Compare commits

..

6 Commits

Author SHA1 Message Date
Arseny Sher
f0cbd5353a Use in wp custom WAL reader gracefully handling missing WAL.
and disable recovery on start.
2023-10-04 12:51:26 +03:00
Arseny Sher
8ea21686e1 Add safekeeper test_late_init. 2023-10-04 12:50:47 +03:00
Arseny Sher
a8e7eede2a Add check that WAL segments are identical after recovery. 2023-09-20 13:34:44 +03:00
Arseny Sher
2b91f507a8 Make test_pageserver_http_get_wal_receiver_success not wait for keepalive. 2023-09-18 17:44:39 +03:00
Arseny Sher
bb2c3253c6 Introduce safekeeper peer recovery.
Implements fetching of WAL by safekeeper from another safekeeper by imitating
behaviour of last elected leader. This allows to avoid WAL accumulation on
compute and facilitates faster compute startup as it doesn't need to download
any WAL. Actually removing WAL download in walproposer is a matter of another
patch though.

There is a per timeline task which always runs, checking regularly if it should
start recovery frome someone, meaning there is something to fetch and there is
no streaming compute. It then proceeds with fetching, finishing when there is
nothing more to receive.

Implements https://github.com/neondatabase/neon/pull/4875
2023-09-18 17:44:38 +03:00
Arseny Sher
bdf3769a2b Don't use AppenRequestHeader.epoch_start_lsn.
It is simpler to get it once from ProposerEelected.
2023-09-18 17:44:38 +03:00
64 changed files with 1678 additions and 2402 deletions

View File

@@ -834,7 +834,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.17.11
VM_BUILDER_VERSION: v0.17.5
steps:
- name: Checkout
@@ -1091,9 +1091,8 @@ jobs:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"

54
Cargo.lock generated
View File

@@ -636,7 +636,7 @@ dependencies = [
"sha1",
"sync_wrapper",
"tokio",
"tokio-tungstenite",
"tokio-tungstenite 0.20.0",
"tower",
"tower-layer",
"tower-service",
@@ -1941,15 +1941,15 @@ dependencies = [
[[package]]
name = "hyper-tungstenite"
version = "0.11.1"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cc7dcb1ab67cd336f468a12491765672e61a3b6b148634dbfe2fe8acd3fe7d9"
checksum = "880b8b1c98a5ec2a505c7c90db6d3f6f1f480af5655d9c5b55facc9382a5a5b5"
dependencies = [
"hyper",
"pin-project-lite",
"pin-project",
"tokio",
"tokio-tungstenite",
"tungstenite",
"tokio-tungstenite 0.18.0",
"tungstenite 0.18.0",
]
[[package]]
@@ -2908,9 +2908,9 @@ dependencies = [
[[package]]
name = "pin-project-lite"
version = "0.2.13"
version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116"
[[package]]
name = "pin-utils"
@@ -4641,6 +4641,18 @@ dependencies = [
"xattr",
]
[[package]]
name = "tokio-tungstenite"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54319c93411147bced34cb5609a80e0a8e44c5999c93903a81cd866630ec0bfd"
dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite 0.18.0",
]
[[package]]
name = "tokio-tungstenite"
version = "0.20.0"
@@ -4650,7 +4662,7 @@ dependencies = [
"futures-util",
"log",
"tokio",
"tungstenite",
"tungstenite 0.20.0",
]
[[package]]
@@ -4965,9 +4977,28 @@ checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed"
[[package]]
name = "tungstenite"
version = "0.20.1"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9"
checksum = "30ee6ab729cd4cf0fd55218530c4522ed30b7b6081752839b68fcec8d0960788"
dependencies = [
"base64 0.13.1",
"byteorder",
"bytes",
"http",
"httparse",
"log",
"rand",
"sha1",
"thiserror",
"url",
"utf-8",
]
[[package]]
name = "tungstenite"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e862a1c4128df0112ab625f55cd5c934bcb4312ba80b39ae4b4835a3fd58e649"
dependencies = [
"byteorder",
"bytes",
@@ -5617,7 +5648,6 @@ dependencies = [
"tower",
"tracing",
"tracing-core",
"tungstenite",
"url",
"uuid",
]

View File

@@ -78,7 +78,7 @@ hostname = "0.3.1"
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.11"
hyper-tungstenite = "0.9"
inotify = "0.10.2"
itertools = "0.10"
jsonwebtoken = "8"

View File

@@ -589,7 +589,8 @@ RUN case "${PG_VERSION}" in \
echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
#########################################################################################
#

View File

@@ -153,6 +153,18 @@ neon-pg-ext-%: postgres-%
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
# pg_embedding was temporarily released as hnsw from this repo, when we only
# supported PostgreSQL 14 and 15
neon-pg-ext-v14: neon-pg-ext-hnsw-v14
neon-pg-ext-v15: neon-pg-ext-hnsw-v15
neon-pg-ext-hnsw-%: postgres-headers-% postgres-%
+@echo "Compiling hnsw $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
@@ -167,6 +179,9 @@ neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean
.PHONY: neon-pg-ext
neon-pg-ext: \

View File

@@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
```bash
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
libcurl4-openssl-dev openssl python-poetry lsof libicu-dev
libcurl4-openssl-dev openssl python-poetry lsof
```
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel libcurl-devel openssl poetry lsof libicu-devel
protobuf-devel libcurl-devel openssl poetry lsof
```
* On Arch based systems, these packages are needed:
```bash

View File

@@ -1,599 +0,0 @@
# Seamless tenant migration
- Author: john@neon.tech
- Created on 2023-08-11
- Implemented on ..
## Summary
The preceding [generation numbers RFC](025-generation-numbers.md) may be thought of as "making tenant
migration safe". Following that,
this RFC is about how those migrations are to be done:
1. Seamlessly (without interruption to client availability)
2. Quickly (enabling faster operations)
3. Efficiently (minimizing I/O and $ cost)
These points are in priority order: if we have to sacrifice
efficiency to make a migration seamless for clients, we will
do so, etc.
This is accomplished by introducing two high level changes:
- A dual-attached state for tenants, used in a control-plane-orchestrated
migration procedure that preserves availability during a migration.
- Warm secondary locations for tenants, where on-disk content is primed
for a fast migration of the tenant from its current attachment to this
secondary location.
## Motivation
Migrating tenants between pageservers is essential to operating a service
at scale, in several contexts:
1. Responding to a pageserver node failure by migrating tenants to other pageservers
2. Balancing load and capacity across pageservers, for example when a user expands their
database and they need to migrate to a pageserver with more capacity.
3. Restarting pageservers for upgrades and maintenance
The current situation steps for migration are:
- detach from old node; skip if old node is dead; (the [skip part is still WIP](https://github.com/neondatabase/cloud/issues/5426)).
- attach to new node
- re-configure endpoints to use the new node
Once [generation numbers](025-generation-numbers.md) are implemented,
the detach step is no longer critical for correctness. So, we can
- attach to a new node,
- re-configure endpoints to use the new node, and then
- detach from the old node.
However, this still does not meet our seamless/fast/efficient goals:
- Not fast: The new node will have to download potentially large amounts
of data from S3, which may take many minutes.
- Not seamless: If we attach to a new pageserver before detaching an old one,
the new one might delete some objects that interrupt availability of reads on the old one.
- Not efficient: the old pageserver will continue uploading
S3 content during the migration that will never be read.
The user expectations for availability are:
- For planned maintenance, there should be zero availability
gap. This expectation is fulfilled by this RFC.
- For unplanned changes (e.g. node failures), there should be
minimal availability gap. This RFC provides the _mechanism_
to fail over quickly, but does not provide the failure _detection_
nor failover _policy_.
## Non Goals
- Defining service tiers with different storage strategies: the same
level of HA & overhead will apply to all tenants. This doesn't rule out
adding such tiers in future.
- Enabling pageserver failover in the absence of a control plane: the control
plane will remain the source of truth for what should be attached where.
- Totally avoiding availability gaps on unplanned migrations during
a failure (we expect a small, bounded window of
read unavailability of very recent LSNs)
- Workload balancing: this RFC defines the mechanism for moving tenants
around, not the higher level logic for deciding who goes where.
- Defining all possible configuration flows for tenants: the migration process
defined in this RFC demonstrates the sufficiency of the pageserver API, but
is not the only kind of configuration change the control plane will ever do.
The APIs defined here should let the control plane move tenants around in
whatever way is needed while preserving data safety and read availability.
## Impacted components
Pageserver, control plane
## Terminology
- **Attachment**: a tenant is _attached_ to a pageserver if it has
been issued a generation number, and is running an instance of
the `Tenant` type, ingesting the WAL, and available to serve
page reads.
- **Location**: locations are a superset of attachments. A location
is a combination of a tenant and a pageserver. We may _attach_ at a _location_.
- **Secondary location**: a location which is not currently attached.
- **Warm secondary location**: a location which is not currently attached, but is endeavoring to maintain a warm local cache of layers. We avoid calling this a _warm standby_ to avoid confusion with similar postgres features.
## Implementation (high level)
### Warm secondary locations
To enable faster migrations, we will identify at least one _secondary location_
for each tenant. This secondary location will keep a warm cache of layers
for the tenant, so that if it is later attached, it can catch up with the
latest LSN quickly: rather than downloading everything, it only has to replay
the recent part of the WAL to advance from the remote_consistent_offset to the
most recent LSN in the WAL.
The control plane is responsible for selecting secondary locations, and
calling into pageservers to configure tenants into a secondary mode at this
new location, as well as attaching the tenant in its existing primary location.
The attached pageserver for a tenant will publish a [layer heatmap](#layer-heatmap)
to advise secondaries of which layers should be downloaded.
### Location modes
Currently, we consider a tenant to be in one of two states on a pageserver:
- Attached: active `Tenant` object, and layers on local disk
- Detached: no layers on local disk, no runtime state.
We will extend this with finer-grained modes, whose purpose will become
clear in later sections:
- **AttachedSingle**: equivalent the existing attached state.
- **AttachedMulti**: like AttachedSingle, holds an up to date generation, but
does not do deletions.
- **AttachedStale**: like AttachedSingle, holds a stale generation,
do not do any remote storage operations.
- **Secondary**: keep local state on disk, periodically update from S3.
- **Detached**: equivalent to existing detached state.
To control these finer grained states, a new pageserver API endpoint will be added.
### Cutover procedure
Define old location and new location as "Node A" and "Node B". Consider
the case where both nodes are available, and Node B was previously configured
as a secondary location for the tenant we are migrating.
The cutover procedure is orchestrated by the control plane, calling into
the pageservers' APIs:
1. Call to Node A requesting it to flush to S3 and enter AttachedStale state
2. Increment generation, and call to Node B requesting it to enter AttachedMulti
state with the new generation.
3. Call to Node B, requesting it to download the latest hot layers from remote storage,
according to the latest heatmap flushed by Node A.
4. Wait for Node B's WAL ingestion to catch up with node A's
5. Update endpoints to use node B instead of node A
6. Call to node B requesting it to enter state AttachedSingle.
7. Call to node A requesting it to enter state Secondary
The following table summarizes how the state of the system advances:
| Step | Node A | Node B | Node used by endpoints |
| :-----------: | :------------: | :------------: | :--------------------: |
| 1 (_initial_) | AttachedSingle | Secondary | A |
| 2 | AttachedStale | AttachedMulti | A |
| 3 | AttachedStale | AttachedMulti | A |
| 4 | AttachedStale | AttachedMulti | A |
| 5 (_cutover_) | AttachedStale | AttachedMulti | B |
| 6 | AttachedStale | AttachedSingle | B |
| 7 (_final_) | Secondary | AttachedSingle | B |
The procedure described for a clean handover from a live node to a secondary
is also used for failure cases and for migrations to a location that is not
configured as a secondary, by simply skipping irrelevant steps, as described in
the following sections.
#### Migration from an unresponsive node
If node A is unavailable, then all calls into
node A are skipped and we don't wait for B to catch up before
switching updating the endpoints to use B.
#### Migration to a location that is not a secondary
If node B is initially in Detached state, the procedure is identical. Since Node B
is coming from a Detached state rather than Secondary, the download of layers and
catch up with WAL will take much longer.
We might do this if:
- Attached and secondary locations are both critically low on disk, and we need
to migrate to a third node with more resources available.
- We are migrating a tenant which does not use secondary locations to save on cost.
#### Permanent migration away from a node
In the final step of the migration, we generally request the original node to enter a Secondary
state. This is typical if we are doing a planned migration during maintenance, or to
balance CPU/network load away from a node.
One might also want to permanently migrate away: this can be done by simply removing the secondary
location after the migration is complete, or as an optimization by substituting the Detached state
for the Secondary state in the final step.
#### Cutover diagram
```mermaid
sequenceDiagram
participant CP as Control plane
participant A as Node A
participant B as Node B
participant E as Endpoint
CP->>A: PUT Flush & go to AttachedStale
note right of A: A continues to ingest WAL
CP->>B: PUT AttachedMulti
CP->>B: PUT Download layers from latest heatmap
note right of B: B downloads from S3
loop Poll until download complete
CP->>B: GET download status
end
activate B
note right of B: B ingests WAL
loop Poll until catch up
CP->>B: GET visible WAL
CP->>A: GET visible WAL
end
deactivate B
CP->>E: Configure to use Node B
E->>B: Connect for reads
CP->>B: PUT AttachedSingle
CP->>A: PUT Secondary
```
#### Cutover from an unavailable pageserver
This case is far simpler: we may skip straight to our intended
end state.
```mermaid
sequenceDiagram
participant A as Node A
participant CP as Control plane
participant B as Node B
participant E as Endpoint
note right of A: Node A offline
activate A
CP->>B: PUT AttachedSingle
CP->>E: Configure to use Node B
E->>B: Connect for reads
deactivate A
```
## Implementation (detail)
### Purpose of AttachedMulti, AttachedStale
#### AttachedMulti
Ordinarily, an attached pageserver whose generation is the latest may delete
layers at will (e.g. during compaction). If a previous generation pageserver
is also still attached, and in use by endpoints, then this layer deletion could
lead to a loss of availability for the endpoint when reading from the previous
generation pageserver.
The _AttachedMulti_ state simply disables deletions. These will be enqueued
in `RemoteTimelineClient` until the control plane transitions the
node into AttachedSingle, which unblocks deletions. Other remote storage operations
such as uploads are not blocked.
AttachedMulti is not required for data safety, only to preserve availability
on pageservers running with stale generations.
A node enters AttachedMulti only when explicitly asked to by the control plane. It should
only remain in this state for the duration of a migration.
If a control plane bug leaves
the node in AttachedMulti for a long time, then we must avoid unbounded memory use from enqueued
deletions. This may be accomplished simply, by dropping enqueued deletions when some modest
threshold of delayed deletions (e.g. 10k layers per tenant) is reached. As with all deletions,
it is safe to skip them, and the leaked objects will be eventually cleaned up by scrub or
by timeline deletion.
During AttachedMulti, the Tenant is free to drop layers from local disk in response to
disk pressure: only the deletion of remote layers is blocked.
#### AttachedStale
Currently, a pageserver with a stale generation number will continue to
upload layers, but be prevented from completing deletions. This is safe, but inefficient: layers uploaded by this stale generation
will not be read back by future generations of pageservers.
The _AttachedStale_ state disables S3 uploads. The stale pageserver
will continue to ingest the WAL and write layers to local disk, but not to
do any uploads to S3.
A node may enter AttachedStale in two ways:
- Explicitly, when control plane calls into the node at the start of a migration.
- Implicitly, when the node tries to validate some deletions and discovers
that its generation is stale.
The AttachedStale state also disables sending consumption metrics from
that location: it is interpreted as an indication that some other pageserver
is already attached or is about to be attached, and that new pageserver will
be responsible for sending consumption metrics.
#### Disk Pressure & AttachedStale
Over long periods of time, a tenant location in AttachedStale will accumulate data
on local disk, as it cannot evict any layers written since it entered the
AttachStale state. We rely on the control plane to revert the location to
Secondary or Detached at the end of a migration.
This scenario is particularly noteworthy when evacuating all tenants on a pageserver:
since _all_ the attached tenants will go into AttachedStale, we will be doing no
uploads at all, therefore ingested data will cause disk usage to increase continuously.
Under nominal conditions, the available disk space on pageservers should be sufficient
to complete the evacuation before this becomes a problem, but we must also handle
the case where we hit a low disk situation while in this state.
The concept of disk pressure already exists in the pageserver: the `disk_usage_eviction_task`
touches each Tenant when it determines that a low-disk condition requires
some layer eviction. Having selected layers for eviction, the eviction
task calls `Timeline::evict_layers`.
**Safety**: If evict_layers is called while in AttachedStale state, and some of the to-be-evicted
layers are not yet uploaded to S3, then the block on uploads will be lifted. This
will result in leaking some objects once a migration is complete, but will enable
the node to manage its disk space properly: if a node is left with some tenants
in AttachedStale indefinitely due to a network partition or control plane bug,
these tenants will not cause a full disk condition.
### Warm secondary updates
#### Layer heatmap
The secondary location's job is to serve reads **with the same quality of service as the original location
was serving them around the time of a migration**. This does not mean the secondary
location needs the whole set of layers: inactive layers that might soon
be evicted on the attached pageserver need not be downloaded by the
secondary. A totally idle tenant only needs to maintain enough on-disk
state to enable a fast cold start (i.e. the most recent image layers are
typically sufficient).
To enable this, we introduce the concept of a _layer heatmap_, which
acts as an advisory input to secondary locations to decide which
layers to download from S3.
#### Attached pageserver
The attached pageserver, if in state AttachedSingle, periodically
uploads a serialized heat map to S3. It may skip this if there
is no change since the last time it uploaded (e.g. if the tenant
is totally idle).
Additionally, when the tenant is flushed to remote storage prior to a migration
(the first step in [cutover procedure](#cutover-procedure)),
the heatmap is written out. This enables a future attached pageserver
to get an up to date view when deciding which layers to download.
#### Secondary location behavior
Secondary warm locations run a simple loop, implemented separately from
the main `Tenant` type, which represents attached tenants:
- Download the layer heatmap
- Select any "hot enough" layers to download, if there is sufficient
free disk space.
- Download layers, if they were not previously evicted (see below)
- Download the latest index_part.json
- Check if any layers currently on disk are no longer referenced by
IndexPart & delete them
Note that the heatmap is only advisory: if a secondary location has plenty
of disk space, it may choose to retain layers that aren't referenced
by the heatmap, as long as they are still referenced by the IndexPart. Conversely,
if a node is very low on disk space, it might opt to raise the heat threshold required
to both downloading a layer, until more disk space is available.
#### Secondary locations & disk pressure
Secondary locations are subject to eviction on disk pressure, just as
attached locations are. For eviction purposes, the access time of a
layer in a secondary location will be the access time given in the heatmap,
rather than the literal time at which the local layer file was accessed.
The heatmap will indicate which layers are in local storage on the attached
location. The secondary will always attempt to get back to having that
set of layers on disk, but to avoid flapping, it will remember the access
time of the layer it was most recently asked to evict, and layers whose
access time is below that will not be re-downloaded.
The resulting behavior is that after a layer is evicted from a secondary
location, it is only re-downloaded once the attached pageserver accesses
the layer and uploads a heatmap reflecting that access time. On a pageserver
restart, the secondary location will attempt to download all layers in
the heatmap again, if they are not on local disk.
This behavior will be slightly different when secondary locations are
used for "low energy tenants", but that is beyond the scope of this RFC.
### Location configuration API
Currently, the `/tenant/<tenant_id>/config` API defines various
tunables like compaction settings, which apply to the tenant irrespective
of which pageserver it is running on.
A new "location config" structure will be introduced, which defines
configuration which is per-tenant, but local to a particular pageserver,
such as the attachment mode and whether it is a secondary.
The pageserver will expose a new per-tenant API for setting
the state: `/tenant/<tenant_id>/location/config`.
Body content:
```
{
state: 'enum{Detached, Secondary, AttachedSingle, AttachedMulti, AttachedStale}',
generation: Option<u32>,
configuration: `Option<TenantConfig>`
flush: bool
}
```
Existing `/attach` and `/detach` endpoint will have the same
behavior as calling `/location/config` with `AttachedSingle` and `Detached`
states respectively. These endpoints will be deprecated and later
removed.
The generation attribute is mandatory for entering `AttachedSingle` or
`AttachedMulti`.
The configuration attribute is mandatory when entering any state other
than `Detached`. This configuration is the same as the body for
the existing `/tenant/<tenant_id>/config` endpoint.
The `flush` argument indicates whether the pageservers should flush
to S3 before proceeding: this only has any effect if the node is
currently in AttachedSingle or AttachedMulti. This is used
during the first phase of migration, when transitioning the
old pageserver to AttachedSingle.
The `/re-attach` API response will be extended to include a `state` as
well as a `generation`, enabling the pageserver to enter the
correct state for each tenant on startup.
### Database schema for locations
A new table `ProjectLocation`:
- pageserver_id: int
- tenant_id: TenantId
- generation: Option<int>
- state: `enum(Secondary, AttachedSingle, AttachedMulti)`
Notes:
- It is legacy for a Project to have zero `ProjectLocation`s
- The `pageserver` column in `Project` now means "to which pageserver should
endpoints connect", rather than simply which pageserver is attached.
- The `generation` column in `Project` remains, and is incremented and used
to set the generation of `ProjectLocation` rows when they are set into
an attached state.
- The `Detached` state is implicitly represented as the absence of
a `ProjectLocation`.
### Executing migrations
Migrations will be implemented as Go functions, within the
existing `Operation` framework in the control plane. These
operations are persistent, such that they will always keep
trying until completion: this property is important to avoid
leaving garbage behind on pageservers, such as AttachedStale
locations.
### Recovery from failures during migration
During migration, the control plane may encounter failures of either
the original or new pageserver, or both:
- If the original fails, skip past waiting for the new pageserver
to catch up, and put it into AttachedSingle immediately.
- If the new node fails, put the old pageserver into Secondary
and then back into AttachedSingle (this has the effect of
retaining on-disk state and granting it a fresh generation number).
- If both nodes fail, keep trying until one of them is available
again.
### Control plane -> Pageserver reconciliation
A migration may be done while the old node is unavailable,
in which case the old node may still be running in an AttachedStale
state.
In this case, it is undesirable to have the migration `Operation`
stay alive until the old node eventually comes back online
and can be cleaned up. To handle this, the control plane
should run a background reconciliation process to compare
a pageserver's attachments with the database, and clean up
any that shouldn't be there any more.
Note that there will be no work to do if the old node was really
offline, as during startup it will call into `/re-attach` and
be updated that way. The reconciliation will only be needed
if the node was unavailable but still running.
## Alternatives considered
### Only enabling secondary locations for tenants on a higher service tier
This will make sense in future, especially for tiny databases that may be
downloaded from S3 in milliseconds when needed.
However, it is not wise to do it immediately, because pageservers contain
a mixture of higher and lower tier workloads. If we had 1 tenant with
a secondary location and 9 without, then those other 9 tenants will do
a lot of I/O as they try to recover from S3, which may degrade the
service of the tenant which had a secondary location.
Until we segregate tenant on different service tiers on different pageserver
nodes, or implement & test QoS to ensure that tenants with secondaries are
not harmed by tenants without, we should use the same failover approach
for all the tenants.
### Hot secondary locations (continuous WAL replay)
Instead of secondary locations populating their caches from S3, we could
have them consume the WAL from safekeepers. The downsides of this would be:
- Double load on safekeepers, which are a less scalable service than S3
- Secondary locations' on-disk state would end up subtly different to
the remote state, which would make synchronizing with S3 more complex/expensive
when going into attached state.
The downside of only updating secondary locations from S3 is that we will
have a delay during migration from replaying the LSN range between what's
in S3 and what's in the pageserver. This range will be very small on
planned migrations, as we have the old pageserver flush to S3 immediately
before attaching the new pageserver. On unplanned migrations (old pageserver
is unavailable), the range of LSNs to replay is bounded by the flush frequency
on the old pageserver. However, the migration doesn't have to wait for the
replay: it's just that not-yet-replayed LSNs will be unavailable for read
until the new pageserver catches up.
We expect that pageserver reads of the most recent LSNs will be relatively
rare, as for an active endpoint those pages will usually still be in the postgres
page cache: this leads us to prefer synchronizing from S3 on secondary
locations, rather than consuming the WAL from safekeepers.
### Cold secondary locations
It is not functionally necessary to keep warm caches on secondary locations at all. However, if we do not, then
we would experience a de-facto availability loss in unplanned migrations, as reads to the new node would take an extremely long time (many seconds, perhaps minutes).
Warm caches on secondary locations are necessary to meet
our availability goals.
### Pageserver-granularity failover
Instead of migrating tenants individually, we could have entire spare nodes,
and on a node death, move all its work to one of these spares.
This approach is avoided for several reasons:
- we would still need fine-grained tenant migration for other
purposes such as balancing load
- by sharing the spare capacity over many peers rather than one spare node,
these peers may use the capacity for other purposes, until it is needed
to handle migrated tenants. e.g. for keeping a deeper cache of their
attached tenants.
### Readonly during migration
We could simplify migrations by making both previous and new nodes go into a
readonly state, then flush remote content from the previous node, then activate
attachment on the secondary node.
The downside to this approach is a potentially large gap in readability of
recent LSNs while loading data onto the new node. To avoid this, it is worthwhile
to incur the extra cost of double-replaying the WAL onto old and new nodes' local
storage during a migration.
### Peer-to-peer pageserver communication
Rather than uploading the heatmap to S3, attached pageservers could make it
available to peers.
Currently, pageservers have no peer to peer communication, so adding this
for heatmaps would incur significant overhead in deployment and configuration
of the service, and ensuring that when a new pageserver is deployed, other
pageservers are updated to be aware of it.
As well as simplifying implementation, putting heatmaps in S3 will be useful
for future analytics purposes -- gathering aggregated statistics on activity
pattersn across many tenants may be done directly from data in S3.

View File

@@ -1,244 +0,0 @@
# Sharding Phase 1: Static Key-space Sharding
## Summary
To enable databases with sizes approaching the capacity of a pageserver's disk,
it is necessary to break up the storage for the database, or _shard_ it.
Sharding in general is a complex area. This RFC aims to define a modest initial
capability that will permit creating large-capacity databases using a static configuration
defined at time of Tenant creation.
## Motivation
Currently, all data for a Tenant, including all its timelines, is stored on a single
pageserver. The local storage required may be several times larger than the actual
database size, due to LSM write inflation.
If a database is larger than what one pageserver can hold, then it becomes impossible
for the pageserver to hold it in local storage, as it must do to provide service to
clients.
### Prior art
Numerous: sharding is a long-discussed feature for the pageserver.
Prior art in other distributed systems is too broad to capture here: pretty much
any scale out storage system does something like this.
## Requirements
- Enable creating a large (for example, 16TiB) database without requiring dedicated
pageserver nodes.
- Share read/write bandwidth costs for large databases across pageservers, as well
as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
that disrupt service to other tenants.
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
does not write out a single contiguous ranges of page numbers.
*Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
that a user might create on a current-gen enterprise SSD should also work well on
Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
pageserver backend is not the limiting factor in the database size*.
## Non Goals
- Independently distributing timelines within the same tenant. If a tenant has many
timelines, then sharding may be a less efficient mechanism for distributing load than
sharing out timelines between pageservers.
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
based on the idea that separate mechanisms will make sense for each dimension.
## Impacted Components
pageserver, control plane, safekeeper (optional)
## Terminology
**Key**: a postgres page number. In the sense that the pageserver is a versioned key-value store,
the page number is the key in that store.
**LSN dimension**: this just means the range of LSNs (history), when talking about the range
of keys and LSNs as a two dimensional space.
## Implementation
### Key sharding vs. LSN sharding
When we think of sharding across the two dimensional key/lsn space, this is an
opportunity to think about how the two dimensions differ:
- Sharding the key space distributes the _write_ workload of ingesting data
and compacting. This work must be carefully managed so that exactly one
node owns a given key.
- Sharding the LSN space distributes the _historical read_ workload. This work
can be done by anyone without any special coordination, as long as they can
see the remote index and layers.
The key sharding is the harder part, and also the more urgent one, to support larger
capacity databases. Because distributing historical LSN read work is a relatively
simpler problem that most users don't have, we defer it to future work. It is anticipated
that some quite simple P2P offload model will enable distributing work for historical
reads: a node which is low on space can call out to peer to ask it to download and
serve reads from a historical layer.
### Key mapping scheme
Having decided to focus on key sharding, we must next decide how we will map
keys to shards.
It is proposed to use a "wide striping" approach, to obtain a good compromise
between data locality and avoiding entire large relations mapping to the same shard.
The mapping is quite simple:
- Define a stripe size, such as 256MiB. Map this to a key count, such that a contiguous
range of 256MiB keys would all fall into this stripe, i.e. divide by 8kiB to get 32k.
- Map a key to a stripe by integer division.
- Map a stripe to a shard by taking the shard index modulo the shard count.
This scheme will achieve a good balance as long as there is no aliasing of the keys
to the stripe width. In the example above, if someone had 4 shards and wrote
keys that were all 4*32k apart, they would all map to the same shard. However, we do
not have to worry about this, since end users do not control page numbers: as long as
we do not pick stripe sizes that map to any problematic postgres behaviors, we'll be fine.
### Important Types
#### `ShardMap`
Provides all the information needed to route a request for a particular
key to the correct pageserver:
- Stripe size
- Shard count
- Address of the pageserver hosting each shard
This structure's size is linear with the number of shards.
#### `ShardIdentity`
Provides the information needed to know whether a particular key belongs
to a particular shard:
- Stripe size
- Shard count
- Shard index
This structure's size is constant.
### Pageserver changes
Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
TenantShards, which are just a `Tenant` plus a `ShardIdentity` telling it which part
of the keyspace it owns.
When the pageserver subscribes to a safekeeper for WAL updates, it must provide
its `ShardIdentity` to receive the relevant subset of the WAL.
When the pageserver writes layers and index_part.json to remote storage, it must
include the shard index & count in the name, to avoid collisions (the count is
necessary for future-proofing: the count will vary in time). These keys
will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
exactly the same for TenantShards as it does for Tenants today: each shard will have
its own generation number.
The pageserver doesn't have to do anything special during ingestion, compaction
or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
This will result in sparse layer files, containing keys only in the stripes that this
shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
the key range, these should be updated to ignore gaps that are due to sharding, to
avoid spuriously splitting up layers ito stripe-sized pieces.
### Pageserver Controller changes
The pageserver controller is a new component, which is responsible for abstracting
away the business of managing individual tenant placement on pagservers. It will
also act as the abstraction on top of sharding, so that the control plane continue
to see a Tenant as a single object, even though the reality is that it is many
TenantShards.
For the rest of this RFC, think of the Pageserver Controller as a component of
the control plane. The actual implementation is beyond the scope of this RFC
and will be described in more detail elsewhere.
### Safekeeper changes
The safekeeper's API for subscribing to a WAL will be extended to enable callers
to provide a `ShardIdentity`. In this mode it will only send WAL entries that
fall within the keyspace belonging to the shard, and WAL entries that are to
be mirrored to all shards.
Metadata updates describing databases+relations are mirrored to
all shards, and other WAL messages are only provided to the shard
that owns the key being updated. For any operation that updates multiple
keys, it will be provided to all the shards whose key ranges intersect with
one or more of the keys referenced in the WAL message.
### Pageserver Controller
### Endpoints
Compute endpoints will need to:
- Accept a ShardMap as part of their configuration from the control plane
- Route pageserver requests according to that ShardMap
### Control Plane
#### Publishing ShardMap updates
The control plane will provide an API for the pageserver controller to publish updates
to the ShardMap for a tenant. When such an update is provided, it will be used to
update the configuration of any endpoints currently active for the tenant.
The ShardMap will be opaque to the Control Plane: it doesn't need to do anything with it
other than storing and passing on to endpoints.
#### Attaching via the Pageserver Controller
The Control Plane will issue attach/create API calls to the pageserver controller
instead of directly to pageservers. This will relieve the control plane of the need
to know about sharding.
#### Enabling sharding for large tenants
When a Tenant is created, it is up to the control plane to provide a hint to
the pageserver about how large it will be. This may be implemented as a service tier,
where users creating very large databases would be onboarded to the tier, and then
the Tenants they create would be created with a larger number of shards. For the
general population of users we should continue to use 1 shard by default.
## Next Steps
Clearly, the mechanism described in this RFC has substantial limitations:
- A) the number of shards in a tenant is defined at creation time.
- B) data is not distributed across the LSN dimension
To address `A`, a _splitting_ feature will later be added. One shard can split its
data into a number of children by doing a special compaction operation to generate
image layers broken up child-shard-wise, and then writing out an index_part.json for
each child. This will then require coordination with the pageserver controller to
safely attach these new child shards and then move them around to distribute work.
The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
once a Tenant has been sharded, there is little value in merging it again.
To address `B`, it is envisaged to have some gossip mechanism for pageservers to communicate
about their workload, and then a getpageatlsn offload mechanism where one pageserver can
ask another to go read the necessary layers from remote storage to serve the read. This
requires relativly little coordination because it is read-only: any node can service any
read. All reads to a particular shard would still flow through one node, but the
disk capactity & I/O impact of servicing the read would be distributed.
## FAQ/Alternatives
### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
When a database is growing under a write workload, writes may predominantly hit the
end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
is intensively re-writing a particular relation, if that relation lived in a particular
shard then it would not achieve our goal of distributing the write work across shards.
### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
Two reasons:
1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
database would still cause a load hotspot on the pageserver routing its read requests.
2. Implementing a proxy model as a stop-gap would not be a cheap option, because
it requires making pageservers aware of their peers, and adding synchronisation to
keep pageservers aware of their peers as they come and go.

View File

@@ -1,119 +0,0 @@
# Pageserver Controller Phase 1: Generations
## Summary
In the [generation numbers RFC](025-generation-numbers.md), it was proposed that
the console/control plane would act as the central coordinator for issuing generation
numbers.
That approach has not proven practical, so this RFC proposes an alternative implementation
where generation numbers are managed in a different service.
Calls to generation-aware pageserver APIs like create/attach will call out to this
new _pageserver controller_ to acquire generation numbers. This service will also
form the basis for satisfying future pageserver management requirements, such as
coordinating sharding, doing automatic capacity balancing, and many more.
## Motivation
This is a dependency for delivering high availability.
### Prior art
None
## Requirements
- Provide a hook for the pageserver to use when it receives an attach/create/load API
call, which will yield a generation that is safe for the pageserver to use.
- Implement the /re-attach and /validate APIs required for the generation numbers feature
to work.
## Non Goals
- This is not intended to interact with any components other than the pageserver, or
to integrate with the broader control plane in any way.
## Impacted Components
pageserver, pageserver controller (new)
## Implementation
We may start from the minimal `attachment_service` used in automated tests.
### Data store
For generation numbers, we need a persistent, linearizable data store. Postgres is sufficient for
this: we already have postgres instances used for other control plane work.
The storage for the Pageserver Controller will be independent of other components:
it might use the same physical database server but would use an independent database.
### Deployment
There will be one instance per region. In future we would aim to define the concept
of a pageserver cluster and have one controller per cluster, but in the short term
one per region will be functionally okay for current scale.
The pageserver controller will be deployed within kubernetes, in the same way as
the storage broker (which is currently via a [helm chart](https://github.com/neondatabase/helm-charts/tree/main/charts/neon-storage-broker)).
### Security
The pageserver controller's API will do authentication with JWT, the same as
the pageserver's existing API.
### Correctness
It is essential that pageservers call into the controller at the _very start_ of
handling attach/create/load API requests. They should not do any work at all until
they have acquired that generation number.
If the call fails, they must retry: it is not safe to proceed without a generation number.
## Future
Having a call chain that goes `Control plane -> Pageserver -> Pageserver controller`
is clearly a little strange: we are only doing this to avoid needing to make changes
to the control plane.
In future, we will change the control plane to call directly into the pageserver
controller, which would then call onwards into the pageserver. This would be a fairly
small change to the controller, since all the logic around storing and updating
generation numbers would stay the same: just the behavior of the API frontend
would be different.
The work to enable pageservers to communicate with the controller is not wasted,
because they still communicate in that direction when invoking `/re-attach`
and `/validate`
## Alternatives considered
### Run in the console/control plane codebase
The control plane is a large Go codebase that uses extensive code generation, and
has to be quite generic to manage many different types of component.
### Direct DB access
We could have pageservers call directly into a shared database to acquire and update
generation numbers (with carefully crafted transactions to protect against concurrent
attaches getting the same generation, etc).
Pros:
- No extra service required, simpler deployment
Cons:
- No future path to a cleaner architecture: the pageserver controller can be implemented
as an extensible place for implement more functionality in future, whereas a mechanism
to do generation numbers via SQL queries from the pageserver would be specialized
and the code would probably be disposed of in the relatively near future.
- Puts onus entirely on SQL query correctness to mediate concurrent access.
The pageserver controller also has to be correct in this respect in case there
is more than one instance running, but it is much less likely to hit this path,
so the overall risk of issues is lower when using a central service.
The main downside to that approach is that it doesn't provide the future path that
the pageserver controller does

View File

@@ -3,7 +3,7 @@
//!
use chrono::{DateTime, Utc};
use rand::Rng;
use serde::{Deserialize, Serialize};
use serde::Serialize;
#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
#[serde(tag = "type")]
@@ -54,8 +54,8 @@ impl EventType {
}
}
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct Event<Extra, Metric> {
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct Event<Extra, Metric: Serialize> {
#[serde(flatten)]
#[serde(rename = "type")]
pub kind: EventType,

View File

@@ -561,7 +561,14 @@ impl CgroupWatcher {
/// Setting these values also affects the thresholds for receiving usage alerts.
#[derive(Debug)]
pub struct MemoryLimits {
pub high: u64,
high: u64,
max: u64,
}
impl MemoryLimits {
pub fn new(high: u64, max: u64) -> Self {
Self { max, high }
}
}
// Methods for manipulating the actual cgroup
@@ -638,7 +645,12 @@ impl CgroupWatcher {
/// Set cgroup memory.high and memory.max.
pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
info!(limits.high, path = self.path(), "writing new memory limits",);
info!(
limits.high,
limits.max,
path = self.path(),
"writing new memory limits",
);
self.memory()
.context("failed to get memory subsystem while setting memory limits")?
.set_mem(cgroups_rs::memory::SetMemory {
@@ -647,7 +659,7 @@ impl CgroupWatcher {
high: Some(MaxValue::Value(
u64::min(limits.high, i64::MAX as u64) as i64
)),
max: None,
max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
})
.context("failed to set memory limits")
}
@@ -655,7 +667,7 @@ impl CgroupWatcher {
/// Given some amount of available memory, set the desired cgroup memory limits
pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
let new_high = self.config.calculate_memory_high_value(available_memory);
let limits = MemoryLimits { high: new_high };
let limits = MemoryLimits::new(new_high, available_memory);
info!(
path = self.path(),
memory = ?limits,

View File

@@ -257,11 +257,12 @@ impl Runner {
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
}
let limits = MemoryLimits {
let limits = MemoryLimits::new(
// new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
// since it is properly initialized in the previous cgroup if let block
high: new_cgroup_mem_high,
};
new_cgroup_mem_high,
available_memory,
);
cgroup
.set_limits(&limits)
.context("failed to set cgroup memory limits")?;
@@ -327,9 +328,7 @@ impl Runner {
name = cgroup.path(),
"updating cgroup memory.high",
);
let limits = MemoryLimits {
high: new_cgroup_mem_high,
};
let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
cgroup
.set_limits(&limits)
.context("failed to set file cache size")?;

View File

@@ -3,8 +3,6 @@
//! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.
use anyhow::Result;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use std::cmp::Ordering;
use std::collections::BinaryHeap;
@@ -98,9 +96,9 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
}
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let summary_blk = file.read_blk(0, ctx).await?;
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,
@@ -127,7 +125,6 @@ async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Resul
prev_key = Some(curr.next());
true
},
ctx,
)
.await?;
let mut holes = heap.into_vec();
@@ -138,7 +135,6 @@ async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Resul
pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
let storage_path = &cmd.path;
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
pageserver::virtual_file::init(10);
@@ -167,7 +163,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
parse_filename(&layer.file_name().into_string().unwrap())
{
if layer_file.is_delta {
layer_file.holes = get_holes(&layer.path(), max_holes, &ctx).await?;
layer_file.holes = get_holes(&layer.path(), max_holes).await?;
n_deltas += 1;
}
layers.push(layer_file);

View File

@@ -2,8 +2,6 @@ use std::path::{Path, PathBuf};
use anyhow::Result;
use clap::Subcommand;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::block_io::BlockCursor;
use pageserver::tenant::disk_btree::DiskBtreeReader;
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
@@ -46,12 +44,12 @@ pub(crate) enum LayerCmd {
},
}
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
let path = path.as_ref();
virtual_file::init(10);
page_cache::init(100);
let file = FileBlockReader::new(VirtualFile::open(path).await?);
let summary_blk = file.read_blk(0, ctx).await?;
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
actual_summary.index_start_blk,
@@ -69,12 +67,11 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
all.push((curr, BlobRef(value_offset)));
true
},
ctx,
)
.await?;
let cursor = BlockCursor::new_fileblockreader(&file);
for (k, v) in all {
let value = cursor.read_blob(v.pos(), ctx).await?;
let value = cursor.read_blob(v.pos()).await?;
println!("key:{} value_len:{}", k, value.len());
}
// TODO(chi): special handling for last key?
@@ -82,7 +79,6 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
}
pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
match cmd {
LayerCmd::List { path } => {
for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? {
@@ -157,7 +153,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
);
if layer_file.is_delta {
read_delta_file(layer.path(), &ctx).await?;
read_delta_file(layer.path()).await?;
} else {
anyhow::bail!("not supported yet :(");
}

View File

@@ -14,7 +14,7 @@ use tracing::*;
use utils::id::NodeId;
mod metrics;
use metrics::MetricsKey;
use metrics::{Ids, MetricsKey};
mod disk_cache;
mod upload;
@@ -68,11 +68,10 @@ pub async fn collect_metrics(
},
);
let path: Arc<PathBuf> = Arc::new(local_disk_storage);
let final_path: Arc<PathBuf> = Arc::new(local_disk_storage);
let cancel = task_mgr::shutdown_token();
let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);
let restore_and_reschedule = restore_and_reschedule(&final_path, metric_collection_interval);
let mut cached_metrics = tokio::select! {
_ = cancel.cancelled() => return Ok(()),
@@ -109,14 +108,14 @@ pub async fn collect_metrics(
// already here, better to try to flush the new values.
let flush = async {
match disk_cache::flush_metrics_to_disk(&metrics, &path).await {
match disk_cache::flush_metrics_to_disk(&metrics, &final_path).await {
Ok(()) => {
tracing::debug!("flushed metrics to disk");
}
Err(e) => {
// idea here is that if someone creates a directory as our path, then they
// idea here is that if someone creates a directory as our final_path, then they
// might notice it from the logs before shutdown and remove it
tracing::error!("failed to persist metrics to {path:?}: {e:#}");
tracing::error!("failed to persist metrics to {final_path:?}: {e:#}");
}
}
};
@@ -153,10 +152,12 @@ pub async fn collect_metrics(
///
/// Cancellation safe.
async fn restore_and_reschedule(
path: &Arc<PathBuf>,
final_path: &Arc<PathBuf>,
metric_collection_interval: Duration,
) -> Cache {
let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await {
let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(final_path.clone())
.await
{
Ok(found_some) => {
// there is no min needed because we write these sequentially in
// collect_all_metrics
@@ -174,11 +175,12 @@ async fn restore_and_reschedule(
use std::io::{Error, ErrorKind};
let root = e.root_cause();
let maybe_ioerr = root.downcast_ref::<Error>();
let is_not_found = maybe_ioerr.is_some_and(|e| e.kind() == ErrorKind::NotFound);
if !is_not_found {
tracing::info!("failed to read any previous metrics from {path:?}: {e:#}");
tracing::info!("failed to read any previous metrics from {final_path:?}: {e:#}");
}
(HashMap::new(), None)

View File

@@ -9,13 +9,6 @@ pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result
let span = tracing::Span::current();
tokio::task::spawn_blocking(move || {
let _e = span.entered();
if let Some(parent) = path.parent() {
if let Err(e) = scan_and_delete_with_same_prefix(&path) {
tracing::info!("failed to cleanup temporary files in {parent:?}: {e:#}");
}
}
let mut file = std::fs::File::open(&*path)?;
let reader = std::io::BufReader::new(&mut file);
anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
@@ -25,68 +18,26 @@ pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result
.and_then(|x| x)
}
fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> {
let it = std::fs::read_dir(path.parent().expect("caller checked"))?;
let prefix = path.file_name().expect("caller checked").to_string_lossy();
for entry in it {
let entry = entry?;
if !entry.metadata()?.is_file() {
continue;
}
let file_name = entry.file_name();
if path.file_name().unwrap() == file_name {
// do not remove our actual file
continue;
}
let file_name = file_name.to_string_lossy();
if !file_name.starts_with(&*prefix) {
continue;
}
let path = entry.path();
if let Err(e) = std::fs::remove_file(&path) {
tracing::warn!("cleaning up old tempfile {file_name:?} failed: {e:#}");
} else {
tracing::info!("cleaned up old tempfile {file_name:?}");
}
}
Ok(())
}
pub(super) async fn flush_metrics_to_disk(
current_metrics: &Arc<Vec<RawMetric>>,
path: &Arc<PathBuf>,
final_path: &Arc<PathBuf>,
) -> anyhow::Result<()> {
use std::io::Write;
anyhow::ensure!(path.parent().is_some(), "path must have parent: {path:?}");
anyhow::ensure!(
path.file_name().is_some(),
"path must have filename: {path:?}"
final_path.parent().is_some(),
"path must have parent: {final_path:?}"
);
let span = tracing::Span::current();
tokio::task::spawn_blocking({
let current_metrics = current_metrics.clone();
let path = path.clone();
let final_path = final_path.clone();
move || {
let _e = span.entered();
let parent = path.parent().expect("existence checked");
let file_name = path.file_name().expect("existence checked");
let mut tempfile = tempfile::Builder::new()
.prefix(file_name)
.suffix(".tmp")
.tempfile_in(parent)?;
tracing::debug!("using tempfile {:?}", tempfile.path());
let mut tempfile =
tempfile::NamedTempFile::new_in(final_path.parent().expect("existence checked"))?;
// write out all of the raw metrics, to be read out later on restart as cached values
{
@@ -101,17 +52,15 @@ pub(super) async fn flush_metrics_to_disk(
tempfile.flush()?;
tempfile.as_file().sync_all()?;
fail::fail_point!("before-persist-last-metrics-collected");
drop(tempfile.persist(&*final_path)?);
drop(tempfile.persist(&*path).map_err(|e| e.error)?);
let f = std::fs::File::open(path.parent().unwrap())?;
let f = std::fs::File::open(final_path.parent().unwrap())?;
f.sync_all()?;
anyhow::Ok(())
}
})
.await
.with_context(|| format!("write metrics to {path:?} join error"))
.and_then(|x| x.with_context(|| format!("write metrics to {path:?}")))
.with_context(|| format!("write metrics to {final_path:?} join error"))
.and_then(|x| x.with_context(|| format!("write metrics to {final_path:?}")))
}

View File

@@ -1,21 +1,36 @@
use crate::context::RequestContext;
use anyhow::Context;
use crate::tenant::mgr;
use chrono::{DateTime, Utc};
use consumption_metrics::EventType;
use futures::stream::StreamExt;
use serde_with::serde_as;
use std::{sync::Arc, time::SystemTime};
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use pageserver_api::models::TenantState;
use serde::Serialize;
use serde_with::{serde_as, DisplayFromStr};
use std::sync::Arc;
use std::time::SystemTime;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use anyhow::Context;
use super::{Cache, RawMetric};
// FIXME: all other consumption_metrics::Event stuff is over at uploading, maybe move?
#[serde_as]
#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy)]
pub(super) struct Ids {
#[serde_as(as = "DisplayFromStr")]
pub(super) tenant_id: TenantId,
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) timeline_id: Option<TimelineId>,
}
/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
/// instead of static str.
// Do not rename any of these without first consulting with data team and partner
// management.
// FIXME: write those tests before refactoring to this!
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
pub(super) enum Name {
/// Timeline last_record_lsn, absolute
@@ -44,7 +59,7 @@ pub(super) enum Name {
/// elsewhere.
#[serde_with::serde_as]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
pub(crate) struct MetricsKey {
pub(super) struct MetricsKey {
#[serde_as(as = "serde_with::DisplayFromStr")]
pub(super) tenant_id: TenantId,
@@ -68,7 +83,7 @@ impl MetricsKey {
struct AbsoluteValueFactory(MetricsKey);
impl AbsoluteValueFactory {
const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
let key = self.0;
(key, (EventType::Absolute { time }, val))
}
@@ -83,7 +98,7 @@ struct IncrementalValueFactory(MetricsKey);
impl IncrementalValueFactory {
#[allow(clippy::wrong_self_convention)]
const fn from_until(
fn from_previous_up_to(
self,
prev_end: DateTime<Utc>,
up_to: DateTime<Utc>,
@@ -91,11 +106,16 @@ impl IncrementalValueFactory {
) -> RawMetric {
let key = self.0;
// cannot assert prev_end < up_to because these are realtime clock based
let when = EventType::Incremental {
start_time: prev_end,
stop_time: up_to,
};
(key, (when, val))
(
key,
(
EventType::Incremental {
start_time: prev_end,
stop_time: up_to,
},
val,
),
)
}
fn key(&self) -> &MetricsKey {
@@ -189,11 +209,9 @@ pub(super) async fn collect_all_metrics(
cached_metrics: &Cache,
ctx: &RequestContext,
) -> Vec<RawMetric> {
use pageserver_api::models::TenantState;
let started_at = std::time::Instant::now();
let tenants = match crate::tenant::mgr::list_tenants().await {
let tenants = match mgr::list_tenants().await {
Ok(tenants) => tenants,
Err(err) => {
tracing::error!("failed to list tenants: {:?}", err);
@@ -205,7 +223,7 @@ pub(super) async fn collect_all_metrics(
if state != TenantState::Active {
None
} else {
crate::tenant::mgr::get_tenant(id, true)
mgr::get_tenant(id, true)
.await
.ok()
.map(|tenant| (id, tenant))
@@ -267,7 +285,7 @@ where
current_metrics
}
/// In-between abstraction to allow testing metrics without actual Tenants.
/// Testing helping in-between abstraction allowing testing metrics without actual Tenants.
struct TenantSnapshot {
resident_size: u64,
remote_size: u64,
@@ -423,14 +441,14 @@ impl TimelineSnapshot {
let up_to = now;
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
let key_value = written_size_delta_key.from_previous_up_to(prev.0, up_to, delta);
// written_size_delta
metrics.push(key_value);
// written_size
metrics.push((key, written_size_now));
} else {
// the cached value was ahead of us, report zero until we've caught up
metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
metrics.push(written_size_delta_key.from_previous_up_to(prev.0, up_to, 0));
// the cached value was ahead of us, report the same until we've caught up
metrics.push((key, (written_size_now.0, prev.1)));
}
@@ -450,6 +468,3 @@ impl TimelineSnapshot {
#[cfg(test)]
mod tests;
#[cfg(test)]
pub(crate) use tests::metric_examples;

View File

@@ -1,7 +1,13 @@
use super::*;
use std::collections::HashMap;
use std::time::SystemTime;
use utils::lsn::Lsn;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use super::*;
use chrono::{DateTime, Utc};
#[test]
fn startup_collected_timeline_metrics_before_advancing() {
@@ -27,7 +33,7 @@ fn startup_collected_timeline_metrics_before_advancing() {
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
snap.loaded_at.1.into(),
now,
0
@@ -67,7 +73,8 @@ fn startup_collected_timeline_metrics_second_round() {
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_previous_up_to(before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
@@ -93,7 +100,11 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
// at t=before was the last time the last_record_lsn changed
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
// end time of this event is used for the next ones
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
before,
just_before,
0,
),
]);
let snap = TimelineSnapshot {
@@ -107,13 +118,81 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
just_before,
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
#[test]
fn metric_image_stability() {
// it is important that these strings stay as they are
let tenant_id = TenantId::from_array([0; 16]);
let timeline_id = TimelineId::from_array([0xff; 16]);
let now = DateTime::parse_from_rfc3339("2023-09-15T00:00:00.123456789Z").unwrap();
let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z").unwrap();
let [now, before] = [DateTime::<Utc>::from(now), DateTime::from(before)];
let examples = [
(
line!(),
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_previous_up_to(before, now, 0),
r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
),
(
line!(),
MetricsKey::resident_size(tenant_id).at(now, 0),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
),
(
line!(),
MetricsKey::synthetic_size(tenant_id).at(now, 1),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
),
];
let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(now, "1", 0);
for (line, (key, (kind, value)), expected) in examples {
let e = consumption_metrics::Event {
kind,
metric: key.metric,
idempotency_key: idempotency_key.to_string(),
value,
extra: Ids {
tenant_id: key.tenant_id,
timeline_id: key.timeline_id,
},
};
let actual = serde_json::to_string(&e).unwrap();
assert_eq!(expected, actual, "example from line {line}");
}
}
#[test]
fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
// it can happen that we lose the inmemorylayer but have previously sent metrics and we
@@ -141,7 +220,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
let mut cache = HashMap::from([
MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
way_before,
before_restart,
// not taken into account, but the timestamps are important
@@ -155,7 +234,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
before_restart,
now,
0
@@ -173,7 +252,8 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_previous_up_to(now, later, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
]
);
@@ -279,19 +359,3 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
times
}
pub(crate) const fn metric_examples(
tenant_id: TenantId,
timeline_id: TimelineId,
now: DateTime<Utc>,
before: DateTime<Utc>,
) -> [RawMetric; 6] {
[
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
MetricsKey::resident_size(tenant_id).at(now, 0),
MetricsKey::synthetic_size(tenant_id).at(now, 1),
]
}

View File

@@ -1,21 +1,8 @@
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
use serde_with::serde_as;
use consumption_metrics::{idempotency_key, Event, EventChunk, CHUNK_SIZE};
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use tracing::*;
use super::{metrics::Name, Cache, MetricsKey, RawMetric};
use utils::id::{TenantId, TimelineId};
/// How the metrics from pageserver are identified.
#[serde_with::serde_as]
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
struct Ids {
#[serde_as(as = "serde_with::DisplayFromStr")]
pub(super) tenant_id: TenantId,
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) timeline_id: Option<TimelineId>,
}
use super::{Cache, Ids, RawMetric};
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
pub(super) async fn upload_metrics(
@@ -26,21 +13,44 @@ pub(super) async fn upload_metrics(
metrics: &[RawMetric],
cached_metrics: &mut Cache,
) -> anyhow::Result<()> {
use bytes::BufMut;
let mut uploaded = 0;
let mut failed = 0;
let started_at = std::time::Instant::now();
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
let mut buffer = bytes::BytesMut::new();
let mut chunk_to_send = Vec::new();
while let Some(res) = iter.next() {
let (chunk, body) = res?;
for chunk in metrics.chunks(CHUNK_SIZE) {
chunk_to_send.clear();
// FIXME: this should always overwrite and truncate to chunk.len()
chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
kind: *when,
metric: curr_key.metric,
// FIXME: finally write! this to the prev allocation
idempotency_key: idempotency_key(node_id),
value: *curr_val,
extra: Ids {
tenant_id: curr_key.tenant_id,
timeline_id: curr_key.timeline_id,
},
}));
serde_json::to_writer(
(&mut buffer).writer(),
&EventChunk {
events: (&chunk_to_send).into(),
},
)?;
let body = buffer.split().freeze();
let event_bytes = body.len();
let is_last = iter.len() == 0;
let res = upload(client, metric_collection_endpoint, body, cancel, is_last)
let res = upload(client, metric_collection_endpoint, body, cancel)
.instrument(tracing::info_span!(
"upload",
%event_bytes,
@@ -78,150 +88,6 @@ pub(super) async fn upload_metrics(
Ok(())
}
// The return type is quite ugly, but we gain testability in isolation
fn serialize_in_chunks<'a, F>(
chunk_size: usize,
input: &'a [RawMetric],
factory: F,
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
where
F: KeyGen<'a> + 'a,
{
use bytes::BufMut;
struct Iter<'a, F> {
inner: std::slice::Chunks<'a, RawMetric>,
chunk_size: usize,
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
buffer: bytes::BytesMut,
// chunk amount of events are reused to produce the serialized document
scratch: Vec<Event<Ids, Name>>,
factory: F,
}
impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
fn next(&mut self) -> Option<Self::Item> {
let chunk = self.inner.next()?;
if self.scratch.is_empty() {
// first round: create events with N strings
self.scratch.extend(
chunk
.iter()
.map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
);
} else {
// next rounds: update_in_place to reuse allocations
assert_eq!(self.scratch.len(), self.chunk_size);
self.scratch
.iter_mut()
.zip(chunk.iter())
.for_each(|(slot, raw_metric)| {
raw_metric.update_in_place(slot, &self.factory.generate())
});
}
let res = serde_json::to_writer(
(&mut self.buffer).writer(),
&EventChunk {
events: (&self.scratch[..chunk.len()]).into(),
},
);
match res {
Ok(()) => Some(Ok((chunk, self.buffer.split().freeze()))),
Err(e) => Some(Err(e)),
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
let buffer = bytes::BytesMut::new();
let inner = input.chunks(chunk_size);
let scratch = Vec::new();
Iter {
inner,
chunk_size,
buffer,
scratch,
factory,
}
}
trait RawMetricExt {
fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name>;
fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>);
}
impl RawMetricExt for RawMetric {
fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
let MetricsKey {
metric,
tenant_id,
timeline_id,
} = self.0;
let (kind, value) = self.1;
Event {
kind,
metric,
idempotency_key: key.to_string(),
value,
extra: Ids {
tenant_id,
timeline_id,
},
}
}
fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
use std::fmt::Write;
let MetricsKey {
metric,
tenant_id,
timeline_id,
} = self.0;
let (kind, value) = self.1;
*event = Event {
kind,
metric,
idempotency_key: {
event.idempotency_key.clear();
write!(event.idempotency_key, "{key}").unwrap();
std::mem::take(&mut event.idempotency_key)
},
value,
extra: Ids {
tenant_id,
timeline_id,
},
};
}
}
trait KeyGen<'a>: Copy {
fn generate(&self) -> IdempotencyKey<'a>;
}
impl<'a> KeyGen<'a> for &'a str {
fn generate(&self) -> IdempotencyKey<'a> {
IdempotencyKey::generate(self)
}
}
enum UploadError {
Rejected(reqwest::StatusCode),
Reqwest(reqwest::Error),
@@ -253,16 +119,11 @@ impl UploadError {
}
}
// this is consumed by the test verifiers
static LAST_IN_BATCH: reqwest::header::HeaderName =
reqwest::header::HeaderName::from_static("pageserver-metrics-last-upload-in-batch");
async fn upload(
client: &reqwest::Client,
metric_collection_endpoint: &reqwest::Url,
body: bytes::Bytes,
cancel: &CancellationToken,
is_last: bool,
) -> Result<(), UploadError> {
let warn_after = 3;
let max_attempts = 10;
@@ -273,24 +134,17 @@ async fn upload(
let res = client
.post(metric_collection_endpoint.clone())
.header(reqwest::header::CONTENT_TYPE, "application/json")
.header(
LAST_IN_BATCH.clone(),
if is_last { "true" } else { "false" },
)
.body(body)
.send()
.await;
let res = res.and_then(|res| res.error_for_status());
// 10 redirects are normally allowed, so we don't need worry about 3xx
match res {
Ok(_response) => Ok(()),
Err(e) => {
let status = e.status().filter(|s| s.is_client_error());
if let Some(status) = status {
// rejection used to be a thing when the server could reject a
// whole batch of metrics if one metric was bad.
Err(UploadError::Rejected(status))
} else {
Err(UploadError::Reqwest(e))
@@ -321,123 +175,3 @@ async fn upload(
res
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::{DateTime, Utc};
use once_cell::sync::Lazy;
#[test]
fn chunked_serialization() {
let examples = metric_samples();
assert!(examples.len() > 1);
let factory = FixedGen::new(Utc::now(), "1", 42);
// need to use Event here because serde_json::Value uses default hashmap, not linked
// hashmap
#[derive(serde::Deserialize)]
struct EventChunk {
events: Vec<Event<Ids, Name>>,
}
let correct = serialize_in_chunks(examples.len(), &examples, factory)
.map(|res| res.unwrap().1)
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
.collect::<Vec<_>>();
for chunk_size in 1..examples.len() {
let actual = serialize_in_chunks(chunk_size, &examples, factory)
.map(|res| res.unwrap().1)
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
.collect::<Vec<_>>();
// if these are equal, it means that multi-chunking version works as well
assert_eq!(correct, actual);
}
}
#[derive(Clone, Copy)]
struct FixedGen<'a>(chrono::DateTime<chrono::Utc>, &'a str, u16);
impl<'a> FixedGen<'a> {
fn new(now: chrono::DateTime<chrono::Utc>, node_id: &'a str, nonce: u16) -> Self {
FixedGen(now, node_id, nonce)
}
}
impl<'a> KeyGen<'a> for FixedGen<'a> {
fn generate(&self) -> IdempotencyKey<'a> {
IdempotencyKey::for_tests(self.0, self.1, self.2)
}
}
static SAMPLES_NOW: Lazy<DateTime<Utc>> = Lazy::new(|| {
DateTime::parse_from_rfc3339("2023-09-15T00:00:00.123456789Z")
.unwrap()
.into()
});
#[test]
fn metric_image_stability() {
// it is important that these strings stay as they are
let examples = [
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
),
];
let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
let examples = examples.into_iter().zip(metric_samples());
for ((line, expected), (key, (kind, value))) in examples {
let e = consumption_metrics::Event {
kind,
metric: key.metric,
idempotency_key: idempotency_key.to_string(),
value,
extra: Ids {
tenant_id: key.tenant_id,
timeline_id: key.timeline_id,
},
};
let actual = serde_json::to_string(&e).unwrap();
assert_eq!(expected, actual, "example for {kind:?} from line {line}");
}
}
fn metric_samples() -> [RawMetric; 6] {
let tenant_id = TenantId::from_array([0; 16]);
let timeline_id = TimelineId::from_array([0xff; 16]);
let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
.unwrap()
.into();
let [now, before] = [*SAMPLES_NOW, before];
super::super::metrics::metric_examples(tenant_id, timeline_id, now, before)
}
}

View File

@@ -94,18 +94,6 @@ pub struct RequestContext {
task_kind: TaskKind,
download_behavior: DownloadBehavior,
access_stats_behavior: AccessStatsBehavior,
page_content_kind: PageContentKind,
}
/// The kind of access to the page cache.
#[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
pub enum PageContentKind {
Unknown,
DeltaLayerBtreeNode,
DeltaLayerValue,
ImageLayerBtreeNode,
ImageLayerValue,
InMemoryLayer,
}
/// Desired behavior if the operation requires an on-demand download
@@ -149,7 +137,6 @@ impl RequestContextBuilder {
task_kind,
download_behavior: DownloadBehavior::Download,
access_stats_behavior: AccessStatsBehavior::Update,
page_content_kind: PageContentKind::Unknown,
},
}
}
@@ -162,7 +149,6 @@ impl RequestContextBuilder {
task_kind: original.task_kind,
download_behavior: original.download_behavior,
access_stats_behavior: original.access_stats_behavior,
page_content_kind: original.page_content_kind,
},
}
}
@@ -181,11 +167,6 @@ impl RequestContextBuilder {
self
}
pub(crate) fn page_content_kind(mut self, k: PageContentKind) -> Self {
self.inner.page_content_kind = k;
self
}
pub fn build(self) -> RequestContext {
self.inner
}
@@ -282,8 +263,4 @@ impl RequestContext {
pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
self.access_stats_behavior
}
pub(crate) fn page_content_kind(&self) -> PageContentKind {
self.page_content_kind
}
}

View File

@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
{
pg_control = Some(control_file);
}
modification.flush(ctx).await?;
modification.flush().await?;
}
}
// We're done importing all the data files.
modification.commit(ctx).await?;
modification.commit().await?;
// We expect the Postgres server to be shut down cleanly.
let pg_control = pg_control.context("pg_control file not found")?;
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
// We found the pg_control file.
pg_control = Some(res);
}
modification.flush(ctx).await?;
modification.flush().await?;
}
tokio_tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
// sanity check: ensure that pg_control is loaded
let _pg_control = pg_control.context("pg_control file not found")?;
modification.commit(ctx).await?;
modification.commit().await?;
Ok(())
}

View File

@@ -1,4 +1,3 @@
use enum_map::EnumMap;
use metrics::metric_vec_duration::DurationResultObserver;
use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
@@ -128,24 +127,22 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub struct PageCacheMetricsForTaskKind {
pub struct PageCacheMetrics {
pub read_accesses_materialized_page: IntCounter,
pub read_accesses_ephemeral: IntCounter,
pub read_accesses_immutable: IntCounter,
pub read_hits_ephemeral: IntCounter,
pub read_hits_immutable: IntCounter,
pub read_hits_materialized_page_exact: IntCounter,
pub read_hits_materialized_page_older_lsn: IntCounter,
}
pub struct PageCacheMetrics {
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
}
static PAGE_CACHE_READ_HITS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_read_hits_total",
"Number of read accesses to the page cache that hit",
&["task_kind", "key_kind", "content_kind", "hit_kind"]
&["key_kind", "hit_kind"]
)
.expect("failed to define a metric")
});
@@ -154,73 +151,55 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_read_accesses_total",
"Number of read accesses to the page cache",
&["task_kind", "key_kind", "content_kind"]
&["key_kind"]
)
.expect("failed to define a metric")
});
pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
let task_kind: &'static str = task_kind.into();
EnumMap::from_array(std::array::from_fn(|content_kind| {
let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
let content_kind: &'static str = content_kind.into();
PageCacheMetricsForTaskKind {
read_accesses_materialized_page: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&[
task_kind,
"materialized_page",
content_kind,
])
.unwrap()
},
read_accesses_materialized_page: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&["materialized_page"])
.unwrap()
},
read_accesses_immutable: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&[task_kind, "immutable", content_kind])
.unwrap()
},
read_accesses_ephemeral: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&["ephemeral"])
.unwrap()
},
read_hits_immutable: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
.unwrap()
},
read_accesses_immutable: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&["immutable"])
.unwrap()
},
read_hits_materialized_page_exact: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&[
task_kind,
"materialized_page",
content_kind,
"exact",
])
.unwrap()
},
read_hits_ephemeral: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["ephemeral", "-"])
.unwrap()
},
read_hits_materialized_page_older_lsn: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&[
task_kind,
"materialized_page",
content_kind,
"older_lsn",
])
.unwrap()
},
}
}))
})),
read_hits_immutable: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["immutable", "-"])
.unwrap()
},
read_hits_materialized_page_exact: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["materialized_page", "exact"])
.unwrap()
},
read_hits_materialized_page_older_lsn: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&["materialized_page", "older_lsn"])
.unwrap()
},
});
impl PageCacheMetrics {
pub(crate) fn for_ctx(&self, ctx: &RequestContext) -> &PageCacheMetricsForTaskKind {
&self.map[ctx.task_kind()][ctx.page_content_kind()]
}
}
pub struct PageCacheSizeMetrics {
pub max_bytes: UIntGauge,
@@ -1301,9 +1280,6 @@ use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::{Duration, Instant};
use crate::context::{PageContentKind, RequestContext};
use crate::task_mgr::TaskKind;
pub struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,

View File

@@ -85,7 +85,7 @@ use utils::{
lsn::Lsn,
};
use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
use crate::{metrics::PageCacheSizeMetrics, repository::Key};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -346,10 +346,8 @@ impl PageCache {
timeline_id: TimelineId,
key: &Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, PageReadGuard)> {
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_materialized_page
.inc();
@@ -370,12 +368,10 @@ impl PageCache {
{
if available_lsn == lsn {
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_hits_materialized_page_exact
.inc();
} else {
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_hits_materialized_page_older_lsn
.inc();
}
@@ -430,11 +426,10 @@ impl PageCache {
&self,
file_id: FileId,
blkno: u32,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
self.lock_for_read(&mut cache_key, ctx).await
self.lock_for_read(&mut cache_key).await
}
//
@@ -502,20 +497,14 @@ impl PageCache {
/// }
/// ```
///
async fn lock_for_read(
&self,
cache_key: &mut CacheKey,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
let (read_access, hit) = match cache_key {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
}
CacheKey::ImmutableFilePage { .. } => (
&crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_immutable,
&crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
&crate::metrics::PAGE_CACHE.read_accesses_immutable,
&crate::metrics::PAGE_CACHE.read_hits_immutable,
),
};
read_access.inc();

View File

@@ -1138,7 +1138,7 @@ impl<'a> DatadirModification<'a> {
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
pub async fn flush(&mut self) -> anyhow::Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
@@ -1154,7 +1154,7 @@ impl<'a> DatadirModification<'a> {
if is_rel_block_key(key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, self.lsn, &value, ctx).await?;
writer.put(key, self.lsn, &value).await?;
} else {
retained_pending_updates.insert(key, value);
}
@@ -1174,14 +1174,14 @@ impl<'a> DatadirModification<'a> {
/// underlying timeline.
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
pub async fn commit(&mut self) -> anyhow::Result<()> {
let writer = self.tline.writer().await;
let lsn = self.lsn;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
for (key, value) in self.pending_updates.drain() {
writer.put(key, lsn, &value, ctx).await?;
writer.put(key, lsn, &value).await?;
}
for key_range in self.pending_deletions.drain(..) {
writer.delete(key_range, lsn).await?;

View File

@@ -187,7 +187,6 @@ task_local! {
Debug,
// NB: enumset::EnumSetType derives PartialEq, Eq, Clone, Copy
enumset::EnumSetType,
enum_map::Enum,
serde::Serialize,
serde::Deserialize,
strum_macros::IntoStaticStr,

View File

@@ -1504,7 +1504,7 @@ impl Tenant {
.init_empty_test_timeline()
.context("init_empty_test_timeline")?;
modification
.commit(ctx)
.commit()
.await
.context("commit init_empty_test_timeline modification")?;
@@ -3538,24 +3538,14 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x10),
&Value::Image(TEST_IMG("foo at 0x10")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
.await?;
writer.finish_write(Lsn(0x10));
drop(writer);
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x20),
&Value::Image(TEST_IMG("foo at 0x20")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
.await?;
writer.finish_write(Lsn(0x20));
drop(writer);
@@ -3629,19 +3619,19 @@ mod tests {
// Insert a value on the timeline
writer
.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"), &ctx)
.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))
.await?;
writer
.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"), &ctx)
.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))
.await?;
writer.finish_write(Lsn(0x20));
writer
.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"), &ctx)
.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))
.await?;
writer.finish_write(Lsn(0x30));
writer
.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"), &ctx)
.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))
.await?;
writer.finish_write(Lsn(0x40));
@@ -3656,7 +3646,7 @@ mod tests {
.expect("Should have a local timeline");
let new_writer = newtline.writer().await;
new_writer
.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"), &ctx)
.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))
.await?;
new_writer.finish_write(Lsn(0x40));
@@ -3679,11 +3669,7 @@ mod tests {
Ok(())
}
async fn make_some_layers(
tline: &Timeline,
start_lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<()> {
async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
let mut lsn = start_lsn;
#[allow(non_snake_case)]
{
@@ -3694,7 +3680,6 @@ mod tests {
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
ctx,
)
.await?;
writer.finish_write(lsn);
@@ -3704,7 +3689,6 @@ mod tests {
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
ctx,
)
.await?;
writer.finish_write(lsn);
@@ -3718,7 +3702,6 @@ mod tests {
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
ctx,
)
.await?;
writer.finish_write(lsn);
@@ -3728,7 +3711,6 @@ mod tests {
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
ctx,
)
.await?;
writer.finish_write(lsn);
@@ -3745,7 +3727,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
// FIXME: this doesn't actually remove any layer currently, given how the flushing
@@ -3819,7 +3801,7 @@ mod tests {
.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
@@ -3841,7 +3823,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3850,7 +3832,7 @@ mod tests {
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
tline.set_broken("test".to_owned());
@@ -3891,7 +3873,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3916,7 +3898,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3925,7 +3907,7 @@ mod tests {
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
// run gc on parent
tenant
@@ -3950,7 +3932,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), true)
@@ -3979,7 +3961,7 @@ mod tests {
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
let child_tline = tenant
.branch_timeline_test(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
@@ -3990,7 +3972,7 @@ mod tests {
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60), &ctx).await?;
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
@@ -4022,7 +4004,7 @@ mod tests {
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
let layer_map = tline.layers.read().await;
let level0_deltas = layer_map.layer_map().get_level0_deltas()?;
@@ -4105,12 +4087,7 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x10),
&Value::Image(TEST_IMG("foo at 0x10")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
.await?;
writer.finish_write(Lsn(0x10));
drop(writer);
@@ -4120,12 +4097,7 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x20),
&Value::Image(TEST_IMG("foo at 0x20")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
.await?;
writer.finish_write(Lsn(0x20));
drop(writer);
@@ -4135,12 +4107,7 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x30),
&Value::Image(TEST_IMG("foo at 0x30")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))
.await?;
writer.finish_write(Lsn(0x30));
drop(writer);
@@ -4150,12 +4117,7 @@ mod tests {
let writer = tline.writer().await;
writer
.put(
*TEST_KEY,
Lsn(0x40),
&Value::Image(TEST_IMG("foo at 0x40")),
&ctx,
)
.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))
.await?;
writer.finish_write(Lsn(0x40));
drop(writer);
@@ -4213,7 +4175,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
@@ -4266,7 +4227,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
@@ -4287,7 +4247,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
@@ -4347,7 +4306,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
writer.finish_write(lsn);
@@ -4376,7 +4334,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
&ctx,
)
.await?;
println!("updating {} at {}", blknum, lsn);
@@ -4445,7 +4402,6 @@ mod tests {
test_key,
lsn,
&Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
&ctx,
)
.await?;
println!("updating [{}][{}] at {}", idx, blknum, lsn);
@@ -4518,7 +4474,7 @@ mod tests {
.init_empty_test_timeline()
.context("init_empty_test_timeline")?;
modification
.commit(&ctx)
.commit()
.await
.context("commit init_empty_test_timeline modification")?;

View File

@@ -11,7 +11,6 @@
//! len < 128: 0XXXXXXX
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
//!
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::tenant::block_io::BlockCursor;
use crate::virtual_file::VirtualFile;
@@ -20,13 +19,9 @@ use std::io::{Error, ErrorKind};
impl<'a> BlockCursor<'a> {
/// Read a blob into a new buffer.
pub async fn read_blob(
&self,
offset: u64,
ctx: &RequestContext,
) -> Result<Vec<u8>, std::io::Error> {
pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
let mut buf = Vec::new();
self.read_blob_into_buf(offset, &mut buf, ctx).await?;
self.read_blob_into_buf(offset, &mut buf).await?;
Ok(buf)
}
/// Read blob into the given buffer. Any previous contents in the buffer
@@ -35,12 +30,11 @@ impl<'a> BlockCursor<'a> {
&self,
offset: u64,
dstbuf: &mut Vec<u8>,
ctx: &RequestContext,
) -> Result<(), std::io::Error> {
let mut blknum = (offset / PAGE_SZ as u64) as u32;
let mut off = (offset % PAGE_SZ as u64) as usize;
let mut buf = self.read_blk(blknum, ctx).await?;
let mut buf = self.read_blk(blknum).await?;
// peek at the first byte, to determine if it's a 1- or 4-byte length
let first_len_byte = buf[off];
@@ -56,7 +50,7 @@ impl<'a> BlockCursor<'a> {
// it is split across two pages
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
blknum += 1;
buf = self.read_blk(blknum, ctx).await?;
buf = self.read_blk(blknum).await?;
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
off = 4 - thislen;
} else {
@@ -77,7 +71,7 @@ impl<'a> BlockCursor<'a> {
if page_remain == 0 {
// continue on next page
blknum += 1;
buf = self.read_blk(blknum, ctx).await?;
buf = self.read_blk(blknum).await?;
off = 0;
page_remain = PAGE_SZ;
}
@@ -234,13 +228,12 @@ impl BlobWriter<false> {
#[cfg(test)]
mod tests {
use super::*;
use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
use crate::tenant::block_io::BlockReaderRef;
use rand::{Rng, SeedableRng};
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
let temp_dir = tempfile::tempdir()?;
let path = temp_dir.path().join("file");
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
// Write part (in block to drop the file)
let mut offsets = Vec::new();
@@ -262,7 +255,7 @@ mod tests {
let rdr = BlockReaderRef::VirtualFile(&file);
let rdr = BlockCursor::new(rdr);
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
let blob_read = rdr.read_blob(*offset, &ctx).await?;
let blob_read = rdr.read_blob(*offset).await?;
assert_eq!(
blob, &blob_read,
"mismatch for idx={idx} at offset={offset}"

View File

@@ -4,7 +4,6 @@
use super::ephemeral_file::EphemeralFile;
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
use crate::context::RequestContext;
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
use crate::virtual_file::VirtualFile;
use bytes::Bytes;
@@ -83,16 +82,12 @@ pub(crate) enum BlockReaderRef<'a> {
impl<'a> BlockReaderRef<'a> {
#[inline(always)]
async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
use BlockReaderRef::*;
match self {
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
Adapter(r) => r.read_blk(blknum, ctx).await,
FileBlockReader(r) => r.read_blk(blknum).await,
EphemeralFile(r) => r.read_blk(blknum).await,
Adapter(r) => r.read_blk(blknum).await,
#[cfg(test)]
TestDisk(r) => r.read_blk(blknum),
#[cfg(test)]
@@ -110,13 +105,11 @@ impl<'a> BlockReaderRef<'a> {
///
/// ```no_run
/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
/// # use pageserver::context::RequestContext;
/// # let reader: FileBlockReader = unimplemented!("stub");
/// # let ctx: RequestContext = unimplemented!("stub");
/// let cursor = reader.block_cursor();
/// let buf = cursor.read_blk(1, &ctx);
/// let buf = cursor.read_blk(1);
/// // do stuff with 'buf'
/// let buf = cursor.read_blk(2, &ctx);
/// let buf = cursor.read_blk(2);
/// // do stuff with 'buf'
/// ```
///
@@ -141,12 +134,8 @@ impl<'a> BlockCursor<'a> {
/// access to the contents of the page. (For the page cache, the
/// lease object represents a lock on the buffer.)
#[inline(always)]
pub async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
self.reader.read_blk(blknum, ctx).await
pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
self.reader.read_blk(blknum).await
}
}
@@ -180,15 +169,11 @@ impl FileBlockReader {
/// Returns a "lease" object that can be used to
/// access to the contents of the page. (For the page cache, the
/// lease object represents a lock on the buffer.)
pub async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
let cache = page_cache::get();
loop {
match cache
.read_immutable_buf(self.file_id, blknum, ctx)
.read_immutable_buf(self.file_id, blknum)
.await
.map_err(|e| {
std::io::Error::new(

View File

@@ -26,11 +26,7 @@ use std::{cmp::Ordering, io, result};
use thiserror::Error;
use tracing::error;
use crate::{
context::{DownloadBehavior, RequestContext},
task_mgr::TaskKind,
tenant::block_io::{BlockReader, BlockWriter},
};
use crate::tenant::block_io::{BlockReader, BlockWriter};
// The maximum size of a value stored in the B-tree. 5 bytes is enough currently.
pub const VALUE_SZ: usize = 5;
@@ -235,19 +231,14 @@ where
///
/// Read the value for given key. Returns the value, or None if it doesn't exist.
///
pub async fn get(&self, search_key: &[u8; L], ctx: &RequestContext) -> Result<Option<u64>> {
pub async fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
let mut result: Option<u64> = None;
self.visit(
search_key,
VisitDirection::Forwards,
|key, value| {
if key == search_key {
result = Some(value);
}
false
},
ctx,
)
self.visit(search_key, VisitDirection::Forwards, |key, value| {
if key == search_key {
result = Some(value);
}
false
})
.await?;
Ok(result)
}
@@ -262,7 +253,6 @@ where
search_key: &[u8; L],
dir: VisitDirection,
mut visitor: V,
ctx: &RequestContext,
) -> Result<bool>
where
V: FnMut(&[u8], u64) -> bool,
@@ -272,9 +262,7 @@ where
let block_cursor = self.reader.block_cursor();
while let Some((node_blknum, opt_iter)) = stack.pop() {
// Locate the node.
let node_buf = block_cursor
.read_blk(self.start_blk + node_blknum, ctx)
.await?;
let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;
let node = OnDiskNode::deparse(node_buf.as_ref())?;
let prefix_len = node.prefix_len as usize;
@@ -363,14 +351,13 @@ where
#[allow(dead_code)]
pub async fn dump(&self) -> Result<()> {
let mut stack = Vec::new();
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
stack.push((self.root_blk, String::new(), 0, 0, 0));
let block_cursor = self.reader.block_cursor();
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
let blk = block_cursor.read_blk(self.start_blk + blknum, &ctx).await?;
let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
let buf: &[u8] = blk.as_ref();
let node = OnDiskNode::<L>::deparse(buf)?;
@@ -701,8 +688,6 @@ impl<const L: usize> BuildNode<L> {
#[cfg(test)]
pub(crate) mod tests {
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
use rand::Rng;
use std::collections::BTreeMap;
@@ -740,8 +725,6 @@ pub(crate) mod tests {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let all_keys: Vec<&[u8; 6]> = vec![
b"xaaaaa", b"xaaaba", b"xaaaca", b"xabaaa", b"xababa", b"xabaca", b"xabada", b"xabadb",
];
@@ -762,12 +745,12 @@ pub(crate) mod tests {
// Test the `get` function on all the keys.
for (key, val) in all_data.iter() {
assert_eq!(reader.get(key, &ctx).await?, Some(*val));
assert_eq!(reader.get(key).await?, Some(*val));
}
// And on some keys that don't exist
assert_eq!(reader.get(b"aaaaaa", &ctx).await?, None);
assert_eq!(reader.get(b"zzzzzz", &ctx).await?, None);
assert_eq!(reader.get(b"xaaabx", &ctx).await?, None);
assert_eq!(reader.get(b"aaaaaa").await?, None);
assert_eq!(reader.get(b"zzzzzz").await?, None);
assert_eq!(reader.get(b"xaaabx").await?, None);
// Test search with `visit` function
let search_key = b"xabaaa";
@@ -779,15 +762,10 @@ pub(crate) mod tests {
let mut data = Vec::new();
reader
.visit(
search_key,
VisitDirection::Forwards,
|key, value| {
data.push((key.to_vec(), value));
true
},
&ctx,
)
.visit(search_key, VisitDirection::Forwards, |key, value| {
data.push((key.to_vec(), value));
true
})
.await?;
assert_eq!(data, expected);
@@ -800,28 +778,18 @@ pub(crate) mod tests {
expected.reverse();
let mut data = Vec::new();
reader
.visit(
search_key,
VisitDirection::Backwards,
|key, value| {
data.push((key.to_vec(), value));
true
},
&ctx,
)
.visit(search_key, VisitDirection::Backwards, |key, value| {
data.push((key.to_vec(), value));
true
})
.await?;
assert_eq!(data, expected);
// Backward scan where nothing matches
reader
.visit(
b"aaaaaa",
VisitDirection::Backwards,
|key, value| {
panic!("found unexpected key {}: {}", hex::encode(key), value);
},
&ctx,
)
.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| {
panic!("found unexpected key {}: {}", hex::encode(key), value);
})
.await?;
// Full scan
@@ -831,15 +799,10 @@ pub(crate) mod tests {
.collect();
let mut data = Vec::new();
reader
.visit(
&[0u8; 6],
VisitDirection::Forwards,
|key, value| {
data.push((key.to_vec(), value));
true
},
&ctx,
)
.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| {
data.push((key.to_vec(), value));
true
})
.await?;
assert_eq!(data, expected);
@@ -850,7 +813,6 @@ pub(crate) mod tests {
async fn lots_of_keys() -> Result<()> {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
const NUM_KEYS: u64 = 1000;
@@ -889,14 +851,14 @@ pub(crate) mod tests {
for search_key_int in 0..(NUM_KEYS * 2 + 10) {
let search_key = u64::to_be_bytes(search_key_int);
assert_eq!(
reader.get(&search_key, &ctx).await?,
reader.get(&search_key).await?,
all_data.get(&search_key_int).cloned()
);
// Test a forward scan starting with this key
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Forwards, take_ten, &ctx)
.visit(&search_key, VisitDirection::Forwards, take_ten)
.await?;
let expected = all_data
.range(search_key_int..)
@@ -908,7 +870,7 @@ pub(crate) mod tests {
// And a backwards scan
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Backwards, take_ten, &ctx)
.visit(&search_key, VisitDirection::Backwards, take_ten)
.await?;
let expected = all_data
.range(..=search_key_int)
@@ -924,7 +886,7 @@ pub(crate) mod tests {
limit.store(usize::MAX, Ordering::Relaxed);
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Forwards, take_ten, &ctx)
.visit(&search_key, VisitDirection::Forwards, take_ten)
.await?;
let expected = all_data
.iter()
@@ -937,7 +899,7 @@ pub(crate) mod tests {
limit.store(usize::MAX, Ordering::Relaxed);
result.lock().unwrap().clear();
reader
.visit(&search_key, VisitDirection::Backwards, take_ten, &ctx)
.visit(&search_key, VisitDirection::Backwards, take_ten)
.await?;
let expected = all_data
.iter()
@@ -951,8 +913,6 @@ pub(crate) mod tests {
#[tokio::test]
async fn random_data() -> Result<()> {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
// Generate random keys with exponential distribution, to
// exercise the prefix compression
const NUM_KEYS: usize = 100000;
@@ -979,24 +939,22 @@ pub(crate) mod tests {
// Test get() operation on all the keys
for (&key, &val) in all_data.iter() {
let search_key = u128::to_be_bytes(key);
assert_eq!(reader.get(&search_key, &ctx).await?, Some(val));
assert_eq!(reader.get(&search_key).await?, Some(val));
}
// Test get() operations on random keys, most of which will not exist
for _ in 0..100000 {
let key_int = rand::thread_rng().gen::<u128>();
let search_key = u128::to_be_bytes(key_int);
assert!(reader.get(&search_key, &ctx).await? == all_data.get(&key_int).cloned());
assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned());
}
// Test boundary cases
assert!(
reader.get(&u128::to_be_bytes(u128::MIN), &ctx).await?
== all_data.get(&u128::MIN).cloned()
reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned()
);
assert!(
reader.get(&u128::to_be_bytes(u128::MAX), &ctx).await?
== all_data.get(&u128::MAX).cloned()
reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned()
);
Ok(())
@@ -1027,7 +985,6 @@ pub(crate) mod tests {
// Build a tree from it
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
for (key, val) in disk_btree_test_data::TEST_DATA {
writer.append(&key, val)?;
@@ -1040,21 +997,16 @@ pub(crate) mod tests {
// Test get() operation on all the keys
for (key, val) in disk_btree_test_data::TEST_DATA {
assert_eq!(reader.get(&key, &ctx).await?, Some(val));
assert_eq!(reader.get(&key).await?, Some(val));
}
// Test full scan
let mut count = 0;
reader
.visit(
&[0u8; 26],
VisitDirection::Forwards,
|_key, _value| {
count += 1;
true
},
&ctx,
)
.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| {
count += 1;
true
})
.await?;
assert_eq!(count, disk_btree_test_data::TEST_DATA.len());

View File

@@ -2,7 +2,6 @@
//! used to keep in-memory layers spilled on disk.
use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::VirtualFile;
@@ -62,17 +61,13 @@ impl EphemeralFile {
self.len
}
pub(crate) async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, io::Error> {
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
if flushed_blknums.contains(&(blknum as u64)) {
let cache = page_cache::get();
loop {
match cache
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
.read_immutable_buf(self.page_cache_file_id, blknum)
.await
.map_err(|e| {
std::io::Error::new(
@@ -108,11 +103,7 @@ impl EphemeralFile {
}
}
pub(crate) async fn write_blob(
&mut self,
srcbuf: &[u8],
ctx: &RequestContext,
) -> Result<u64, io::Error> {
pub(crate) async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
struct Writer<'a> {
ephemeral_file: &'a mut EphemeralFile,
/// The block to which the next [`push_bytes`] will write.
@@ -129,11 +120,7 @@ impl EphemeralFile {
})
}
#[inline(always)]
async fn push_bytes(
&mut self,
src: &[u8],
ctx: &RequestContext,
) -> Result<(), io::Error> {
async fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
let mut src_remaining = src;
while !src_remaining.is_empty() {
let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
@@ -159,7 +146,6 @@ impl EphemeralFile {
.read_immutable_buf(
self.ephemeral_file.page_cache_file_id,
self.blknum,
ctx,
)
.await
{
@@ -213,15 +199,15 @@ impl EphemeralFile {
if srcbuf.len() < 0x80 {
// short one-byte length header
let len_buf = [srcbuf.len() as u8];
writer.push_bytes(&len_buf, ctx).await?;
writer.push_bytes(&len_buf).await?;
} else {
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
len_buf[0] |= 0x80;
writer.push_bytes(&len_buf, ctx).await?;
writer.push_bytes(&len_buf).await?;
}
// Write the payload
writer.push_bytes(srcbuf, ctx).await?;
writer.push_bytes(srcbuf).await?;
if srcbuf.len() < 0x80 {
self.len += 1;
@@ -275,8 +261,6 @@ impl BlockReader for EphemeralFile {
#[cfg(test)]
mod tests {
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
use rand::{thread_rng, RngCore};
use std::fs;
@@ -284,15 +268,7 @@ mod tests {
fn harness(
test_name: &str,
) -> Result<
(
&'static PageServerConf,
TenantId,
TimelineId,
RequestContext,
),
io::Error,
> {
) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
let repo_dir = PageServerConf::test_repo_dir(test_name);
let _ = fs::remove_dir_all(&repo_dir);
let conf = PageServerConf::dummy_conf(repo_dir);
@@ -304,57 +280,46 @@ mod tests {
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
Ok((conf, tenant_id, timeline_id, ctx))
Ok((conf, tenant_id, timeline_id))
}
#[tokio::test]
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
let pos_foo = file.write_blob(b"foo", &ctx).await?;
let pos_foo = file.write_blob(b"foo").await?;
assert_eq!(
b"foo",
file.block_cursor()
.read_blob(pos_foo, &ctx)
.await?
.as_slice()
file.block_cursor().read_blob(pos_foo).await?.as_slice()
);
let pos_bar = file.write_blob(b"bar", &ctx).await?;
let pos_bar = file.write_blob(b"bar").await?;
assert_eq!(
b"foo",
file.block_cursor()
.read_blob(pos_foo, &ctx)
.await?
.as_slice()
file.block_cursor().read_blob(pos_foo).await?.as_slice()
);
assert_eq!(
b"bar",
file.block_cursor()
.read_blob(pos_bar, &ctx)
.await?
.as_slice()
file.block_cursor().read_blob(pos_bar).await?.as_slice()
);
let mut blobs = Vec::new();
for i in 0..10000 {
let data = Vec::from(format!("blob{}", i).as_bytes());
let pos = file.write_blob(&data, &ctx).await?;
let pos = file.write_blob(&data).await?;
blobs.push((pos, data));
}
// also test with a large blobs
for i in 0..100 {
let data = format!("blob{}", i).as_bytes().repeat(100);
let pos = file.write_blob(&data, &ctx).await?;
let pos = file.write_blob(&data).await?;
blobs.push((pos, data));
}
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
for (pos, expected) in blobs {
let actual = cursor.read_blob(pos, &ctx).await?;
let actual = cursor.read_blob(pos).await?;
assert_eq!(actual, expected);
}
@@ -362,8 +327,8 @@ mod tests {
let mut large_data = Vec::new();
large_data.resize(20000, 0);
thread_rng().fill_bytes(&mut large_data);
let pos_large = file.write_blob(&large_data, &ctx).await?;
let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
let pos_large = file.write_blob(&large_data).await?;
let result = file.block_cursor().read_blob(pos_large).await?;
assert_eq!(result, large_data);
Ok(())

View File

@@ -28,7 +28,7 @@
//! "values" part.
//!
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
@@ -317,11 +317,11 @@ impl DeltaLayer {
tree_reader.dump().await?;
let keys = DeltaLayerInner::load_keys(&inner, ctx).await?;
let keys = DeltaLayerInner::load_keys(&inner).await?;
// A subroutine to dump a single blob
async fn dump_blob(val: ValueRef<'_>, ctx: &RequestContext) -> Result<String> {
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
async fn dump_blob(val: ValueRef<'_>) -> Result<String> {
let buf = val.reader.read_blob(val.blob_ref.pos()).await?;
let val = Value::des(&buf)?;
let desc = match val {
Value::Image(img) => {
@@ -342,7 +342,7 @@ impl DeltaLayer {
for entry in keys {
let DeltaEntry { key, lsn, val, .. } = entry;
let desc = match dump_blob(val, ctx).await {
let desc = match dump_blob(val).await {
Ok(desc) => desc,
Err(err) => {
let err: anyhow::Error = err;
@@ -370,7 +370,7 @@ impl DeltaLayer {
.load(LayerAccessKind::GetValueReconstructData, ctx)
.await?;
inner
.get_value_reconstruct_data(key, lsn_range, reconstruct_state, ctx)
.get_value_reconstruct_data(key, lsn_range, reconstruct_state)
.await
}
@@ -453,12 +453,12 @@ impl DeltaLayer {
self.access_stats.record_access(access_kind, ctx);
// Quick exit if already loaded
self.inner
.get_or_try_init(|| self.load_inner(ctx))
.get_or_try_init(|| self.load_inner())
.await
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
}
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
async fn load_inner(&self) -> Result<Arc<DeltaLayerInner>> {
let path = self.path();
let summary = match &self.path_or_conf {
@@ -466,7 +466,7 @@ impl DeltaLayer {
PathOrConf::Path(_) => None,
};
let loaded = DeltaLayerInner::load(&path, summary, ctx).await?;
let loaded = DeltaLayerInner::load(&path, summary).await?;
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
@@ -554,7 +554,7 @@ impl DeltaLayer {
.load(LayerAccessKind::KeyIter, ctx)
.await
.context("load delta layer keys")?;
DeltaLayerInner::load_keys(inner, ctx)
DeltaLayerInner::load_keys(inner)
.await
.context("Layer index is corrupted")
}
@@ -849,14 +849,13 @@ impl DeltaLayerInner {
pub(super) async fn load(
path: &std::path::Path,
summary: Option<Summary>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let Some(mut expected_summary) = summary {
@@ -884,7 +883,6 @@ impl DeltaLayerInner {
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
let mut need_image = true;
// Scan the page versions backwards, starting from `lsn`.
@@ -899,38 +897,27 @@ impl DeltaLayerInner {
let mut offsets: Vec<(Lsn, u64)> = Vec::new();
tree_reader
.visit(
&search_key.0,
VisitDirection::Backwards,
|key, value| {
let blob_ref = BlobRef(value);
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
return false;
}
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
if entry_lsn < lsn_range.start {
return false;
}
offsets.push((entry_lsn, blob_ref.pos()));
.visit(&search_key.0, VisitDirection::Backwards, |key, value| {
let blob_ref = BlobRef(value);
if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] {
return false;
}
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
if entry_lsn < lsn_range.start {
return false;
}
offsets.push((entry_lsn, blob_ref.pos()));
!blob_ref.will_init()
},
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
.build(),
)
!blob_ref.will_init()
})
.await?;
let ctx = &RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerValue)
.build();
// Ok, 'offsets' now contains the offsets of all the entries we need to read
let cursor = file.block_cursor();
let mut buf = Vec::new();
for (entry_lsn, pos) in offsets {
cursor
.read_blob_into_buf(pos, &mut buf, ctx)
.read_blob_into_buf(pos, &mut buf)
.await
.with_context(|| {
format!(
@@ -971,10 +958,9 @@ impl DeltaLayerInner {
}
}
pub(super) async fn load_keys<'a, 'b, T: AsRef<DeltaLayerInner> + Clone>(
this: &'a T,
ctx: &'b RequestContext,
) -> Result<Vec<DeltaEntry<'a>>> {
pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
this: &T,
) -> Result<Vec<DeltaEntry<'_>>> {
let dl = this.as_ref();
let file = &dl.file;
@@ -1011,9 +997,6 @@ impl DeltaLayerInner {
all_keys.push(entry);
true
},
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
.build(),
)
.await?;
if let Some(last) = all_keys.last_mut() {
@@ -1043,9 +1026,9 @@ pub struct ValueRef<'a> {
impl<'a> ValueRef<'a> {
/// Loads the value from disk
pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
pub async fn load(&self) -> Result<Value> {
// theoretically we *could* record an access time for each, but it does not really matter
let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
let val = Value::des(&buf)?;
Ok(val)
}
@@ -1054,11 +1037,7 @@ impl<'a> ValueRef<'a> {
pub(crate) struct Adapter<T>(T);
impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
pub(crate) async fn read_blk(
&self,
blknum: u32,
ctx: &RequestContext,
) -> Result<BlockLease, std::io::Error> {
self.0.as_ref().file.read_blk(blknum, ctx).await
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
self.0.as_ref().file.read_blk(blknum).await
}
}

View File

@@ -24,7 +24,7 @@
//! mapping from Key to an offset in the "values" part. The
//! actual page images are stored in the "values" part.
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
@@ -237,15 +237,10 @@ impl ImageLayer {
tree_reader.dump().await?;
tree_reader
.visit(
&[0u8; KEY_SIZE],
VisitDirection::Forwards,
|key, value| {
println!("key: {} offset {}", hex::encode(key), value);
true
},
ctx,
)
.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
println!("key: {} offset {}", hex::encode(key), value);
true
})
.await?;
Ok(())
@@ -266,7 +261,7 @@ impl ImageLayer {
.load(LayerAccessKind::GetValueReconstructData, ctx)
.await?;
inner
.get_value_reconstruct_data(key, reconstruct_state, ctx)
.get_value_reconstruct_data(key, reconstruct_state)
.await
// FIXME: makes no sense to dump paths
.with_context(|| format!("read {}", self.path().display()))
@@ -340,12 +335,12 @@ impl ImageLayer {
) -> Result<&ImageLayerInner> {
self.access_stats.record_access(access_kind, ctx);
self.inner
.get_or_try_init(|| self.load_inner(ctx))
.get_or_try_init(|| self.load_inner())
.await
.with_context(|| format!("Failed to load image layer {}", self.path().display()))
}
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
async fn load_inner(&self) -> Result<ImageLayerInner> {
let path = self.path();
let expected_summary = match &self.path_or_conf {
@@ -354,8 +349,7 @@ impl ImageLayer {
};
let loaded =
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary, ctx)
.await?;
ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;
if let PathOrConf::Path(ref path) = self.path_or_conf {
// not production code
@@ -442,13 +436,12 @@ impl ImageLayerInner {
path: &std::path::Path,
lsn: Lsn,
summary: Option<Summary>,
ctx: &RequestContext,
) -> anyhow::Result<Self> {
let file = VirtualFile::open(path)
.await
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
let summary_blk = file.read_blk(0).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
if let Some(mut expected_summary) = summary {
@@ -477,30 +470,16 @@ impl ImageLayerInner {
&self,
key: Key,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
let file = &self.file;
let tree_reader = DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, file);
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
if let Some(offset) = tree_reader
.get(
&keybuf,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
.build(),
)
.await?
{
if let Some(offset) = tree_reader.get(&keybuf).await? {
let blob = file
.block_cursor()
.read_blob(
offset,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::ImageLayerValue)
.build(),
)
.read_blob(offset)
.await
.with_context(|| format!("failed to read value from offset {}", offset))?;
let value = Bytes::from(blob);

View File

@@ -5,7 +5,7 @@
//! its position in the file, is kept in memory, though.
//!
use crate::config::PageServerConf;
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
use crate::context::RequestContext;
use crate::repository::{Key, Value};
use crate::tenant::block_io::BlockReader;
use crate::tenant::ephemeral_file::EphemeralFile;
@@ -106,7 +106,7 @@ impl InMemoryLayer {
/// debugging function to print out the contents of the layer
///
/// this is likely completly unused
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
pub async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
let inner = self.inner.read().await;
let end_str = self.end_lsn_or_max();
@@ -125,7 +125,7 @@ impl InMemoryLayer {
for (key, vec_map) in inner.index.iter() {
for (lsn, pos) in vec_map.as_slice() {
let mut desc = String::new();
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
cursor.read_blob_into_buf(*pos, &mut buf).await?;
let val = Value::des(&buf);
match val {
Ok(Value::Image(img)) => {
@@ -158,15 +158,11 @@ impl InMemoryLayer {
key: Key,
lsn_range: Range<Lsn>,
reconstruct_state: &mut ValueReconstructState,
ctx: &RequestContext,
_ctx: &RequestContext,
) -> anyhow::Result<ValueReconstructResult> {
ensure!(lsn_range.start >= self.start_lsn);
let mut need_image = true;
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
let inner = self.inner.read().await;
let reader = inner.file.block_cursor();
@@ -175,7 +171,7 @@ impl InMemoryLayer {
if let Some(vec_map) = inner.index.get(&key) {
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
let buf = reader.read_blob(*pos, &ctx).await?;
let buf = reader.read_blob(*pos).await?;
let value = Value::des(&buf)?;
match value {
Value::Image(img) => {
@@ -267,13 +263,7 @@ impl InMemoryLayer {
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree
pub async fn put_value(
&self,
key: Key,
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let inner: &mut _ = &mut *self.inner.write().await;
self.assert_writable();
@@ -285,15 +275,7 @@ impl InMemoryLayer {
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
buf.clear();
val.ser_into(&mut buf)?;
inner
.file
.write_blob(
&buf,
&RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build(),
)
.await?
inner.file.write_blob(&buf).await?
};
let vec_map = inner.index.entry(key).or_default();
@@ -331,7 +313,7 @@ impl InMemoryLayer {
/// Write this frozen in-memory layer to disk.
///
/// Returns a new delta layer with all the same data as this in-memory layer
pub(crate) async fn write_to_disk(&self, ctx: &RequestContext) -> Result<DeltaLayer> {
pub(crate) async fn write_to_disk(&self) -> Result<DeltaLayer> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. There's one exception
@@ -361,14 +343,11 @@ impl InMemoryLayer {
let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
keys.sort_by_key(|k| k.0);
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
for (key, vec_map) in keys.iter() {
let key = **key;
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
cursor.read_blob_into_buf(*pos, &mut buf).await?;
let will_init = Value::des(&buf)?.will_init();
delta_layer_writer
.put_value_bytes(key, *lsn, &buf, will_init)

View File

@@ -471,7 +471,7 @@ impl Timeline {
// The cached image can be returned directly if there is no WAL between the cached image
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
// for redo.
let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
let cached_page_img = match self.lookup_cached_page(&key, lsn).await {
Some((cached_lsn, cached_img)) => {
match cached_lsn.cmp(&lsn) {
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
@@ -2518,18 +2518,13 @@ impl Timeline {
}
}
async fn lookup_cached_page(
&self,
key: &Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, Bytes)> {
async fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
let cache = page_cache::get();
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
// We should look at the key to determine if it's a cacheable object
let (lsn, read_guard) = cache
.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn, ctx)
.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)
.await?;
let img = Bytes::from(read_guard.to_vec());
Some((lsn, img))
@@ -2563,16 +2558,10 @@ impl Timeline {
Ok(layer)
}
async fn put_value(
&self,
key: Key,
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> anyhow::Result<()> {
async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
//info!("PUT: key {} at {}", key, lsn);
let layer = self.get_layer_for_write(lsn).await?;
layer.put_value(key, lsn, val, ctx).await?;
layer.put_value(key, lsn, val).await?;
Ok(())
}
@@ -2744,7 +2733,7 @@ impl Timeline {
// Normal case, write out a L0 delta layer file.
// `create_delta_layer` will not modify the layer map.
// We will remove frozen layer and add delta layer in one atomic operation later.
let layer = self.create_delta_layer(&frozen_layer, ctx).await?;
let layer = self.create_delta_layer(&frozen_layer).await?;
(
HashMap::from([(
layer.filename(),
@@ -2867,21 +2856,19 @@ impl Timeline {
async fn create_delta_layer(
self: &Arc<Self>,
frozen_layer: &Arc<InMemoryLayer>,
ctx: &RequestContext,
) -> anyhow::Result<DeltaLayer> {
let span = tracing::info_span!("blocking");
let new_delta: DeltaLayer = tokio::task::spawn_blocking({
let _g = span.entered();
let self_clone = Arc::clone(self);
let frozen_layer = Arc::clone(frozen_layer);
let ctx = ctx.attached_child();
move || {
// Write it out
// Keep this inside `spawn_blocking` and `Handle::current`
// as long as the write path is still sync and the read impl
// is still not fully async. Otherwise executor threads would
// be blocked.
let new_delta = Handle::current().block_on(frozen_layer.write_to_disk(&ctx))?;
let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
let new_delta_path = new_delta.path();
// Sync it to disk.
@@ -3587,7 +3574,7 @@ impl Timeline {
key, lsn, ref val, ..
} in all_values_iter
{
let value = val.load(ctx).await?;
let value = val.load().await?;
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
// We need to check key boundaries once we reach next key or end of layer with the same key
if !same_key || lsn == dup_end_lsn {
@@ -4712,14 +4699,8 @@ impl<'a> TimelineWriter<'a> {
///
/// This will implicitly extend the relation, if the page is beyond the
/// current end-of-file.
pub async fn put(
&self,
key: Key,
lsn: Lsn,
value: &Value,
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.tl.put_value(key, lsn, value, ctx).await
pub async fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
self.tl.put_value(key, lsn, value).await
}
pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {

View File

@@ -650,12 +650,6 @@ mod tests {
File(File),
}
impl From<VirtualFile> for MaybeVirtualFile {
fn from(vf: VirtualFile) -> Self {
MaybeVirtualFile::VirtualFile(vf)
}
}
impl MaybeVirtualFile {
async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
match self {
@@ -893,54 +887,4 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_atomic_overwrite_basic() {
let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic");
std::fs::create_dir_all(&testdir).unwrap();
let path = testdir.join("myfile");
let tmp_path = testdir.join("myfile.tmp");
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
.await
.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
let post = file.read_string().await.unwrap();
assert_eq!(post, "foo");
assert!(!tmp_path.exists());
drop(file);
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
.await
.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
let post = file.read_string().await.unwrap();
assert_eq!(post, "bar");
assert!(!tmp_path.exists());
drop(file);
}
#[tokio::test]
async fn test_atomic_overwrite_preexisting_tmp() {
let testdir =
crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp");
std::fs::create_dir_all(&testdir).unwrap();
let path = testdir.join("myfile");
let tmp_path = testdir.join("myfile.tmp");
std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
assert!(tmp_path.exists());
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
.await
.unwrap();
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
let post = file.read_string().await.unwrap();
assert_eq!(post, "foo");
assert!(!tmp_path.exists());
drop(file);
}
}

View File

@@ -363,7 +363,7 @@ impl<'a> WalIngest<'a> {
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN
modification.commit(ctx).await?;
modification.commit().await?;
Ok(())
}
@@ -1561,7 +1561,7 @@ mod tests {
let mut m = tline.begin_modification(Lsn(0x10));
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
m.commit(ctx).await?;
m.commit().await?;
let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
Ok(walingest)
@@ -1580,22 +1580,22 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
let mut m = tline.begin_modification(Lsn(0x30));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
let mut m = tline.begin_modification(Lsn(0x40));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
let mut m = tline.begin_modification(Lsn(0x50));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_current_logical_size(&tline, Lsn(0x50));
@@ -1681,7 +1681,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_current_logical_size(&tline, Lsn(0x60));
// Check reported size and contents after truncation
@@ -1723,7 +1723,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1736,7 +1736,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1761,7 +1761,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1800,7 +1800,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
// Check that rel exists and size is correct
assert_eq!(
@@ -1819,7 +1819,7 @@ mod tests {
// Drop rel
let mut m = tline.begin_modification(Lsn(0x30));
walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
m.commit(&ctx).await?;
m.commit().await?;
// Check that rel is not visible anymore
assert_eq!(
@@ -1837,7 +1837,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
// Check that rel exists and size is correct
assert_eq!(
@@ -1876,7 +1876,7 @@ mod tests {
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
.await?;
}
m.commit(&ctx).await?;
m.commit().await?;
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
@@ -1921,7 +1921,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
// Check reported size and contents after truncation
assert_eq!(
@@ -1970,7 +1970,7 @@ mod tests {
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
.await?;
}
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline
@@ -2017,7 +2017,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
}
assert_current_logical_size(&tline, Lsn(lsn));
@@ -2033,7 +2033,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE
@@ -2046,7 +2046,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE - 1
@@ -2062,7 +2062,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
.await?;
m.commit(&ctx).await?;
m.commit().await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
size as BlockNumber

View File

@@ -2,3 +2,4 @@ comment = '** Deprecated ** Please use pg_embedding instead'
default_version = '0.1.0'
module_pathname = '$libdir/hnsw'
relocatable = true
trusted = true

View File

@@ -222,9 +222,8 @@ lfc_change_limit_hook(int newval, void *extra)
/*
* Stats collector detach shared memory, so we should not try to access shared memory here.
* Parallel workers first assign default value (0), so not perform truncation in parallel workers.
* The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
*/
if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
return;
/* Open cache file if not done yet */

View File

@@ -566,9 +566,7 @@ WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
}
initStringInfo(&safekeeper[n_safekeepers].outbuf);
safekeeper[n_safekeepers].xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
if (safekeeper[n_safekeepers].xlogreader == NULL)
elog(FATAL, "Failed to allocate xlog reader");
safekeeper[n_safekeepers].xlogreader = NULL;
safekeeper[n_safekeepers].flushWrite = false;
safekeeper[n_safekeepers].startStreamingAt = InvalidXLogRecPtr;
safekeeper[n_safekeepers].streamingAt = InvalidXLogRecPtr;
@@ -716,6 +714,12 @@ ShutdownConnection(Safekeeper *sk)
sk->voteResponse.termHistory.entries = NULL;
HackyRemoveWalProposerEvent(sk);
if (sk->xlogreader)
{
NeonWALReaderFree(sk->xlogreader);
sk->xlogreader = NULL;
}
}
/*
@@ -1238,8 +1242,8 @@ HandleElectedProposer(void)
LSN_FORMAT_ARGS(truncateLsn),
LSN_FORMAT_ARGS(propEpochStartLsn));
/* Perform recovery */
if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn))
elog(FATAL, "Failed to recover state");
// if (!WalProposerRecovery(donor, greetRequest.timeline, truncateLsn, propEpochStartLsn))
// elog(FATAL, "Failed to recover state");
}
else if (syncSafekeepers)
{
@@ -1555,6 +1559,12 @@ SendProposerElected(Safekeeper *sk)
term_t lastCommonTerm;
int i;
/* It's a good moment to create WAL reader */
Assert(!sk->xlogreader);
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, propEpochStartLsn);
if (!sk->xlogreader)
elog(FATAL, "failed to allocate xlog reader");
/*
* Determine start LSN by comparing safekeeper's log term switch history
* and proposer's, searching for the divergence point.
@@ -1834,19 +1844,24 @@ SendAppendRequests(Safekeeper *sk)
/* write the WAL itself */
enlargeStringInfo(&sk->outbuf, req->endLsn - req->beginLsn);
if (!WALRead(sk->xlogreader,
if (!NeonWALRead(sk->xlogreader,
&sk->outbuf.data[sk->outbuf.len],
req->beginLsn,
req->endLsn - req->beginLsn,
#if PG_VERSION_NUM >= 150000
/* FIXME don't use hardcoded timeline_id here */
1,
1
#else
ThisTimeLineID,
ThisTimeLineID
#endif
&errinfo))
))
{
WALReadRaiseError(&errinfo);
elog(WARNING, "WAL reading for node %s:%s failed: %s",
sk->host, sk->port,
sk->xlogreader->err_msg);
ShutdownConnection(sk);
return false;
}
sk->outbuf.len += req->endLsn - req->beginLsn;

View File

@@ -2,6 +2,7 @@
#define __NEON_WALPROPOSER_H__
#include "access/xlogdefs.h"
#include "access/xlogreader.h"
#include "postgres.h"
#include "port.h"
#include "access/xlog_internal.h"
@@ -327,6 +328,24 @@ typedef struct AppendResponse
/* Other fields are fixed part */
#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
#define NEON_WALREADER_ERR_MSG_LEN 128
/*
* Like WALRead, but returns error instead of throwing ERROR when segment is
* missing + doesn't attempt to read WAL before specified horizon -- basebackup
* LSN. Missing WAL should be fetched by peer recovery, or, alternatively, on
* demand WAL fetching from safekeepers should be implemented in NeonWALReader.
*/
typedef struct {
/* LSN before */
XLogRecPtr available_lsn;
WALSegmentContext segcxt;
WALOpenSegment seg;
int wre_errno;
/* Explains failure to read, static for simplicity. */
char err_msg[NEON_WALREADER_ERR_MSG_LEN];
} NeonWALReader;
/*
* Descriptor of safekeeper
*/
@@ -358,7 +377,7 @@ typedef struct Safekeeper
/*
* WAL reader, allocated for each safekeeper.
*/
XLogReaderState *xlogreader;
NeonWALReader *xlogreader;
/*
* Streaming will start here; must be record boundary.
@@ -508,4 +527,9 @@ extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_
extern uint64 BackpressureThrottlingTime(void);
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn);
extern void NeonWALReaderFree(NeonWALReader *state);
extern bool NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
#endif /* __NEON_WALPROPOSER_H__ */

View File

@@ -12,6 +12,7 @@
#include "replication/slot.h"
#include "walproposer_utils.h"
#include "replication/walsender_private.h"
#include "utils/wait_event.h"
#include "storage/ipc.h"
#include "utils/builtins.h"
@@ -657,3 +658,185 @@ XLogBroadcastWalProposer(void)
set_ps_display(activitymsg);
}
}
/* palloc and initialize NeonWALReader */
NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn)
{
NeonWALReader *reader;
reader = (NeonWALReader *)
palloc_extended(sizeof(NeonWALReader),
MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
if (!reader)
return NULL;
reader->available_lsn = available_lsn;
reader->seg.ws_file = -1;
reader->seg.ws_segno = 0;
reader->seg.ws_tli = 0;
reader->segcxt.ws_segsize = wal_segment_size;
return reader;
}
static void neon_wal_segment_close(NeonWALReader *state);
void
NeonWALReaderFree(NeonWALReader *state)
{
if (state->seg.ws_file != -1)
neon_wal_segment_close(state);
pfree(state);
}
/*
* Copy of vanilla wal_segment_open, but returns false in case of error instead
* of ERROR, with errno set.
*
* XLogReaderRoutine->segment_open callback for local pg_wal files
*/
static bool
neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
TimeLineID *tli_p)
{
TimeLineID tli = *tli_p;
char path[MAXPGPATH];
XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
if (state->seg.ws_file >= 0)
return true;
return false;
}
/* copy of vanilla wal_segment_close with NeonWALReader */
void
neon_wal_segment_close(NeonWALReader *state)
{
close(state->seg.ws_file);
/* need to check errno? */
state->seg.ws_file = -1;
}
/*
* Mostly copy of vanilla WALRead, but 1) returns error if requested data before
* available_lsn 2) returns error is segment is missing instead of throwing
* ERROR.
*
* Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
* fetched from timeline 'tli'.
*
* Returns true if succeeded, false if an error occurs, in which case
* 'state->errno' shows whether it was missing WAL (ENOENT) or something else,
* and 'err' the desciption.
*/
bool NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
{
char *p;
XLogRecPtr recptr;
Size nbytes;
if (startptr < state->available_lsn)
{
state->wre_errno = 0;
snprintf(state->err_msg, sizeof(state->err_msg), "failed to read WAL at %X/%X which is earlier than available %X/%X",
LSN_FORMAT_ARGS(startptr), LSN_FORMAT_ARGS(state->available_lsn));
return false;
}
p = buf;
recptr = startptr;
nbytes = count;
while (nbytes > 0)
{
uint32 startoff;
int segbytes;
int readbytes;
startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
/*
* If the data we want is not in a segment we have open, close what we
* have (if anything) and open the next one, using the caller's
* provided openSegment callback.
*/
if (state->seg.ws_file < 0 ||
!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
tli != state->seg.ws_tli)
{
XLogSegNo nextSegNo;
if (state->seg.ws_file >= 0)
neon_wal_segment_close(state);
XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
if (!neon_wal_segment_open(state, nextSegNo, &tli))
{
char fname[MAXFNAMELEN];
state->wre_errno = errno;
XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
return false;
}
/* This shouldn't happen -- indicates a bug in segment_open */
Assert(state->seg.ws_file >= 0);
/* Update the current segment info. */
state->seg.ws_tli = tli;
state->seg.ws_segno = nextSegNo;
}
/* How many bytes are within this segment? */
if (nbytes > (state->segcxt.ws_segsize - startoff))
segbytes = state->segcxt.ws_segsize - startoff;
else
segbytes = nbytes;
#ifndef FRONTEND
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
#endif
/* Reset errno first; eases reporting non-errno-affecting errors */
errno = 0;
readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
#ifndef FRONTEND
pgstat_report_wait_end();
#endif
if (readbytes <= 0)
{
char fname[MAXFNAMELEN];
XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
if (readbytes < 0)
{
state->wre_errno = errno;
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
fname, startoff, strerror(state->wre_errno));
}
else
{
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
fname, startoff);
}
return false;
}
/* Update state for read */
recptr += readbytes;
nbytes -= readbytes;
p += readbytes;
}
return true;
}

50
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -887,34 +887,34 @@ files = [
[[package]]
name = "cryptography"
version = "41.0.4"
version = "41.0.3"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
optional = false
python-versions = ">=3.7"
files = [
{file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839"},
{file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13"},
{file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143"},
{file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397"},
{file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860"},
{file = "cryptography-41.0.4-cp37-abi3-win32.whl", hash = "sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd"},
{file = "cryptography-41.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829"},
{file = "cryptography-41.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9"},
{file = "cryptography-41.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6"},
{file = "cryptography-41.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311"},
{file = "cryptography-41.0.4.tar.gz", hash = "sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a"},
{file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"},
{file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"},
{file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"},
{file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"},
{file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"},
{file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"},
{file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"},
{file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"},
{file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"},
{file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"},
{file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"},
]
[package.dependencies]

View File

@@ -28,11 +28,10 @@ use tracing::{error, info, info_span, warn, Instrument};
use utils::measured_stream::MeasuredStream;
/// Number of times we should retry the `/proxy_wake_compute` http request.
/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
pub const NUM_RETRIES_CONNECT: u32 = 16;
/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
pub const NUM_RETRIES_CONNECT: u32 = 10;
const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -554,7 +553,8 @@ impl ShouldRetry for compute::ConnectionError {
}
pub fn retry_after(num_retries: u32) -> time::Duration {
BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
// 1.5 seems to be an ok growth factor heuristic
BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
}
/// Finish client connection initialization: confirm auth success, send params, etc.

View File

@@ -303,7 +303,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
#[test]
fn connect_compute_total_wait() {
let mut total_wait = tokio::time::Duration::ZERO;
for num_retries in 1..NUM_RETRIES_CONNECT {
for num_retries in 1..10 {
total_wait += retry_after(num_retries);
}
assert!(total_wait < tokio::time::Duration::from_secs(12));
@@ -494,11 +494,11 @@ async fn connect_to_compute_non_retry_2() {
/// Retry for at most `NUM_RETRIES_CONNECT` times.
#[tokio::test]
async fn connect_to_compute_non_retry_3() {
assert_eq!(NUM_RETRIES_CONNECT, 16);
assert_eq!(NUM_RETRIES_CONNECT, 10);
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![
Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
/* the 11th time */ Retry,
]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds)

View File

@@ -1,5 +1,5 @@
[toolchain]
channel = "1.72.1"
channel = "1.72.0"
profile = "default"
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
# https://rust-lang.github.io/rustup/concepts/profiles.html

View File

@@ -2,7 +2,7 @@
// Main entry point for the safekeeper executable
//
use anyhow::{bail, Context, Result};
use clap::Parser;
use clap::{ArgAction, Parser};
use futures::future::BoxFuture;
use futures::stream::FuturesUnordered;
use futures::{FutureExt, StreamExt};
@@ -105,6 +105,9 @@ struct Args {
/// it during this period passed as a human readable duration.
#[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT, verbatim_doc_comment)]
heartbeat_timeout: Duration,
/// Disable/enable peer recovery. Used for disabling it in tests.
#[arg(long, default_value = "true", action=ArgAction::Set)]
peer_recovery: bool,
/// Remote storage configuration for WAL backup (offloading to s3) as TOML
/// inline table, e.g.
/// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
@@ -268,6 +271,7 @@ async fn main() -> anyhow::Result<()> {
broker_endpoint: args.broker_endpoint,
broker_keepalive_interval: args.broker_keepalive_interval,
heartbeat_timeout: args.heartbeat_timeout,
peer_recovery_enabled: args.peer_recovery,
remote_storage: args.remote_storage,
max_offloader_lag_bytes: args.max_offloader_lag,
wal_backup_enabled: !args.disable_wal_backup,

View File

@@ -372,6 +372,13 @@ impl SafekeeperPostgresHandler {
/// from a walproposer recovery function. This connection gets a special handling:
/// safekeeper must stream all local WAL till the flush_lsn, whether committed or not.
pub fn is_walproposer_recovery(&self) -> bool {
self.appname == Some("wal_proposer_recovery".to_string())
match &self.appname {
None => false,
Some(appname) => {
appname == "wal_proposer_recovery" ||
// set by safekeeper peer recovery
appname.starts_with("safekeeper")
}
}
}
}

View File

@@ -16,8 +16,8 @@ use tokio::io::AsyncReadExt;
use utils::http::endpoint::request_span;
use crate::receive_wal::WalReceiverState;
use crate::safekeeper::ServerInfo;
use crate::safekeeper::Term;
use crate::safekeeper::{ServerInfo, TermLsn};
use crate::send_wal::WalSenderState;
use crate::timeline::PeerInfo;
use crate::{debug_dump, pull_timeline};
@@ -60,16 +60,25 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
.as_ref()
}
/// Same as TermSwitchEntry, but serializes LSN using display serializer
/// Same as TermLsn, but serializes LSN using display serializer
/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
#[serde_as]
#[derive(Debug, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct TermSwitchApiEntry {
pub term: Term,
#[serde_as(as = "DisplayFromStr")]
pub lsn: Lsn,
}
impl From<TermSwitchApiEntry> for TermLsn {
fn from(api_val: TermSwitchApiEntry) -> Self {
TermLsn {
term: api_val.term,
lsn: api_val.lsn,
}
}
}
/// Augment AcceptorState with epoch for convenience
#[derive(Debug, Serialize, Deserialize)]
pub struct AcceptorStateStatus {

View File

@@ -62,6 +62,7 @@ pub struct SafeKeeperConf {
pub broker_endpoint: Uri,
pub broker_keepalive_interval: Duration,
pub heartbeat_timeout: Duration,
pub peer_recovery_enabled: bool,
pub remote_storage: Option<RemoteStorageConfig>,
pub max_offloader_lag_bytes: u64,
pub backup_parallel_jobs: usize,
@@ -100,6 +101,7 @@ impl SafeKeeperConf {
.parse()
.expect("failed to parse default broker endpoint"),
broker_keepalive_interval: Duration::from_secs(5),
peer_recovery_enabled: true,
wal_backup_enabled: true,
backup_parallel_jobs: 1,
pg_auth: None,

View File

@@ -55,9 +55,12 @@ impl WalReceivers {
/// Register new walreceiver. Returned guard provides access to the slot and
/// automatically deregisters in Drop.
pub fn register(self: &Arc<WalReceivers>) -> WalReceiverGuard {
pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
let slots = &mut self.mutex.lock().slots;
let walreceiver = WalReceiverState::Voting;
let walreceiver = WalReceiverState {
conn_id,
status: WalReceiverStatus::Voting,
};
// find empty slot or create new one
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
slots[pos] = Some(walreceiver);
@@ -96,6 +99,18 @@ impl WalReceivers {
self.mutex.lock().slots.iter().flatten().cloned().collect()
}
/// Get number of streaming walreceivers (normally 0 or 1) from compute.
pub fn get_num_streaming(self: &Arc<WalReceivers>) -> usize {
self.mutex
.lock()
.slots
.iter()
.flatten()
// conn_id.is_none skips recovery which also registers here
.filter(|s| s.conn_id.is_none() && matches!(s.status, WalReceiverStatus::Streaming))
.count()
}
/// Unregister walsender.
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
let mut shared = self.mutex.lock();
@@ -108,10 +123,17 @@ struct WalReceiversShared {
slots: Vec<Option<WalReceiverState>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalReceiverState {
/// None means it is recovery initiated by us (this safekeeper).
pub conn_id: Option<ConnectionId>,
pub status: WalReceiverStatus,
}
/// Walreceiver status. Currently only whether it passed voting stage and
/// started receiving the stream, but it is easy to add more if needed.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum WalReceiverState {
pub enum WalReceiverStatus {
Voting,
Streaming,
}
@@ -136,8 +158,8 @@ impl Drop for WalReceiverGuard {
}
}
const MSG_QUEUE_SIZE: usize = 256;
const REPLY_QUEUE_SIZE: usize = 16;
pub const MSG_QUEUE_SIZE: usize = 256;
pub const REPLY_QUEUE_SIZE: usize = 16;
impl SafekeeperPostgresHandler {
/// Wrapper around handle_start_wal_push_guts handling result. Error is
@@ -261,7 +283,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
tli.clone(),
msg_rx,
reply_tx,
self.conn_id,
Some(self.conn_id),
));
// Forward all messages to WalAcceptor
@@ -317,31 +339,41 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
// even when it writes a steady stream of messages.
const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
/// Takes messages from msg_rx, processes and pushes replies to reply_tx.
struct WalAcceptor {
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
/// replies to reply_tx; reading from socket and writing to disk in parallel is
/// beneficial for performance, this struct provides writing to disk part.
pub struct WalAcceptor {
tli: Arc<Timeline>,
msg_rx: Receiver<ProposerAcceptorMessage>,
reply_tx: Sender<AcceptorProposerMessage>,
conn_id: Option<ConnectionId>,
}
impl WalAcceptor {
/// Spawn thread with WalAcceptor running, return handle to it.
fn spawn(
/// Spawn task with WalAcceptor running, return handle to it. Task returns
/// Ok(()) if either of channels has closed, and Err if any error during
/// message processing is encountered.
///
/// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
pub fn spawn(
tli: Arc<Timeline>,
msg_rx: Receiver<ProposerAcceptorMessage>,
reply_tx: Sender<AcceptorProposerMessage>,
conn_id: ConnectionId,
conn_id: Option<ConnectionId>,
) -> JoinHandle<anyhow::Result<()>> {
task::spawn(async move {
let mut wa = WalAcceptor {
tli,
msg_rx,
reply_tx,
conn_id,
};
let span_ttid = wa.tli.ttid; // satisfy borrow checker
wa.run()
.instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid))
.instrument(
info_span!("WAL acceptor", cid = %conn_id.unwrap_or(0), ttid = %span_ttid),
)
.await
})
}
@@ -355,7 +387,7 @@ impl WalAcceptor {
let _compute_conn_guard = ComputeConnectionGuard {
timeline: Arc::clone(&self.tli),
};
let walreceiver_guard = self.tli.get_walreceivers().register();
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
self.tli.update_status_notify().await?;
// After this timestamp we will stop processing AppendRequests and send a response
@@ -372,7 +404,7 @@ impl WalAcceptor {
// Update walreceiver state in shmem for reporting.
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
*walreceiver_guard.get() = WalReceiverState::Streaming;
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
}
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {

View File

@@ -1,17 +1,41 @@
//! This module implements pulling WAL from peer safekeepers if compute can't
//! provide it, i.e. safekeeper lags too much.
use std::sync::Arc;
use std::time::SystemTime;
use std::{fmt, pin::pin, sync::Arc};
use tokio::{select, time::sleep, time::Duration};
use tracing::{info, instrument};
use anyhow::{bail, Context};
use futures::StreamExt;
use postgres_protocol::message::backend::ReplicationMessage;
use tokio::sync::mpsc::{channel, Receiver, Sender};
use tokio::time::timeout;
use tokio::{
select,
time::sleep,
time::{self, Duration},
};
use tokio_postgres::replication::ReplicationStream;
use tokio_postgres::types::PgLsn;
use tracing::*;
use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
use crate::{timeline::Timeline, SafeKeeperConf};
use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
use crate::safekeeper::{AppendRequest, AppendRequestHeader};
use crate::{
http::routes::TimelineStatus,
receive_wal::MSG_QUEUE_SIZE,
safekeeper::{
AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
TermLsn, VoteRequest,
},
timeline::{PeerInfo, Timeline},
SafeKeeperConf,
};
/// Entrypoint for per timeline task which always runs, checking whether
/// recovery for this safekeeper is needed and starting it if so.
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
info!("started");
let mut cancellation_rx = match tli.get_cancellation_rx() {
Ok(rx) => rx,
@@ -22,19 +46,387 @@ pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
};
select! {
_ = recovery_main_loop(tli) => { unreachable!() }
_ = recovery_main_loop(tli, conf) => { unreachable!() }
_ = cancellation_rx.changed() => {
info!("stopped");
}
}
}
/// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
/// fields to explain the choice.
#[derive(Debug)]
pub struct RecoveryNeededInfo {
/// my term
pub term: Term,
/// my last_log_term
pub last_log_term: Term,
/// my flush_lsn
pub flush_lsn: Lsn,
/// peers from which we can fetch WAL, for observability.
pub peers: Vec<PeerInfo>,
/// for observability
pub num_streaming_computes: usize,
pub donors: Vec<Donor>,
}
// Custom to omit not important fields from PeerInfo.
impl fmt::Display for RecoveryNeededInfo {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{{")?;
write!(
f,
"term: {}, last_log_term: {}, flush_lsn: {}, peers: {{",
self.term, self.last_log_term, self.flush_lsn
)?;
for p in self.peers.iter() {
write!(
f,
"PeerInfo {{ sk_id: {}, term: {}, last_log_term: {}, flush_lsn: {} }}, ",
p.sk_id, p.term, p.last_log_term, p.flush_lsn
)?;
}
write!(
f,
"}} num_streaming_computes: {}, donors: {:?}",
self.num_streaming_computes, self.donors
)
}
}
#[derive(Clone, Debug)]
pub struct Donor {
pub sk_id: NodeId,
/// equals to last_log_term
pub term: Term,
pub flush_lsn: Lsn,
pub pg_connstr: String,
pub http_connstr: String,
}
impl From<&PeerInfo> for Donor {
fn from(p: &PeerInfo) -> Self {
Donor {
sk_id: p.sk_id,
term: p.term,
flush_lsn: p.flush_lsn,
pg_connstr: p.pg_connstr.clone(),
http_connstr: p.http_connstr.clone(),
}
}
}
const CHECK_INTERVAL_MS: u64 = 2000;
/// Check regularly whether we need to start recovery.
async fn recovery_main_loop(_tli: Arc<Timeline>) {
async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
loop {
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
match recovery_needed_info.donors.first() {
Some(donor) => {
info!(
"starting recovery from donor {}: {}",
donor.sk_id, recovery_needed_info
);
match recover(tli.clone(), donor, &conf).await {
// Note: 'write_wal rewrites WAL written before' error is
// expected here and might happen if compute and recovery
// concurrently write the same data. Eventually compute
// should win.
Err(e) => warn!("recovery failed: {:#}", e),
Ok(msg) => info!("recovery finished: {}", msg),
}
}
None => {
trace!(
"recovery not needed or not possible: {}",
recovery_needed_info
);
}
}
sleep(check_duration).await;
}
}
/// Recover from the specified donor. Returns message explaining normal finish
/// reason or error.
async fn recover(
tli: Arc<Timeline>,
donor: &Donor,
conf: &SafeKeeperConf,
) -> anyhow::Result<String> {
// Learn donor term switch history to figure out starting point.
let client = reqwest::Client::new();
let timeline_info: TimelineStatus = client
.get(format!(
"http://{}/v1/tenant/{}/timeline/{}",
donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id
))
.send()
.await?
.json()
.await?;
if timeline_info.acceptor_state.term != donor.term {
bail!(
"donor term changed from {} to {}",
donor.term,
timeline_info.acceptor_state.term
);
}
// convert from API TermSwitchApiEntry into TermLsn.
let donor_th = TermHistory(
timeline_info
.acceptor_state
.term_history
.iter()
.map(|tl| Into::<TermLsn>::into(*tl))
.collect(),
);
// Now understand our term history.
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
let vote_response = match tli
.process_msg(&vote_request)
.await
.context("VoteRequest handling")?
{
Some(AcceptorProposerMessage::VoteResponse(vr)) => vr,
_ => {
bail!("unexpected VoteRequest response"); // unreachable
}
};
if vote_response.term != donor.term {
bail!(
"our term changed from {} to {}",
donor.term,
vote_response.term
);
}
let last_common_point = match TermHistory::find_highest_common_point(
&donor_th,
&vote_response.term_history,
vote_response.flush_lsn,
) {
None => bail!(
"couldn't find common point in histories, donor {:?}, sk {:?}",
donor_th,
vote_response.term_history,
),
Some(lcp) => lcp,
};
info!("found last common point at {:?}", last_common_point);
// truncate WAL locally
let pe = ProposerAcceptorMessage::Elected(ProposerElected {
term: donor.term,
start_streaming_at: last_common_point.lsn,
term_history: donor_th,
timeline_start_lsn: Lsn::INVALID,
});
// Successful ProposerElected handling always returns None. If term changed,
// we'll find out that during the streaming. Note: it is expected to get
// 'refusing to overwrite correct WAL' here if walproposer reconnected
// concurrently, restart helps here.
tli.process_msg(&pe)
.await
.context("ProposerElected handling")?;
recovery_stream(tli, donor, last_common_point.lsn, conf).await
}
// Pull WAL from donor, assuming handshake is already done.
async fn recovery_stream(
tli: Arc<Timeline>,
donor: &Donor,
start_streaming_at: Lsn,
conf: &SafeKeeperConf,
) -> anyhow::Result<String> {
// TODO: pass auth token
let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
let mut cfg = cfg.to_tokio_postgres_config();
// It will make safekeeper give out not committed WAL (up to flush_lsn).
cfg.application_name(&format!("safekeeper_{}", conf.my_id));
cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
let connect_timeout = Duration::from_millis(10000);
let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls))
.await
{
Ok(client_and_conn) => client_and_conn?,
Err(_elapsed) => {
bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open");
}
};
trace!("connected to {:?}", donor);
// The connection object performs the actual communication with the
// server, spawn it off to run on its own.
let ttid = tli.ttid;
tokio::spawn(async move {
if let Err(e) = connection
.instrument(info_span!("recovery task connection poll", ttid = %ttid))
.await
{
// This logging isn't very useful as error is anyway forwarded to client.
trace!(
"tokio_postgres connection object finished with error: {}",
e
);
}
});
let query = format!(
"START_REPLICATION PHYSICAL {} (term='{}')",
start_streaming_at, donor.term
);
let copy_stream = client.copy_both_simple(&query).await?;
let physical_stream = ReplicationStream::new(copy_stream);
// As in normal walreceiver, do networking and writing to disk in parallel.
let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None);
let res = tokio::select! {
r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r,
r = read_replies(reply_rx, donor.term) => r.map(|()| None),
};
// Join the spawned WalAcceptor. At this point chans to/from it passed to
// network routines are dropped, so it will exit as soon as it touches them.
match wa.await {
Ok(Ok(())) => {
// WalAcceptor finished normally, termination reason is different
match res {
Ok(Some(success_desc)) => Ok(success_desc),
Ok(None) => bail!("unexpected recovery end without error/success"), // can't happen
Err(e) => Err(e), // network error or term change
}
}
Ok(Err(e)) => Err(e), // error while processing message
Err(e) => bail!("WalAcceptor panicked: {}", e),
}
}
// Perform network part of streaming: read data and push it to msg_tx, send KA
// to make sender hear from us. If there is nothing coming for a while, check
// for termination.
// Returns
// - Ok(None) if channel to WalAcceptor closed -- its task should return error.
// - Ok(Some(String)) if recovery successfully completed.
// - Err if error happened while reading/writing to socket.
async fn network_io(
physical_stream: ReplicationStream,
msg_tx: Sender<ProposerAcceptorMessage>,
donor: Donor,
tli: Arc<Timeline>,
conf: SafeKeeperConf,
) -> anyhow::Result<Option<String>> {
let mut physical_stream = pin!(physical_stream);
let mut last_received_lsn = Lsn::INVALID;
// tear down connection if no data arrives withing this period
let no_data_timeout = Duration::from_millis(30000);
loop {
let msg = match timeout(no_data_timeout, physical_stream.next()).await {
Ok(next) => match next {
None => bail!("unexpected end of replication stream"),
Some(msg) => msg.context("get replication message")?,
},
Err(_) => bail!("no message received within {:?}", no_data_timeout),
};
match msg {
ReplicationMessage::XLogData(xlog_data) => {
let ar_hdr = AppendRequestHeader {
term: donor.term,
epoch_start_lsn: Lsn::INVALID, // unused
begin_lsn: Lsn(xlog_data.wal_start()),
end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
truncate_lsn: Lsn::INVALID, // do not attempt to advance
proposer_uuid: [0; 16],
};
let ar = AppendRequest {
h: ar_hdr,
wal_data: xlog_data.into_data(),
};
trace!(
"processing AppendRequest {}-{}, len {}",
ar.h.begin_lsn,
ar.h.end_lsn,
ar.wal_data.len()
);
last_received_lsn = ar.h.end_lsn;
if msg_tx
.send(ProposerAcceptorMessage::AppendRequest(ar))
.await
.is_err()
{
return Ok(None); // chan closed, WalAcceptor terminated
}
}
ReplicationMessage::PrimaryKeepAlive(_) => {
// keepalive means nothing is being streamed for a while. Check whether we need to stop.
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
// do current donors still contain one we currently connected to?
if !recovery_needed_info
.donors
.iter()
.any(|d| d.sk_id == donor.sk_id)
{
// Most likely it means we are caughtup.
// note: just exiting makes tokio_postgres send CopyFail to the far end.
return Ok(Some(format!(
"terminating at {} as connected safekeeper {} with term {} is not a donor anymore: {}",
last_received_lsn, donor.sk_id, donor.term, recovery_needed_info
)));
}
}
_ => {}
}
// Send reply to each message to keep connection alive. Ideally we
// should do that once in a while instead, but this again requires
// stream split or similar workaround, and recovery is anyway not that
// performance critical.
//
// We do not know here real write/flush LSNs (need to take mutex again
// or check replies which are read in different future), but neither
// sender much cares about them, so just send last received.
physical_stream
.as_mut()
.standby_status_update(
PgLsn::from(last_received_lsn.0),
PgLsn::from(last_received_lsn.0),
PgLsn::from(last_received_lsn.0),
SystemTime::now(),
0,
)
.await?;
}
}
// Read replies from WalAcceptor. We are not interested much in sending them to
// donor safekeeper, so don't route them anywhere. However, we should check if
// term changes and exit if it does.
// Returns Ok(()) if channel closed, Err in case of term change.
async fn read_replies(
mut reply_rx: Receiver<AcceptorProposerMessage>,
donor_term: Term,
) -> anyhow::Result<()> {
loop {
match reply_rx.recv().await {
Some(msg) => {
if let AcceptorProposerMessage::AppendResponse(ar) = msg {
if ar.term != donor_term {
bail!("donor term changed from {} to {}", donor_term, ar.term);
}
}
}
None => return Ok(()), // chan closed, WalAcceptor terminated
}
}
}

View File

@@ -91,6 +91,59 @@ impl TermHistory {
}
TermHistory(res)
}
/// Find point of divergence between leader (walproposer) term history and
/// safekeeper. Arguments are not symmetrics as proposer history ends at
/// +infinity while safekeeper at flush_lsn.
/// C version is at walproposer SendProposerElected.
pub fn find_highest_common_point(
prop_th: &TermHistory,
sk_th: &TermHistory,
sk_wal_end: Lsn,
) -> Option<TermLsn> {
let (prop_th, sk_th) = (&prop_th.0, &sk_th.0); // avoid .0 below
// find last common term, if any...
let mut last_common_idx = None;
for i in 0..min(sk_th.len(), prop_th.len()) {
if prop_th[i].term != sk_th[i].term {
break;
}
// If term is the same, LSN must be equal as well.
assert!(
prop_th[i].lsn == sk_th[i].lsn,
"same term {} has different start LSNs: prop {}, sk {}",
prop_th[i].term,
prop_th[i].lsn,
sk_th[i].lsn
);
last_common_idx = Some(i);
}
let last_common_idx = match last_common_idx {
None => return None, // no common point
Some(lci) => lci,
};
// Now find where it ends at both prop and sk and take min. End of
// (common) term is the start of the next except it is the last one;
// there it is flush_lsn in case of safekeeper or, in case of proposer
// +infinity, so we just take flush_lsn then.
if last_common_idx == prop_th.len() - 1 {
Some(TermLsn {
term: prop_th[last_common_idx].term,
lsn: sk_wal_end,
})
} else {
let prop_common_term_end = prop_th[last_common_idx + 1].lsn;
let sk_common_term_end = if last_common_idx + 1 < sk_th.len() {
sk_th[last_common_idx + 1].lsn
} else {
sk_wal_end
};
Some(TermLsn {
term: prop_th[last_common_idx].term,
lsn: min(prop_common_term_end, sk_common_term_end),
})
}
}
}
/// Display only latest entries for Debug.
@@ -305,19 +358,19 @@ pub struct AcceptorGreeting {
/// Vote request sent from proposer to safekeepers
#[derive(Debug, Deserialize)]
pub struct VoteRequest {
term: Term,
pub term: Term,
}
/// Vote itself, sent from safekeeper to proposer
#[derive(Debug, Serialize)]
pub struct VoteResponse {
term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
vote_given: u64, // fixme u64 due to padding
// Safekeeper flush_lsn (end of WAL) + history of term switches allow
// proposer to choose the most advanced one.
flush_lsn: Lsn,
pub flush_lsn: Lsn,
truncate_lsn: Lsn,
term_history: TermHistory,
pub term_history: TermHistory,
timeline_start_lsn: Lsn,
}
@@ -344,7 +397,8 @@ pub struct AppendRequest {
pub struct AppendRequestHeader {
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
pub term: Term,
// LSN since the proposer appends WAL; determines epoch switch point.
// TODO: remove this field, it in unused -- LSN of term switch can be taken
// from ProposerElected (as well as from term history).
pub epoch_start_lsn: Lsn,
/// start position of message in WAL
pub begin_lsn: Lsn,
@@ -759,7 +813,7 @@ where
bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
msg.term, self.flush_lsn(), msg.start_streaming_at)
}
// Otherwise this shouldn't happen.
// Otherwise we must never attempt to truncate committed data.
assert!(
msg.start_streaming_at >= self.inmem.commit_lsn,
"attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
@@ -810,6 +864,14 @@ where
info!("start receiving WAL since {:?}", msg.start_streaming_at);
// Cache LSN where term starts to immediately fsync control file with
// commit_lsn once we reach it -- sync-safekeepers finishes when
// persisted commit_lsn on majority of safekeepers aligns.
self.epoch_start_lsn = match msg.term_history.0.last() {
None => bail!("proposer elected with empty term history"),
Some(term_lsn_start) => term_lsn_start.lsn,
};
Ok(None)
}
@@ -835,10 +897,7 @@ where
// file: walproposer in sync mode is very interested when this
// happens. Note: this is for sync-safekeepers mode only, as
// otherwise commit_lsn might jump over epoch_start_lsn.
// Also note that commit_lsn can reach epoch_start_lsn earlier
// that we receive new epoch_start_lsn, and we still need to sync
// control file in this case.
if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
self.persist_control_file(self.state.clone()).await?;
}
@@ -902,7 +961,6 @@ where
// Now we know that we are in the same term as the proposer,
// processing the message.
self.epoch_start_lsn = msg.h.epoch_start_lsn;
self.inmem.proposer_uuid = msg.h.proposer_uuid;
// do the job
@@ -1185,4 +1243,65 @@ mod tests {
sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
assert_eq!(sk.get_epoch(), 1);
}
#[test]
fn test_find_highest_common_point_none() {
let prop_th = TermHistory(vec![(0, Lsn(1)).into()]);
let sk_th = TermHistory(vec![(1, Lsn(1)).into(), (2, Lsn(2)).into()]);
assert_eq!(
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(3),),
None
);
}
#[test]
fn test_find_highest_common_point_middle() {
let prop_th = TermHistory(vec![
(1, Lsn(10)).into(),
(2, Lsn(20)).into(),
(4, Lsn(40)).into(),
]);
let sk_th = TermHistory(vec![
(1, Lsn(10)).into(),
(2, Lsn(20)).into(),
(3, Lsn(30)).into(), // sk ends last common term 2 at 30
]);
assert_eq!(
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(40),),
Some(TermLsn {
term: 2,
lsn: Lsn(30),
})
);
}
#[test]
fn test_find_highest_common_point_sk_end() {
let prop_th = TermHistory(vec![
(1, Lsn(10)).into(),
(2, Lsn(20)).into(), // last common term 2, sk will end it at 32 sk_end_lsn
(4, Lsn(40)).into(),
]);
let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
assert_eq!(
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
Some(TermLsn {
term: 2,
lsn: Lsn(32),
})
);
}
#[test]
fn test_find_highest_common_point_walprop() {
let prop_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
assert_eq!(
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
Some(TermLsn {
term: 2,
lsn: Lsn(32),
})
);
}
}

View File

@@ -418,10 +418,11 @@ impl SafekeeperPostgresHandler {
}
info!(
"starting streaming from {:?}, available WAL ends at {}, recovery={}",
"starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}",
start_pos,
end_pos,
matches!(end_watch, EndWatch::Flush(_))
matches!(end_watch, EndWatch::Flush(_)),
appname
);
// switch to copy
@@ -680,7 +681,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
}
}
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(5);
/// Wait until we have available WAL > start_pos or timeout expires. Returns
/// - Ok(Some(end_pos)) if needed lsn is successfully observed;

View File

@@ -11,6 +11,7 @@ use serde_with::DisplayFromStr;
use std::cmp::max;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{Mutex, MutexGuard};
use tokio::{
sync::{mpsc::Sender, watch},
@@ -27,7 +28,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use crate::receive_wal::WalReceivers;
use crate::recovery::recovery_main;
use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
use crate::safekeeper::{
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM,
@@ -45,11 +46,12 @@ use crate::{debug_dump, wal_storage};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PeerInfo {
pub sk_id: NodeId,
pub term: Term,
/// Term of the last entry.
_last_log_term: Term,
pub last_log_term: Term,
/// LSN of the last record.
#[serde_as(as = "DisplayFromStr")]
_flush_lsn: Lsn,
pub flush_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub commit_lsn: Lsn,
/// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
@@ -61,16 +63,21 @@ pub struct PeerInfo {
#[serde(skip)]
#[serde(default = "Instant::now")]
ts: Instant,
pub pg_connstr: String,
pub http_connstr: String,
}
impl PeerInfo {
fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
PeerInfo {
sk_id: NodeId(sk_info.safekeeper_id),
_last_log_term: sk_info.last_log_term,
_flush_lsn: Lsn(sk_info.flush_lsn),
term: sk_info.term,
last_log_term: sk_info.last_log_term,
flush_lsn: Lsn(sk_info.flush_lsn),
commit_lsn: Lsn(sk_info.commit_lsn),
local_start_lsn: Lsn(sk_info.local_start_lsn),
pg_connstr: sk_info.safekeeper_connstr.clone(),
http_connstr: sk_info.http_connstr.clone(),
ts,
}
}
@@ -265,6 +272,20 @@ impl SharedState {
availability_zone: conf.availability_zone.clone(),
}
}
/// Get our latest view of alive peers status on the timeline.
/// We pass our own info through the broker as well, so when we don't have connection
/// to the broker returned vec is empty.
fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
let now = Instant::now();
self.peers_info
.0
.iter()
// Regard peer as absent if we haven't heard from it within heartbeat_timeout.
.filter(|p| now.duration_since(p.ts) <= heartbeat_timeout)
.cloned()
.collect()
}
}
#[derive(Debug, thiserror::Error)]
@@ -446,7 +467,9 @@ impl Timeline {
/// Bootstrap new or existing timeline starting background stasks.
pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
// Start recovery task which always runs on the timeline.
tokio::spawn(recovery_main(self.clone(), conf.clone()));
if conf.peer_recovery_enabled {
tokio::spawn(recovery_main(self.clone(), conf.clone()));
}
}
/// Delete timeline from disk completely, by removing timeline directory. Background
@@ -680,20 +703,88 @@ impl Timeline {
Ok(())
}
/// Get our latest view of alive peers status on the timeline.
/// We pass our own info through the broker as well, so when we don't have connection
/// to the broker returned vec is empty.
pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
let shared_state = self.write_shared_state().await;
let now = Instant::now();
shared_state
.peers_info
.0
.iter()
// Regard peer as absent if we haven't heard from it within heartbeat_timeout.
.filter(|p| now.duration_since(p.ts) <= conf.heartbeat_timeout)
.cloned()
.collect()
shared_state.get_peers(conf.heartbeat_timeout)
}
/// Should we start fetching WAL from a peer safekeeper, and if yes, from
/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
/// to fetch, and we can do that without running elections; 2) there is no
/// actively streaming compute, as we don't want to compete with it.
///
/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
/// to its last_log_term so we are sure such a leader ever had been elected.
///
/// All possible donors are returned so that we could keep connection to the
/// current one if it is good even if it slightly lags behind.
///
/// Note that term conditions above might be not met, but safekeepers are
/// still not aligned on last flush_lsn. Generally in this case until
/// elections are run it is not possible to say which safekeeper should
/// recover from which one -- history which would be committed is different
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
/// Thus we don't try to predict it here.
pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
let ss = self.write_shared_state().await;
let term = ss.sk.state.acceptor_state.term;
let last_log_term = ss.sk.get_epoch();
let flush_lsn = ss.sk.flush_lsn();
// note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
let mut peers = ss.get_peers(heartbeat_timeout);
// Sort by <last log term, lsn> pairs.
peers.sort_by(|p1, p2| {
let tl1 = TermLsn {
term: p1.last_log_term,
lsn: p1.flush_lsn,
};
let tl2 = TermLsn {
term: p2.last_log_term,
lsn: p2.flush_lsn,
};
tl2.cmp(&tl1) // desc
});
let num_streaming_computes = self.walreceivers.get_num_streaming();
let donors = if num_streaming_computes > 0 {
vec![] // If there is a streaming compute, don't try to recover to not intervene.
} else {
peers
.iter()
.filter_map(|candidate| {
// Are we interested in this candidate?
let candidate_tl = TermLsn {
term: candidate.last_log_term,
lsn: candidate.flush_lsn,
};
let my_tl = TermLsn {
term: last_log_term,
lsn: flush_lsn,
};
if my_tl < candidate_tl {
// Yes, we are interested. Can we pull from it without
// (re)running elections? It is possible if 1) his term
// is equal to his last_log_term so we could act on
// behalf of leader of this term (we must be sure he was
// ever elected) and 2) our term is not higher, or we'll refuse data.
if candidate.term == candidate.last_log_term && candidate.term >= term {
Some(Donor::from(candidate))
} else {
None
}
} else {
None
}
})
.collect()
};
RecoveryNeededInfo {
term,
last_log_term,
flush_lsn,
peers,
num_streaming_computes,
donors,
}
}
pub fn get_walsenders(&self) -> &Arc<WalSenders> {

View File

@@ -1,62 +0,0 @@
#!/usr/bin/env python3
#
# Script to download the basebackup from a pageserver to a tar file.
#
# This can be useful in disaster recovery.
#
import argparse
import psycopg2
from psycopg2.extensions import connection as PgConnection
def main(args: argparse.Namespace):
pageserver_connstr = args.pageserver_connstr
tenant_id = args.tenant
timeline_id = args.timeline
lsn = args.lsn
output_path = args.output_path
psconn: PgConnection = psycopg2.connect(pageserver_connstr)
psconn.autocommit = True
output = open(output_path, "wb")
with psconn.cursor() as pscur:
pscur.copy_expert(f"basebackup {tenant_id} {timeline_id} {lsn}", output)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tenant-id",
dest="tenant",
required=True,
help="Id of the tenant",
)
parser.add_argument(
"--timeline-id",
dest="timeline",
required=True,
help="Id of the timeline",
)
parser.add_argument(
"--lsn",
dest="lsn",
required=True,
help="LSN to take the basebackup at",
)
parser.add_argument(
"--pageserver-connstr",
dest="pageserver_connstr",
required=True,
help="libpq connection string of the pageserver",
)
parser.add_argument(
"--output",
dest="output_path",
required=True,
help="output path to write the basebackup to",
)
args = parser.parse_args()
main(args)

View File

@@ -2691,6 +2691,20 @@ class Safekeeper:
def data_dir(self) -> str:
return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
def timeline_dir(self, tenant_id, timeline_id) -> str:
return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id))
def list_segments(self, tenant_id, timeline_id) -> List[str]:
"""
Get list of segment names of the given timeline.
"""
tli_dir = self.timeline_dir(tenant_id, timeline_id)
segments = []
for _, _, filenames in os.walk(tli_dir):
segments.extend([f for f in filenames if f != "safekeeper.control"])
segments.sort()
return segments
@dataclass
class SafekeeperTimelineStatus:

View File

@@ -1,44 +0,0 @@
import threading
import time
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, PgBin
#
# Test branching, when a transaction is in prepared state
#
@pytest.mark.timeout(600)
def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
env = neon_simple_env
env.neon_cli.create_branch("test_lfc_resize", "empty")
endpoint = env.endpoints.create_start(
"test_lfc_resize",
config_lines=[
"neon.file_cache_path='file.cache'",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
],
)
n_resize = 10
scale = 10
log.info("postgres is running on 'test_lfc_resize' branch")
def run_pgbench(connstr: str):
log.info(f"Start a pgbench workload on pg {connstr}")
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])
thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
thread.start()
conn = endpoint.connect()
cur = conn.cursor()
for i in range(n_resize):
cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
cur.execute("select pg_reload_conf()")
time.sleep(1)
thread.join()

View File

@@ -157,6 +157,8 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
tenant_id, timeline_id = env.neon_cli.create_tenant()
endpoint = env.endpoints.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id)
# insert something to force sk -> ps message
endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
# Wait to make sure that we get a latest WAL receiver data.
# We need to wait here because it's possible that we don't have access to
# the latest WAL yet, when the `timeline_detail` API is first called.
@@ -168,7 +170,7 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
)
# Make a DB modification then expect getting a new WAL receiver's data.
endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')")
wait_until(
number_of_iterations=5,
interval=1,

View File

@@ -1,7 +1,5 @@
import json
import time
from dataclasses import dataclass
from pathlib import Path
from queue import SimpleQueue
from typing import Any, Dict, Set
@@ -30,7 +28,6 @@ def test_metric_collection(
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
# this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose
uploads: SimpleQueue[Any] = SimpleQueue()
def metrics_handler(request: Request) -> Response:
@@ -38,9 +35,7 @@ def test_metric_collection(
return Response(status=400)
events = request.json["events"]
is_last = request.headers["pageserver-metrics-last-upload-in-batch"]
assert is_last in ["true", "false"]
uploads.put((events, is_last == "true"))
uploads.put(events)
return Response(status=200)
# Require collecting metrics frequently, since we change
@@ -48,12 +43,15 @@ def test_metric_collection(
#
# Disable time-based pitr, we will use the manual GC calls
# to trigger remote storage operations in a controlled way
neon_env_builder.pageserver_config_override = f"""
neon_env_builder.pageserver_config_override = (
f"""
metric_collection_interval="1s"
metric_collection_endpoint="{metric_collection_endpoint}"
cached_metric_collection_interval="0s"
synthetic_size_calculation_interval="3s"
"""
"""
+ "tenant_config={pitr_interval = '0 sec'}"
)
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
@@ -65,7 +63,7 @@ def test_metric_collection(
)
# spin up neon, after http server is ready
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
env = neon_env_builder.init_start()
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
tenant_id = env.initial_tenant
@@ -126,20 +124,19 @@ def test_metric_collection(
events = uploads.get(timeout=timeout)
if events == "ready":
(events, is_last) = uploads.get(timeout=timeout)
v.ingest(events, is_last)
events = uploads.get(timeout=timeout)
v.ingest(events)
break
else:
(events, is_last) = events
v.ingest(events, is_last)
v.ingest(events)
if "synthetic_storage_size" not in v.accepted_event_names():
log.info("waiting for synthetic storage size to be calculated and uploaded...")
rounds = 0
while "synthetic_storage_size" not in v.accepted_event_names():
(events, is_last) = uploads.get(timeout=timeout)
v.ingest(events, is_last)
events = uploads.get(timeout=timeout)
v.ingest(events)
rounds += 1
assert rounds < 10, "did not get synthetic_storage_size in 10 uploads"
# once we have it in verifiers, it will assert that future batches will contain it
@@ -153,161 +150,17 @@ def test_metric_collection(
events = uploads.get(timeout=timeout)
if events == "ready":
(events, is_last) = uploads.get(timeout=timeout * 3)
v.ingest(events, is_last)
(events, is_last) = uploads.get(timeout=timeout)
v.ingest(events, is_last)
events = uploads.get(timeout=timeout * 3)
v.ingest(events)
events = uploads.get(timeout=timeout)
v.ingest(events)
break
else:
(events, is_last) = events
v.ingest(events, is_last)
v.ingest(events)
httpserver.check()
def test_metric_collection_cleans_up_tempfile(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address,
):
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
# this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose
uploads: SimpleQueue[Any] = SimpleQueue()
def metrics_handler(request: Request) -> Response:
if request.json is None:
return Response(status=400)
events = request.json["events"]
is_last = request.headers["pageserver-metrics-last-upload-in-batch"]
assert is_last in ["true", "false"]
uploads.put((events, is_last == "true"))
return Response(status=200)
# Require collecting metrics frequently, since we change
# the timeline and want something to be logged about it.
#
# Disable time-based pitr, we will use the manual GC calls
# to trigger remote storage operations in a controlled way
neon_env_builder.pageserver_config_override = f"""
metric_collection_interval="1s"
metric_collection_endpoint="{metric_collection_endpoint}"
cached_metric_collection_interval="0s"
synthetic_size_calculation_interval="3s"
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
metrics_handler
)
# spin up neon, after http server is ready
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
pageserver_http = env.pageserver.http_client()
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
"""
INSERT INTO foo
SELECT g, 0, 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
# we expect uploads at 1Hz, on busy runners this could be too optimistic,
# so give 5s we only want to get the following upload after "ready" value.
timeout = 5
# these strings in the upload queue allow synchronizing with the uploads
# and the main test execution
uploads.put("ready")
while True:
events = uploads.get(timeout=timeout)
if events == "ready":
(events, _) = uploads.get(timeout=timeout)
break
# should really configure an env?
pageserver_http.configure_failpoints(("before-persist-last-metrics-collected", "exit"))
time.sleep(3)
env.pageserver.stop()
initially = iterate_pageserver_workdir(env.pageserver.workdir, "last_consumption_metrics.json")
assert (
len(initially.matching) == 2
), f"expecting actual file and tempfile, but not found: {initially.matching}"
uploads.put("ready")
env.pageserver.start()
while True:
events = uploads.get(timeout=timeout * 3)
if events == "ready":
(events, _) = uploads.get(timeout=timeout)
break
env.pageserver.stop()
later = iterate_pageserver_workdir(env.pageserver.workdir, "last_consumption_metrics.json")
# it is possible we shutdown the pageserver right at the correct time, so the old tempfile
# is gone, but we also have a new one.
only = set(["last_consumption_metrics.json"])
assert (
initially.matching.intersection(later.matching) == only
), "only initial tempfile should had been removed"
assert initially.other.issuperset(later.other), "no other files should had been removed"
@dataclass
class PrefixPartitionedFiles:
matching: Set[str]
other: Set[str]
def iterate_pageserver_workdir(path: Path, prefix: str) -> PrefixPartitionedFiles:
"""
Iterates the files in the workdir, returns two sets:
- files with the prefix
- files without the prefix
"""
matching = set()
other = set()
for entry in path.iterdir():
if not entry.is_file():
continue
if not entry.name.startswith(prefix):
other.add(entry.name)
else:
matching.add(entry.name)
return PrefixPartitionedFiles(matching, other)
class MetricsVerifier:
"""
A graph of per tenant per timeline verifiers, allowing one for each
@@ -318,7 +171,7 @@ class MetricsVerifier:
self.tenants: Dict[TenantId, TenantMetricsVerifier] = {}
pass
def ingest(self, events, is_last):
def ingest(self, events):
stringified = json.dumps(events, indent=2)
log.info(f"ingesting: {stringified}")
for event in events:
@@ -328,9 +181,8 @@ class MetricsVerifier:
self.tenants[id].ingest(event)
if is_last:
for t in self.tenants.values():
t.post_batch()
for t in self.tenants.values():
t.post_batch()
def accepted_event_names(self) -> Set[str]:
names: Set[str] = set()

View File

@@ -1,3 +1,4 @@
import filecmp
import os
import pathlib
import random
@@ -980,6 +981,137 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
endpoint.start()
# Test that we can create timeline with one safekeeper down and initialize it
# later when some data already had been written.
def test_late_init(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
sk1 = env.safekeepers[0]
sk1.stop()
# create and insert smth while safekeeper is down...
env.neon_cli.create_branch("test_late_init")
endpoint = env.endpoints.create_start("test_late_init")
endpoint.safe_psql("create table t(key int, value text)")
endpoint.safe_psql("insert into t select generate_series(1, 1000), 'payload'")
log.info("insert with safekeeper down done")
endpoint.stop() # stop compute
# stop another safekeeper, and start one which missed timeline creation
sk2 = env.safekeepers[1]
sk2.stop()
sk1.start()
# insert some more
endpoint = env.endpoints.create_start("test_late_init")
endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
# is timeline flush_lsn equal on provided safekeepers?
def is_flush_lsn_aligned(sk1_http_cli, sk2_http_cli, tenant_id, timeline_id):
return (
sk1_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn
== sk2_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn
)
# Test behaviour with one safekeeper down and missing a lot of WAL. Namely, that
# 1) walproposer can't recover node if it misses WAL written by previous computes, but
# still starts up and functions normally if two other sks are ok.
# 2) walproposer doesn't keep WAL after some threshold (pg_wal bloat is limited), but functions
# normally if two other sks are ok.
# 3) Lagged safekeeper can still recover by peer recovery.
def test_one_sk_down(neon_env_builder: NeonEnvBuilder):
pass
# Smaller version of test_one_sk_down testing peer recovery in isolation: that
# it works without compute at all.
def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_peer_recovery")
endpoint = env.endpoints.create_start("test_peer_recovery")
endpoint.safe_psql("create table t(key int, value text)")
sk1 = env.safekeepers[0]
sk1.stop()
# roughly fills one segment
endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'")
endpoint.stop() # stop compute
# now start safekeeper, but with peer recovery disabled
sk1.start(extra_opts=["--peer-recovery=false"])
# it should lag for about a segment
sk1_http_cli = sk1.http_client()
sk2 = env.safekeepers[1]
sk2_http_cli = sk2.http_client()
sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
log.info(
f"flush_lsns after insertion: sk1={sk1_tli_status.flush_lsn}, sk2={sk2_tli_status.flush_lsn}"
)
assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
# wait a bit, lsns shouldn't change
# time.sleep(5)
sk1_tli_status = sk1_http_cli.timeline_status(tenant_id, timeline_id)
sk2_tli_status = sk2_http_cli.timeline_status(tenant_id, timeline_id)
log.info(
f"flush_lsns after waiting: sk1={sk1_tli_status.flush_lsn}, sk2={sk2_tli_status.flush_lsn}"
)
assert sk2_tli_status.flush_lsn - sk1_tli_status.flush_lsn >= 16 * 1024 * 1024
# now restart safekeeper with peer recovery enabled and wait for recovery
sk1.stop().start()
wait(
partial(is_flush_lsn_aligned, sk1_http_cli, sk2_http_cli, tenant_id, timeline_id),
"flush_lsn to get aligned",
wait_f=lambda sk1_http_cli=sk1_http_cli, sk2_http_cli=sk2_http_cli, tenant_id=tenant_id, timeline_id=timeline_id: log.info(
f"waiting for flush_lsn alignment, sk1.flush_lsn={sk1_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn}, sk2.flush_lsn={sk2_http_cli.timeline_status(tenant_id, timeline_id).flush_lsn}"
),
)
# check that WALs are identic after recovery
segs = sk1.list_segments(tenant_id, timeline_id)
log.info(f"segs are {segs}")
(_, mismatch, not_regular) = filecmp.cmpfiles(
sk1.timeline_dir(tenant_id, timeline_id),
sk2.timeline_dir(tenant_id, timeline_id),
segs,
shallow=False,
)
log.info(
f"filecmp result mismatch and not regular files:\n\t mismatch={mismatch}\n\t not_regular={not_regular}"
)
for f in mismatch:
f1 = os.path.join(sk1.timeline_dir(tenant_id, timeline_id), f)
f2 = os.path.join(sk2.timeline_dir(tenant_id, timeline_id), f)
stdout_filename = "{}.filediff".format(f2)
with open(stdout_filename, "w") as stdout_f:
subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True)
subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True)
cmd = "diff {}.hex {}.hex".format(f1, f2)
subprocess.run([cmd], stdout=stdout_f, shell=True)
assert (mismatch, not_regular) == ([], [])
# stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
env.safekeepers[2].stop()
endpoint = env.endpoints.create_start("test_peer_recovery")
endpoint.safe_psql("insert into t select generate_series(1,100), 'payload'")
class SafekeeperEnv:
def __init__(
self,

View File

@@ -1,5 +1,5 @@
{
"postgres-v16": "389ce36b4b3da7aa654a25e1b3f10b641319a87f",
"postgres-v15": "74cfe3e681836747a31fdbd47bdd14b3d81b0772",
"postgres-v16": "7a50f139c6269454ab9260c7a9752874b9089943",
"postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027",
"postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568"
}

View File

@@ -64,7 +64,6 @@ toml_edit = { version = "0.19", features = ["serde"] }
tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
tracing = { version = "0.1", features = ["log"] }
tracing-core = { version = "0.1" }
tungstenite = { version = "0.20" }
url = { version = "2", features = ["serde"] }
uuid = { version = "1", features = ["serde", "v4"] }