Compare commits

..

116 Commits

Author SHA1 Message Date
Conrad Ludgate
77e431d80a abstract out listen-accept loops 2024-10-07 16:52:37 +01:00
Conrad Ludgate
784571eac7 remove redundant generic 2024-10-07 16:52:37 +01:00
Conrad Ludgate
ffd0d875cf minor changes 2024-10-07 16:52:37 +01:00
Conrad Ludgate
dbf34a17ce rename to serverless backend 2024-10-07 16:52:37 +01:00
Conrad Ludgate
4001e24745 proxy: continue streamlining auth::Backend 2024-10-07 16:52:37 +01:00
Conrad Ludgate
d9a59c5a3f deduplicate some mode 2024-10-07 16:52:37 +01:00
Conrad Ludgate
c7afbe55c9 remove some duplication 2024-10-07 16:52:37 +01:00
Conrad Ludgate
d0930c9d1d remove console-redirect from mega-backend object 2024-10-07 16:52:37 +01:00
Conrad Ludgate
addfff61b5 create console_redirect_proxy solo-path 2024-10-07 16:52:37 +01:00
Conrad Ludgate
cb28721eee make well-defined console request backend 2024-10-07 16:52:37 +01:00
Conrad Ludgate
07076e88a9 shrink some uses of config 2024-10-07 16:52:37 +01:00
Conrad Ludgate
2feba8a3da remove auth backend from proxy config 2024-10-07 16:52:37 +01:00
Heikki Linnakangas
323bd018cd Make sure BufferTag padding bytes are cleared in hash keys (#9292)
The prefetch-queue hash table uses a BufferTag struct as the hash key,
and it's hashed using hash_bytes(). It's important that all the padding
bytes in the key are cleared, because hash_bytes() will include them.

I was getting compiler warnings like this on v14 and v15, when compiling
with -Warray-bounds:

    In function ‘prfh_lookup_hash_internal’,
inlined from ‘prfh_lookup’ at
pg_install/v14/include/postgresql/server/lib/simplehash.h:821:9,
inlined from ‘neon_read_at_lsnv’ at pgxn/neon/pagestore_smgr.c:2789:11,
inlined from ‘neon_read_at_lsn’ at pgxn/neon/pagestore_smgr.c:2904:2:
pg_install/v14/include/postgresql/server/storage/relfilenode.h:90:43:
warning: array subscript ‘PrefetchRequest[0]’ is partly outside array
bounds of ‘BufferTag[1]’ {aka ‘struct buftag[1]’} [-Warray-bounds]
       89 |         ((node1).relNode == (node2).relNode && \
          |         ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       90 |          (node1).dbNode == (node2).dbNode && \
          |          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~
       91 |          (node1).spcNode == (node2).spcNode)
          |          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
pg_install/v14/include/postgresql/server/storage/buf_internals.h:116:9:
note: in expansion of macro ‘RelFileNodeEquals’
      116 |         RelFileNodeEquals((a).rnode, (b).rnode) && \
          |         ^~~~~~~~~~~~~~~~~
pgxn/neon/neon_pgversioncompat.h:25:31: note: in expansion of macro
‘BUFFERTAGS_EQUAL’
       25 | #define BufferTagsEqual(a, b) BUFFERTAGS_EQUAL(*(a), *(b))
          |                               ^~~~~~~~~~~~~~~~
pgxn/neon/pagestore_smgr.c:220:34: note: in expansion of macro
‘BufferTagsEqual’
220 | #define SH_EQUAL(tb, a, b) (BufferTagsEqual(&(a)->buftag,
&(b)->buftag))
          |                                  ^~~~~~~~~~~~~~~
pg_install/v14/include/postgresql/server/lib/simplehash.h:280:77: note:
in expansion of macro ‘SH_EQUAL’
280 | #define SH_COMPARE_KEYS(tb, ahash, akey, b) (ahash ==
SH_GET_HASH(tb, b) && SH_EQUAL(tb, b->SH_KEY, akey))
| ^~~~~~~~
pg_install/v14/include/postgresql/server/lib/simplehash.h:799:21: note:
in expansion of macro ‘SH_COMPARE_KEYS’
      799 |                 if (SH_COMPARE_KEYS(tb, hash, key, entry))
          |                     ^~~~~~~~~~~~~~~
    pgxn/neon/pagestore_smgr.c: In function ‘neon_read_at_lsn’:
    pgxn/neon/pagestore_smgr.c:2742:25: note: object ‘buftag’ of size 20
     2742 |         BufferTag       buftag = {0};
          |                         ^~~~~~

This commit silences those warnings, although it's not clear to me why
the compiler complained like that in the first place. I found the issue
with padding bytes while looking into those warnings, but that was
coincidental, I don't think the padding bytes explain the warnings as
such.

In v16, the BUFFERTAGS_EQUAL macro was replaced with a static inline
function, and that also silences the compiler warning. Not clear to me
why.
2024-10-07 18:04:04 +03:00
Folke Behrens
ad267d849f proxy: Move module base files into module directory (#9297) 2024-10-07 16:25:34 +02:00
Conrad Ludgate
8cd7b5bf54 proxy: rename console -> control_plane, rename web -> console_redirect (#9266)
rename console -> control_plane
rename web -> console_redirect

I think these names are a little more representative.
2024-10-07 14:09:54 +01:00
Konstantin Knizhnik
47c3c9a413 Fix update of statistic for LFC/prefetch (#9272)
## Problem

See #9199

## Summary of changes

Fix update of hits/misses for LFC and prefetch introduced in
78938d1b59

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
2024-10-07 12:21:16 +03:00
Arseny Sher
eae4470bb6 safekeeper: remove local WAL files ignoring peer_horizon_lsn. (#8900)
If peer safekeeper needs garbage collected segment it will be fetched
now from s3 using on-demand WAL download. Reduces danger of running out of disk space when safekeeper fails.
2024-10-04 19:07:39 +03:00
Ivan Efremov
2d248aea6f proxy: exclude triple logging of connect compute errors (#9277)
Fixes (#9020)
 - Use the compute::COULD_NOT_CONNECT for connection error message;
 - Eliminate logging for one connection attempt;
 - Typo fix.
2024-10-04 18:21:39 +03:00
Conrad Ludgate
6c05f89f7d proxy: add local-proxy to compute image (#8823)
1. Adds local-proxy to compute image and vm spec
2. Updates local-proxy config processing, writing PID to a file eagerly
3. Updates compute-ctl to understand local proxy compute spec and to
send SIGHUP to local-proxy over that pid.

closes https://github.com/neondatabase/cloud/issues/16867
2024-10-04 14:52:01 +00:00
Arseny Sher
db53f98725 neon walsender_hooks: take basebackup LSN directly. (#9263)
NeonWALReader needs to know LSN before which WAL is not available
locally, that is, basebackup LSN. Previously it was taken from
WalpropShmemState, but that's racy, as walproposer sets its there only
after successfull election. Get it directly with GetRedoStartLsn.

Should fix flakiness of
test_ondemand_wal_download_in_replication_slot_funcs etc.

ref #9201
2024-10-04 14:56:15 +01:00
Erik Grinaker
04a6222418 remote_storage: add head_object integration test (#9274) 2024-10-04 12:40:41 +01:00
Vlad Lazar
dcf7af5a16 storcon: do timeline creation on all attached location (#9237)
## Problem

Creation of a timelines during a reconciliation can lead to
unavailability if the user attempts to
start a compute before the storage controller has notified cplane of the
cut-over.

## Summary of changes

Create timelines on all currently attached locations. For the latest
location, we still look
at the database (this is a previously). With this change we also look
into the observed state
to find *other* attached locations.

Related https://github.com/neondatabase/neon/issues/9144
2024-10-04 11:56:43 +01:00
Erik Grinaker
37158d0424 pageserver: use conditional GET for secondary tenant heatmaps (#9236)
## Problem

Secondary tenant heatmaps were always downloaded, even when they hadn't
changed. This can be avoided by using a conditional GET request passing
the `ETag` of the previous heatmap.

## Summary of changes

The `ETag` was already plumbed down into the heatmap downloader, and
just needed further plumbing into the remote storage backends.

* Add a `DownloadOpts` struct and pass it to
`RemoteStorage::download()`.
* Add an optional `DownloadOpts::etag` field, which uses a conditional
GET and returns `DownloadError::Unmodified` on match.
2024-10-04 12:29:48 +02:00
Erik Grinaker
60fb840e1f Cargo.toml: enable sso for aws-config (#9261)
## Problem

The S3 tests couldn't use SSO authentication for local tests against S3.

## Summary of changes

Enable the `sso` feature of `aws-config`. Also run `cargo hakari
generate` which made some updates to `workspace_hack`.
2024-10-04 11:27:06 +01:00
Heikki Linnakangas
52232dd85c tests: Add a comment explaining the rules of NeonLocalCli wrappers (#9195) 2024-10-03 22:03:29 +03:00
Heikki Linnakangas
8ef0c38b23 tests: Rename NeonLocalCli functions to match the 'neon_local' commands (#9195)
This makes it more clear that the functions in NeonLocalCli are just
typed wrappers around the corresponding 'neon_local' commands.
2024-10-03 22:03:27 +03:00
Heikki Linnakangas
56bb1ac458 tests: Move NeonCli and friends to separate file (#9195)
In the passing, rename it to NeonLocalCli, to reflect that the binary
is called 'neon_local'.

Add wrapper for the 'timeline_import' command, eliminating the last
raw call to the raw_cli() function from tests, except for a few in
test_neon_cli.py which are about testing the 'neon_local' iteself. All
the other calls are now made through the strongly-typed wrapper
functions
2024-10-03 22:03:25 +03:00
Heikki Linnakangas
19db9e9aad tests: Replace direct calls to neon_cli with wrappers in NeonEnv (#9195)
Add wrappers for a few commands that didn't have them before. Move the
logic to generate tenant and timeline IDs from NeonCli to the callers,
so that NeonCli is more purely just a type-safe wrapper around
'neon_local'.
2024-10-03 22:03:22 +03:00
David Gomes
4e9b32c442 chore: makes some onboarding document improvements (#9216)
* I had to install `m4` in order to be able to run locally
* The docs/docker.md was missing a pointer to where the compute node
code is

(Was originally on #8888 but I am pulling this out)
2024-10-03 20:58:30 +02:00
David Gomes
2fac0b7fac chore: remove unnecessary comments in compute/Dockerfile.compute-node (#9253)
See [this
comment](https://github.com/neondatabase/neon/pull/8888#discussion_r1783130082).
2024-10-03 18:26:41 +00:00
Arpad Müller
e3d6ecaeee Revert hyper and tonic updates (#9268) 2024-10-03 19:21:22 +01:00
Arseny Sher
d785fcb5ff safekeeper: fix panic in debug_dump. (#9097)
Panic was triggered only when dump selected no timelines.

sentry report:
https://neondatabase.sentry.io/issues/5832368589/
2024-10-03 19:22:22 +03:00
Vlad Lazar
552fa2b972 pageserver: tweak oversized key read path warning (#9221)
## Problem

`Oversized vectored read [...]` logs are spewing in prod because we have
a few keys that
are unexpectedly large:
* reldir/relblock - these are unbounded, so it's known technical debt
* slru block - they can be a bit bigger than 128KiB due to storage
format overhead

## Summary of changes

* Bump threshold to 130KiB
* Don't warn on oversized reldir and dbdir keys 

Closes https://github.com/neondatabase/neon/issues/8967
2024-10-03 16:40:35 +01:00
Arpad Müller
9d93dd4807 Rename hyper 1.0 to hyper and hyper 0.14 to hyper0 (#9254)
Follow-up of #9234 to give hyper 1.0 the version-free name, and the
legacy version of hyper the one with the version number inside. As we
move away from hyper 0.14, we can remove the `hyper0` name piece by
piece.

Part of #9255
2024-10-03 16:33:43 +02:00
Heikki Linnakangas
53b6e1a01c vm-monitor: Upgrade axum from 0.6 to 0.7 (#9257)
Because:
- it's nice to be up-to-date,
- we already had axum 0.7 in our dependency tree, so this avoids having
to compile two versions, and
- removes one of the remaining dpendencies to hyper version 0

Also bumps the 'tokio-tungstenite' dependency, to avoid having two
versions in the dependency tree.
2024-10-03 16:49:39 +03:00
Joonas Koivunen
dbef1b064c chore: smaller layer changes (#9247)
Address minor technical debt in Layer inspired by #9224:

- layer usage as arg same as in spans
- avoid one Weak::upgrade
2024-10-03 09:38:45 +01:00
Heikki Linnakangas
6a9e2d657c Remove unnecessary dependencies from postgis-build image (#9211)
The apt install stage before this commit:

    0 upgraded, 391 newly installed, 0 to remove and 9 not upgraded.
    Need to get 261 MB of archives.

after:

    0 upgraded, 367 newly installed, 0 to remove and 9 not upgraded.
    Need to get 220 MB of archives.
2024-10-03 10:05:23 +03:00
Arpad Müller
2d8f6d7906 Suppress wal lag timeout warnings right after tenant attachment (#9232)
As seen in https://github.com/neondatabase/cloud/issues/17335, during
releases we can have ingest lags that are above the limits for warnings.
However, such lags are part of normal pageserver startup.

Therefore, calculate a certain cooldown timestamp until which we accept
lags up to a certain size. The heuristic is chosen to grow the later we get
to fully load the tenant, and we also add 60 seconds as a grace period
after that term.
2024-10-03 02:33:09 +01:00
Arpad Müller
1b176fe74a Use hyper 1.0 and tonic 0.12 in storage broker (#9234)
Fixes #9231 .

Upgrade hyper to 1.4.0 and use hyper 1.4 instead of 0.14 in the storage
broker, together with tonic 0.12. The two upgrades go hand in hand.

Thanks to the broker being independent from other components, we can
upgrade its hyper version without touching the other components, which
makes things easier.
2024-10-03 00:48:12 +02:00
Heikki Linnakangas
1dec93f129 Add compute_tools/ to the list of paths that trigger an E2E run on a PR (#9251)
compute_ctl is an important part of the interfaces between the control
plane and the compute, so it seems important to E2E test any changes
there.
2024-10-03 00:31:19 +03:00
Alexander Bayandin
16002f5e45 test_runner: bump requests and psycopg2-binary (#9248)
## Problem

```
Warning: The file chosen for install of requests 2.32.0 (requests-2.32.0-py3-none-any.whl) is yanked. Reason for being yanked: Yanked due to conflicts with CVE-2024-35195 mitigation
```

## Summary of changes
- Update `requests` to fix the warning
- Update `psycopg2-binary`
2024-10-02 21:26:45 +01:00
dotdister
09d4bad1be Change parentheses to clarify conditions in walproposer (#9180)
Some parentheses in conditional expressions are redundant or necessary
for clarity conditional expressions in walproposer.

## Summary of changes

Change some parentheses to clarify conditions in walproposer.

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2024-10-02 14:49:52 -04:00
Heikki Linnakangas
d20448986c Fix metric name of the 'getpage_wait_seconds_bucket' metric (#9242)
Per convention, histogram buckets have the '_bucket' suffix. I got that
wrong in commit 0d500bbd5b.

Fixes https://github.com/neondatabase/neon/issues/9241
2024-10-02 20:05:14 +03:00
John Spray
d54624153d tests: sync_after_each_test -> sync_between_tests (#9239)
## Problem

We are seeing frequent pageserver startup timelines while it calls
syncfs(). There is an existing fixture that syncs _after_ tests, but not
before the first one. We hypothesize that some failures are happening on
the first test in a job.

## Summary of changes

- extend the existing sync_after_each_test to be a sync between all
tests, including sync'ing before running the first test. That should
remove any ambiguity about whether the sync is happening on the correct
node.

This is an alternative to https://github.com/neondatabase/neon/pull/8957
-- I didn't realize until I saw Alexander's comment on that PR that we
have an existing hook that syncs filesystems and can be extended.
2024-10-02 17:44:25 +01:00
Alex Chi Z.
700885471f fix(test): only test num of L1 layers in compaction smoke test (#9186)
close https://github.com/neondatabase/neon/issues/9160

For whatever reason, pg17's WAL pattern seems different from others,
which triggers some flaky behavior within the compaction smoke test.

## Summary of changes

* Run L0 compaction before proceeding with the read benchmark.
* So that we can ensure the num of L0 layers is 0 and test the
compaction behavior only with L1 layers.

We have a threshold for triggering L0 compaction. In some cases, the
test case did not produce enough L0 layers to do a L0 compaction,
therefore leaving the layer map with 3+ L0 layers above the L1 layers.
This increases the average read depth for the timeline.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-10-02 17:42:35 +01:00
Vlad Lazar
38a8dcab9f storcon: add metric for long running reconciles (#9207)
## Problem

We don't have an alert for long running reconciles. Stuck reconciles are
problematic
as we've seen in a recent incident.

## Summary of changes

Add a new metric `storage_controller_reconcile_long_running_total` with
labels: `{tenant_id, shard_number, seq}`.
The metric is removed after the long running reconcile finishes. These
events should be rare, so we won't break
the bank on cardinality.

Related https://github.com/neondatabase/neon/issues/9150
2024-10-02 17:25:11 +01:00
Vlad Lazar
8dbfda98d4 storcon: ignore deleted timelines on new location catch-up (#9244)
## Problem

If a timeline was deleted right before waiting for LSNs to catch up
before the cut-over,
then we would wait forever. 

## Summary of changes

Fix the issue and add a test for timeline deletions mid migration. 

Related https://github.com/neondatabase/neon/issues/9144
2024-10-02 17:23:26 +01:00
John Spray
f875e107aa pageserver: tweak logging of "became visible" for layers (#9224)
## Problem

Recent change to avoid the "became visible" log messages from certain
tasks missed a task: the logical size calculation that happens as a
child of synthetic size calculation.

Related: https://github.com/neondatabase/neon/issues/9058

## Summary of changes

- Add OnDemandLogicalSize to the list of permitted tasks for reads
making a covered layer visible
- Tweak the log message to use layer name instead of key: this is more
terse, and easier to use when debugging, as one can search for it
elsewhere to see when the layer was written/downloaded etc.
2024-10-02 13:21:04 +01:00
Folke Behrens
1e90e792d6 proxy: Add timeout to webauth confirmation wait (#9227)
```shell
$ cargo run -p proxy --bin proxy -- --auth-backend=web --webauth-confirmation-timeout=5s
```

```
$ psql -h localhost -p 4432
NOTICE:  Welcome to Neon!
Authenticate by visiting within 5s:
    http://localhost:3000/psql_session/e946900c8a9bc6e9


psql: error: connection to server at "localhost" (::1), port 4432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?
connection to server at "localhost" (127.0.0.1), port 4432 failed: ERROR:  Disconnected due to inactivity after 5s.
```
2024-10-02 12:10:56 +02:00
Matthias van de Meent
ea32f1d0a3 Expose more granular wait event data to the user (#9163)
In PG17, there is this newfangled custom wait events system. This commit
adds that feature to Neon, so that users can see what their backends may
be waiting for when a PostgreSQL backend is playing the waiting game in
Neon code.
2024-10-02 11:12:50 +02:00
Heikki Linnakangas
2e3b7862d0 Fix compute metrics collector config (#9235) 2024-10-02 09:44:00 +01:00
Arpad Müller
387e569259 Update aws SDK crates (#9233)
This updates the aws SDK crates to their newest released versions.
2024-10-02 08:00:08 +02:00
Alex Chi Z.
31f12f6426 fix: ignore tonic to resolve advisories (#9230)
check-rust-style fails because tonic version too old, this does not seem
to be an easy fix, so ignore it from the deny list.

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-10-01 19:26:54 -04:00
Heikki Linnakangas
8861e8a323 Fix the size of the perf counters shared memory array (#9226)
MaxBackends doesn't include auxiliary processes. Whenever an aux process
made IO operations that updated the counters, they would scribble over
shared memory beoynd the end of the array. The relsize cache hash table
comes after the array, so the symptom was an error about hash table
corruption in the relsize cache hash.
2024-10-01 20:07:51 +01:00
Arseny Sher
62e22dfd85 Backpressure: reset ps display after it is done. (#8980)
Previously we set the 'backpressure throttling' status, but overwrote
current one and never reset it back.
2024-10-01 20:55:05 +03:00
Arseny Sher
17672c88ff tests: wait walreceiver on sks to be gone on 'immediate' ep restart. (#9099)
When endpoint is stopped in immediate mode and started again there is a
chance of old connection delivering some WAL to safekeepers after second
start checked need for sync-safekeepers and thus grabbed basebackup LSN.
It makes basebackup unusable, so compute panics. Avoid flakiness by
waiting for walreceivers on safekeepers to be gone in such cases. A
better way would be to bump term on safekeepers if sync-safekeepers is
skipped, but it needs more infrastructure.

ref https://github.com/neondatabase/neon/issues/9079
2024-10-01 20:54:00 +03:00
Matthias van de Meent
6efdb1d0f3 Fix small memory accounting bug in libpagestore (#9223)
Found while searching for other issues in shared memory.

The bug should be benign, in that it over-allocates memory for this
struct, but doesn't allow for out-of-bounds writes.
2024-10-01 17:37:59 +01:00
Erik Grinaker
325de52e73 pageserver: remove TenantConfOpt::TryFrom<toml_edit::Item> (#9219)
Following #7656, `TenantConfOpt::TryFrom<toml_edit::Item>` appears to be
dead code. This patch removes `TenantConfOpt::TryFrom<toml_edit::Item>`.

The code does appear to be dead, since the TOML config is deserialized
into `TenantConfig` (via `LocationConfig`) and then converted into
`TenantConfOpt`.

This was verified by adding a panic to `try_from()` and running the
pageserver unit tests as well as a local end-to-end cluster (including
creating a new tenant and restarting the pageserver). This did not fail,
so this is not used on the common happy path at least. No explicit
`try_from` or `try_into` calls were found either.

Resolves #8918.
2024-10-01 16:35:18 +01:00
Anastasia Lubennikova
ce73db9316 Fix post_apply_config() (#9220)
Bring back post_apply_config() step 
that was accidentally removed in 78938d1
2024-10-01 16:28:58 +01:00
Shinya Kato
b675997f48 safekeeper: Fix a log message of HTTP worker (#9213)
## Problem
There is a wrong log message.

## Summary of changes
Fixed the log message.
2024-10-01 17:16:53 +02:00
Alex Chi Z.
49f99eb729 docs: add aux file v2 RFC (#9115)
aux v2 migration is near the end and I rewrote the RFC based on what I
proposed (several months before...) and what I actually implemented.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-10-01 15:56:54 +01:00
Heikki Linnakangas
0d500bbd5b Add new compute metrics to sql exporter (#9190)
These are the perf counters added in commit 263dfba6ee.

Note: This relies on 'neon' extension version 1.5. The default was
bumped to 1.5 in commit d696c41807.

---------

Co-authored-by: Matthias van de Meent <matthias@neon.tech>
2024-10-01 17:38:19 +03:00
Heikki Linnakangas
1b8b50755c Use debian packages for cmake again (#9212)
On bookworm, 'cmake' is new enough that we can just use it. On bullseye,
we can get a new-enough package from backports. By including 'cmake' in
the build-deps stage, we don't need to install it separately in all the
later build stages that need it.

See https://github.com/neondatabase/neon/pull/2699, where we switched to
downloading and building a specific version.
2024-10-01 15:09:09 +03:00
Conrad Ludgate
4391b25d01 proxy: ignore typ and use jwt.alg rather than jwk.alg (#9215)
Microsoft exposes JWKs without the alg header. It's only included on the
tokens. Not a problem.

Also noticed that wrt the `typ` header:
> It will typically not be used by applications when it is already known
that the object is a JWT. This parameter is ignored by JWT
implementations; any processing of this parameter is performed by the
JWT application.

Since we know we are expecting JWTs only, I've followed the guidance and
removed the validation.
2024-10-01 10:36:49 +01:00
John Spray
40b10b878a storage_scrubber: retry on index deletion failures (#9204)
## Problem

In automated tests running on AWS S3, we frequently see scrubber
failures when it can't delete an index.

`location_conf_churn`:

https://neon-github-public-dev.s3.amazonaws.com/reports/main/11076221056/index.html#/testresult/f89b1916b6a693e2

`scrubber_physical_gc`:

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9178/11074269153/index.html#/testresult/9885ed5aa0fe38b6

## Summary of changes

Wrap index deletion in a backoff::retry

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
2024-10-01 10:34:39 +01:00
David Gomes
d6c6b0a509 feat(compute): adds pg_session_jwt extension to compute image (#8888)
## Problem

We need the
[pg_session_jwt](https://github.com/neondatabase/pg_session_jwt/)
extension in the compute image. This PR adds it.

## Summary of changes

I added the `pg_session_jwt` extension in a very similar way to how the
pggraphql and pgtiktoken extensions were added (since they're all
written with pgrx). Then I tested this.

```
$ cd docker-compose/
$ PG_VERSION=16 TAG=10667533475 docker-compose up --build -d
$ psql postgresql://cloud_admin:cloud_admin@localhost:55433/postgres

cloud_admin@postgres=# create extension pg_session_jwt;
CREATE EXTENSION
Time: 43.048 ms

cloud_admin@postgres=# \df auth.*;
                              List of functions
┌────────┬──────────────────┬──────────────────┬─────────────────────┬──────┐
│ Schema │       Name       │ Result data type │ Argument data types │ Type │
├────────┼──────────────────┼──────────────────┼─────────────────────┼──────┤
│ auth   │ get              │ jsonb            │ s text              │ func │
│ auth   │ init             │ void             │ kid bigint, s jsonb │ func │
│ auth   │ jwt_session_init │ void             │ s text              │ func │
│ auth   │ user_id          │ text             │                     │ func │
└────────┴──────────────────┴──────────────────┴─────────────────────┴──────┘
(4 rows)

cloud_admin@postgres=# select auth.init(cast('1' as bigint), to_jsonb(TEXT '{ "kty": "EC", "kid": "571683be-33cf-4e67-bccc-8905c0ebb862", "crv": "P-521", "alg": "ES512", "x": "AM_GsnQvKML2yXdn_OsN8PdgO1Sf9XMXih5vQMKLmJkp-Iz_FFWJUt6uyR_qp4brr8Ji2kjGJgN4cQJpg2kskH7V", "y": "AZg-salw24lCmsBP-BCBa5jT6INkTwLtCOC7o0BIxDVvmIEH1-PQAJVYVJPTFvPMi_PLa0QlOm-ufJYkynwa2Mau" }'));
ERROR:  called `Result::unwrap()` on an `Err` value: Error("invalid type: string \"{ \\\"kty\\\": \\\"EC\\\", \\\"kid\\\": \\\"571683be-33cf-4e67-bccc-8905c0ebb862\\\", \\\"crv\\\": \\\"P-521\\\", \\\"alg\\\": \\\"ES512\\\", \\\"x\\\": \\\"AM_GsnQvKML2yXdn_OsN8PdgO1Sf9XMXih5vQMKLmJkp-Iz_FFWJUt6uyR_qp4brr8Ji2kjGJgN4cQJpg2kskH7V\\\", \\\"y\\\": \\\"AZg-salw24lCmsBP-BCBa5jT6INkTwLtCOC7o0BIxDVvmIEH1-PQAJVYVJPTFvPMi_PLa0QlOm-ufJYkynwa2Mau\\\" }\", expected struct JwkEcKey", line: 0, column: 0)
Time: 6.991 ms
```

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Move the download location to a proper URL
2024-10-01 10:29:56 +01:00
John Spray
d515727e94 tests: make test_multi_attach more stable (#9202)
## Problem

`test_multi_attach` is sometimes failing with `invalid compute status
for configuration request: Configuration`. This is likely a result of
the test attempting to reconfigure the compute at the same time as the
storage controller is doing so.

This test was originally written before the storage controller existed,
and is not expecting anything else to be reconfiguring computes at the
same time.

## Summary of changes

- Configure the tenant into scheduling policy `Stop` in the storage
controller at the start of the test, so that it won't try to do anything
to the tenant while the test is running.
2024-10-01 10:15:18 +01:00
Folke Behrens
2e508b1ff9 Upgrade OpenTelemetry and other tracing crates (#9200)
* tracing-utils now returns a `Layer` impl. Removes the need for crates
to
  import OTel crates.
* Drop the /v1/traces URI check. Verified that the code does the right
thing.
* Leave a TODO to hook in an error handler for OTel to log errors to
when it
  assumes the regular pipeline cannot be used/is broken.
2024-10-01 11:02:54 +02:00
John Spray
651ae44569 storage controller: drop out of blocking compute notification loop if migration origin becomes unavailable (#9147)
## Problem

The live migration code waits forever for the compute notification hook,
on the basis that until it succeeds, the compute is probably using the
old location and we shouldn't detach it.

However, if a pageserver stops or restarts in the background, then this
original location might no longer be available, so there is no point
waiting. Waiting is also actively harmful, because it prevents other
reconciliations happening for the tenant shard, such as during an
upgrade where a stuck "drain" migration might prevent the later "fill"
migration from moving the shard back to its original location.

## Summary of changes

- Refactor the notification wait loop into a function
- Add a checks during the loop, for the origin node's cancellation token
and an explicit HTTP request to the origin node to confirm the shard is
still attached there.

Closes: https://github.com/neondatabase/neon/issues/8901
2024-10-01 07:57:22 +00:00
Heikki Linnakangas
65bda19051 Remove unnecessary dev package from compute image (#9210)
libcurl4-openssl-dev is needed to build pgxn/, but libcurl4 is enough at
runtime.
2024-10-01 01:07:43 +03:00
Conrad Ludgate
94a5ca2817 proxy: auth broker (#8855)
Opens http2 connection to local-proxy and forwards requests over with
all headers and body

closes https://github.com/neondatabase/cloud/issues/16039
2024-09-30 20:43:45 +01:00
Arthur Petukhovsky
c07cea80bd Bump vm-builder v0.29.3 -> v0.35.0 (#9208)
We haven't updated it for a while. Now I need the update to add quotas
support to compute images
(https://github.com/neondatabase/cloud/issues/13127).

Previous update: https://github.com/neondatabase/neon/pull/7849
2024-09-30 19:18:42 +01:00
Conrad Ludgate
a2e2362ee9 add proxy-protocol header disable option (#9203)
resolves https://github.com/neondatabase/cloud/issues/18026
2024-09-30 18:11:50 +00:00
Heikki Linnakangas
0a567acdb9 tests: Move comment to more appropriate place
There is no 'pg_bin' in NeonEnv.
2024-09-30 17:56:43 +03:00
Heikki Linnakangas
69ea2776e9 tests: Remove creation of extra timelines in some tests
neon_cli.create_tenant() creates a new tenant *and* a timeline on the
tenant, with name "main". In most tests, there's no need to create
another timeline on the same tenant.

There are some more tests that do that, but in the remaining cases, I
wasn't be 100% if the presence of extra root timelines affect what the
tests test, so I left them alone.
2024-09-30 17:56:40 +03:00
Heikki Linnakangas
4dc9cb7cf9 tests: Remove some spurious list_timelines calls
These calls seem really out of place. We know what the initial tenant
and branch are in these tests, just like in all other tests.
2024-09-30 17:56:37 +03:00
John Spray
7424e7269c tests: longer timeout in test_delete_timeline_client_hangup (#9161)
## Problem

This test waits for a request to finish, and then expects deletion to
complete almost immediately. The request completes, but it's a 202, the
timeline is still deleting in the background: we need to be more
patient.

## Summary of changes

- Adjust iterations from 2 to 10 when waiting for deletion
2024-09-30 15:46:07 +01:00
a-masterov
5dc68e4e6a test_compatibility: fix the regexes detecting the version (#9205)
## Problem
The Neon components, built locally and by the GitHub workflow have
slightly different version prefixes (git: vs git-env:)
This does not allow running tests against local builds correctly.

## Summary of changes
The regular expressions were changed to work with both
prefixes.
2024-09-30 16:37:14 +02:00
John Spray
7cfd116856 pageserver: refactor immediate_gc into TenantManager (#9183)
## Problem

Legacy functions that were called as `mgr::` and relied on the static
TENANTS, see #5796

## Summary of changes

- Move the last stray function (immediate_gc) into TenantManager

Closes: https://github.com/neondatabase/neon/issues/5796
2024-09-30 09:27:28 +01:00
Heikki Linnakangas
d696c41807 Bump default neon extension version to 1.5 (#9188)
Commit 263dfba6ee introduced neon extension version 1.5, which included
some new functions and views for metrics. It didn't bump the default
neon extension number yet, so that we could still safely roll back to
the old binary if necessary. This bumps the default version.
2024-09-30 09:20:52 +03:00
Alexander Bayandin
3c72192065 CI(benchmarking): fix setting LD_LIBRARY_PATH (#9191)
## Problem

`pgbench-pgvector` job from Nightly Benchmarks fails with the error:

```
/__w/_temp/f45bc2eb-4c4c-4f0a-8030-99079303fa65.sh: line 17: LD_LIBRARY_PATH: unbound variable
```

## Summary of changes
- Fix `LD_LIBRARY_PATH: unbound variable` error in benchmarks
2024-09-29 22:27:53 +00:00
Alexander Bayandin
d2d9921761 CI(benchmarking): fix Nightly Benchmarks (#9178)
## Problem

Nightly Benchmarks have been broken for some time due to various
reasons, this PR fixes it

## Summary of changes
- Pull `build-tools` image from dockerhub for `benchmarking` workflow
- Use `aws-actions/configure-aws-credentials` to upload/download
artifacts from S3
- Fix Postgres 16 installation (for pgbench)
2024-09-28 02:44:22 +01:00
Arthur Petukhovsky
ba498a630a Set disk quotas on bind in compute_ctl (#8936)
Part of https://github.com/neondatabase/cloud/issues/13127. Resolves
#9153

What changed in this PR:
1. Adds `ComputeSpec.disk_quota_bytes: Option<u64>`
2. Adds new arg to compute_ctl: `--set-disk-quota-for-fs <mountpoint>`
3. Implements running `/neonvm/bin/set-disk-quota` with the right value
if both cmdline arg AND field in the spec are specified
4. Patches `/etc/sudoers.d` to allow `compute_ctl` to set quota with
sudo

This PR is very similar to the swap support added earlier, you can take
a look at it as prior art: #7434

In theory, it can be implemented outside of compute_ctl when we will
have a separate neonvm daemon, but we are not there yet. Current
implementation is the simplest possible to unblock computes with larger
disks.

All code related to usage of `/neonvm/bin/set-disk-quota` is located in
`disk_quota.rs`. We need to call this script with the following
arguments: `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`. Quotas
are set on the filesystem level, so we need to provide path to the
directory that filesystem was mounted to.

I tested this change locally with
https://github.com/neondatabase/cloud/pull/17270. It should be safe to
merge, because this feature is gated by both cmdline arg and field in
the spec. If control-plane doesn't set values in both places,
compute_ctl won't be affected by this change.
2024-09-27 20:52:22 +01:00
Heikki Linnakangas
e989a5e4a2 neon_local: Use clap derive macros to parse the CLI args (#9103)
This is easier to work with.
2024-09-27 22:08:46 +03:00
Alex Chi Z.
cde1654d7b fix(pageserver): abort process if fsync fails (#9108)
close https://github.com/neondatabase/neon/issues/8140

The original issue is rather vague on what we should do. After
discussion w/ @problame we decided to narrow down the problems we want
to solve in that issue.

* read path -- do not panic for now.
* write path -- panic only on write errors (i.e., device error, fsync
error), but not on no-space for now.

The guideline is that if the pageserver behavior could lead to violation
of persistent constraints (i.e., return an operation as successful but
not actually persisting things), we should panic. Fsync is the place
where both of us agree that we should panic, because if fsync fails, the
kernel will mark dirty pages as clean, and the next fsync will not
necessarily return false. This would make the storage client assume the
operation is successful.

## Summary of changes

Make fsync panic on fatal errors.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-09-27 19:58:50 +01:00
Heikki Linnakangas
cf6a776fcf tests: Reduce the # of iterations in safekeeper::test_random_schedules (#9182)
To make it faster. On my laptop, it takes about 30 before this commit.
In the arm64 debug variant in CI, it takes about 120 s. Reduce it by
factor of 4.
2024-09-27 16:25:35 +00:00
Matthias van de Meent
5c5871111a WalProposer: Read WAL directly from WAL buffers in PG17 (#9171)
This reduces the overhead of the WalProposer when it is not being
throttled by SK WAL acceptance rate
2024-09-27 17:47:05 +02:00
Yuchen Liang
d56c4e7a38 pageserver: remove AdjacentVectoredReadBuilder and bump minmimum io_buffer_alignment to 512 (#9175)
Part of #8130

## Problem

After deploying https://github.com/neondatabase/infra/pull/1927, we
shipped `io_buffer_alignment=512` to all prod region. The
`AdjacentVectoredReadBuilder` code path is no longer taken and we are
running pageserver unit tests 6 times in the CI. Removing it would
reduce the test duration by 30-60s.

## Summary of changes

- Remove `AdjacentVectoredReadBuilder` code.
- Bump the minimum `io_buffer_alignment` requirement to at least 512
bytes.
- Use default `io_buffer_alignment` for Rust unit tests.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
2024-09-27 16:41:42 +01:00
Conrad Ludgate
43b2445d0b proxy: add jwks endpoint to control plane and mock providers (#9165) 2024-09-27 16:08:43 +01:00
Yuchen Liang
42ef08db47 fix(pageserver): LSN lease edge cases around restarts/migrations (#9055)
Part of #7497, closes #8817.

## Problem

See #8817. 

## Summary of changes

**compute_ctl**

- Renew lsn lease as soon as `/configure` updates pageserver_connstr,
use `state_changed` Condvar for synchronization.

**pageserver**

As mentioned in
https://github.com/neondatabase/neon/issues/8817#issuecomment-2315768076,
we still want some permanent error reported if a lease cannot be
granted. By considering attachment mode and the added
`lsn_lease_deadline` when processing lease requests, we can also bound
the case of bad requests to a very short period after migration/restart.

- Refactor https://github.com/neondatabase/neon/pull/9024 and move
`lsn_lease_deadline` to `AttachedTenantConf` so timeline can easily
access it.
- Have separate HTTP `init_lsn_lease` and  libpq `renew_lsn_lease` API.
  - Always do LSN verification for the initial HTTP lease request.
- LSN verification for the renewal is **still done** when tenants are
not in `AttachedSingle` and we have pass the `lsn_lease_deadline`, which
give plenty of time for compute to renew the lease.
 
**neon_local**

- add and call `timeline_init_lsn_lease` mgmt_api at static endpoint
start. The initial lsn lease http request is sent when we run `cargo
neon endpoint start <static endpoint>`.


## Testing

- Extend `test_readonly_node_gc` to do pageserver restarts and
migration.

## Future Work

- The control plane should make the initial lease request through HTTP
when creating a static endpoint. This is currently only done in
`neon_local`.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
2024-09-27 09:56:52 -04:00
Tristan Partin
fc962c9605 Use long options when calling initdb
Verbosity in this case is good when reading the code. Short options are
better when operating in an interactive shell.

Signed-off-by: Tristan Partin <tristan@neon.tech>
2024-09-27 08:22:16 -05:00
Heikki Linnakangas
357fa070a3 Add gdb to build-tools (#9125)
So that compute_ctl can use it to print backtrace on core dumps

See issue #2800.
2024-09-27 15:36:24 +03:00
Heikki Linnakangas
02cdd37b56 Dump backtrace if a core dump is called just "core" (#9125)
I hope this lets us capture backtraces in CI. At least it makes it
work on my laptop, which is valuable even if we need to do more for
CI.

See issue #2800.
2024-09-27 15:36:24 +03:00
Vlad Lazar
fa354a65ab libs: improve logging on PG connection errors (#9130)
## Problem
We get some unexpected errors, but don't know who they're happening for.

## Summary of change
Add tenant id and peer address to PG connection error logs.

Related https://github.com/neondatabase/cloud/issues/17336
2024-09-27 12:36:43 +01:00
Arseny Sher
40f7930a7d safekeeper: skip syncfs on start if --no-sync is specified. (#9166)
https://neondb.slack.com/archives/C059ZC138NR/p1727350911890989?thread_ts=1727350211.370869&cid=C059ZC138NR
2024-09-27 09:59:38 +03:00
Conrad Ludgate
ec07a1ecc9 proxy: make local-proxy config by signal with PID, refine JWKS apis with role caching (#9164) 2024-09-26 19:01:48 +01:00
Arseny Sher
c4cdfe66ac Fix flakiness of test_timeline_copy.
Timeline might be not initialized when timeline_start_lsn is
queried. Spotted by CI.
2024-09-26 19:01:45 +03:00
Alex Chi Z.
42e19e952f fix(pageserver): categorize client error in basebackup metrics (#9110)
We separated client error from basebackup error log lines in
https://github.com/neondatabase/neon/pull/7523, but we didn't do
anything for the metrics. In this patch, we fixed it.

ref https://github.com/neondatabase/neon/issues/8970

## Summary of changes

We use the same criteria as in `log_query_error` producing an info line
(instead of error) for the metrics. We added a `client_error` category
for the basebackup query time metrics.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-09-26 11:38:19 -04:00
John Spray
3d255d601b pageserver: rename control plane client & chunk validation requests (#8997)
## Problem

- In https://github.com/neondatabase/neon/pull/8784, the validate
controller API is modified to check generations directly in the
database. It batches tenants into separate queries to avoid generating a
huge statement, but
- While updating this, I realized that "control_plane_client" is a kind
of confusing name for the client code now that it primarily talks to the
storage controller (the case of talking to the control plane will go
away in a few months).

## Summary of changes

- Big rename to "ControllerUpcallClient" -- this reflects the storage
controller's api naming, where the paths used by the pageserver are in
`/upcall/`
- When sending validate requests, break them up into chunks so that we
avoid possible edge cases of generating any HTTP requests that require
database I/O across many thousands of tenants.

This PR mixes a functional change with a refactor, but the commits are
cleanly separated -- only the last commit is a functional change.

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
2024-09-26 16:06:34 +01:00
Arthur Petukhovsky
80e974d05b fix(compute_ctl): race condition in configurator (#9162)
There was a tricky race condition in compute_ctl, that sometimes makes
configurator skip updates. It makes a deadlock because:
- control-plane cannot configure compute, because it's in
ConfigurationPending state
- compute_ctl doesn't do any reconfiguration because
`configurator_main_loop` missed notification for it

Full sequence that reproduces the issue:
1. `start_compute` finishes works and changes status
`self.set_status(ComputeStatus::Running);`
2. configurator received update about `Running` state and dropped the
mutex lock in the iteration
3. `/configure` request was triggered at the same time as step 1, and
got the mutex lock
4. same `/configure` request set the spec and updated the state to
`ConfigurationPending`, also sent a notification
5. next iteration in configurator got the mutex lock, but missed the
notification

There are more details in this slack thread:
https://neondb.slack.com/archives/C03438W3FLZ/p1727281028478689?thread_ts=1727261220.483799&cid=C03438W3FLZ

---------

Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
2024-09-26 15:42:17 +01:00
Alexander Bayandin
7fdf1ab5b6 CI: run compatibility tests on Postgres 17 (#9145)
## Problem

The latest storage release has generated artifacts for Postgres 17,
so we can enable compatibility tests this version

## Summary of changes
- Unskip `test_backward_compatibility` / `test_forward_compatibility` on
Postgres 17
2024-09-26 15:17:01 +01:00
Arpad Müller
7bae78186b Forbid creation of child timelines of archived timeline (#9122)
We don't want to allow any new child timelines of archived timelines. If
you want any new child timelines, you should first un-archive the
timeline.
 
Part of #8088
2024-09-26 02:05:25 +02:00
Heikki Linnakangas
7e560dd00e chore: Silence clippy warning with nightly (#9157)
The warning:

    warning: first doc comment paragraph is too long
      --> pageserver/src/tenant/checks.rs:7:1
       |
7 | / /// Checks whether a layer map is valid (i.e., is a valid result
of the current compaction algorithm if no...
8 | | /// The function checks if we can split the LSN range of a delta
layer only at the LSNs of the delta layer...
    9  | | ///
    10 | | /// ```plain
       | |_
       |
= help: for further information visit
https://rust-lang.github.io/rust-clippy/master/index.html#too_long_first_doc_paragraph
= note: `#[warn(clippy::too_long_first_doc_paragraph)]` on by default
    help: add an empty line
       |
7 ~ /// Checks whether a layer map is valid (i.e., is a valid result of
the current compaction algorithm if nothing goes wrong).
    8  + ///
       |

Fix by applying the suggestion.
2024-09-25 21:29:16 +00:00
Tristan Partin
684e924211 Fix compute_logical_snapshot_files for v14
The function, pg_ls_logicalsnapdir(), was added in version 15.

Signed-off-by: Tristan Partin <tristan@neon.tech>
2024-09-25 16:25:17 -05:00
Tristan Partin
8ace9ea25f Format long single DATA line in pgxn/Makefile
This should be a little more readable.

Signed-off-by: Tristan Partin <tristan@neon.tech>
2024-09-25 16:25:17 -05:00
Alex Chi Z.
6a4f49b08b fix(pageserver): passthrough partition cancel error (#9154)
close https://github.com/neondatabase/neon/issues/9142

## Summary of changes

passthrough CollectKeyspaceError::Cancelled to
CompactionError::ShuttingDown

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-09-25 21:35:33 +01:00
Alexander Bayandin
c6e89445e2 CI(promote-images): fix prod ECR auth (#9146)
A cherry-pick from the previous release (#9131)

## Problem
Login to prod ECR doesn't work anymore:
```
Retrieving registries data through *** SDK...
*** ECR detected with eu-central-1 region
Error: The security token included in the request is invalid.
```

## Summary of changes
- Fix login to prod ECR by using `aws-actions/configure-aws-credentials`
2024-09-25 18:22:39 +01:00
Vlad Lazar
04f32b9526 tests: remove patching up of az id column (#8968)
This was required since the compat tests used a snapshot generated from
a version of neon local which didn't contain the availability_zone_id
column.
2024-09-25 17:22:32 +01:00
Heikki Linnakangas
6f2333f52b CI: Leave out unnecessary build files from binary artifact (#9135)
The pg_install/build directory contains .o files and such intermediate
results from the build, which are not needed in the final tarball.
Except for src/test/regress/regress.so and a few other .so files in that
directory; keep those.

This reduces the size of the neon-Linux-X64-release-artifact.tar.zst
artifact from about 1.5 GB to 700 MB.

(I attempted this a long time ago already, by moving the build/
directory out of pg_install altogether, see PR #2127. But I never got
around to finish that work.)

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
2024-09-25 19:07:20 +03:00
Yuchen Liang
d447f49bc3 fix(pageserver): handle lsn lease requests for unnormalized lsns (#9137)
Fixes https://github.com/neondatabase/neon/issues/9098.

## Problem

See
https://github.com/neondatabase/neon/issues/9098#issuecomment-2372484969.

### Related

A similar problem happened with branch creation, which was discussed
[here](https://github.com/neondatabase/neon/pull/2143#issuecomment-1199969052)
and fixed by https://github.com/neondatabase/neon/pull/2529.

## Summary of changes

- Normalize the lsn on pageserver side upon lsn lease request, stores
the normalized LSN.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
2024-09-25 14:57:38 +00:00
Vlad Lazar
c5972389aa storcon: include timeline ID in LSN waiting logs (#9141)
## Problem
Hard to tell which timeline is holding the migration.

## Summary of Changes
Add timeline id to log.
2024-09-25 15:54:41 +01:00
Matthias van de Meent
c4f5736d5a Build images for PG17 using Debian 12 "Bookworm" (#9132)
This increases the support window of the OS used for PG17 by 2 years
compared to the previous usage of Debian 11 "Bullseye".
2024-09-25 17:50:05 +03:00
Alexey Kondratov
518f598e2d docs(rfc): Independent compute release flow (#8881)
Related to https://github.com/neondatabase/cloud/issues/11698
2024-09-25 16:24:09 +02:00
John Spray
4b711caf5e storage controller: make proxying of GETs to pageservers more robust (#9065)
## Problem

These commits are split off from
https://github.com/neondatabase/neon/pull/8971/commits where I was
fixing this to make a better scale test pass -- Vlad also independently
recognized these issues with cloudbench in
https://github.com/neondatabase/neon/issues/9062.

1. The storage controller proxies GET requests to pageservers based on
their intent, not the ground truth of where they're really attached.
2. Proxied requests can race with scheduling to tenants, resulting in
404 responses if the request hits the wrong pageserver.

Closes: https://github.com/neondatabase/neon/issues/9062

## Summary of changes

1. If a shard has a running reconciler, then use the database
generation_pageserver to decide who to proxy the request to
2. If such a request gets a 404 response and its scheduled node has
changed since the request was dispatched.
2024-09-25 13:56:39 +00:00
Vlad Lazar
2cf47b1477 storcon: do az aware scheduling (#9083)
## Problem

Storage controller didn't previously consider AZ locality between
compute and pageservers
when scheduling nodes. Control plane has this feature, and, since we are
migrating tenants
away from it, we need feature parity to avoid perf degradations.

## Summary of changes

The change itself is fairly simple:
1. Thread az info into the scheduler
2. Add an extra member to the scheduling scores

Step (2) deserves some more discussion. Let's break it down by the shard
type being scheduled:

**Attached Shards**

We wish for attached shards of a tenant to end up in the preferred AZ of
the tenant since that
is where the compute is like to be. 

The AZ member for `NodeAttachmentSchedulingScore` has been placed
below the affinity score (so it's got the second biggest weight for
picking the node). The rationale for going
below the affinity score is to avoid having all shards of a single
tenant placed on the same node in 2 node
regions, since that would mean that one tenant can drive the general
workload of an entire pageserver.
I'm not 100% sure this is the right decision, so open to discussing
hoisting the AZ up to first place.

 **Secondary Shards**

We wish for secondary shards of a tenant to be scheduled in a different
AZ from the preferred one
for HA purposes.

The AZ member for `NodeSecondarySchedulingScore` has been placed first,
so nodes in different AZs
from the preferred one will always be considered first. On small
clusters, this can mean that all the secondaries
of a tenant are scheduled to the same pageserver, but secondaries don't
use up as many resources as the
attached location, so IMO the argument made for attached shards doesn't
hold.

Related: https://github.com/neondatabase/neon/issues/8848
2024-09-25 14:31:04 +01:00
Folke Behrens
7dcfcccf7c Re-export git-version from utils and remove as direct dep (#9138) 2024-09-25 14:38:35 +02:00
269 changed files with 8904 additions and 4478 deletions

View File

@@ -3,19 +3,23 @@ name: Prepare benchmarking databases by restoring dumps
on:
workflow_call:
# no inputs needed
defaults:
run:
shell: bash -euxo pipefail {0}
jobs:
setup-databases:
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
strategy:
fail-fast: false
matrix:
platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
database: [ clickbench, tpch, userexample ]
env:
LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
PLATFORM: ${{ matrix.platform }}
@@ -23,7 +27,10 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
@@ -32,13 +39,13 @@ jobs:
run: |
case "${PLATFORM}" in
neon)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;;
aws-rds-postgres)
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
;;
aws-aurora-serverless-v2-postgres)
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}"
@@ -46,10 +53,17 @@ jobs:
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
- uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
uses: ./.github/actions/download
with:
@@ -57,23 +71,23 @@ jobs:
path: /tmp/neon/
prefix: latest
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
- name: Create benchmark_restore_status table if it does not exist
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
DATABASE_NAME: ${{ matrix.database }}
# to avoid a race condition of multiple jobs trying to create the table at the same time,
# to avoid a race condition of multiple jobs trying to create the table at the same time,
# we use an advisory lock
run: |
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
SELECT pg_advisory_lock(4711);
SELECT pg_advisory_lock(4711);
CREATE TABLE IF NOT EXISTS benchmark_restore_status (
databasename text primary key,
restore_done boolean
);
SELECT pg_advisory_unlock(4711);
"
- name: Check if restore is already done
id: check-restore-done
env:
@@ -107,7 +121,7 @@ jobs:
DATABASE_NAME: ${{ matrix.database }}
run: |
mkdir -p /tmp/dumps
aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
- name: Replace database name in connection string
if: steps.check-restore-done.outputs.skip != 'true'
@@ -126,17 +140,17 @@ jobs:
else
new_connstr="${base_connstr}/${DATABASE_NAME}"
fi
echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
- name: Restore dump
if: steps.check-restore-done.outputs.skip != 'true'
env:
DATABASE_NAME: ${{ matrix.database }}
DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
# the following works only with larger computes:
# the following works only with larger computes:
# PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
# we add the || true because:
# the dumps were created with Neon and contain neon extensions that are not
# the dumps were created with Neon and contain neon extensions that are not
# available in RDS, so we will always report an error, but we can ignore it
run: |
${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \

View File

@@ -236,9 +236,7 @@ jobs:
# run pageserver tests with different settings
for io_engine in std-fs tokio-epoll-uring ; do
for io_buffer_alignment in 0 1 512 ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
done
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
done
# Run separate tests for real S3
@@ -257,7 +255,15 @@ jobs:
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
- name: Install postgres binaries
run: cp -a pg_install /tmp/neon/pg_install
run: |
# Use tar to copy files matching the pattern, preserving the paths in the destionation
tar c \
pg_install/v* \
pg_install/build/*/src/test/regress/*.so \
pg_install/build/*/src/test/regress/pg_regress \
pg_install/build/*/src/test/isolation/isolationtester \
pg_install/build/*/src/test/isolation/pg_isolation_regress \
| tar x -C /tmp/neon
- name: Upload Neon artifact
uses: ./.github/actions/upload

View File

@@ -12,7 +12,6 @@ on:
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 3 * * *' # run once a day, timezone is utc
workflow_dispatch: # adds ability to run this manually
inputs:
region_id:
@@ -59,7 +58,7 @@ jobs:
permissions:
contents: write
statuses: write
id-token: write # Required for OIDC authentication in azure runners
id-token: write # aws-actions/configure-aws-credentials
strategy:
fail-fast: false
matrix:
@@ -68,12 +67,10 @@ jobs:
PLATFORM: "neon-staging"
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
RUNNER: [ self-hosted, us-east-2, x64 ]
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
- DEFAULT_PG_VERSION: 16
PLATFORM: "azure-staging"
region_id: 'azure-eastus2'
RUNNER: [ self-hosted, eastus2, x64 ]
IMAGE: neondatabase/build-tools:pinned
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -86,7 +83,10 @@ jobs:
runs-on: ${{ matrix.RUNNER }}
container:
image: ${{ matrix.IMAGE }}
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
@@ -164,6 +164,10 @@ jobs:
replication-tests:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 16
@@ -174,12 +178,21 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
uses: ./.github/actions/download
@@ -267,7 +280,7 @@ jobs:
region_id_default=${{ env.DEFAULT_REGION_ID }}
runner_default='["self-hosted", "us-east-2", "x64"]'
runner_azure='["self-hosted", "eastus2", "x64"]'
image_default="369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned"
image_default="neondatabase/build-tools:pinned"
matrix='{
"pg_version" : [
16
@@ -344,7 +357,7 @@ jobs:
permissions:
contents: write
statuses: write
id-token: write # Required for OIDC authentication in azure runners
id-token: write # aws-actions/configure-aws-credentials
strategy:
fail-fast: false
@@ -371,7 +384,7 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials # necessary on Azure runners
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
@@ -492,17 +505,15 @@ jobs:
permissions:
contents: write
statuses: write
id-token: write # Required for OIDC authentication in azure runners
id-token: write # aws-actions/configure-aws-credentials
strategy:
fail-fast: false
matrix:
include:
- PLATFORM: "neonvm-captest-pgvector"
RUNNER: [ self-hosted, us-east-2, x64 ]
IMAGE: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
- PLATFORM: "azure-captest-pgvector"
RUNNER: [ self-hosted, eastus2, x64 ]
IMAGE: neondatabase/build-tools:pinned
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
@@ -511,13 +522,16 @@ jobs:
DEFAULT_PG_VERSION: 16
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: ${{ matrix.PLATFORM }}
runs-on: ${{ matrix.RUNNER }}
container:
image: ${{ matrix.IMAGE }}
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
@@ -527,17 +541,26 @@ jobs:
# instead of using Neon artifacts containing pgbench
- name: Install postgresql-16 where pytest expects it
run: |
# Just to make it easier to test things locally on macOS (with arm64)
arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
cd /home/nonroot
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.4-1.pgdg110%2B1_amd64.deb
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110%2B1_amd64.deb
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110%2B1_amd64.deb
dpkg -x libpq5_16.4-1.pgdg110+1_amd64.deb pg
dpkg -x postgresql-client-16_16.4-1.pgdg110+1_amd64.deb pg
dpkg -x postgresql-16_16.4-1.pgdg110+1_amd64.deb pg
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb"
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb"
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb"
dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg
dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg
dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg
mkdir -p /tmp/neon/pg_install/v16/bin
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v16/lib
LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
export LD_LIBRARY_PATH
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
/tmp/neon/pg_install/v16/bin/pgbench --version
/tmp/neon/pg_install/v16/bin/psql --version
@@ -559,7 +582,7 @@ jobs:
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
- name: Configure AWS credentials # necessary on Azure runners to read/write from/to S3
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
@@ -620,6 +643,10 @@ jobs:
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
strategy:
@@ -638,12 +665,22 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
uses: ./.github/actions/download
with:
@@ -714,6 +751,10 @@ jobs:
#
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
strategy:
@@ -731,12 +772,22 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
uses: ./.github/actions/download
with:
@@ -806,6 +857,10 @@ jobs:
user-examples-compare:
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
strategy:
@@ -822,12 +877,22 @@ jobs:
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
image: neondatabase/build-tools:pinned
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours
- name: Download Neon artifact
uses: ./.github/actions/download
with:

View File

@@ -36,7 +36,7 @@ jobs:
strategy:
matrix:
arch: [ x64 ]
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -79,9 +79,7 @@ jobs:
push: true
pull: true
file: Dockerfile.build-tools
cache-from: |
type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}-does-not-exist-2
cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}

View File

@@ -6,7 +6,6 @@ on:
- main
- release
- release-proxy
- bayandin/test
pull_request:
defaults:
@@ -28,7 +27,7 @@ env:
jobs:
check-permissions:
if: false
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
uses: ./.github/workflows/check-permissions.yml
with:
github-event-name: ${{ github.event_name }}
@@ -79,6 +78,7 @@ jobs:
id: build-tag
check-build-tools-image:
needs: [ check-permissions ]
uses: ./.github/workflows/check-build-tools-image.yml
build-build-tools-image:
@@ -341,7 +341,7 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
SYNC_AFTER_EACH_TEST: true
SYNC_BETWEEN_TESTS: true
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
@@ -602,7 +602,20 @@ jobs:
strategy:
fail-fast: false
matrix:
version: [ v14, v15, v16, v17 ]
version:
# Much data was already generated on old PG versions with bullseye's
# libraries, the locales of which can cause data incompatibilities.
# However, new PG versions should check if they can be built on newer
# images, as that reduces the support burden of old and ancient
# distros.
- pg: v14
debian: bullseye-slim
- pg: v15
debian: bullseye-slim
- pg: v16
debian: bullseye-slim
- pg: v17
debian: bookworm-slim
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
@@ -645,41 +658,46 @@ jobs:
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
PG_VERSION=${{ matrix.version }}
PG_VERSION=${{ matrix.version.pg }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
DEBIAN_FLAVOR=${{ matrix.version.debian }}
provenance: false
push: true
pull: true
file: compute/Dockerfile.compute-node
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
tags: |
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
- name: Build neon extensions test image
if: matrix.version == 'v16'
if: matrix.version.pg == 'v16'
uses: docker/build-push-action@v6
with:
context: .
build-args: |
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
PG_VERSION=${{ matrix.version }}
PG_VERSION=${{ matrix.version.pg }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
DEBIAN_FLAVOR=${{ matrix.version.debian }}
provenance: false
push: true
pull: true
file: compute/Dockerfile.compute-node
target: neon-pg-ext-test
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }}
tags: |
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
- name: Build compute-tools image
# compute-tools are Postgres independent, so build it only once
if: matrix.version == 'v17'
# We pick 16, because that builds on debian 11 with older glibc (and is
# thus compatible with newer glibc), rather than 17 on Debian 12, as
# that isn't guaranteed to be compatible with Debian 11
if: matrix.version.pg == 'v16'
uses: docker/build-push-action@v6
with:
target: compute-tools-image
@@ -688,6 +706,7 @@ jobs:
GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG=${{ needs.tag.outputs.build-tag }}
TAG=${{ needs.build-build-tools-image.outputs.image-tag }}
DEBIAN_FLAVOR=${{ matrix.version.debian }}
provenance: false
push: true
pull: true
@@ -754,7 +773,7 @@ jobs:
matrix:
version: [ v14, v15, v16, v17 ]
env:
VM_BUILDER_VERSION: v0.29.3
VM_BUILDER_VERSION: v0.35.0
steps:
- uses: actions/checkout@v4
@@ -843,6 +862,9 @@ jobs:
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: ubuntu-22.04
permissions:
id-token: write # for `aws-actions/configure-aws-credentials`
env:
VERSIONS: v14 v15 v16 v17
@@ -887,13 +909,19 @@ jobs:
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
- name: Configure AWS-prod credentials
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
mask-aws-account-id: true
role-to-assume: ${{ secrets.PROD_GHA_OIDC_ROLE }}
- name: Login to prod ECR
uses: docker/login-action@v3
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
with:
registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
- name: Copy all images to prod ECR
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
@@ -1162,10 +1190,9 @@ jobs:
files_to_promote+=("s3://${BUCKET}/${s3_key}")
# TODO Add v17
for pg_version in v14 v15 v16; do
for pg_version in v14 v15 v16 v17; do
# We run less tests for debug builds, so we don't need to promote them
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then
if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v17" ] ; }; then
continue
fi

View File

@@ -33,7 +33,7 @@ jobs:
IMAGE_TAG: |
${{ hashFiles('Dockerfile.build-tools',
'.github/workflows/check-build-tools-image.yml',
'.github/workflows/build-build-tools-image.yml') }}-test
'.github/workflows/build-build-tools-image.yml') }}
run: |
echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT

View File

@@ -102,12 +102,17 @@ jobs:
# Default set of platforms to run e2e tests on
platforms='["docker", "k8s"]'
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
# If a PR changes anything that affects computes, add k8s-neonvm to the list of platforms.
# If the workflow run is not a pull request, add k8s-neonvm to the list.
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
case "$f" in
vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
# List of directories that contain code which affect compute images.
#
# This isn't exhaustive, just the paths that are most directly compute-related.
# For example, compute_ctl also depends on libs/utils, but we don't trigger
# an e2e run on that.
vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
;;
*)

493
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -53,15 +53,15 @@ azure_storage_blobs = { version = "0.19", default-features = false, features = [
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
aws-config = { version = "1.3", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.26"
aws-sdk-iam = "1.15.0"
aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
aws-sdk-s3 = "1.52"
aws-sdk-iam = "1.46.0"
aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.1.9"
aws-smithy-types = "1.2"
aws-credential-types = "1.2.0"
aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
aws-types = "1.2.0"
axum = { version = "0.6.20", features = ["ws"] }
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
aws-types = "1.3"
axum = { version = "0.7.5", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
bindgen = "0.70"
@@ -96,10 +96,13 @@ hmac = "0.12.1"
hostname = "0.4"
http = {version = "1.1.0", features = ["std"]}
http-types = { version = "2", default-features = false }
http-body-util = "0.1.2"
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
tokio-tungstenite = "0.20.0"
hyper0 = { package = "hyper", version = "0.14" }
hyper = "1.4"
hyper-util = "0.1"
tokio-tungstenite = "0.21.0"
indexmap = "2"
indoc = "2"
ipnet = "2.9.0"
@@ -116,9 +119,10 @@ notify = "6.0.0"
num_cpus = "1.15"
num-traits = "0.2.15"
once_cell = "1.13"
opentelemetry = "0.20.0"
opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.12.0"
opentelemetry = "0.24"
opentelemetry_sdk = "0.24"
opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.16"
parking_lot = "0.12"
parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
@@ -131,7 +135,7 @@ rand = "0.8"
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
regex = "1.10.2"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] }
reqwest-middleware = "0.3.0"
reqwest-retry = "0.5"
routerify = "3"
@@ -177,8 +181,8 @@ toml_edit = "0.22"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
tower-service = "0.3.2"
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.21.0"
tracing-error = "0.2"
tracing-opentelemetry = "0.25"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
try-lock = "0.2.5"
twox-hash = { version = "1.6.3", default-features = false }

View File

@@ -13,6 +13,9 @@ RUN useradd -ms /bin/bash nonroot -b /home
SHELL ["/bin/bash", "-c"]
# System deps
#
# 'gdb' is included so that we get backtraces of core dumps produced in
# regression tests
RUN set -e \
&& apt update \
&& apt install -y \
@@ -24,6 +27,7 @@ RUN set -e \
cmake \
curl \
flex \
gdb \
git \
gnupg \
gzip \

View File

@@ -58,7 +58,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
1. Install XCode and dependencies
```
xcode-select --install
brew install protobuf openssl flex bison icu4c pkg-config
brew install protobuf openssl flex bison icu4c pkg-config m4
# add openssl to PATH, required for ed25519 keys generation in neon_local
echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc

View File

@@ -3,17 +3,34 @@ ARG REPOSITORY=neondatabase
ARG IMAGE=build-tools
ARG TAG=pinned
ARG BUILD_TAG
ARG DEBIAN_FLAVOR=bullseye-slim
#########################################################################################
#
# Layer "build-deps"
#
#########################################################################################
FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
FROM debian:$DEBIAN_FLAVOR AS build-deps
ARG DEBIAN_FLAVOR
RUN case $DEBIAN_FLAVOR in \
# Version-specific installs for Bullseye (PG14-PG16):
# The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
# Install newer version (3.25) from backports.
bullseye*) \
echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \
;; \
# Version-specific installs for Bookworm (PG17):
bookworm*) \
VERSION_INSTALLS="cmake"; \
;; \
esac && \
apt update && \
apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \
$VERSION_INSTALLS
#########################################################################################
#
@@ -87,7 +104,7 @@ FROM build-deps AS postgis-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y cmake gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
apt install --no-install-recommends -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \
libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \
libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
protobuf-c-compiler xsltproc
@@ -165,7 +182,7 @@ RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
apt update && \
apt install -y ninja-build python3-dev libncurses5 binutils clang
apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -198,27 +215,6 @@ FROM build-deps AS h3-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
case "$(uname -m)" in \
"x86_64") \
export CMAKE_CHECKSUM=739d372726cb23129d57a539ce1432453448816e345e1545f6127296926b6754 \
;; \
"aarch64") \
export CMAKE_CHECKSUM=281b42627c9a1beed03e29706574d04c6c53fae4994472e90985ef018dd29c02 \
;; \
*) \
echo "Unsupported architecture '$(uname -m)'. Supported are x86_64 and aarch64" && exit 1 \
;; \
esac && \
wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-$(uname -m).sh \
-q -O /tmp/cmake-install.sh \
&& echo "${CMAKE_CHECKSUM} /tmp/cmake-install.sh" | sha256sum --check \
&& chmod u+x /tmp/cmake-install.sh \
&& /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
&& rm /tmp/cmake-install.sh
RUN case "${PG_VERSION}" in "v17") \
mkdir -p /h3/usr/ && \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -504,8 +500,6 @@ RUN case "${PG_VERSION}" in "v17") \
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
;; \
esac && \
apt-get update && \
apt-get install -y cmake && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
mkdir timescaledb-src && cd timescaledb-src && tar xzf ../timescaledb.tar.gz --strip-components=1 -C . && \
@@ -593,8 +587,7 @@ RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
apt-get update && \
apt-get install -y \
cmake \
apt-get install --no-install-recommends -y \
libboost-iostreams1.74-dev \
libboost-regex1.74-dev \
libboost-serialization1.74-dev \
@@ -759,7 +752,7 @@ ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt-get update && \
apt-get install -y curl libclang-dev cmake && \
apt-get install --no-install-recommends -y curl libclang-dev && \
useradd -ms /bin/bash nonroot -b /home
ENV HOME=/home/nonroot
@@ -869,6 +862,25 @@ RUN case "${PG_VERSION}" in "v17") \
cargo pgrx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################
#
# Layer "pg-session-jwt-build"
# Compile "pg_session_jwt" extension
#
#########################################################################################
FROM rust-extensions-build AS pg-session-jwt-build
ARG PG_VERSION
RUN case "${PG_VERSION}" in "v17") \
echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
esac && \
wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \
echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgrx install --release
#########################################################################################
#
# Layer "wal2json-build"
@@ -965,6 +977,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1027,7 +1040,8 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de
#
#########################################################################################
FROM debian:bullseye-slim AS compute-tools-image
FROM debian:$DEBIAN_FLAVOR AS compute-tools-image
ARG DEBIAN_FLAVOR
COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
@@ -1037,12 +1051,16 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu
#
#########################################################################################
FROM debian:bullseye-slim AS pgbouncer
FROM debian:$DEBIAN_FLAVOR AS pgbouncer
ARG DEBIAN_FLAVOR
RUN set -e \
&& apt-get update \
&& apt-get install -y \
&& apt-get install --no-install-recommends -y \
build-essential \
git \
ca-certificates \
autoconf \
automake \
libevent-dev \
libtool \
pkg-config
@@ -1057,6 +1075,20 @@ RUN set -e \
&& make -j $(nproc) dist_man_MANS= \
&& make install dist_man_MANS=
#########################################################################################
#
# Compile the Neon-specific `local_proxy` binary
#
#########################################################################################
FROM $REPOSITORY/$IMAGE:$TAG AS local_proxy
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin local_proxy
#########################################################################################
#
# Layers "postgres-exporter" and "sql-exporter"
@@ -1150,11 +1182,6 @@ RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
# cmake is required for the h3 test
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
apt-get update && apt-get install -y cmake
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
@@ -1179,7 +1206,8 @@ ENV PGDATABASE=postgres
# Put it all together into the final image
#
#########################################################################################
FROM debian:bullseye-slim
FROM debian:$DEBIAN_FLAVOR
ARG DEBIAN_FLAVOR
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
@@ -1199,6 +1227,10 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
# local_proxy and its config
COPY --from=local_proxy --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy
RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
# Metrics exporter binaries and configuration files
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter
@@ -1211,21 +1243,34 @@ COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector
# Create remote extension download directory
RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
# Install:
# libreadline8 for psql
# libicu67, locales for collations (including ICU and plpgsql_check)
# liblz4-1 for lz4
# libossp-uuid16 for extension ossp-uuid
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libgeos, libsfcgal1, and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2
# libzstd1 for zstd
# libboost* for rdkit
# ca-certificates for communicating with s3 by compute_ctl
RUN apt update && \
RUN apt update && \
case $DEBIAN_FLAVOR in \
# Version-specific installs for Bullseye (PG14-PG16):
# libicu67, locales for collations (including ICU and plpgsql_check)
# libgdal28, libproj19 for PostGIS
bullseye*) \
VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \
;; \
# Version-specific installs for Bookworm (PG17):
# libicu72, locales for collations (including ICU and plpgsql_check)
# libgdal32, libproj25 for PostGIS
bookworm*) \
VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \
;; \
esac && \
apt install --no-install-recommends -y \
gdb \
libicu67 \
liblz4-1 \
libreadline8 \
libboost-iostreams1.74.0 \
@@ -1234,17 +1279,16 @@ RUN apt update && \
libboost-system1.74.0 \
libossp-uuid16 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
libprotobuf-c1 \
libsfcgal1 \
libxml2 \
libxslt1.1 \
libzstd1 \
libcurl4-openssl-dev \
libcurl4 \
locales \
procps \
ca-certificates && \
ca-certificates \
$VERSION_INSTALLS && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -94,6 +94,68 @@ metrics:
query: |
select sum(pg_database_size(datname)) as total from pg_database;
- metric_name: getpage_wait_seconds_count
type: counter
help: 'Number of getpage requests'
values: [getpage_wait_seconds_count]
query_ref: neon_perf_counters
- metric_name: getpage_wait_seconds_sum
type: counter
help: 'Time spent in getpage requests'
values: [getpage_wait_seconds_sum]
query_ref: neon_perf_counters
- metric_name: getpage_prefetch_requests_total
type: counter
help: 'Number of getpage issued for prefetching'
values: [getpage_prefetch_requests_total]
query_ref: neon_perf_counters
- metric_name: getpage_sync_requests_total
type: counter
help: 'Number of synchronous getpage issued'
values: [getpage_sync_requests_total]
query_ref: neon_perf_counters
- metric_name: getpage_prefetch_misses_total
type: counter
help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
values: [getpage_prefetch_misses_total]
query_ref: neon_perf_counters
- metric_name: getpage_prefetch_discards_total
type: counter
help: 'Number of prefetch responses issued but not used'
values: [getpage_prefetch_discards_total]
query_ref: neon_perf_counters
- metric_name: pageserver_requests_sent_total
type: counter
help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
values: [pageserver_requests_sent_total]
query_ref: neon_perf_counters
- metric_name: pageserver_disconnects_total
type: counter
help: 'Number of times that the connection to the pageserver was lost'
values: [pageserver_disconnects_total]
query_ref: neon_perf_counters
- metric_name: pageserver_send_flushes_total
type: counter
help: 'Number of flushes to the pageserver connection'
values: [pageserver_send_flushes_total]
query_ref: neon_perf_counters
- metric_name: getpage_wait_seconds_bucket
type: counter
help: 'Histogram buckets of getpage request latency'
key_labels:
- bucket_le
values: [value]
query_ref: getpage_wait_seconds_buckets
# DEPRECATED
- metric_name: lfc_approximate_working_set_size
type: gauge
@@ -195,7 +257,7 @@ metrics:
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
-- temporary snapshot files are renamed to the actual snapshot files after they are
-- completely built. We only WAL-log the completely built snapshot files.
(SELECT COUNT(*) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
@@ -245,3 +307,25 @@ metrics:
CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
FROM pg_replication_slots;
queries:
- query_name: neon_perf_counters
query: |
WITH c AS (
SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
)
SELECT d.*
FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
getpage_wait_seconds_count numeric,
getpage_wait_seconds_sum numeric,
getpage_prefetch_requests_total numeric,
getpage_sync_requests_total numeric,
getpage_prefetch_misses_total numeric,
getpage_prefetch_discards_total numeric,
pageserver_requests_sent_total numeric,
pageserver_disconnects_total numeric,
pageserver_send_flushes_total numeric
);
- query_name: getpage_wait_seconds_buckets
query: |
SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';

View File

@@ -11,10 +11,18 @@ commands:
user: root
sysvInitAction: sysinit
shell: 'chmod 711 /neonvm/bin/resize-swap'
- name: chmod-set-disk-quota
user: root
sysvInitAction: sysinit
shell: 'chmod 711 /neonvm/bin/set-disk-quota'
- name: pgbouncer
user: postgres
sysvInitAction: respawn
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
- name: local_proxy
user: postgres
sysvInitAction: respawn
shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
- name: postgres-exporter
user: nobody
sysvInitAction: respawn
@@ -30,11 +38,12 @@ commands:
shutdownHook: |
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
files:
- filename: compute_ctl-resize-swap
- filename: compute_ctl-sudoers
content: |
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
# as root without requiring entering a password (NOPASSWD), regardless of hostname (ALL)
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
# regardless of hostname (ALL)
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
- filename: cgconfig.conf
content: |
# Configuration for cgroups in VM compute nodes
@@ -100,7 +109,7 @@ merge: |
&& apt install --no-install-recommends -y \
sudo \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
COPY compute_ctl-resize-swap /etc/sudoers.d/compute_ctl-resize-swap
COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers
COPY cgconfig.conf /etc/cgconfig.conf

View File

@@ -11,16 +11,18 @@ testing = []
[dependencies]
anyhow.workspace = true
camino.workspace = true
chrono.workspace = true
cfg-if.workspace = true
clap.workspace = true
flate2.workspace = true
futures.workspace = true
hyper = { workspace = true, features = ["full"] }
hyper0 = { workspace = true, features = ["full"] }
nix.workspace = true
notify.workspace = true
num_cpus.workspace = true
opentelemetry.workspace = true
opentelemetry_sdk.workspace = true
postgres.workspace = true
regex.workspace = true
serde_json.workspace = true

View File

@@ -44,6 +44,7 @@ use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use compute_tools::disk_quota::set_disk_quota;
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
use signal_hook::consts::{SIGQUIT, SIGTERM};
use signal_hook::{consts::SIGINT, iterator::Signals};
@@ -151,6 +152,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
let spec_json = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
Ok(ProcessCliResult {
connstr,
@@ -161,6 +163,7 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
spec_json,
spec_path,
resize_swap_on_bind,
set_disk_quota_for_fs,
})
}
@@ -173,6 +176,7 @@ struct ProcessCliResult<'clap> {
spec_json: Option<&'clap String>,
spec_path: Option<&'clap String>,
resize_swap_on_bind: bool,
set_disk_quota_for_fs: Option<&'clap String>,
}
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
@@ -214,7 +218,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
}
if !startup_tracing_carrier.is_empty() {
use opentelemetry::propagation::TextMapPropagator;
use opentelemetry::sdk::propagation::TraceContextPropagator;
use opentelemetry_sdk::propagation::TraceContextPropagator;
let guard = TraceContextPropagator::new()
.extract(&startup_tracing_carrier)
.attach();
@@ -293,6 +297,7 @@ fn wait_spec(
pgbin,
ext_remote_storage,
resize_swap_on_bind,
set_disk_quota_for_fs,
http_port,
..
}: ProcessCliResult,
@@ -373,6 +378,7 @@ fn wait_spec(
compute,
http_port,
resize_swap_on_bind,
set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
})
}
@@ -381,6 +387,7 @@ struct WaitSpecResult {
// passed through from ProcessCliResult
http_port: u16,
resize_swap_on_bind: bool,
set_disk_quota_for_fs: Option<String>,
}
fn start_postgres(
@@ -390,6 +397,7 @@ fn start_postgres(
compute,
http_port,
resize_swap_on_bind,
set_disk_quota_for_fs,
}: WaitSpecResult,
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
// We got all we need, update the state.
@@ -403,6 +411,7 @@ fn start_postgres(
);
// before we release the mutex, fetch the swap size (if any) for later.
let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes;
drop(state);
// Launch remaining service threads
@@ -422,8 +431,8 @@ fn start_postgres(
// OOM-killed during startup because swap wasn't available yet.
match resize_swap(size_bytes) {
Ok(()) => {
let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%size_bytes, %size_gib, "resized swap");
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%size_bytes, %size_mib, "resized swap");
}
Err(err) => {
let err = err.context("failed to resize swap");
@@ -432,10 +441,29 @@ fn start_postgres(
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
let mut state = compute.state.lock().unwrap();
state.error = Some(format!("{err:?}"));
state.status = ComputeStatus::Failed;
compute.state_changed.notify_all();
compute.set_failed_status(err);
delay_exit = true;
}
}
}
// Set disk quota if the compute spec says so
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
(disk_quota_bytes, set_disk_quota_for_fs)
{
match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
Ok(()) => {
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
info!(%disk_quota_bytes, %size_mib, "set disk quota");
}
Err(err) => {
let err = err.context("failed to set disk quota");
error!("{err:#}");
// Mark compute startup as failed; don't try to start postgres, and report this
// error to the control plane when it next asks.
prestartup_failed = true;
compute.set_failed_status(err);
delay_exit = true;
}
}
@@ -450,16 +478,7 @@ fn start_postgres(
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:#}", err);
let mut state = compute.state.lock().unwrap();
state.error = Some(format!("{:?}", err));
state.status = ComputeStatus::Failed;
// Notify others that Postgres failed to start. In case of configuring the
// empty compute, it's likely that API handler is still waiting for compute
// state change. With this we will notify it that compute is in Failed state,
// so control plane will know about it earlier and record proper error instead
// of timeout.
compute.state_changed.notify_all();
drop(state); // unlock
compute.set_failed_status(err);
delay_exit = true;
None
}
@@ -750,6 +769,11 @@ fn cli() -> clap::Command {
.long("resize-swap-on-bind")
.action(clap::ArgAction::SetTrue),
)
.arg(
Arg::new("set-disk-quota-for-fs")
.long("set-disk-quota-for-fs")
.value_name("SET_DISK_QUOTA_FOR_FS")
)
}
/// When compute_ctl is killed, send also termination signal to sync-safekeepers

View File

@@ -10,6 +10,7 @@ use std::sync::atomic::AtomicU32;
use std::sync::atomic::Ordering;
use std::sync::{Condvar, Mutex, RwLock};
use std::thread;
use std::time::Duration;
use std::time::Instant;
use anyhow::{Context, Result};
@@ -33,6 +34,7 @@ use nix::sys::signal::{kill, Signal};
use remote_storage::{DownloadError, RemotePath};
use crate::checker::create_availability_check_data;
use crate::local_proxy;
use crate::logger::inlinify;
use crate::pg_helpers::*;
use crate::spec::*;
@@ -305,6 +307,13 @@ impl ComputeNode {
self.state_changed.notify_all();
}
pub fn set_failed_status(&self, err: anyhow::Error) {
let mut state = self.state.lock().unwrap();
state.error = Some(format!("{err:?}"));
state.status = ComputeStatus::Failed;
self.state_changed.notify_all();
}
pub fn get_status(&self) -> ComputeStatus {
self.state.lock().unwrap().status
}
@@ -710,7 +719,7 @@ impl ComputeNode {
info!("running initdb");
let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
Command::new(initdb_bin)
.args(["-D", pgdata])
.args(["--pgdata", pgdata])
.output()
.expect("cannot start initdb process");
@@ -878,6 +887,11 @@ impl ComputeNode {
// 'Close' connection
drop(client);
if let Some(ref local_proxy) = spec.local_proxy_config {
info!("configuring local_proxy");
local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
}
// Run migrations separately to not hold up cold starts
thread::spawn(move || {
let mut connstr = connstr.clone();
@@ -928,6 +942,19 @@ impl ComputeNode {
});
}
if let Some(ref local_proxy) = spec.local_proxy_config {
info!("configuring local_proxy");
// Spawn a thread to do the configuration,
// so that we don't block the main thread that starts Postgres.
let local_proxy = local_proxy.clone();
let _handle = Some(thread::spawn(move || {
if let Err(err) = local_proxy::configure(&local_proxy) {
error!("error while configuring local_proxy: {err:?}");
}
}));
}
// Write new config
let pgdata_path = Path::new(&self.pgdata);
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
@@ -1015,6 +1042,19 @@ impl ComputeNode {
});
}
if let Some(local_proxy) = &pspec.spec.local_proxy_config {
info!("configuring local_proxy");
// Spawn a thread to do the configuration,
// so that we don't block the main thread that starts Postgres.
let local_proxy = local_proxy.clone();
let _handle = thread::spawn(move || {
if let Err(err) = local_proxy::configure(&local_proxy) {
error!("error while configuring local_proxy: {err:?}");
}
});
}
info!(
"start_compute spec.remote_extensions {:?}",
pspec.spec.remote_extensions
@@ -1052,19 +1092,26 @@ impl ComputeNode {
let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
let config_time = Utc::now();
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
let pgdata_path = Path::new(&self.pgdata);
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are applying config:
// creating new extensions, roles, etc...
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
if pspec.spec.mode == ComputeMode::Primary {
if !pspec.spec.skip_pg_catalog_updates {
let pgdata_path = Path::new(&self.pgdata);
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are applying config:
// creating new extensions, roles, etc...
config::with_compute_ctl_tmp_override(
pgdata_path,
"neon.max_cluster_size=-1",
|| {
self.pg_reload_conf()?;
self.apply_config(&compute_state)?;
Ok(())
},
)?;
self.pg_reload_conf()?;
self.apply_config(&compute_state)?;
Ok(())
})?;
self.pg_reload_conf()?;
}
self.post_apply_config()?;
}
let startup_end_time = Utc::now();
@@ -1123,6 +1170,9 @@ impl ComputeNode {
//
// Use that as a default location and pattern, except macos where core dumps are written
// to /cores/ directory by default.
//
// With default Linux settings, the core dump file is called just "core", so check for
// that too.
pub fn check_for_core_dumps(&self) -> Result<()> {
let core_dump_dir = match std::env::consts::OS {
"macos" => Path::new("/cores/"),
@@ -1134,8 +1184,17 @@ impl ComputeNode {
let files = fs::read_dir(core_dump_dir)?;
let cores = files.filter_map(|entry| {
let entry = entry.ok()?;
let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
Some(entry.path())
let is_core_dump = match entry.file_name().to_str()? {
n if n.starts_with("core.") => true,
"core" => true,
_ => false,
};
if is_core_dump {
Some(entry.path())
} else {
None
}
});
// Print backtrace for each core dump
@@ -1386,6 +1445,36 @@ LIMIT 100",
}
Ok(remote_ext_metrics)
}
/// Waits until current thread receives a state changed notification and
/// the pageserver connection strings has changed.
///
/// The operation will time out after a specified duration.
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
let state = self.state.lock().unwrap();
let old_pageserver_connstr = state
.pspec
.as_ref()
.expect("spec must be set")
.pageserver_connstr
.clone();
let mut unchanged = true;
let _ = self
.state_changed
.wait_timeout_while(state, duration, |s| {
let pageserver_connstr = &s
.pspec
.as_ref()
.expect("spec must be set")
.pageserver_connstr;
unchanged = pageserver_connstr == &old_pageserver_connstr;
unchanged
})
.unwrap();
if !unchanged {
info!("Pageserver config changed");
}
}
}
pub fn forward_termination_signal() {

View File

@@ -11,9 +11,17 @@ use crate::compute::ComputeNode;
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
info!("waiting for reconfiguration requests");
loop {
let state = compute.state.lock().unwrap();
let mut state = compute.state_changed.wait(state).unwrap();
let mut state = compute.state.lock().unwrap();
// We have to re-check the status after re-acquiring the lock because it could be that
// the status has changed while we were waiting for the lock, and we might not need to
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
// we are waiting for a condition variable that will never be signaled.
if state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();
}
// Re-check the status after waking up
if state.status == ComputeStatus::ConfigurationPending {
info!("got configuration request");
state.status = ComputeStatus::Configuration;

View File

@@ -0,0 +1,25 @@
use anyhow::Context;
pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota";
/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set.
pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> {
let size_kb = size_bytes / 1024;
// run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`
let child_result = std::process::Command::new("/usr/bin/sudo")
.arg(DISK_QUOTA_BIN)
.arg(size_kb.to_string())
.arg(fs_mountpoint)
.spawn();
child_result
.context("spawn() failed")
.and_then(|mut child| child.wait().context("wait() failed"))
.and_then(|status| match status.success() {
true => Ok(()),
false => Err(anyhow::anyhow!("process exited with {status}")),
})
// wrap any prior error with the overall context that we couldn't run the command
.with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`"))
}

View File

@@ -2,6 +2,9 @@
//! configuration.
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
extern crate hyper0 as hyper;
pub mod checker;
pub mod config;
pub mod configurator;
@@ -10,7 +13,9 @@ pub mod http;
pub mod logger;
pub mod catalog;
pub mod compute;
pub mod disk_quota;
pub mod extension_server;
pub mod local_proxy;
pub mod lsn_lease;
mod migration;
pub mod monitor;

View File

@@ -0,0 +1,56 @@
//! Local Proxy is a feature of our BaaS Neon Authorize project.
//!
//! Local Proxy validates JWTs and manages the pg_session_jwt extension.
//! It also maintains a connection pool to postgres.
use anyhow::{Context, Result};
use camino::Utf8Path;
use compute_api::spec::LocalProxySpec;
use nix::sys::signal::Signal;
use utils::pid_file::{self, PidFileRead};
pub fn configure(local_proxy: &LocalProxySpec) -> Result<()> {
write_local_proxy_conf("/etc/local_proxy/config.json".as_ref(), local_proxy)?;
notify_local_proxy("/etc/local_proxy/pid".as_ref())?;
Ok(())
}
/// Create or completely rewrite configuration file specified by `path`
fn write_local_proxy_conf(path: &Utf8Path, local_proxy: &LocalProxySpec) -> Result<()> {
let config =
serde_json::to_string_pretty(local_proxy).context("serializing LocalProxySpec to json")?;
std::fs::write(path, config).with_context(|| format!("writing {path}"))?;
Ok(())
}
/// Notify local proxy about a new config file.
fn notify_local_proxy(path: &Utf8Path) -> Result<()> {
match pid_file::read(path)? {
// if the file doesn't exist, or isn't locked, local_proxy isn't running
// and will naturally pick up our config later
PidFileRead::NotExist | PidFileRead::NotHeldByAnyProcess(_) => {}
PidFileRead::LockedByOtherProcess(pid) => {
// From the pid_file docs:
//
// > 1. The other process might exit at any time, turning the given PID stale.
// > 2. There is a small window in which `claim_for_current_process` has already
// > locked the file but not yet updates its contents. [`read`] will return
// > this variant here, but with the old file contents, i.e., a stale PID.
// >
// > The kernel is free to recycle PID once it has been `wait(2)`ed upon by
// > its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
// > system call on it, bears the risk of killing an unrelated process.
// > This is an inherent limitation of using pidfiles.
// > The only race-free solution is to have a supervisor-process with a lifetime
// > that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
//
// This is an ok risk as we only send a SIGHUP which likely won't actually
// kill the process, only reload config.
nix::sys::signal::kill(pid, Signal::SIGHUP).context("sending signal to local_proxy")?;
}
}
Ok(())
}

View File

@@ -1,4 +1,3 @@
use tracing_opentelemetry::OpenTelemetryLayer;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
@@ -23,8 +22,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
.with_writer(std::io::stderr);
// Initialize OpenTelemetry
let otlp_layer =
tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl");
// Put it all together
tracing_subscriber::registry()

View File

@@ -57,10 +57,10 @@ fn lsn_lease_bg_task(
.max(valid_duration / 2);
info!(
"Succeeded, sleeping for {} seconds",
"Request succeeded, sleeping for {} seconds",
sleep_duration.as_secs()
);
thread::sleep(sleep_duration);
compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration);
}
}
@@ -89,10 +89,7 @@ fn acquire_lsn_lease_with_retry(
.map(|connstr| {
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
if let Some(storage_auth_token) = &spec.storage_auth_token {
info!("Got storage auth token from spec file");
config.password(storage_auth_token.clone());
} else {
info!("Storage auth token not set");
}
config
})
@@ -108,9 +105,11 @@ fn acquire_lsn_lease_with_retry(
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
}
Err(e) => {
warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
warn!("Failed to acquire lsn lease: {e} (attempt {attempts})");
thread::sleep(Duration::from_millis(retry_period_ms as u64));
compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis(
retry_period_ms as u64,
));
retry_period_ms *= 1.5;
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
}

View File

@@ -9,12 +9,12 @@ anyhow.workspace = true
camino.workspace = true
clap.workspace = true
comfy-table.workspace = true
git-version.workspace = true
futures.workspace = true
humantime.workspace = true
nix.workspace = true
once_cell.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
hyper0.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["blocking", "json"] }
scopeguard.workspace = true

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,94 @@
//! Branch mappings for convenience
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use anyhow::{bail, Context};
use serde::{Deserialize, Serialize};
use utils::id::{TenantId, TenantTimelineId, TimelineId};
/// Keep human-readable aliases in memory (and persist them to config XXX), to hide tenant/timeline hex strings from the user.
#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
#[serde(default, deny_unknown_fields)]
pub struct BranchMappings {
/// Default tenant ID to use with the 'neon_local' command line utility, when
/// --tenant_id is not explicitly specified. This comes from the branches.
pub default_tenant_id: Option<TenantId>,
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
pub mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
}
impl BranchMappings {
pub fn register_branch_mapping(
&mut self,
branch_name: String,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> anyhow::Result<()> {
let existing_values = self.mappings.entry(branch_name.clone()).or_default();
let existing_ids = existing_values
.iter()
.find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
if let Some((_, old_timeline_id)) = existing_ids {
if old_timeline_id == &timeline_id {
Ok(())
} else {
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
}
} else {
existing_values.push((tenant_id, timeline_id));
Ok(())
}
}
pub fn get_branch_timeline_id(
&self,
branch_name: &str,
tenant_id: TenantId,
) -> Option<TimelineId> {
// If it looks like a timeline ID, return it as it is
if let Ok(timeline_id) = branch_name.parse::<TimelineId>() {
return Some(timeline_id);
}
self.mappings
.get(branch_name)?
.iter()
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
.map(|&(_, timeline_id)| timeline_id)
.map(TimelineId::from)
}
pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
self.mappings
.iter()
.flat_map(|(name, tenant_timelines)| {
tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
(TenantTimelineId::new(tenant_id, timeline_id), name.clone())
})
})
.collect()
}
pub fn persist(&self, path: &Path) -> anyhow::Result<()> {
let content = &toml::to_string_pretty(self)?;
fs::write(path, content).with_context(|| {
format!(
"Failed to write branch information into path '{}'",
path.display()
)
})
}
pub fn load(path: &Path) -> anyhow::Result<BranchMappings> {
let branches_file_contents = fs::read_to_string(path)?;
Ok(toml::from_str(branches_file_contents.as_str())?)
}
}

View File

@@ -561,6 +561,7 @@ impl Endpoint {
operation_uuid: None,
features: self.features.clone(),
swap_size_bytes: None,
disk_quota_bytes: None,
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
@@ -598,6 +599,7 @@ impl Endpoint {
remote_extensions,
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
local_proxy_config: None,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;

View File

@@ -168,6 +168,9 @@ pub struct NeonStorageControllerConf {
#[serde(with = "humantime_serde")]
pub heartbeat_interval: Duration,
#[serde(with = "humantime_serde")]
pub long_reconcile_threshold: Option<Duration>,
}
impl NeonStorageControllerConf {
@@ -190,6 +193,7 @@ impl Default for NeonStorageControllerConf {
split_threshold: None,
max_secondary_lag_bytes: None,
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
long_reconcile_threshold: None,
}
}
}

View File

@@ -113,7 +113,7 @@ impl SafekeeperNode {
pub async fn start(
&self,
extra_opts: Vec<String>,
extra_opts: &[String],
retry_timeout: &Duration,
) -> anyhow::Result<()> {
print!(
@@ -196,7 +196,7 @@ impl SafekeeperNode {
]);
}
args.extend(extra_opts);
args.extend_from_slice(extra_opts);
background_process::start_process(
&format!("safekeeper-{id}"),

View File

@@ -3,7 +3,7 @@ use crate::{
local_env::{LocalEnv, NeonStorageControllerConf},
};
use camino::{Utf8Path, Utf8PathBuf};
use hyper::Uri;
use hyper0::Uri;
use nix::unistd::Pid;
use pageserver_api::{
controller_api::{
@@ -347,7 +347,7 @@ impl StorageController {
if !tokio::fs::try_exists(&pg_data_path).await? {
let initdb_args = [
"-D",
"--pgdata",
pg_data_path.as_ref(),
"--username",
&username(),
@@ -517,6 +517,13 @@ impl StorageController {
args.push(format!("--max-secondary-lag-bytes={lag}"))
}
if let Some(threshold) = self.config.long_reconcile_threshold {
args.push(format!(
"--long-reconcile-threshold={}",
humantime::Duration::from(threshold)
))
}
args.push(format!(
"--neon-local-repo-dir={}",
self.env.base_data_dir.display()

View File

@@ -4,8 +4,8 @@ use std::{str::FromStr, time::Duration};
use clap::{Parser, Subcommand};
use pageserver_api::{
controller_api::{
NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse, ShardSchedulingPolicy,
TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
},
models::{
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
listen_pg_port,
listen_http_addr,
listen_http_port,
availability_zone_id,
availability_zone_id: AvailabilityZone(availability_zone_id),
}),
)
.await?;

View File

@@ -5,7 +5,7 @@
Currently we build two main images:
- [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14.
- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/Dockerfile.compute-node).
And additional intermediate image:

View File

@@ -0,0 +1,112 @@
# AUX file v2
## Summary
This is a retrospective RFC describing a new storage strategy for AUX files.
## Motivation
The original aux file storage strategy stores everything in a single `AUX_FILES_KEY`.
Every time the compute node streams a `neon-file` record to the pageserver, it will
update the aux file hash map, and then write the serialized hash map into the key.
This creates serious space bloat. There was a fix to log delta records (i.e., update
a key in the hash map) to the aux file key. In this way, the pageserver only stores
the deltas at each of the LSNs. However, this improved v1 storage strategy still
requires us to store everything in an aux file cache in memory, because we cannot
fetch a single key (or file) from the compound `AUX_FILES_KEY`.
### Prior art
For storing large amount of small files, we can use a key-value store where the key
is the filename and the value is the file content.
## Requirements
- No space bloat, fixed space amplification.
- No write bloat, fixed write amplification.
## Impacted Components
pageserver
## Sparse Keyspace
In pageserver, we had assumed the keyspaces are always contiguous. For example, if the keyspace 0x0000-0xFFFF
exists in the pageserver, every single key in the key range would exist in the storage. Based on the prior
assumption, there are code that traverses the keyspace by iterating every single key.
```rust
loop {
// do something
key = key.next();
}
```
If a keyspace is very large, for example, containing `2^64` keys, this loop will take infinite time to run.
Therefore, we introduce the concept of sparse keyspace in this RFC. For a sparse keyspace, not every key would
exist in the key range. Developers should not attempt to iterate every single key in the keyspace. Instead,
they should fetch all the layer files in the key range, and then do a merge of them.
In aux file v2, we store aux files within the sparse keyspace of the prefix `AUX_KEY_PREFIX`.
## AUX v2 Keyspace and Key Mapping
Pageserver uses fixed-size keys. The key is 128b. In order to store files of arbitrary filenames into the
keyspace, we assign a predetermined prefix based on the directory storing the aux file, and use the FNV hash
of the filename for the rest bits of the key. The encoding scheme is defined in `encode_aux_file_key`.
For example, `pg_logical/mappings/test1` will be encoded as:
```
62 0000 01 01 7F8B83D94F7081693471ABF91C
^ aux prefix
^ assigned prefix of pg_logical/
^ assigned prefix of mappings/
^ 13B FNV hash of test1
^ not used due to key representation
```
The prefixes of the directories should be assigned every time we add a new type of aux file into the storage within `aux_file.rs`. For all directories without an assigned prefix, it will be put into the `0xFFFF` keyspace.
Note that inside pageserver, there are two representations of the keys: the 18B full key representation
and the 16B compact key representation. For the 18B representation, some fields have restricted ranges
of values. Therefore, the aux keys only use the 16B compact portion of the full key.
It is possible that two files get mapped to the same key due to hash collision. Therefore, the value of
each of the aux key is an array that contains all filenames and file content that should be stored in
this key.
We use `Value::Image` to store the aux keys. Therefore, page reconstruction works in the same way as before,
and we do not need addition code to support reconstructing the value. We simply get the latest image from
the storage.
## Inbound Logical Replication Key Mapping
For inbound logical replication, Postgres needs the `replorigin_checkpoint` file to store the data.
This file not directly stored in the pageserver using the aux v2 mechanism. It is constructed during
generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
## Sparse Keyspace Read Path
There are two places we need to read the aux files from the pageserver:
* On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
## Compaction and Image Layer Generation
With the add of sparse keyspaces, we also modified the compaction code to accommodate the fact that sparse keyspaces do not have every single key stored in the storage.
* L0 compaction: we modified the hole computation code so that it can handle sparse keyspaces when computing holes.
* Image layer creation: instead of calling `key.next()` and getting/reconstructing images for every single key, we use the vectored get API to scan all keys in the keyspace at a given LSN. Image layers are only created if there are too many delta layers between the latest LSN and the last image layer we generated for sparse keyspaces. The created image layer always cover the full aux key range for now, and could be optimized later.
## Migration
We decided not to make the new aux storage strategy (v1) compatible with the original one (v1). One feasible way of doing a seamless migration is to store new data in aux v2 while old data in aux v1, but this complicates file deletions. We want all users to start with a clean state with no aux files in the storage, and therefore, we need to do manual migrations for users using aux v1 by using the [migration script](https://github.com/neondatabase/aux_v2_migration).
During the period of migration, we store the aux policy in the `index_part.json` file. When a tenant is attached
with no policy set, the pageserver will scan the aux file keyspaces to identify the current aux policy being used (v1 or v2).
If a timeline has aux v1 files stored, it will use aux file policy v1 unless we do a manual migration for them. Otherwise, the default aux file policy for new timelines is aux v2. Users enrolled in logical replication before we set aux v2 as default use aux v1 policy. Users who tried setting up inbound replication (which was not supported at that time) may also create some file entries in aux v1 store, even if they did not enroll in the logical replication testing program.
The code for aux v2 migration is in https://github.com/neondatabase/aux_v2_migration. The toolkit scans all projects with logical replication enabled. For all these projects, it put the computes into maintenance mode (suspend all of then), call the migration API to switch the aux file policy on the pageserver (which drops all replication states), and restart all the computes.

View File

@@ -0,0 +1,343 @@
# Independent compute release
Created at: 2024-08-30. Author: Alexey Kondratov (@ololobus)
## Summary
This document proposes an approach to fully independent compute release flow. It attempts to
cover the following features:
- Process is automated as much as possible to minimize human errors.
- Compute<->storage protocol compatibility is ensured.
- A transparent release history is available with an easy rollback strategy.
- Although not in the scope of this document, there is a viable way to extend the proposed release
flow to achieve the canary and/or blue-green deployment strategies.
## Motivation
Previously, the compute release was tightly coupled to the storage release. This meant that once
some storage nodes got restarted with a newer version, all new compute starts using these nodes
automatically got a new version. Thus, two releases happen in parallel, which increases the blast
radius and makes ownership fuzzy.
Now, we practice a manual v0 independent compute release flow -- after getting a new compute release
image and tag, we pin it region by region using Admin UI. It's better, but it still has its own flaws:
1. It's a simple but fairly manual process, as you need to click through a few pages.
2. It's prone to human errors, e.g., you could mistype or copy the wrong compute tag.
3. We now require an additional approval in the Admin UI, which partially solves the 2.,
but also makes the whole process pretty annoying, as you constantly need to go back
and forth between two people.
## Non-goals
It's not the goal of this document to propose a design for some general-purpose release tool like Helm.
The document considers how the current compute fleet is orchestrated at Neon. Even if we later
decide to split the control plane further (e.g., introduce a separate compute controller), the proposed
release process shouldn't change much, i.e., the releases table and API will reside in
one of the parts.
Achieving the canary and/or blue-green deploy strategies is out of the scope of this document. They
were kept in mind, though, so it's expected that the proposed approach will lay down the foundation
for implementing them in future iterations.
## Impacted components
Compute, control plane, CI, observability (some Grafana dashboards may require changes).
## Prior art
One of the very close examples is how Helm tracks [releases history](https://helm.sh/docs/helm/helm_history/).
In the code:
- [Release](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/release.go#L20-L43)
- [Release info](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/info.go#L24-L40)
- [Release status](https://github.com/helm/helm/blob/2b30cf4b61d587d3f7594102bb202b787b9918db/pkg/release/status.go#L18-L42)
TL;DR it has several important attributes:
- Revision -- unique release ID/primary key. It is not the same as the application version,
because the same version can be deployed several times, e.g., after a newer version rollback.
- App version -- version of the application chart/code.
- Config -- set of overrides to the default config of the application.
- Status -- current status of the release in the history.
- Timestamps -- tracks when a release was created and deployed.
## Proposed implementation
### Separate release branch
We will use a separate release branch, `release-compute`, to have a clean history for releases and commits.
In order to avoid confusion with storage releases, we will use a different prefix for compute [git release
tags](https://github.com/neondatabase/neon/releases) -- `release-compute-XXXX`. We will use the same tag for
Docker images as well. The `neondatabase/compute-node-v16:release-compute-XXXX` looks longer and a bit redundant,
but it's better to have image and git tags in sync.
Currently, control plane relies on the numeric compute and storage release versions to decide on compute->storage
compatibility. Once we implement this proposal, we should drop this code as release numbers will be completely
independent. The only constraint we want is that it must monotonically increase within the same release branch.
### Compute config/settings manifest
We will create a new sub-directory `compute` and file `compute/manifest.yaml` with a structure:
```yaml
pg_settings:
# Common settings for primaries and secondaries of all versions.
common:
wal_log_hints: "off"
max_wal_size: "1024"
per_version:
14:
# Common settings for both replica and primary of version PG 14
common:
shared_preload_libraries: "neon,pg_stat_statements,extension_x"
15:
common:
shared_preload_libraries: "neon,pg_stat_statements,extension_x"
# Settings that should be applied only to
replica:
# Available only starting Postgres 15th
recovery_prefetch: "off"
# ...
17:
common:
# For example, if third-party `extension_x` is not yet available for PG 17
shared_preload_libraries: "neon,pg_stat_statements"
replica:
recovery_prefetch: "off"
```
**N.B.** Setting value should be a string with `on|off` for booleans and a number (as a string)
without units for all numeric settings. That's how the control plane currently operates.
The priority of settings will be (a higher number is a higher priority):
1. Any static and hard-coded settings in the control plane
2. `pg_settings->common`
3. Per-version `common`
4. Per-version `replica`
5. Any per-user/project/endpoint overrides in the control plane
6. Any dynamic setting calculated based on the compute size
**N.B.** For simplicity, we do not do any custom logic for `shared_preload_libraries`, so it's completely
overridden if specified on some level. Make sure that you include all necessary extensions in it when you
do any overrides.
**N.B.** There is a tricky question about what to do with custom compute image pinning we sometimes
do for particular projects and customers. That's usually some ad-hoc work and images are based on
the latest compute image, so it's relatively safe to assume that we could use settings from the latest compute
release. If for some reason that's not true, and further overrides are needed, it's also possible to do
on the project level together with pinning the image, so it's on-call/engineer/support responsibility to
ensure that compute starts with the specified custom image. The only real risk is that compute image will get
stale and settings from new releases will drift away, so eventually it will get something incompatible,
but i) this is some operational issue, as we do not want stale images anyway, and ii) base settings
receive something really new so rarely that the chance of this happening is very low. If we want to solve it completely,
then together with pinning the image we could also pin the matching release revision in the control plane.
The compute team will own the content of `compute/manifest.yaml`.
### Control plane: releases table
In order to store information about releases, the control plane will use a table `compute_releases` with the following
schema:
```sql
CREATE TABLE compute_releases (
-- Unique release ID
-- N.B. Revision won't by synchronized across all regions, because all control planes are technically independent
-- services. We have the same situation with Helm releases as well because they could be deployed and rolled back
-- independently in different clusters.
revision BIGSERIAL PRIMARY KEY,
-- Numeric version of the compute image, e.g. 9057
version BIGINT NOT NULL,
-- Compute image tag, e.g. `release-9057`
tag TEXT NOT NULL,
-- Current release status. Currently, it will be a simple enum
-- * `deployed` -- release is deployed and used for new compute starts.
-- Exactly one release can have this status at a time.
-- * `superseded` -- release has been replaced by a newer one.
-- But we can always extend it in the future when we need more statuses
-- for more complex deployment strategies.
status TEXT NOT NULL,
-- Any additional metadata for compute in the corresponding release
manifest JSONB NOT NULL,
-- Timestamp when release record was created in the control plane database
created_at TIMESTAMP NOT NULL DEFAULT now(),
-- Timestamp when release deployment was finished
deployed_at TIMESTAMP
);
```
We keep track of the old releases not only for the sake of audit, but also because we usually have ~30% of
old computes started using the image from one of the previous releases. Yet, when users want to reconfigure
them without restarting, the control plane needs to know what settings are applicable to them, so we also need
information about the previous releases that are readily available. There could be some other auxiliary info
needed as well: supported extensions, compute flags, etc.
**N.B.** Here, we can end up in an ambiguous situation when the same compute image is deployed twice, e.g.,
it was deployed once, then rolled back, and then deployed again, potentially with a different manifest. Yet,
we could've started some computes with the first deployment and some with the second. Thus, when we need to
look up the manifest for the compute by its image tag, we will see two records in the table with the same tag,
but different revision numbers. We can assume that this could happen only in case of rollbacks, so we
can just take the latest revision for the given tag.
### Control plane: management API
The control plane will implement new API methods to manage releases:
1. `POST /management/api/v2/compute_releases` to create a new release. With payload
```json
{
"version": 9057,
"tag": "release-9057",
"manifest": {}
}
```
and response
```json
{
"revision": 53,
"version": 9057,
"tag": "release-9057",
"status": "deployed",
"manifest": {},
"created_at": "2024-08-15T15:52:01.0000Z",
"deployed_at": "2024-08-15T15:52:01.0000Z",
}
```
Here, we can actually mix-in custom (remote) extensions metadata into the `manifest`, so that the control plane
will get information about all available extensions not bundled into compute image. The corresponding
workflow in `neondatabase/build-custom-extensions` should produce it as an artifact and make
it accessible to the workflow in the `neondatabase/infra`. See the complete release flow below. Doing that,
we put a constraint that new custom extension requires new compute release, which is good for the safety,
but is not exactly what we want operational-wise (we want to be able to deploy new extensions without new
images). Yet, it can be solved incrementally: v0 -- do not do anything with extensions at all;
v1 -- put them into the same manifest; v2 -- make them separate entities with their own lifecycle.
**N.B.** This method is intended to be used in CI workflows, and CI/network can be flaky. It's reasonable
to assume that we could retry the request several times, even though it's already succeeded. Although it's
not a big deal to create several identical releases one-by-one, it's better to avoid it, so the control plane
should check if the latest release is identical and just return `304 Not Modified` in this case.
2. `POST /management/api/v2/compute_releases/rollback` to rollback to any previously deployed release. With payload
including the revision of the release to rollback to:
```json
{
"revision": 52
}
```
Rollback marks the current release as `superseded` and creates a new release with all the same data as the
requested revision, but with a new revision number.
This rollback API is not strictly needed, as we can just use `infra` repo workflow to deploy any
available tag. It's still nice to have for on-call and any urgent matters, for example, if we need
to rollback and GitHub is down. It's much easier to specify only the revision number vs. crafting
all the necessary data for the new release payload.
### Compute->storage compatibility tests
In order to safely release new compute versions independently from storage, we need to ensure that the currently
deployed storage is compatible with the new compute version. Currently, we maintain backward compatibility
in storage, but newer computes may require a newer storage version.
Remote end-to-end (e2e) tests [already accept](https://github.com/neondatabase/cloud/blob/e3468d433e0d73d02b7d7e738d027f509b522408/.github/workflows/testing.yml#L43-L48)
`storage_image_tag` and `compute_image_tag` as separate inputs. That means that we could reuse e2e tests to ensure
compatibility between storage and compute:
1. Pick the latest storage release tag and use it as `storage_image_tag`.
2. Pick a new compute tag built in the current compute release PR and use it as `compute_image_tag`.
Here, we should use a temporary ECR image tag, because the final tag will be known only after the release PR is merged.
3. Trigger e2e tests as usual.
### Release flow
```mermaid
sequenceDiagram
actor oncall as Compute on-call person
participant neon as neondatabase/neon
box private
participant cloud as neondatabase/cloud
participant exts as neondatabase/build-custom-extensions
participant infra as neondatabase/infra
end
box cloud
participant preprod as Pre-prod control plane
participant prod as Production control plane
participant k8s as Compute k8s
end
oncall ->> neon: Open release PR into release-compute
activate neon
neon ->> cloud: CI: trigger e2e compatibility tests
activate cloud
cloud -->> neon: CI: e2e tests pass
deactivate cloud
neon ->> neon: CI: pass PR checks, get approvals
deactivate neon
oncall ->> neon: Merge release PR into release-compute
activate neon
neon ->> neon: CI: pass checks, build and push images
neon ->> exts: CI: trigger extensions build
activate exts
exts -->> neon: CI: extensions are ready
deactivate exts
neon ->> neon: CI: create release tag
neon ->> infra: Trigger release workflow using the produced tag
deactivate neon
activate infra
infra ->> infra: CI: pass checks
infra ->> preprod: Release new compute image to pre-prod automatically <br/> POST /management/api/v2/compute_releases
activate preprod
preprod -->> infra: 200 OK
deactivate preprod
infra ->> infra: CI: wait for per-region production deploy approvals
oncall ->> infra: CI: approve deploys region by region
infra ->> k8s: Prewarm new compute image
infra ->> prod: POST /management/api/v2/compute_releases
activate prod
prod -->> infra: 200 OK
deactivate prod
deactivate infra
```
## Further work
As briefly mentioned in other sections, eventually, we would like to use more complex deployment strategies.
For example, we can pass a fraction of the total compute starts that should use the new release. Then we can
mark the release as `partial` or `canary` and monitor its performance. If everything is fine, we can promote it
to `deployed` status. If not, we can roll back to the previous one.
## Alternatives
In theory, we can try using Helm as-is:
1. Write a compute Helm chart. That will actually have only some config map, which the control plane can access and read.
N.B. We could reuse the control plane chart as well, but then it's not a fully independent release again and even more fuzzy.
2. The control plane will read it and start using the new compute version for new starts.
Drawbacks:
1. Helm releases work best if the workload is controlled by the Helm chart itself. Then you can have different
deployment strategies like rolling update or canary or blue/green deployments. At Neon, the compute starts are controlled
by control plane, so it makes it much more tricky.
2. Releases visibility will suffer, i.e. instead of a nice table in the control plane and Admin UI, we would need to use
`helm` cli and/or K8s UIs like K8sLens.
3. We do not restart all computes shortly after the new version release. This means that for some features and compatibility
purpose (see above) control plane may need some auxiliary info from the previous releases.

View File

@@ -50,6 +50,16 @@ pub struct ComputeSpec {
#[serde(default)]
pub swap_size_bytes: Option<u64>,
/// If compute_ctl was passed `--set-disk-quota-for-fs`, a value of `Some(_)` instructs
/// compute_ctl to run `/neonvm/bin/set-disk-quota` with the given size and fs, when the
/// spec is first received.
///
/// Both this field and `--set-disk-quota-for-fs` are required, so that the control plane's
/// spec generation doesn't need to be aware of the actual compute it's running on, while
/// guaranteeing gradual rollout of disk quota.
#[serde(default)]
pub disk_quota_bytes: Option<u64>,
/// Expected cluster state at the end of transition process.
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
@@ -96,6 +106,10 @@ pub struct ComputeSpec {
// Stripe size for pageserver sharding, in pages
#[serde(default)]
pub shard_stripe_size: Option<usize>,
/// Local Proxy configuration used for JWT authentication
#[serde(default)]
pub local_proxy_config: Option<LocalProxySpec>,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -268,6 +282,24 @@ pub struct GenericOption {
/// declare a `trait` on it.
pub type GenericOptions = Option<Vec<GenericOption>>;
/// Configured the local_proxy application with the relevant JWKS and roles it should
/// use for authorizing connect requests using JWT.
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct LocalProxySpec {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub jwks: Option<Vec<JwksSettings>>,
}
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct JwksSettings {
pub id: String,
pub role_names: Vec<String>,
pub jwks_url: String,
pub provider_name: String,
pub jwt_audience: Option<String>,
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -296,7 +296,14 @@ pub mod defaults {
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
/// Soft limit for the maximum size of a vectored read.
///
/// This is determined by the largest NeonWalRecord that can exist (minus dbdir and reldir keys
/// which are bounded by the blob io limits only). As of this writing, that is a `NeonWalRecord::ClogSetCommitted` record,
/// with 32k xids. That's the max number of XIDS on a single CLOG page. The size of such a record
/// is `sizeof(Transactionid) * 32768 + (some fixed overhead from 'timestamp`, the Vec length and whatever extra serde serialization adds)`.
/// That is, slightly above 128 kB.
pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB
pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
ImageCompressionAlgorithm::Zstd { level: Some(1) };

View File

@@ -1,4 +1,5 @@
use std::collections::{HashMap, HashSet};
use std::fmt::Display;
use std::str::FromStr;
use std::time::{Duration, Instant};
@@ -57,7 +58,7 @@ pub struct NodeRegisterRequest {
pub listen_http_addr: String,
pub listen_http_port: u16,
pub availability_zone_id: String,
pub availability_zone_id: AvailabilityZone,
}
#[derive(Serialize, Deserialize)]
@@ -74,10 +75,19 @@ pub struct TenantPolicyRequest {
pub scheduling: Option<ShardSchedulingPolicy>,
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub struct AvailabilityZone(pub String);
impl Display for AvailabilityZone {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
#[derive(Serialize, Deserialize)]
pub struct ShardsPreferredAzsRequest {
#[serde(flatten)]
pub preferred_az_ids: HashMap<TenantShardId, String>,
pub preferred_az_ids: HashMap<TenantShardId, AvailabilityZone>,
}
#[derive(Serialize, Deserialize)]

View File

@@ -748,6 +748,16 @@ impl Key {
self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
}
#[inline(always)]
pub fn is_rel_dir_key(&self) -> bool {
self.field1 == 0x00
&& self.field2 != 0
&& self.field3 != 0
&& self.field4 == 0
&& self.field5 == 0
&& self.field6 == 1
}
/// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
#[inline(always)]
pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {

View File

@@ -984,6 +984,7 @@ pub fn short_error(e: &QueryError) -> String {
}
fn log_query_error(query: &str, e: &QueryError) {
// If you want to change the log level of a specific error, also re-categorize it in `BasebackupQueryTimeOngoingRecording`.
match e {
QueryError::Disconnected(ConnectionError::Io(io_error)) => {
if is_expected_io_error(io_error) {

View File

@@ -93,9 +93,9 @@ impl Conf {
);
let output = self
.new_pg_command("initdb")?
.arg("-D")
.arg("--pgdata")
.arg(&self.datadir)
.args(["-U", "postgres", "--no-instructions", "--no-sync"])
.args(["--username", "postgres", "--no-instructions", "--no-sync"])
.output()?;
debug!("initdb output: {:?}", output);
ensure!(

View File

@@ -16,7 +16,7 @@ aws-sdk-s3.workspace = true
bytes.workspace = true
camino = { workspace = true, features = ["serde1"] }
humantime-serde.workspace = true
hyper = { workspace = true, features = ["stream"] }
hyper0 = { workspace = true, features = ["stream"] }
futures.workspace = true
serde.workspace = true
serde_json.workspace = true

View File

@@ -14,7 +14,7 @@ use std::time::SystemTime;
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use anyhow::Result;
use azure_core::request_options::{MaxResults, Metadata, Range};
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
use azure_core::{Continuable, RetryOptions};
use azure_identity::DefaultAzureCredential;
use azure_storage::StorageCredentials;
@@ -33,10 +33,10 @@ use tracing::debug;
use utils::backoff;
use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
use crate::ListingObject;
use crate::{
config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing,
ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError,
DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata,
TimeTravelError, TimeoutOrCancel,
};
pub struct AzureBlobStorage {
@@ -259,6 +259,7 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
if let Some(http_err) = error.as_http_error() {
match http_err.status() {
StatusCode::NotFound => DownloadError::NotFound,
StatusCode::NotModified => DownloadError::Unmodified,
StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
_ => DownloadError::Other(anyhow::Error::new(error)),
}
@@ -484,11 +485,16 @@ impl RemoteStorage for AzureBlobStorage {
async fn download(
&self,
from: &RemotePath,
opts: &DownloadOpts,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
let builder = blob_client.get();
let mut builder = blob_client.get();
if let Some(ref etag) = opts.etag {
builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()))
}
self.download_for_builder(builder, cancel).await
}

View File

@@ -5,6 +5,8 @@ pub enum DownloadError {
BadInput(anyhow::Error),
/// The file was not found in the remote storage.
NotFound,
/// The caller provided an ETag, and the file was not modified.
Unmodified,
/// A cancellation token aborted the download, typically during
/// tenant detach or process shutdown.
Cancelled,
@@ -24,6 +26,7 @@ impl std::fmt::Display for DownloadError {
write!(f, "Failed to download a remote file due to user input: {e}")
}
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
DownloadError::Unmodified => write!(f, "File was not modified"),
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
DownloadError::Timeout => write!(f, "timeout"),
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
@@ -38,7 +41,7 @@ impl DownloadError {
pub fn is_permanent(&self) -> bool {
use DownloadError::*;
match self {
BadInput(_) | NotFound | Cancelled => true,
BadInput(_) | NotFound | Unmodified | Cancelled => true,
Timeout | Other(_) => false,
}
}

View File

@@ -161,6 +161,14 @@ pub struct Listing {
pub keys: Vec<ListingObject>,
}
/// Options for downloads. The default value is a plain GET.
#[derive(Default)]
pub struct DownloadOpts {
/// If given, returns [`DownloadError::Unmodified`] if the object still has
/// the same ETag (using If-None-Match).
pub etag: Option<Etag>,
}
/// Storage (potentially remote) API to manage its state.
/// This storage tries to be unaware of any layered repository context,
/// providing basic CRUD operations for storage files.
@@ -245,6 +253,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
async fn download(
&self,
from: &RemotePath,
opts: &DownloadOpts,
cancel: &CancellationToken,
) -> Result<Download, DownloadError>;
@@ -401,16 +410,18 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
/// See [`RemoteStorage::download`]
pub async fn download(
&self,
from: &RemotePath,
opts: &DownloadOpts,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
match self {
Self::LocalFs(s) => s.download(from, cancel).await,
Self::AwsS3(s) => s.download(from, cancel).await,
Self::AzureBlob(s) => s.download(from, cancel).await,
Self::Unreliable(s) => s.download(from, cancel).await,
Self::LocalFs(s) => s.download(from, opts, cancel).await,
Self::AwsS3(s) => s.download(from, opts, cancel).await,
Self::AzureBlob(s) => s.download(from, opts, cancel).await,
Self::Unreliable(s) => s.download(from, opts, cancel).await,
}
}
@@ -572,7 +583,7 @@ impl GenericRemoteStorage {
) -> Result<Download, DownloadError> {
match byte_range {
Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
None => self.download(from, cancel).await,
None => self.download(from, &DownloadOpts::default(), cancel).await,
}
}

View File

@@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
use utils::crashsafe::path_with_suffix_extension;
use crate::{
Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, TimeTravelError,
TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath,
TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
use super::{RemoteStorage, StorageMetadata};
@@ -494,11 +494,17 @@ impl RemoteStorage for LocalFs {
async fn download(
&self,
from: &RemotePath,
opts: &DownloadOpts,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
let target_path = from.with_base(&self.storage_root);
let file_metadata = file_metadata(&target_path).await?;
let etag = mock_etag(&file_metadata);
if opts.etag.as_ref() == Some(&etag) {
return Err(DownloadError::Unmodified);
}
let source = ReaderStream::new(
fs::OpenOptions::new()
@@ -519,7 +525,6 @@ impl RemoteStorage for LocalFs {
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
let etag = mock_etag(&file_metadata);
Ok(Download {
metadata,
last_modified: file_metadata
@@ -692,7 +697,7 @@ mod fs_tests {
) -> anyhow::Result<String> {
let cancel = CancellationToken::new();
let download = storage
.download(remote_storage_path, &cancel)
.download(remote_storage_path, &DownloadOpts::default(), &cancel)
.await
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
ensure!(
@@ -773,8 +778,8 @@ mod fs_tests {
"We should upload and download the same contents"
);
let non_existing_path = "somewhere/else";
match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await {
let non_existing_path = RemotePath::new(Utf8Path::new("somewhere/else"))?;
match storage.download(&non_existing_path, &DownloadOpts::default(), &cancel).await {
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
}
@@ -1101,7 +1106,13 @@ mod fs_tests {
storage.upload(body, len, &path, None, &cancel).await?;
}
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
let read = aggregate(
storage
.download(&path, &DownloadOpts::default(), &cancel)
.await?
.download_stream,
)
.await?;
assert_eq!(body, read);
let shorter = Bytes::from_static(b"shorter body");
@@ -1112,7 +1123,13 @@ mod fs_tests {
storage.upload(body, len, &path, None, &cancel).await?;
}
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
let read = aggregate(
storage
.download(&path, &DownloadOpts::default(), &cancel)
.await?
.download_stream,
)
.await?;
assert_eq!(shorter, read);
Ok(())
}
@@ -1145,7 +1162,13 @@ mod fs_tests {
storage.upload(body, len, &path, None, &cancel).await?;
}
let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
let read = aggregate(
storage
.download(&path, &DownloadOpts::default(), &cancel)
.await?
.download_stream,
)
.await?;
assert_eq!(body, read);
Ok(())

View File

@@ -28,12 +28,13 @@ use aws_sdk_s3::{
Client,
};
use aws_smithy_async::rt::sleep::TokioSleep;
use http_types::StatusCode;
use aws_smithy_types::{body::SdkBody, DateTime};
use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
use bytes::Bytes;
use futures::stream::Stream;
use hyper::Body;
use hyper0::Body;
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
use utils::backoff;
@@ -44,8 +45,8 @@ use crate::{
error::Cancelled,
metrics::{start_counting_cancelled_wait, start_measuring_requests},
support::PermitCarrying,
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath,
RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
REMOTE_STORAGE_PREFIX_SEPARATOR,
};
@@ -67,6 +68,7 @@ pub struct S3Bucket {
struct GetObjectRequest {
bucket: String,
key: String,
etag: Option<String>,
range: Option<String>,
}
impl S3Bucket {
@@ -248,13 +250,18 @@ impl S3Bucket {
let started_at = start_measuring_requests(kind);
let get_object = self
let mut builder = self
.client
.get_object()
.bucket(request.bucket)
.key(request.key)
.set_range(request.range)
.send();
.set_range(request.range);
if let Some(etag) = request.etag {
builder = builder.if_none_match(etag);
}
let get_object = builder.send();
let get_object = tokio::select! {
res = get_object => res,
@@ -277,6 +284,20 @@ impl S3Bucket {
);
return Err(DownloadError::NotFound);
}
Err(SdkError::ServiceError(e))
// aws_smithy_runtime_api::http::response::StatusCode isn't
// re-exported by any aws crates, so just check the numeric
// status against http_types::StatusCode instead of pulling it.
if e.raw().status().as_u16() == StatusCode::NotModified =>
{
// Count an unmodified file as a success.
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
AttemptOutcome::Ok,
started_at,
);
return Err(DownloadError::Unmodified);
}
Err(e) => {
crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
kind,
@@ -773,6 +794,7 @@ impl RemoteStorage for S3Bucket {
async fn download(
&self,
from: &RemotePath,
opts: &DownloadOpts,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
// if prefix is not none then download file `prefix/from`
@@ -781,6 +803,7 @@ impl RemoteStorage for S3Bucket {
GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
etag: opts.etag.as_ref().map(|e| e.to_string()),
range: None,
},
cancel,
@@ -807,6 +830,7 @@ impl RemoteStorage for S3Bucket {
GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
etag: None,
range,
},
cancel,

View File

@@ -12,8 +12,8 @@ use std::{collections::hash_map::Entry, sync::Arc};
use tokio_util::sync::CancellationToken;
use crate::{
Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage,
StorageMetadata, TimeTravelError,
Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath,
RemoteStorage, StorageMetadata, TimeTravelError,
};
pub struct UnreliableWrapper {
@@ -167,11 +167,12 @@ impl RemoteStorage for UnreliableWrapper {
async fn download(
&self,
from: &RemotePath,
opts: &DownloadOpts,
cancel: &CancellationToken,
) -> Result<Download, DownloadError> {
self.attempt(RemoteOp::Download(from.clone()))
.map_err(DownloadError::Other)?;
self.inner.download(from, cancel).await
self.inner.download(from, opts, cancel).await
}
async fn download_byte_range(

View File

@@ -1,8 +1,7 @@
use anyhow::Context;
use camino::Utf8Path;
use futures::StreamExt;
use remote_storage::ListingMode;
use remote_storage::RemotePath;
use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath};
use std::sync::Arc;
use std::{collections::HashSet, num::NonZeroU32};
use test_context::test_context;
@@ -284,7 +283,10 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
ctx.client.upload(data, len, &path, None, &cancel).await?;
// Normal download request
let dl = ctx.client.download(&path, &cancel).await?;
let dl = ctx
.client
.download(&path, &DownloadOpts::default(), &cancel)
.await?;
let buf = download_to_vec(dl).await?;
assert_eq!(&buf, &orig);
@@ -337,6 +339,54 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
Ok(())
}
/// Tests that conditional downloads work properly, by returning
/// DownloadError::Unmodified when the object ETag matches the given ETag.
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn download_conditional(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
return Ok(());
};
let cancel = CancellationToken::new();
// Create a file.
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))?;
let data = bytes::Bytes::from_static("foo".as_bytes());
let (stream, len) = wrap_stream(data);
ctx.client.upload(stream, len, &path, None, &cancel).await?;
// Download it to obtain its etag.
let mut opts = DownloadOpts::default();
let download = ctx.client.download(&path, &opts, &cancel).await?;
// Download with the etag yields DownloadError::Unmodified.
opts.etag = Some(download.etag);
let result = ctx.client.download(&path, &opts, &cancel).await;
assert!(
matches!(result, Err(DownloadError::Unmodified)),
"expected DownloadError::Unmodified, got {result:?}"
);
// Replace the file contents.
let data = bytes::Bytes::from_static("bar".as_bytes());
let (stream, len) = wrap_stream(data);
ctx.client.upload(stream, len, &path, None, &cancel).await?;
// A download with the old etag should yield the new file.
let download = ctx.client.download(&path, &opts, &cancel).await?;
assert_ne!(download.etag, opts.etag.unwrap(), "ETag did not change");
// A download with the new etag should yield Unmodified again.
opts.etag = Some(download.etag);
let result = ctx.client.download(&path, &opts, &cancel).await;
assert!(
matches!(result, Err(DownloadError::Unmodified)),
"expected DownloadError::Unmodified, got {result:?}"
);
Ok(())
}
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
@@ -364,7 +414,10 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
// Normal download request
ctx.client.copy_object(&path, &path_dest, &cancel).await?;
let dl = ctx.client.download(&path_dest, &cancel).await?;
let dl = ctx
.client
.download(&path_dest, &DownloadOpts::default(), &cancel)
.await?;
let buf = download_to_vec(dl).await?;
assert_eq!(&buf, &orig);
@@ -376,3 +429,56 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
Ok(())
}
/// Tests that head_object works properly.
#[test_context(MaybeEnabledStorage)]
#[tokio::test]
async fn head_object(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
let MaybeEnabledStorage::Enabled(ctx) = ctx else {
return Ok(());
};
let cancel = CancellationToken::new();
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))?;
// Errors on missing file.
let result = ctx.client.head_object(&path, &cancel).await;
assert!(
matches!(result, Err(DownloadError::NotFound)),
"expected NotFound, got {result:?}"
);
// Create the file.
let data = bytes::Bytes::from_static("foo".as_bytes());
let (stream, len) = wrap_stream(data);
ctx.client.upload(stream, len, &path, None, &cancel).await?;
// Fetch the head metadata.
let object = ctx.client.head_object(&path, &cancel).await?;
assert_eq!(
object,
ListingObject {
key: path.clone(),
last_modified: object.last_modified, // ignore
size: 3
}
);
// Wait for a couple of seconds, and then update the file to check the last
// modified timestamp.
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
let data = bytes::Bytes::from_static("bar".as_bytes());
let (stream, len) = wrap_stream(data);
ctx.client.upload(stream, len, &path, None, &cancel).await?;
let new = ctx.client.head_object(&path, &cancel).await?;
assert!(
!new.last_modified
.duration_since(object.last_modified)?
.is_zero(),
"last_modified did not advance"
);
Ok(())
}

View File

@@ -12,8 +12,8 @@ use anyhow::Context;
use camino::Utf8Path;
use futures_util::StreamExt;
use remote_storage::{
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
RemoteStorageKind, S3Config,
DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
RemoteStorageConfig, RemoteStorageKind, S3Config,
};
use test_context::test_context;
use test_context::AsyncTestContext;
@@ -121,7 +121,8 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
// A little check to ensure that our clock is not too far off from the S3 clock
{
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
let opts = DownloadOpts::default();
let dl = retry(|| ctx.client.download(&path2, &opts, &cancel)).await?;
let last_modified = dl.last_modified;
let half_wt = WAIT_TIME.mul_f32(0.5);
let t0_hwt = t0 + half_wt;
@@ -159,7 +160,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
println!("after recovery to t2: {t2_files_recovered:?}");
assert_eq!(t2_files, t2_files_recovered);
let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
let path2_recovered_t2 = download_to_vec(
ctx.client
.download(&path2, &DownloadOpts::default(), &cancel)
.await?,
)
.await?;
assert_eq!(path2_recovered_t2, new_data.as_bytes());
// after recovery to t1: path1 is back, path2 has the old content
@@ -170,7 +176,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
println!("after recovery to t1: {t1_files_recovered:?}");
assert_eq!(t1_files, t1_files_recovered);
let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
let path2_recovered_t1 = download_to_vec(
ctx.client
.download(&path2, &DownloadOpts::default(), &cancel)
.await?,
)
.await?;
assert_eq!(path2_recovered_t1, old_data.as_bytes());
// after recovery to t0: everything is gone except for path1
@@ -416,7 +427,7 @@ async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
let started_at = std::time::Instant::now();
let mut stream = ctx
.client
.download(&path, &cancel)
.download(&path, &DownloadOpts::default(), &cancel)
.await
.expect("download succeeds")
.download_stream;
@@ -491,7 +502,7 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
{
let stream = ctx
.client
.download(&path, &cancel)
.download(&path, &DownloadOpts::default(), &cancel)
.await
.expect("download succeeds")
.download_stream;

View File

@@ -5,13 +5,15 @@ edition.workspace = true
license.workspace = true
[dependencies]
hyper.workspace = true
opentelemetry = { workspace = true, features=["rt-tokio"] }
opentelemetry-otlp = { workspace = true, default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
hyper0.workspace = true
opentelemetry = { workspace = true, features = ["trace"] }
opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] }
opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions.workspace = true
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tracing.workspace = true
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
[dev-dependencies]
tracing-subscriber.workspace = true # For examples in docs

View File

@@ -1,7 +1,7 @@
//! Tracing wrapper for Hyper HTTP server
use hyper::HeaderMap;
use hyper::{Body, Request, Response};
use hyper0::HeaderMap;
use hyper0::{Body, Request, Response};
use std::future::Future;
use tracing::Instrument;
use tracing_opentelemetry::OpenTelemetrySpanExt;

View File

@@ -10,7 +10,6 @@
//!
//! ```rust,no_run
//! use tracing_subscriber::prelude::*;
//! use tracing_opentelemetry::OpenTelemetryLayer;
//!
//! #[tokio::main]
//! async fn main() {
@@ -22,7 +21,7 @@
//! .with_writer(std::io::stderr);
//!
//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
//! let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
//! let otlp_layer = tracing_utils::init_tracing("my_application").await;
//!
//! // Put it all together
//! tracing_subscriber::registry()
@@ -35,15 +34,15 @@
#![deny(unsafe_code)]
#![deny(clippy::undocumented_unsafe_blocks)]
use opentelemetry::sdk::Resource;
use opentelemetry::KeyValue;
use opentelemetry_otlp::WithExportConfig;
use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
pub use tracing_opentelemetry::OpenTelemetryLayer;
pub mod http;
use opentelemetry::trace::TracerProvider;
use opentelemetry::KeyValue;
use opentelemetry_sdk::Resource;
use tracing::Subscriber;
use tracing_subscriber::registry::LookupSpan;
use tracing_subscriber::Layer;
/// Set up OpenTelemetry exporter, using configuration from environment variables.
///
/// `service_name` is set as the OpenTelemetry 'service.name' resource (see
@@ -71,7 +70,10 @@ pub mod http;
///
/// This doesn't block, but is marked as 'async' to hint that this must be called in
/// asynchronous execution context.
pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
pub async fn init_tracing<S>(service_name: &str) -> Option<impl Layer<S>>
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
return None;
};
@@ -80,9 +82,10 @@ pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trac
/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
/// tasks.
pub fn init_tracing_without_runtime(
service_name: &str,
) -> Option<opentelemetry::sdk::trace::Tracer> {
pub fn init_tracing_without_runtime<S>(service_name: &str) -> Option<impl Layer<S>>
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
return None;
};
@@ -113,54 +116,36 @@ pub fn init_tracing_without_runtime(
Some(init_tracing_internal(service_name.to_string()))
}
fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
// Set up exporter from the OTEL_EXPORTER_* environment variables
let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
fn init_tracing_internal<S>(service_name: String) -> impl Layer<S>
where
S: Subscriber + for<'span> LookupSpan<'span>,
{
// Sets up exporter from the OTEL_EXPORTER_* environment variables.
let exporter = opentelemetry_otlp::new_exporter().http();
// XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
// OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
// OpenTelemetry spec at
// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
// the full exporter URL is formed by appending "/v1/traces" to the value
// of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
// that with the grpc-tonic exporter. Other exporters, like the HTTP
// exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
// appending "/v1/traces".
//
// See https://github.com/open-telemetry/opentelemetry-rust/pull/950
//
// Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
// the endpoint url with the "/v1/traces" path ourselves. If the bug is
// fixed in a later version, we can remove this code. But if we don't
// remember to remove this, it won't do any harm either, as the crate will
// just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
// is set directly with `with_endpoint`.
if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
if !endpoint.ends_with('/') {
endpoint.push('/');
}
endpoint.push_str("v1/traces");
exporter = exporter.with_endpoint(endpoint);
}
}
// TODO: opentelemetry::global::set_error_handler() with custom handler that
// bypasses default tracing layers, but logs regular looking log
// messages.
// Propagate trace information in the standard W3C TraceContext format.
opentelemetry::global::set_text_map_propagator(
opentelemetry::sdk::propagation::TraceContextPropagator::new(),
opentelemetry_sdk::propagation::TraceContextPropagator::new(),
);
opentelemetry_otlp::new_pipeline()
let tracer = opentelemetry_otlp::new_pipeline()
.tracing()
.with_exporter(exporter)
.with_trace_config(
opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
.with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource(
Resource::new(vec![KeyValue::new(
opentelemetry_semantic_conventions::resource::SERVICE_NAME,
service_name,
)])),
)
.install_batch(opentelemetry::runtime::Tokio)
)]),
))
.install_batch(opentelemetry_sdk::runtime::Tokio)
.expect("could not initialize opentelemetry exporter")
.tracer("global");
tracing_opentelemetry::layer().with_tracer(tracer)
}
// Shutdown trace pipeline gracefully, so that it has a chance to send any

View File

@@ -19,9 +19,10 @@ bincode.workspace = true
bytes.workspace = true
camino.workspace = true
chrono.workspace = true
git-version.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper = { workspace = true, features = ["full"] }
hyper0 = { workspace = true, features = ["full"] }
fail.workspace = true
futures = { workspace = true}
jsonwebtoken.workspace = true

View File

@@ -2,6 +2,8 @@
//! between other crates in this repository.
#![deny(clippy::undocumented_unsafe_blocks)]
extern crate hyper0 as hyper;
pub mod backoff;
/// `Lsn` type implements common tasks on Log Sequence Numbers
@@ -92,6 +94,10 @@ pub mod toml_edit_ext;
pub mod circuit_breaker;
// Re-export used in macro. Avoids adding git-version as dep in target crates.
#[doc(hidden)]
pub use git_version;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:
@@ -131,7 +137,7 @@ macro_rules! project_git_version {
($const_identifier:ident) => {
// this should try GIT_VERSION first only then git_version::git_version!
const $const_identifier: &::core::primitive::str = {
const __COMMIT_FROM_GIT: &::core::primitive::str = git_version::git_version! {
const __COMMIT_FROM_GIT: &::core::primitive::str = $crate::git_version::git_version! {
prefix = "",
fallback = "unknown",
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha

View File

@@ -7,11 +7,13 @@ use axum::{
extract::{ws::WebSocket, State, WebSocketUpgrade},
response::Response,
};
use axum::{routing::get, Router, Server};
use axum::{routing::get, Router};
use clap::Parser;
use futures::Future;
use std::net::SocketAddr;
use std::{fmt::Debug, time::Duration};
use sysinfo::{RefreshKind, System, SystemExt};
use tokio::net::TcpListener;
use tokio::{sync::broadcast, task::JoinHandle};
use tokio_util::sync::CancellationToken;
use tracing::{error, info};
@@ -132,14 +134,14 @@ pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Res
args,
});
let addr = args.addr();
let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail"))
let addr_str = args.addr();
let addr: SocketAddr = addr_str.parse().expect("parsing address should not fail");
let listener = TcpListener::bind(&addr)
.await
.with_context(|| format!("failed to bind to {addr}"))?;
info!(addr, "server bound");
bound
.serve(app.into_make_service())
info!(addr_str, "server bound");
axum::serve(listener, app.into_make_service())
.await
.context("server exited")?;

View File

@@ -27,11 +27,10 @@ crc32c.workspace = true
either.workspace = true
fail.workspace = true
futures.workspace = true
git-version.workspace = true
hex.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
hyper0.workspace = true
itertools.workspace = true
md5.workspace = true
nix.workspace = true

View File

@@ -736,4 +736,22 @@ impl Client {
.await
.map_err(Error::ReceiveBody)
}
pub async fn timeline_init_lsn_lease(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<LsnLease> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/lsn_lease",
self.mgmt_api_endpoint,
);
self.request(Method::POST, &uri, LsnLeaseRequest { lsn })
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
}

View File

@@ -12,7 +12,6 @@ anyhow.workspace = true
async-stream.workspace = true
clap = { workspace = true, features = ["string"] }
futures.workspace = true
git-version.workspace = true
itertools.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true

View File

@@ -10,7 +10,6 @@ license.workspace = true
anyhow.workspace = true
camino.workspace = true
clap = { workspace = true, features = ["string"] }
git-version.workspace = true
humantime.workspace = true
pageserver = { path = ".." }
pageserver_api.workspace = true

View File

@@ -15,7 +15,7 @@ use clap::{Arg, ArgAction, Command};
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::config::PageserverIdentity;
use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::controller_upcall_client::ControllerUpcallClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::{COMPUTE_REQUEST_RUNTIME, WALRECEIVER_RUNTIME};
@@ -396,7 +396,7 @@ fn start_pageserver(
// Set up deletion queue
let (deletion_queue, deletion_workers) = DeletionQueue::new(
remote_storage.clone(),
ControlPlaneClient::new(conf, &shutdown_pageserver),
ControllerUpcallClient::new(conf, &shutdown_pageserver),
conf,
);
if let Some(deletion_workers) = deletion_workers {
@@ -575,7 +575,7 @@ fn start_pageserver(
.build()
.map_err(|err| anyhow!(err))?;
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?
let server = hyper0::Server::from_tcp(http_listener)?
.serve(service)
.with_graceful_shutdown({
let cancel = cancel.clone();

View File

@@ -2,7 +2,7 @@ use std::collections::HashMap;
use futures::Future;
use pageserver_api::{
controller_api::NodeRegisterRequest,
controller_api::{AvailabilityZone, NodeRegisterRequest},
shard::TenantShardId,
upcall_api::{
ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
@@ -17,9 +17,12 @@ use utils::{backoff, failpoint_support, generation::Generation, id::NodeId};
use crate::{config::PageServerConf, virtual_file::on_fatal_io_error};
use pageserver_api::config::NodeMetadata;
/// The Pageserver's client for using the control plane API: this is a small subset
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
pub struct ControlPlaneClient {
/// The Pageserver's client for using the storage controller upcall API: this is a small API
/// for dealing with generations (see docs/rfcs/025-generation-numbers.md).
///
/// The server presenting this API may either be the storage controller or some other
/// service (such as the Neon control plane) providing a store of generation numbers.
pub struct ControllerUpcallClient {
http_client: reqwest::Client,
base_url: Url,
node_id: NodeId,
@@ -45,7 +48,7 @@ pub trait ControlPlaneGenerationsApi {
) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
}
impl ControlPlaneClient {
impl ControllerUpcallClient {
/// A None return value indicates that the input `conf` object does not have control
/// plane API enabled.
pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
@@ -114,7 +117,7 @@ impl ControlPlaneClient {
}
}
impl ControlPlaneGenerationsApi for ControlPlaneClient {
impl ControlPlaneGenerationsApi for ControllerUpcallClient {
/// Block until we get a successful response, or error out if we are shut down
async fn re_attach(
&self,
@@ -148,10 +151,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
match az_id_from_metadata {
Some(az_id) => Some(az_id),
Some(az_id) => Some(AvailabilityZone(az_id)),
None => {
tracing::warn!("metadata.json does not contain an 'availability_zone_id' field");
conf.availability_zone.clone()
conf.availability_zone.clone().map(AvailabilityZone)
}
}
};
@@ -216,29 +219,38 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
.join("validate")
.expect("Failed to build validate path");
let request = ValidateRequest {
tenants: tenants
.into_iter()
.map(|(id, gen)| ValidateRequestTenant {
id,
gen: gen
.into()
.expect("Generation should always be valid for a Tenant doing deletions"),
})
.collect(),
};
// When sending validate requests, break them up into chunks so that we
// avoid possible edge cases of generating any HTTP requests that
// require database I/O across many thousands of tenants.
let mut result: HashMap<TenantShardId, bool> = HashMap::with_capacity(tenants.len());
for tenant_chunk in (tenants).chunks(128) {
let request = ValidateRequest {
tenants: tenant_chunk
.iter()
.map(|(id, generation)| ValidateRequestTenant {
id: *id,
gen: (*generation).into().expect(
"Generation should always be valid for a Tenant doing deletions",
),
})
.collect(),
};
failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel);
if self.cancel.is_cancelled() {
return Err(RetryForeverError::ShuttingDown);
failpoint_support::sleep_millis_async!(
"control-plane-client-validate-sleep",
&self.cancel
);
if self.cancel.is_cancelled() {
return Err(RetryForeverError::ShuttingDown);
}
let response: ValidateResponse =
self.retry_http_forever(&re_attach_path, request).await?;
for rt in response.tenants {
result.insert(rt.id, rt.valid);
}
}
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
Ok(response
.tenants
.into_iter()
.map(|rt| (rt.id, rt.valid))
.collect())
Ok(result.into_iter().collect())
}
}

View File

@@ -6,7 +6,7 @@ use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::controller_upcall_client::ControlPlaneGenerationsApi;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::remote_timeline_path;
@@ -622,7 +622,7 @@ impl DeletionQueue {
/// If remote_storage is None, then the returned workers will also be None.
pub fn new<C>(
remote_storage: GenericRemoteStorage,
control_plane_client: Option<C>,
controller_upcall_client: Option<C>,
conf: &'static PageServerConf,
) -> (Self, Option<DeletionQueueWorkers<C>>)
where
@@ -662,7 +662,7 @@ impl DeletionQueue {
conf,
backend_rx,
executor_tx,
control_plane_client,
controller_upcall_client,
lsn_table.clone(),
cancel.clone(),
),
@@ -704,7 +704,7 @@ mod test {
use tokio::task::JoinHandle;
use crate::{
control_plane_client::RetryForeverError,
controller_upcall_client::RetryForeverError,
repository::Key,
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
};

View File

@@ -25,8 +25,8 @@ use tracing::info;
use tracing::warn;
use crate::config::PageServerConf;
use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::control_plane_client::RetryForeverError;
use crate::controller_upcall_client::ControlPlaneGenerationsApi;
use crate::controller_upcall_client::RetryForeverError;
use crate::metrics;
use crate::virtual_file::MaybeFatalIo;
@@ -61,7 +61,7 @@ where
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
// Client for calling into control plane API for validation of deletes
control_plane_client: Option<C>,
controller_upcall_client: Option<C>,
// DeletionLists which are waiting generation validation. Not safe to
// execute until [`validate`] has processed them.
@@ -94,7 +94,7 @@ where
conf: &'static PageServerConf,
rx: tokio::sync::mpsc::Receiver<ValidatorQueueMessage>,
tx: tokio::sync::mpsc::Sender<DeleterMessage>,
control_plane_client: Option<C>,
controller_upcall_client: Option<C>,
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
cancel: CancellationToken,
) -> Self {
@@ -102,7 +102,7 @@ where
conf,
rx,
tx,
control_plane_client,
controller_upcall_client,
lsn_table,
pending_lists: Vec::new(),
validated_lists: Vec::new(),
@@ -145,8 +145,8 @@ where
return Ok(());
}
let tenants_valid = if let Some(control_plane_client) = &self.control_plane_client {
match control_plane_client
let tenants_valid = if let Some(controller_upcall_client) = &self.controller_upcall_client {
match controller_upcall_client
.validate(tenant_generations.iter().map(|(k, v)| (*k, *v)).collect())
.await
{

View File

@@ -56,6 +56,7 @@ use utils::http::endpoint::request_span;
use utils::http::request::must_parse_query_param;
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -80,7 +81,6 @@ use crate::tenant::timeline::CompactionError;
use crate::tenant::timeline::Timeline;
use crate::tenant::GetTimelineError;
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
use crate::{config::PageServerConf, tenant::mgr};
use crate::{disk_usage_eviction_task, tenant};
use pageserver_api::models::{
StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
@@ -589,6 +589,10 @@ async fn timeline_create_handler(
StatusCode::SERVICE_UNAVAILABLE,
HttpErrorBody::from_msg(e.to_string()),
),
Err(e @ tenant::CreateTimelineError::AncestorArchived) => json_response(
StatusCode::NOT_ACCEPTABLE,
HttpErrorBody::from_msg(e.to_string()),
),
Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
StatusCode::SERVICE_UNAVAILABLE,
HttpErrorBody::from_msg("tenant shutting down".to_string()),
@@ -820,7 +824,7 @@ async fn get_lsn_by_timestamp_handler(
let lease = if with_lease {
timeline
.make_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
.init_lsn_lease(lsn, timeline.get_lsn_lease_length_for_ts(), &ctx)
.inspect_err(|_| {
warn!("fail to grant a lease to {}", lsn);
})
@@ -1688,9 +1692,18 @@ async fn lsn_lease_handler(
let timeline =
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
.await?;
let result = timeline
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
.map_err(|e| ApiError::InternalServerError(e.context("lsn lease http handler")))?;
let result = async {
timeline
.init_lsn_lease(lsn, timeline.get_lsn_lease_length(), &ctx)
.map_err(|e| {
ApiError::InternalServerError(
e.context(format!("invalid lsn lease request at {lsn}")),
)
})
}
.instrument(info_span!("init_lsn_lease", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.await?;
json_response(StatusCode::OK, result)
}
@@ -1706,8 +1719,13 @@ async fn timeline_gc_handler(
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let state = get_state(&request);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let gc_result = mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
let gc_result = state
.tenant_manager
.immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx)
.await?;
json_response(StatusCode::OK, gc_result)
}
@@ -1724,6 +1742,10 @@ async fn timeline_compact_handler(
let state = get_state(&request);
let mut flags = EnumSet::empty();
if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
flags |= CompactFlags::ForceL0Compaction;
}
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
flags |= CompactFlags::ForceRepartition;
}
@@ -1770,6 +1792,9 @@ async fn timeline_checkpoint_handler(
let state = get_state(&request);
let mut flags = EnumSet::empty();
if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
flags |= CompactFlags::ForceL0Compaction;
}
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
flags |= CompactFlags::ForceRepartition;
}

View File

@@ -6,13 +6,15 @@ pub mod basebackup;
pub mod config;
pub mod consumption_metrics;
pub mod context;
pub mod control_plane_client;
pub mod controller_upcall_client;
pub mod deletion_queue;
pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub mod l0_flush;
extern crate hyper0 as hyper;
use futures::{stream::FuturesUnordered, StreamExt};
pub use pageserver_api::keyspace;
use tokio_util::sync::CancellationToken;

View File

@@ -8,6 +8,8 @@ use metrics::{
};
use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
use postgres_backend::{is_expected_io_error, QueryError};
use pq_proto::framed::ConnectionError;
use strum::{EnumCount, VariantNames};
use strum_macros::{IntoStaticStr, VariantNames};
use tracing::warn;
@@ -1508,6 +1510,7 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
pub(crate) struct BasebackupQueryTime {
ok: Histogram,
error: Histogram,
client_error: Histogram,
}
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
@@ -1521,6 +1524,7 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
BasebackupQueryTime {
ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
error: vec.get_metric_with_label_values(&["error"]).unwrap(),
client_error: vec.get_metric_with_label_values(&["client_error"]).unwrap(),
}
});
@@ -1557,7 +1561,7 @@ impl BasebackupQueryTime {
}
impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
let elapsed = self.start.elapsed();
let ex_throttled = self
.ctx
@@ -1576,10 +1580,15 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
elapsed
}
};
let metric = if res.is_ok() {
&self.parent.ok
} else {
&self.parent.error
// If you want to change categorize of a specific error, also change it in `log_query_error`.
let metric = match res {
Ok(_) => &self.parent.ok,
Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
if is_expected_io_error(io_error) =>
{
&self.parent.client_error
}
Err(_) => &self.parent.error,
};
metric.observe(ex_throttled.as_secs_f64());
}

View File

@@ -273,10 +273,20 @@ async fn page_service_conn_main(
info!("Postgres client disconnected ({io_error})");
Ok(())
} else {
Err(io_error).context("Postgres connection error")
let tenant_id = conn_handler.timeline_handles.tenant_id();
Err(io_error).context(format!(
"Postgres connection error for tenant_id={:?} client at peer_addr={}",
tenant_id, peer_addr
))
}
}
other => other.context("Postgres query error"),
other => {
let tenant_id = conn_handler.timeline_handles.tenant_id();
other.context(format!(
"Postgres query error for tenant_id={:?} client peer_addr={}",
tenant_id, peer_addr
))
}
}
}
@@ -340,6 +350,10 @@ impl TimelineHandles {
}
})
}
fn tenant_id(&self) -> Option<TenantId> {
self.wrapper.tenant_id.get().copied()
}
}
pub(crate) struct TenantManagerWrapper {
@@ -819,7 +833,7 @@ impl PageServerHandler {
set_tracing_field_shard_id(&timeline);
let lease = timeline
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
.renew_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
.inspect_err(|e| {
warn!("{e}");
})

View File

@@ -21,6 +21,7 @@ use futures::stream::FuturesUnordered;
use futures::StreamExt;
use pageserver_api::models;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::LsnLease;
use pageserver_api::models::TimelineArchivalState;
use pageserver_api::models::TimelineState;
use pageserver_api::models::TopTenantShardItem;
@@ -96,6 +97,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::storage_layer::DeltaLayer;
use crate::tenant::storage_layer::ImageLayer;
use crate::walingest::WalLagCooldown;
use crate::walredo;
use crate::InitializationOrder;
use std::collections::hash_map::Entry;
@@ -182,27 +184,54 @@ pub struct TenantSharedResources {
pub(super) struct AttachedTenantConf {
tenant_conf: TenantConfOpt,
location: AttachedLocationConfig,
/// The deadline before which we are blocked from GC so that
/// leases have a chance to be renewed.
lsn_lease_deadline: Option<tokio::time::Instant>,
}
impl AttachedTenantConf {
fn new(tenant_conf: TenantConfOpt, location: AttachedLocationConfig) -> Self {
// Sets a deadline before which we cannot proceed to GC due to lsn lease.
//
// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
// length, we guarantee that all the leases we granted before will have a chance to renew
// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
Some(
tokio::time::Instant::now()
+ tenant_conf
.lsn_lease_length
.unwrap_or(LsnLease::DEFAULT_LENGTH),
)
} else {
// We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
// because we don't do GC in these modes.
None
};
Self {
tenant_conf,
location,
lsn_lease_deadline,
}
}
fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
match &location_conf.mode {
LocationMode::Attached(attach_conf) => Ok(Self {
tenant_conf: location_conf.tenant_conf,
location: *attach_conf,
}),
LocationMode::Attached(attach_conf) => {
Ok(Self::new(location_conf.tenant_conf, *attach_conf))
}
LocationMode::Secondary(_) => {
anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
}
}
}
fn is_gc_blocked_by_lsn_lease_deadline(&self) -> bool {
self.lsn_lease_deadline
.map(|d| tokio::time::Instant::now() < d)
.unwrap_or(false)
}
}
struct TimelinePreload {
timeline_id: TimelineId,
@@ -291,6 +320,9 @@ pub struct Tenant {
/// background warmup.
pub(crate) activate_now_sem: tokio::sync::Semaphore,
/// Time it took for the tenant to activate. Zero if not active yet.
attach_wal_lag_cooldown: Arc<std::sync::OnceLock<WalLagCooldown>>,
// Cancellation token fires when we have entered shutdown(). This is a parent of
// Timelines' cancellation token.
pub(crate) cancel: CancellationToken,
@@ -563,6 +595,8 @@ pub enum CreateTimelineError {
AncestorLsn(anyhow::Error),
#[error("ancestor timeline is not active")]
AncestorNotActive,
#[error("ancestor timeline is archived")]
AncestorArchived,
#[error("tenant shutting down")]
ShuttingDown,
#[error(transparent)]
@@ -970,11 +1004,15 @@ impl Tenant {
// Remote preload is complete.
drop(remote_load_completion);
// We will time the duration of the attach phase unless this is a creation (attach will do no work)
let attach_start = std::time::Instant::now();
let attached = {
let _attach_timer = Some(TENANT.attach.start_timer());
tenant_clone.attach(preload, &ctx).await
};
let attach_duration = attach_start.elapsed();
_ = tenant_clone.attach_wal_lag_cooldown.set(WalLagCooldown::new(attach_start, attach_duration));
match attached {
Ok(()) => {
@@ -1698,6 +1736,11 @@ impl Tenant {
return Err(CreateTimelineError::AncestorNotActive);
}
if ancestor_timeline.is_archived() == Some(true) {
info!("tried to branch archived timeline");
return Err(CreateTimelineError::AncestorArchived);
}
if let Some(lsn) = ancestor_start_lsn.as_mut() {
*lsn = lsn.align();
@@ -1815,6 +1858,11 @@ impl Tenant {
info!("Skipping GC in location state {:?}", conf.location);
return Ok(GcResult::default());
}
if conf.is_gc_blocked_by_lsn_lease_deadline() {
info!("Skipping GC because lsn lease deadline is not reached");
return Ok(GcResult::default());
}
}
let _guard = match self.gc_block.start().await {
@@ -2623,6 +2671,8 @@ impl Tenant {
Arc::new(AttachedTenantConf {
tenant_conf: new_tenant_conf.clone(),
location: inner.location,
// Attached location is not changed, no need to update lsn lease deadline.
lsn_lease_deadline: inner.lsn_lease_deadline,
})
});
@@ -2712,6 +2762,7 @@ impl Tenant {
pg_version,
state,
last_aux_file_policy,
self.attach_wal_lag_cooldown.clone(),
self.cancel.child_token(),
);
@@ -2818,6 +2869,7 @@ impl Tenant {
Some(Duration::from_secs(3600 * 24)),
)),
activate_now_sem: tokio::sync::Semaphore::new(0),
attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
cancel: CancellationToken::default(),
gate: Gate::default(),
timeline_get_throttle: Arc::new(throttle::Throttle::new(
@@ -3880,9 +3932,9 @@ async fn run_initdb(
let _permit = INIT_DB_SEMAPHORE.acquire().await;
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
.args(["-D", initdb_target_dir.as_ref()])
.args(["-U", &conf.superuser])
.args(["-E", "utf8"])
.args(["--pgdata", initdb_target_dir.as_ref()])
.args(["--username", &conf.superuser])
.args(["--encoding", "utf8"])
.arg("--no-instructions")
.arg("--no-sync")
.env_clear()
@@ -4454,13 +4506,17 @@ mod tests {
tline.freeze_and_flush().await.map_err(|e| e.into())
}
#[tokio::test]
#[tokio::test(start_paused = true)]
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
let (tenant, ctx) =
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
.await?
.load()
.await;
// Advance to the lsn lease deadline so that GC is not blocked by
// initial transition into AttachedSingle.
tokio::time::advance(tenant.get_lsn_lease_length()).await;
tokio::time::resume();
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
@@ -7237,9 +7293,17 @@ mod tests {
Ok(())
}
#[tokio::test]
#[tokio::test(start_paused = true)]
async fn test_lsn_lease() -> anyhow::Result<()> {
let (tenant, ctx) = TenantHarness::create("test_lsn_lease").await?.load().await;
let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
.await
.unwrap()
.load()
.await;
// Advance to the lsn lease deadline so that GC is not blocked by
// initial transition into AttachedSingle.
tokio::time::advance(tenant.get_lsn_lease_length()).await;
tokio::time::resume();
let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let end_lsn = Lsn(0x100);
@@ -7267,24 +7331,33 @@ mod tests {
let leased_lsns = [0x30, 0x50, 0x70];
let mut leases = Vec::new();
let _: anyhow::Result<_> = leased_lsns.iter().try_for_each(|n| {
leases.push(timeline.make_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)?);
Ok(())
leased_lsns.iter().for_each(|n| {
leases.push(
timeline
.init_lsn_lease(Lsn(*n), timeline.get_lsn_lease_length(), &ctx)
.expect("lease request should succeed"),
);
});
// Renewing with shorter lease should not change the lease.
let updated_lease_0 =
timeline.make_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)?;
assert_eq!(updated_lease_0.valid_until, leases[0].valid_until);
let updated_lease_0 = timeline
.renew_lsn_lease(Lsn(leased_lsns[0]), Duration::from_secs(0), &ctx)
.expect("lease renewal should succeed");
assert_eq!(
updated_lease_0.valid_until, leases[0].valid_until,
" Renewing with shorter lease should not change the lease."
);
// Renewing with a long lease should renew lease with later expiration time.
let updated_lease_1 = timeline.make_lsn_lease(
Lsn(leased_lsns[1]),
timeline.get_lsn_lease_length() * 2,
&ctx,
)?;
assert!(updated_lease_1.valid_until > leases[1].valid_until);
let updated_lease_1 = timeline
.renew_lsn_lease(
Lsn(leased_lsns[1]),
timeline.get_lsn_lease_length() * 2,
&ctx,
)
.expect("lease renewal should succeed");
assert!(
updated_lease_1.valid_until > leases[1].valid_until,
"Renewing with a long lease should renew lease with later expiration time."
);
// Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
info!(
@@ -7301,7 +7374,8 @@ mod tests {
&CancellationToken::new(),
&ctx,
)
.await?;
.await
.unwrap();
// Keeping everything <= Lsn(0x80) b/c leases:
// 0/10: initdb layer
@@ -7315,13 +7389,16 @@ mod tests {
// Make lease on a already GC-ed LSN.
// 0/80 does not have a valid lease + is below latest_gc_cutoff
assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
let res = timeline.make_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx);
assert!(res.is_err());
timeline
.init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
.expect_err("lease request on GC-ed LSN should fail");
// Should still be able to renew a currently valid lease
// Assumption: original lease to is still valid for 0/50.
let _ =
timeline.make_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)?;
// (use `Timeline::init_lsn_lease` for testing so it always does validation)
timeline
.init_lsn_lease(Lsn(leased_lsns[1]), timeline.get_lsn_lease_length(), &ctx)
.expect("lease renewal with validation should succeed");
Ok(())
}

View File

@@ -5,6 +5,7 @@ use itertools::Itertools;
use super::storage_layer::LayerName;
/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong).
///
/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example,
///
/// ```plain

View File

@@ -8,7 +8,6 @@
//! We cannot use global or default config instead, because wrong settings
//! may lead to a data loss.
//!
use anyhow::bail;
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::CompactionAlgorithmSettings;
@@ -441,29 +440,6 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
}
}
impl TryFrom<toml_edit::Item> for TenantConfOpt {
type Error = anyhow::Error;
fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
match item {
toml_edit::Item::Value(value) => {
let d = value.into_deserializer();
return serde_path_to_error::deserialize(d)
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
}
toml_edit::Item::Table(table) => {
let deserializer =
toml_edit::de::Deserializer::from(toml_edit::DocumentMut::from(table));
return serde_path_to_error::deserialize(deserializer)
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
}
_ => {
bail!("expected non-inline table but found {item}")
}
}
}
}
/// This is a conversion from our internal tenant config object to the one used
/// in external APIs.
impl From<TenantConfOpt> for models::TenantConfig {

View File

@@ -1,29 +1,12 @@
use std::{collections::HashMap, time::Duration};
use std::collections::HashMap;
use super::remote_timeline_client::index::GcBlockingReason;
use tokio::time::Instant;
use utils::id::TimelineId;
type TimelinesBlocked = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
use super::remote_timeline_client::index::GcBlockingReason;
#[derive(Default)]
struct Storage {
timelines_blocked: TimelinesBlocked,
/// The deadline before which we are blocked from GC so that
/// leases have a chance to be renewed.
lsn_lease_deadline: Option<Instant>,
}
type Storage = HashMap<TimelineId, enumset::EnumSet<GcBlockingReason>>;
impl Storage {
fn is_blocked_by_lsn_lease_deadline(&self) -> bool {
self.lsn_lease_deadline
.map(|d| Instant::now() < d)
.unwrap_or(false)
}
}
/// GcBlock provides persistent (per-timeline) gc blocking and facilitates transient time based gc
/// blocking.
/// GcBlock provides persistent (per-timeline) gc blocking.
#[derive(Default)]
pub(crate) struct GcBlock {
/// The timelines which have current reasons to block gc.
@@ -66,17 +49,6 @@ impl GcBlock {
}
}
/// Sets a deadline before which we cannot proceed to GC due to lsn lease.
///
/// We do this as the leases mapping are not persisted to disk. By delaying GC by lease
/// length, we guarantee that all the leases we granted before will have a chance to renew
/// when we run GC for the first time after restart / transition from AttachedMulti to AttachedSingle.
pub(super) fn set_lsn_lease_deadline(&self, lsn_lease_length: Duration) {
let deadline = Instant::now() + lsn_lease_length;
let mut g = self.reasons.lock().unwrap();
g.lsn_lease_deadline = Some(deadline);
}
/// Describe the current gc blocking reasons.
///
/// TODO: make this json serializable.
@@ -102,7 +74,7 @@ impl GcBlock {
) -> anyhow::Result<bool> {
let (added, uploaded) = {
let mut g = self.reasons.lock().unwrap();
let set = g.timelines_blocked.entry(timeline.timeline_id).or_default();
let set = g.entry(timeline.timeline_id).or_default();
let added = set.insert(reason);
// LOCK ORDER: intentionally hold the lock, see self.reasons.
@@ -133,7 +105,7 @@ impl GcBlock {
let (remaining_blocks, uploaded) = {
let mut g = self.reasons.lock().unwrap();
match g.timelines_blocked.entry(timeline.timeline_id) {
match g.entry(timeline.timeline_id) {
Entry::Occupied(mut oe) => {
let set = oe.get_mut();
set.remove(reason);
@@ -147,7 +119,7 @@ impl GcBlock {
}
}
let remaining_blocks = g.timelines_blocked.len();
let remaining_blocks = g.len();
// LOCK ORDER: intentionally hold the lock while scheduling; see self.reasons
let uploaded = timeline
@@ -172,11 +144,11 @@ impl GcBlock {
pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
let unblocked = {
let mut g = self.reasons.lock().unwrap();
if g.timelines_blocked.is_empty() {
if g.is_empty() {
return;
}
g.timelines_blocked.remove(&timeline.timeline_id);
g.remove(&timeline.timeline_id);
BlockingReasons::clean_and_summarize(g).is_none()
};
@@ -187,11 +159,10 @@ impl GcBlock {
}
/// Initialize with the non-deleted timelines of this tenant.
pub(crate) fn set_scanned(&self, scanned: TimelinesBlocked) {
pub(crate) fn set_scanned(&self, scanned: Storage) {
let mut g = self.reasons.lock().unwrap();
assert!(g.timelines_blocked.is_empty());
g.timelines_blocked
.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
assert!(g.is_empty());
g.extend(scanned.into_iter().filter(|(_, v)| !v.is_empty()));
if let Some(reasons) = BlockingReasons::clean_and_summarize(g) {
tracing::info!(summary=?reasons, "initialized with gc blocked");
@@ -205,7 +176,6 @@ pub(super) struct Guard<'a> {
#[derive(Debug)]
pub(crate) struct BlockingReasons {
tenant_blocked_by_lsn_lease_deadline: bool,
timelines: usize,
reasons: enumset::EnumSet<GcBlockingReason>,
}
@@ -214,8 +184,8 @@ impl std::fmt::Display for BlockingReasons {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"tenant_blocked_by_lsn_lease_deadline: {}, {} timelines block for {:?}",
self.tenant_blocked_by_lsn_lease_deadline, self.timelines, self.reasons
"{} timelines block for {:?}",
self.timelines, self.reasons
)
}
}
@@ -223,15 +193,13 @@ impl std::fmt::Display for BlockingReasons {
impl BlockingReasons {
fn clean_and_summarize(mut g: std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
let mut reasons = enumset::EnumSet::empty();
g.timelines_blocked.retain(|_key, value| {
g.retain(|_key, value| {
reasons = reasons.union(*value);
!value.is_empty()
});
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
if !g.timelines_blocked.is_empty() || blocked_by_lsn_lease_deadline {
if !g.is_empty() {
Some(BlockingReasons {
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
timelines: g.timelines_blocked.len(),
timelines: g.len(),
reasons,
})
} else {
@@ -240,17 +208,14 @@ impl BlockingReasons {
}
fn summarize(g: &std::sync::MutexGuard<'_, Storage>) -> Option<Self> {
let blocked_by_lsn_lease_deadline = g.is_blocked_by_lsn_lease_deadline();
if g.timelines_blocked.is_empty() && !blocked_by_lsn_lease_deadline {
if g.is_empty() {
None
} else {
let reasons = g
.timelines_blocked
.values()
.fold(enumset::EnumSet::empty(), |acc, next| acc.union(*next));
Some(BlockingReasons {
tenant_blocked_by_lsn_lease_deadline: blocked_by_lsn_lease_deadline,
timelines: g.timelines_blocked.len(),
timelines: g.len(),
reasons,
})
}

View File

@@ -30,8 +30,8 @@ use utils::{backoff, completion, crashsafe};
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::control_plane_client::{
ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
use crate::controller_upcall_client::{
ControlPlaneGenerationsApi, ControllerUpcallClient, RetryForeverError,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
@@ -122,7 +122,7 @@ pub(crate) enum ShardSelector {
Known(ShardIndex),
}
/// A convenience for use with the re_attach ControlPlaneClient function: rather
/// A convenience for use with the re_attach ControllerUpcallClient function: rather
/// than the serializable struct, we build this enum that encapsulates
/// the invariant that attached tenants always have generations.
///
@@ -219,7 +219,11 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
+ TEMP_FILE_SUFFIX;
let tmp_path = path_with_suffix_extension(&path, &rand_suffix);
fs::rename(path.as_ref(), &tmp_path).await?;
fs::File::open(parent).await?.sync_all().await?;
fs::File::open(parent)
.await?
.sync_all()
.await
.maybe_fatal_err("safe_rename_tenant_dir")?;
Ok(tmp_path)
}
@@ -341,7 +345,7 @@ async fn init_load_generations(
"Emergency mode! Tenants will be attached unsafely using their last known generation"
);
emergency_generations(tenant_confs)
} else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
} else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
info!("Calling control plane API to re-attach tenants");
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
match client.re_attach(conf).await {
@@ -949,12 +953,6 @@ impl TenantManager {
(LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
match attach_conf.generation.cmp(&tenant.generation) {
Ordering::Equal => {
if attach_conf.attach_mode == AttachmentMode::Single {
tenant
.gc_block
.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
}
// A transition from Attached to Attached in the same generation, we may
// take our fast path and just provide the updated configuration
// to the tenant.
@@ -2199,6 +2197,82 @@ impl TenantManager {
Ok((wanted_bytes, shard_count as u32))
}
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
pub(crate) async fn immediate_gc(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<GcResult, ApiError> {
let tenant = {
let guard = self.tenants.read().unwrap();
guard
.get(&tenant_shard_id)
.cloned()
.with_context(|| format!("tenant {tenant_shard_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?
};
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
// Run in task_mgr to avoid race with tenant_detach operation
let ctx: RequestContext =
ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
fail::fail_point!("immediate_gc_task_pre");
#[allow(unused_mut)]
let mut result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
#[cfg(feature = "testing")]
{
// we need to synchronize with drop completion for python tests without polling for
// log messages
if let Ok(result) = result.as_mut() {
let mut js = tokio::task::JoinSet::new();
for layer in std::mem::take(&mut result.doomed_layers) {
js.spawn(layer.wait_drop());
}
tracing::info!(
total = js.len(),
"starting to wait for the gc'd layers to be dropped"
);
while let Some(res) = js.join_next().await {
res.expect("wait_drop should not panic");
}
}
let timeline = tenant.get_timeline(timeline_id, false).ok();
let rtc = timeline.as_ref().map(|x| &x.remote_client);
if let Some(rtc) = rtc {
// layer drops schedule actions on remote timeline client to actually do the
// deletions; don't care about the shutdown error, just exit fast
drop(rtc.wait_completion().await);
}
}
result.map_err(|e| match e {
GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
GcError::TimelineNotFound => {
ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
}
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
})
}
}
#[derive(Debug, thiserror::Error)]
@@ -2343,7 +2417,7 @@ enum TenantSlotDropError {
/// Errors that can happen any time we are walking the tenant map to try and acquire
/// the TenantSlot for a particular tenant.
#[derive(Debug, thiserror::Error)]
pub enum TenantMapError {
pub(crate) enum TenantMapError {
// Tried to read while initializing
#[error("tenant map is still initializing")]
StillInitializing,
@@ -2373,7 +2447,7 @@ pub enum TenantMapError {
/// The `old_value` may be dropped before the SlotGuard is dropped, by calling
/// `drop_old_value`. It is an error to call this without shutting down
/// the conents of `old_value`.
pub struct SlotGuard {
pub(crate) struct SlotGuard {
tenant_shard_id: TenantShardId,
old_value: Option<TenantSlot>,
upserted: bool,
@@ -2766,81 +2840,6 @@ use {
utils::http::error::ApiError,
};
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))]
pub(crate) async fn immediate_gc(
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<GcResult, ApiError> {
let tenant = {
let guard = TENANTS.read().unwrap();
guard
.get(&tenant_shard_id)
.cloned()
.with_context(|| format!("tenant {tenant_shard_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?
};
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
// Run in task_mgr to avoid race with tenant_detach operation
let ctx: RequestContext =
ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let _gate_guard = tenant.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
fail::fail_point!("immediate_gc_task_pre");
#[allow(unused_mut)]
let mut result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
#[cfg(feature = "testing")]
{
// we need to synchronize with drop completion for python tests without polling for
// log messages
if let Ok(result) = result.as_mut() {
let mut js = tokio::task::JoinSet::new();
for layer in std::mem::take(&mut result.doomed_layers) {
js.spawn(layer.wait_drop());
}
tracing::info!(
total = js.len(),
"starting to wait for the gc'd layers to be dropped"
);
while let Some(res) = js.join_next().await {
res.expect("wait_drop should not panic");
}
}
let timeline = tenant.get_timeline(timeline_id, false).ok();
let rtc = timeline.as_ref().map(|x| &x.remote_client);
if let Some(rtc) = rtc {
// layer drops schedule actions on remote timeline client to actually do the
// deletions; don't care about the shutdown error, just exit fast
drop(rtc.wait_completion().await);
}
}
result.map_err(|e| match e {
GcError::TenantCancelled | GcError::TimelineCancelled => ApiError::ShuttingDown,
GcError::TimelineNotFound => {
ApiError::NotFound(anyhow::anyhow!("Timeline not found").into())
}
other => ApiError::InternalServerError(anyhow::anyhow!(other)),
})
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;

View File

@@ -27,7 +27,7 @@ use crate::tenant::Generation;
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
use crate::TEMP_FILE_SUFFIX;
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use utils::pausable_failpoint;
@@ -153,7 +153,9 @@ async fn download_object<'a>(
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
.map_err(DownloadError::Other)?;
let download = storage.download(src_path, cancel).await?;
let download = storage
.download(src_path, &DownloadOpts::default(), cancel)
.await?;
pausable_failpoint!("before-downloading-layer-stream-pausable");
@@ -178,6 +180,7 @@ async fn download_object<'a>(
destination_file
.flush()
.await
.maybe_fatal_err("download_object sync_all")
.with_context(|| format!("flush source file at {dst_path}"))
.map_err(DownloadError::Other)?;
@@ -185,6 +188,7 @@ async fn download_object<'a>(
destination_file
.sync_all()
.await
.maybe_fatal_err("download_object sync_all")
.with_context(|| format!("failed to fsync source file at {dst_path}"))
.map_err(DownloadError::Other)?;
@@ -202,7 +206,9 @@ async fn download_object<'a>(
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
.map_err(DownloadError::Other)?;
let mut download = storage.download(src_path, cancel).await?;
let mut download = storage
.download(src_path, &DownloadOpts::default(), cancel)
.await?;
pausable_failpoint!("before-downloading-layer-stream-pausable");
@@ -232,6 +238,7 @@ async fn download_object<'a>(
destination_file
.sync_all()
.await
.maybe_fatal_err("download_object sync_all")
.with_context(|| format!("failed to fsync source file at {dst_path}"))
.map_err(DownloadError::Other)?;
@@ -341,7 +348,9 @@ async fn do_download_index_part(
let index_part_bytes = download_retry_forever(
|| async {
let download = storage.download(&remote_path, cancel).await?;
let download = storage
.download(&remote_path, &DownloadOpts::default(), cancel)
.await?;
let mut bytes = Vec::new();
@@ -523,10 +532,15 @@ pub(crate) async fn download_initdb_tar_zst(
.with_context(|| format!("tempfile creation {temp_path}"))
.map_err(DownloadError::Other)?;
let download = match storage.download(&remote_path, cancel).await {
let download = match storage
.download(&remote_path, &DownloadOpts::default(), cancel)
.await
{
Ok(dl) => dl,
Err(DownloadError::NotFound) => {
storage.download(&remote_preserved_path, cancel).await?
storage
.download(&remote_preserved_path, &DownloadOpts::default(), cancel)
.await?
}
Err(other) => Err(other)?,
};

View File

@@ -49,7 +49,7 @@ use futures::Future;
use metrics::UIntGauge;
use pageserver_api::models::SecondaryProgress;
use pageserver_api::shard::TenantShardId;
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
use remote_storage::{DownloadError, DownloadOpts, Etag, GenericRemoteStorage};
use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, warn, Instrument};
@@ -944,36 +944,34 @@ impl<'a> TenantDownloader<'a> {
) -> Result<HeatMapDownload, UpdateError> {
debug_assert_current_span_has_tenant_id();
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
// TODO: pull up etag check into the request, to do a conditional GET rather than
// issuing a GET and then maybe ignoring the response body
// (https://github.com/neondatabase/neon/issues/6199)
tracing::debug!("Downloading heatmap for secondary tenant",);
let heatmap_path = remote_heatmap_path(tenant_shard_id);
let cancel = &self.secondary_state.cancel;
let opts = DownloadOpts {
etag: prev_etag.cloned(),
};
backoff::retry(
|| async {
let download = self
let download = match self
.remote_storage
.download(&heatmap_path, cancel)
.download(&heatmap_path, &opts, cancel)
.await
.map_err(UpdateError::from)?;
{
Ok(download) => download,
Err(DownloadError::Unmodified) => return Ok(HeatMapDownload::Unmodified),
Err(err) => return Err(err.into()),
};
SECONDARY_MODE.download_heatmap.inc();
if Some(&download.etag) == prev_etag {
Ok(HeatMapDownload::Unmodified)
} else {
let mut heatmap_bytes = Vec::new();
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
Ok(HeatMapDownload::Modified(HeatMapModified {
etag: download.etag,
last_modified: download.last_modified,
bytes: heatmap_bytes,
}))
}
let mut heatmap_bytes = Vec::new();
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
Ok(HeatMapDownload::Modified(HeatMapModified {
etag: download.etag,
last_modified: download.last_modified,
bytes: heatmap_bytes,
}))
},
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
FAILED_DOWNLOAD_WARN_THRESHOLD,
@@ -984,6 +982,7 @@ impl<'a> TenantDownloader<'a> {
.await
.ok_or_else(|| UpdateError::Cancelled)
.and_then(|x| x)
.inspect(|_| SECONDARY_MODE.download_heatmap.inc())
}
/// Download heatmap layers that are not present on local disk, or update their

View File

@@ -40,11 +40,11 @@ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{
BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
VectoredReadCoalesceMode, VectoredReadPlanner,
VectoredReadPlanner,
};
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
use crate::virtual_file::{self, VirtualFile};
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{anyhow, bail, ensure, Context, Result};
@@ -53,6 +53,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::config::MaxVectoredReadBytes;
use pageserver_api::key::DBDIR_KEY;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::ImageCompressionAlgorithm;
use pageserver_api::shard::TenantShardId;
@@ -589,7 +590,9 @@ impl DeltaLayerWriterInner {
);
// fsync the file
file.sync_all().await?;
file.sync_all()
.await
.maybe_fatal_err("delta_layer sync_all")?;
trace!("created delta layer {}", self.path);
@@ -961,14 +964,25 @@ impl DeltaLayerInner {
.blobs_at
.as_slice()
.iter()
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
.filter_map(|(_, blob_meta)| {
if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
// The size of values for these keys is unbounded and can
// grow very large in pathological cases.
None
} else {
Some(format!("{}@{}", blob_meta.key, blob_meta.lsn))
}
})
.join(", ");
tracing::warn!(
"Oversized vectored read ({} > {}) for keys {}",
largest_read_size,
read_size_soft_max,
offenders
);
if !offenders.is_empty() {
tracing::warn!(
"Oversized vectored read ({} > {}) for keys {}",
largest_read_size,
read_size_soft_max,
offenders
);
}
}
largest_read_size
@@ -1133,7 +1147,7 @@ impl DeltaLayerInner {
ctx: &RequestContext,
) -> anyhow::Result<usize> {
use crate::tenant::vectored_blob_io::{
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
BlobMeta, ChunkedVectoredReadBuilder, VectoredReadExtended,
};
use futures::stream::TryStreamExt;
@@ -1183,8 +1197,8 @@ impl DeltaLayerInner {
let mut prev: Option<(Key, Lsn, BlobRef)> = None;
let mut read_builder: Option<VectoredReadBuilder> = None;
let read_mode = VectoredReadCoalesceMode::get();
let mut read_builder: Option<ChunkedVectoredReadBuilder> = None;
let align = virtual_file::get_io_buffer_alignment();
let max_read_size = self
.max_vectored_read_bytes
@@ -1228,12 +1242,12 @@ impl DeltaLayerInner {
{
None
} else {
read_builder.replace(VectoredReadBuilder::new(
read_builder.replace(ChunkedVectoredReadBuilder::new(
offsets.start.pos(),
offsets.end.pos(),
meta,
max_read_size,
read_mode,
align,
))
}
} else {

View File

@@ -41,7 +41,7 @@ use crate::tenant::vectored_blob_io::{
};
use crate::tenant::PageReconstructError;
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::virtual_file::{self, VirtualFile};
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
@@ -49,6 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use hex;
use itertools::Itertools;
use pageserver_api::config::MaxVectoredReadBytes;
use pageserver_api::key::DBDIR_KEY;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use rand::{distributions::Alphanumeric, Rng};
@@ -587,14 +588,25 @@ impl ImageLayerInner {
.blobs_at
.as_slice()
.iter()
.map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn))
.filter_map(|(_, blob_meta)| {
if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
// The size of values for these keys is unbounded and can
// grow very large in pathological cases.
None
} else {
Some(format!("{}@{}", blob_meta.key, blob_meta.lsn))
}
})
.join(", ");
tracing::warn!(
"Oversized vectored read ({} > {}) for keys {}",
buf_size,
max_vectored_read_bytes,
offenders
);
if !offenders.is_empty() {
tracing::warn!(
"Oversized vectored read ({} > {}) for keys {}",
buf_size,
max_vectored_read_bytes,
offenders
);
}
}
let buf = BytesMut::with_capacity(buf_size);
@@ -889,7 +901,9 @@ impl ImageLayerWriterInner {
// set inner.file here. The first read will have to re-open it.
// fsync the file
file.sync_all().await?;
file.sync_all()
.await
.maybe_fatal_err("image_layer sync_all")?;
trace!("created image layer {}", self.path);

View File

@@ -442,11 +442,13 @@ impl Layer {
// Visibility was modified to Visible: maybe log about this
match ctx.task_kind() {
TaskKind::CalculateSyntheticSize
| TaskKind::OndemandLogicalSizeCalculation
| TaskKind::GarbageCollector
| TaskKind::MgmtRequest => {
// This situation is expected in code paths do binary searches of the LSN space to resolve
// an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size,
// and on-demand for certain HTTP API requests.
// and on-demand for certain HTTP API requests. On-demand logical size calculation is also included
// because it is run as a sub-task of synthetic size.
}
_ => {
// In all other contexts, it is unusual to do I/O involving layers which are not visible at
@@ -456,8 +458,8 @@ impl Layer {
// This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object
// which was covered by a concurrent compaction.
tracing::info!(
"Layer {} became visible as a result of access",
self.0.desc.key()
layer=%self,
"became visible as a result of access",
);
}
}
@@ -686,7 +688,9 @@ impl Drop for LayerInner {
// and we could be delaying shutdown for nothing.
}
if let Some(timeline) = self.timeline.upgrade() {
let timeline = self.timeline.upgrade();
if let Some(timeline) = timeline.as_ref() {
// Only need to decrement metrics if the timeline still exists: otherwise
// it will have already de-registered these metrics via TimelineMetrics::shutdown
if self.desc.is_delta() {
@@ -717,7 +721,6 @@ impl Drop for LayerInner {
let path = std::mem::take(&mut self.path);
let file_name = self.layer_desc().layer_name();
let file_size = self.layer_desc().file_size;
let timeline = self.timeline.clone();
let meta = self.metadata();
let status = self.status.take();
@@ -727,7 +730,7 @@ impl Drop for LayerInner {
// carry this until we are finished for [`Layer::wait_drop`] support
let _status = status;
let Some(timeline) = timeline.upgrade() else {
let Some(timeline) = timeline else {
// no need to nag that timeline is gone: under normal situation on
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);

View File

@@ -330,7 +330,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let mut first = true;
tenant.gc_block.set_lsn_lease_deadline(tenant.get_lsn_lease_length());
loop {
tokio::select! {
_ = cancel.cancelled() => {

View File

@@ -48,7 +48,6 @@ use utils::{
sync::gate::{Gate, GateGuard},
};
use std::pin::pin;
use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
use std::time::{Duration, Instant, SystemTime};
@@ -62,14 +61,17 @@ use std::{
collections::btree_map::Entry,
ops::{Deref, Range},
};
use std::{pin::pin, sync::OnceLock};
use crate::{
aux_file::AuxFileSizeEstimator,
tenant::{
config::AttachmentMode,
layer_map::{LayerMap, SearchResult},
metadata::TimelineMetadata,
storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
},
walingest::WalLagCooldown,
walredo,
};
use crate::{
@@ -112,7 +114,7 @@ use pageserver_api::reltag::RelTag;
use pageserver_api::shard::ShardIndex;
use postgres_connection::PgConnectionConfig;
use postgres_ffi::to_pg_timestamp;
use postgres_ffi::{to_pg_timestamp, v14::xlog_utils, WAL_SEGMENT_SIZE};
use utils::{
completion,
generation::Generation,
@@ -428,6 +430,8 @@ pub struct Timeline {
pub(crate) l0_flush_global_state: L0FlushGlobalState,
pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
}
pub struct WalReceiverInfo {
@@ -736,6 +740,7 @@ pub enum GetLogicalSizePriority {
pub(crate) enum CompactFlags {
ForceRepartition,
ForceImageLayerCreation,
ForceL0Compaction,
EnhancedGcBottomMostCompaction,
DryRun,
}
@@ -1324,27 +1329,53 @@ impl Timeline {
Ok(())
}
/// Obtains a temporary lease blocking garbage collection for the given LSN.
///
/// This function will error if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is also
/// no existing lease to renew. If there is an existing lease in the map, the lease will be renewed only if
/// the request extends the lease. The returned lease is therefore the maximum between the existing lease and
/// the requesting lease.
pub(crate) fn make_lsn_lease(
/// Initializes an LSN lease. The function will return an error if the requested LSN is less than the `latest_gc_cutoff_lsn`.
pub(crate) fn init_lsn_lease(
&self,
lsn: Lsn,
length: Duration,
ctx: &RequestContext,
) -> anyhow::Result<LsnLease> {
self.make_lsn_lease(lsn, length, true, ctx)
}
/// Renews a lease at a particular LSN. The requested LSN is not validated against the `latest_gc_cutoff_lsn` when we are in the grace period.
pub(crate) fn renew_lsn_lease(
&self,
lsn: Lsn,
length: Duration,
ctx: &RequestContext,
) -> anyhow::Result<LsnLease> {
self.make_lsn_lease(lsn, length, false, ctx)
}
/// Obtains a temporary lease blocking garbage collection for the given LSN.
///
/// If we are in `AttachedSingle` mode and is not blocked by the lsn lease deadline, this function will error
/// if the requesting LSN is less than the `latest_gc_cutoff_lsn` and there is no existing request present.
///
/// If there is an existing lease in the map, the lease will be renewed only if the request extends the lease.
/// The returned lease is therefore the maximum between the existing lease and the requesting lease.
fn make_lsn_lease(
&self,
lsn: Lsn,
length: Duration,
init: bool,
_ctx: &RequestContext,
) -> anyhow::Result<LsnLease> {
let lease = {
// Normalize the requested LSN to be aligned, and move to the first record
// if it points to the beginning of the page (header).
let lsn = xlog_utils::normalize_lsn(lsn, WAL_SEGMENT_SIZE);
let mut gc_info = self.gc_info.write().unwrap();
let valid_until = SystemTime::now() + length;
let entry = gc_info.leases.entry(lsn);
let lease = {
if let Entry::Occupied(mut occupied) = entry {
match entry {
Entry::Occupied(mut occupied) => {
let existing_lease = occupied.get_mut();
if valid_until > existing_lease.valid_until {
existing_lease.valid_until = valid_until;
@@ -1356,20 +1387,28 @@ impl Timeline {
}
existing_lease.clone()
} else {
// Reject already GC-ed LSN (lsn < latest_gc_cutoff)
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
if lsn < *latest_gc_cutoff_lsn {
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
}
Entry::Vacant(vacant) => {
// Reject already GC-ed LSN (lsn < latest_gc_cutoff) if we are in AttachedSingle and
// not blocked by the lsn lease deadline.
let validate = {
let conf = self.tenant_conf.load();
conf.location.attach_mode == AttachmentMode::Single
&& !conf.is_gc_blocked_by_lsn_lease_deadline()
};
if init || validate {
let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
if lsn < *latest_gc_cutoff_lsn {
bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
}
}
let dt: DateTime<Utc> = valid_until.into();
info!("lease created, valid until {}", dt);
entry.or_insert(LsnLease { valid_until }).clone()
vacant.insert(LsnLease { valid_until }).clone()
}
};
lease
}
};
Ok(lease)
@@ -1946,8 +1985,6 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
}
// TODO(yuchen): remove unused flag after implementing https://github.com/neondatabase/neon/issues/8072
#[allow(unused)]
pub(crate) fn get_lsn_lease_length_for_ts(&self) -> Duration {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2097,6 +2134,7 @@ impl Timeline {
pg_version: u32,
state: TimelineState,
aux_file_policy: Option<AuxFilePolicy>,
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
cancel: CancellationToken,
) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2238,6 +2276,8 @@ impl Timeline {
l0_flush_global_state: resources.l0_flush_global_state,
handles: Default::default(),
attach_wal_lag_cooldown,
};
if aux_file_policy == Some(AuxFilePolicy::V1) {
@@ -3597,7 +3637,7 @@ impl Timeline {
ctx,
)
.await
.map_err(|e| FlushLayerError::from_anyhow(self, e))?;
.map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
if self.cancel.is_cancelled() {
return Err(FlushLayerError::Cancelled);
@@ -3836,16 +3876,20 @@ impl Timeline {
partition_size: u64,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> anyhow::Result<((KeyPartitioning, SparseKeyPartitioning), Lsn)> {
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
// and hence before the compaction task starts.
anyhow::bail!("repartition() called concurrently, this should not happen");
return Err(CompactionError::Other(anyhow!(
"repartition() called concurrently, this should not happen"
)));
};
let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
if lsn < *partition_lsn {
anyhow::bail!("repartition() called with LSN going backwards, this should not happen");
return Err(CompactionError::Other(anyhow!(
"repartition() called with LSN going backwards, this should not happen"
)));
}
let distance = lsn.0 - partition_lsn.0;
@@ -4447,6 +4491,12 @@ pub(crate) enum CompactionError {
Other(anyhow::Error),
}
impl CompactionError {
pub fn is_cancelled(&self) -> bool {
matches!(self, CompactionError::ShuttingDown)
}
}
impl From<CollectKeySpaceError> for CompactionError {
fn from(err: CollectKeySpaceError) -> Self {
match err {

View File

@@ -11,6 +11,7 @@ pub(crate) struct RangeAnalysis {
has_image: bool,
num_of_deltas_above_image: usize,
total_num_of_deltas: usize,
num_of_l0: usize,
}
impl Timeline {
@@ -20,8 +21,10 @@ impl Timeline {
let mut delta_ranges = Vec::new();
let mut image_ranges = Vec::new();
let num_of_l0;
let all_layer_files = {
let guard = self.layers.read().await;
num_of_l0 = guard.layer_map().unwrap().level0_deltas().len();
guard.all_persistent_layers()
};
let lsn = self.get_last_record_lsn();
@@ -82,6 +85,7 @@ impl Timeline {
has_image: image_layer.is_some(),
num_of_deltas_above_image: maybe_delta_layers.len(),
total_num_of_deltas: pitr_delta_layers.len(),
num_of_l0,
});
}

View File

@@ -353,7 +353,13 @@ impl Timeline {
// 2. Compact
let timer = self.metrics.compact_time_histo.start_timer();
let fully_compacted = self.compact_level0(target_file_size, ctx).await?;
let fully_compacted = self
.compact_level0(
target_file_size,
flags.contains(CompactFlags::ForceL0Compaction),
ctx,
)
.await?;
timer.stop_and_record();
let mut partitioning = dense_partitioning;
@@ -390,7 +396,7 @@ impl Timeline {
// error but continue.
//
// Suppress error when it's due to cancellation
if !self.cancel.is_cancelled() {
if !self.cancel.is_cancelled() && !err.is_cancelled() {
tracing::error!("could not compact, repartitioning keyspace failed: {err:?}");
}
(1, false)
@@ -658,6 +664,7 @@ impl Timeline {
async fn compact_level0(
self: &Arc<Self>,
target_file_size: u64,
force_compaction_ignore_threshold: bool,
ctx: &RequestContext,
) -> Result<bool, CompactionError> {
let CompactLevel0Phase1Result {
@@ -679,9 +686,15 @@ impl Timeline {
let now = tokio::time::Instant::now();
stats.read_lock_acquisition_micros =
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx)
.instrument(phase1_span)
.await?
self.compact_level0_phase1(
phase1_layers_locked,
stats,
target_file_size,
force_compaction_ignore_threshold,
&ctx,
)
.instrument(phase1_span)
.await?
};
if new_layers.is_empty() && deltas_to_compact.is_empty() {
@@ -700,6 +713,7 @@ impl Timeline {
guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
mut stats: CompactLevel0Phase1StatsBuilder,
target_file_size: u64,
force_compaction_ignore_threshold: bool,
ctx: &RequestContext,
) -> Result<CompactLevel0Phase1Result, CompactionError> {
stats.read_lock_held_spawn_blocking_startup_micros =
@@ -711,11 +725,26 @@ impl Timeline {
// Only compact if enough layers have accumulated.
let threshold = self.get_compaction_threshold();
if level0_deltas.is_empty() || level0_deltas.len() < threshold {
debug!(
level0_deltas = level0_deltas.len(),
threshold, "too few deltas to compact"
);
return Ok(CompactLevel0Phase1Result::default());
if force_compaction_ignore_threshold {
if !level0_deltas.is_empty() {
info!(
level0_deltas = level0_deltas.len(),
threshold, "too few deltas to compact, but forcing compaction"
);
} else {
info!(
level0_deltas = level0_deltas.len(),
threshold, "too few deltas to compact, cannot force compaction"
);
return Ok(CompactLevel0Phase1Result::default());
}
} else {
debug!(
level0_deltas = level0_deltas.len(),
threshold, "too few deltas to compact"
);
return Ok(CompactLevel0Phase1Result::default());
}
}
let mut level0_deltas = level0_deltas

View File

@@ -185,171 +185,7 @@ pub(crate) enum VectoredReadExtended {
No,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum VectoredReadCoalesceMode {
/// Only coalesce exactly adjacent reads.
AdjacentOnly,
/// In addition to adjacent reads, also consider reads whose corresponding
/// `end` and `start` offsets reside at the same chunk.
Chunked(usize),
}
impl VectoredReadCoalesceMode {
/// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
/// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
pub(crate) fn get() -> Self {
let align = virtual_file::get_io_buffer_alignment_raw();
if align == 0 {
VectoredReadCoalesceMode::AdjacentOnly
} else {
VectoredReadCoalesceMode::Chunked(align)
}
}
}
pub(crate) enum VectoredReadBuilder {
Adjacent(AdjacentVectoredReadBuilder),
Chunked(ChunkedVectoredReadBuilder),
}
impl VectoredReadBuilder {
fn new_impl(
start_offset: u64,
end_offset: u64,
meta: BlobMeta,
max_read_size: Option<usize>,
mode: VectoredReadCoalesceMode,
) -> Self {
match mode {
VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
),
VectoredReadCoalesceMode::Chunked(chunk_size) => {
Self::Chunked(ChunkedVectoredReadBuilder::new(
start_offset,
end_offset,
meta,
max_read_size,
chunk_size,
))
}
}
}
pub(crate) fn new(
start_offset: u64,
end_offset: u64,
meta: BlobMeta,
max_read_size: usize,
mode: VectoredReadCoalesceMode,
) -> Self {
Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
}
pub(crate) fn new_streaming(
start_offset: u64,
end_offset: u64,
meta: BlobMeta,
mode: VectoredReadCoalesceMode,
) -> Self {
Self::new_impl(start_offset, end_offset, meta, None, mode)
}
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
match self {
VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
}
}
pub(crate) fn build(self) -> VectoredRead {
match self {
VectoredReadBuilder::Adjacent(builder) => builder.build(),
VectoredReadBuilder::Chunked(builder) => builder.build(),
}
}
pub(crate) fn size(&self) -> usize {
match self {
VectoredReadBuilder::Adjacent(builder) => builder.size(),
VectoredReadBuilder::Chunked(builder) => builder.size(),
}
}
}
pub(crate) struct AdjacentVectoredReadBuilder {
/// Start offset of the read.
start: u64,
// End offset of the read.
end: u64,
/// Start offset and metadata for each blob in this read
blobs_at: VecMap<u64, BlobMeta>,
max_read_size: Option<usize>,
}
impl AdjacentVectoredReadBuilder {
/// Start building a new vectored read.
///
/// Note that by design, this does not check against reading more than `max_read_size` to
/// support reading larger blobs than the configuration value. The builder will be single use
/// however after that.
pub(crate) fn new(
start_offset: u64,
end_offset: u64,
meta: BlobMeta,
max_read_size: Option<usize>,
) -> Self {
let mut blobs_at = VecMap::default();
blobs_at
.append(start_offset, meta)
.expect("First insertion always succeeds");
Self {
start: start_offset,
end: end_offset,
blobs_at,
max_read_size,
}
}
/// Attempt to extend the current read with a new blob if the start
/// offset matches with the current end of the vectored read
/// and the resuting size is below the max read size
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
tracing::trace!(start, end, "trying to extend");
let size = (end - start) as usize;
let not_limited_by_max_read_size = {
if let Some(max_read_size) = self.max_read_size {
self.size() + size <= max_read_size
} else {
true
}
};
if self.end == start && not_limited_by_max_read_size {
self.end = end;
self.blobs_at
.append(start, meta)
.expect("LSNs are ordered within vectored reads");
return VectoredReadExtended::Yes;
}
VectoredReadExtended::No
}
pub(crate) fn size(&self) -> usize {
(self.end - self.start) as usize
}
pub(crate) fn build(self) -> VectoredRead {
VectoredRead {
start: self.start,
end: self.end,
blobs_at: self.blobs_at,
}
}
}
/// A vectored read builder that tries to coalesce all reads that fits in a chunk.
pub(crate) struct ChunkedVectoredReadBuilder {
/// Start block number
start_blk_no: usize,
@@ -373,7 +209,7 @@ impl ChunkedVectoredReadBuilder {
/// Note that by design, this does not check against reading more than `max_read_size` to
/// support reading larger blobs than the configuration value. The builder will be single use
/// however after that.
pub(crate) fn new(
fn new_impl(
start_offset: u64,
end_offset: u64,
meta: BlobMeta,
@@ -396,6 +232,25 @@ impl ChunkedVectoredReadBuilder {
}
}
pub(crate) fn new(
start_offset: u64,
end_offset: u64,
meta: BlobMeta,
max_read_size: usize,
align: usize,
) -> Self {
Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align)
}
pub(crate) fn new_streaming(
start_offset: u64,
end_offset: u64,
meta: BlobMeta,
align: usize,
) -> Self {
Self::new_impl(start_offset, end_offset, meta, None, align)
}
/// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
///
/// The resulting size also must be below the max read size.
@@ -474,17 +329,17 @@ pub struct VectoredReadPlanner {
max_read_size: usize,
mode: VectoredReadCoalesceMode,
align: usize,
}
impl VectoredReadPlanner {
pub fn new(max_read_size: usize) -> Self {
let mode = VectoredReadCoalesceMode::get();
let align = virtual_file::get_io_buffer_alignment();
Self {
blobs: BTreeMap::new(),
prev: None,
max_read_size,
mode,
align,
}
}
@@ -545,7 +400,7 @@ impl VectoredReadPlanner {
}
pub fn finish(self) -> Vec<VectoredRead> {
let mut current_read_builder: Option<VectoredReadBuilder> = None;
let mut current_read_builder: Option<ChunkedVectoredReadBuilder> = None;
let mut reads = Vec::new();
for (key, blobs_for_key) in self.blobs {
@@ -558,12 +413,12 @@ impl VectoredReadPlanner {
};
if extended == VectoredReadExtended::No {
let next_read_builder = VectoredReadBuilder::new(
let next_read_builder = ChunkedVectoredReadBuilder::new(
start_offset,
end_offset,
BlobMeta { key, lsn },
self.max_read_size,
self.mode,
self.align,
);
let prev_read_builder = current_read_builder.replace(next_read_builder);
@@ -688,7 +543,7 @@ impl<'a> VectoredBlobReader<'a> {
/// `handle` gets called and when the current key would just exceed the read_size and
/// max_cnt constraints.
pub struct StreamingVectoredReadPlanner {
read_builder: Option<VectoredReadBuilder>,
read_builder: Option<ChunkedVectoredReadBuilder>,
// Arguments for previous blob passed into [`StreamingVectoredReadPlanner::handle`]
prev: Option<(Key, Lsn, u64)>,
/// Max read size per batch. This is not a strict limit. If there are [0, 100) and [100, 200), while the `max_read_size` is 150,
@@ -699,21 +554,21 @@ pub struct StreamingVectoredReadPlanner {
/// Size of the current batch
cnt: usize,
mode: VectoredReadCoalesceMode,
align: usize,
}
impl StreamingVectoredReadPlanner {
pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
assert!(max_cnt > 0);
assert!(max_read_size > 0);
let mode = VectoredReadCoalesceMode::get();
let align = virtual_file::get_io_buffer_alignment();
Self {
read_builder: None,
prev: None,
max_cnt,
max_read_size,
cnt: 0,
mode,
align,
}
}
@@ -762,11 +617,11 @@ impl StreamingVectoredReadPlanner {
}
None => {
self.read_builder = {
Some(VectoredReadBuilder::new_streaming(
Some(ChunkedVectoredReadBuilder::new_streaming(
start_offset,
end_offset,
BlobMeta { key, lsn },
self.mode,
self.align,
))
};
}
@@ -1092,7 +947,7 @@ mod tests {
let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
let mut buf = BytesMut::with_capacity(reserved_bytes);
let mode = VectoredReadCoalesceMode::get();
let align = virtual_file::get_io_buffer_alignment();
let vectored_blob_reader = VectoredBlobReader::new(&file);
let meta = BlobMeta {
key: Key::MIN,
@@ -1104,7 +959,8 @@ mod tests {
if idx + 1 == offsets.len() {
continue;
}
let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
let read_builder =
ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align);
let read = read_builder.build();
let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
assert_eq!(result.blobs.len(), 1);

View File

@@ -466,6 +466,7 @@ impl VirtualFile {
&[]
};
utils::crashsafe::overwrite(&final_path, &tmp_path, content)
.maybe_fatal_err("crashsafe_overwrite")
})
.await
.expect("blocking task is never aborted")
@@ -475,7 +476,7 @@ impl VirtualFile {
pub async fn sync_all(&self) -> Result<(), Error> {
with_file!(self, StorageIoOperation::Fsync, |file_guard| {
let (_file_guard, res) = io_engine::get().sync_all(file_guard).await;
res
res.maybe_fatal_err("sync_all")
})
}
@@ -483,7 +484,7 @@ impl VirtualFile {
pub async fn sync_data(&self) -> Result<(), Error> {
with_file!(self, StorageIoOperation::Fsync, |file_guard| {
let (_file_guard, res) = io_engine::get().sync_data(file_guard).await;
res
res.maybe_fatal_err("sync_data")
})
}
@@ -1147,7 +1148,9 @@ pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize)
panic!("virtual_file::init called twice");
}
if set_io_buffer_alignment(io_buffer_alignment).is_err() {
panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
panic!(
"IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}"
);
}
io_engine::init(engine);
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
@@ -1174,14 +1177,16 @@ fn get_open_files() -> &'static OpenFiles {
static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
/// Returns true if `x` is zero or a power of two.
fn is_zero_or_power_of_two(x: usize) -> bool {
(x == 0) || ((x & (x - 1)) == 0)
/// Returns true if the alignment is a power of two and is greater or equal to 512.
fn is_valid_io_buffer_alignment(align: usize) -> bool {
align.is_power_of_two() && align >= 512
}
/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is
/// not a power of two or less than 512 bytes.
#[allow(unused)]
pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
if is_zero_or_power_of_two(align) {
if is_valid_io_buffer_alignment(align) {
IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
Ok(())
} else {
@@ -1189,19 +1194,19 @@ pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
}
}
/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
/// Gets the io buffer alignment.
///
/// This function should be used to check the raw config value.
pub(crate) fn get_io_buffer_alignment_raw() -> usize {
/// This function should be used for getting the actual alignment value to use.
pub(crate) fn get_io_buffer_alignment() -> usize {
let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
if cfg!(test) {
let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
if let Some(test_align) = utils::env::var(env_var_name) {
if is_zero_or_power_of_two(test_align) {
if is_valid_io_buffer_alignment(test_align) {
test_align
} else {
panic!("IO buffer alignment ({test_align}) is not a power of two");
panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}");
}
} else {
align
@@ -1211,14 +1216,6 @@ pub(crate) fn get_io_buffer_alignment_raw() -> usize {
}
}
/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
///
/// This function should be used for getting the actual alignment value to use.
pub(crate) fn get_io_buffer_alignment() -> usize {
let align = get_io_buffer_alignment_raw();
align.max(1)
}
#[cfg(test)]
mod tests {
use crate::context::DownloadBehavior;

View File

@@ -21,7 +21,10 @@
//! redo Postgres process, but some records it can handle directly with
//! bespoken Rust code.
use std::sync::Arc;
use std::sync::OnceLock;
use std::time::Duration;
use std::time::Instant;
use std::time::SystemTime;
use pageserver_api::shard::ShardIdentity;
@@ -69,7 +72,29 @@ impl CheckPoint {
}
}
/// Temporary limitation of WAL lag warnings after attach
///
/// After tenant attach, we want to limit WAL lag warnings because
/// we don't look at the WAL until the attach is complete, which
/// might take a while.
pub struct WalLagCooldown {
/// Until when should this limitation apply at all
active_until: std::time::Instant,
/// The maximum lag to suppress. Lags above this limit get reported anyways.
max_lag: Duration,
}
impl WalLagCooldown {
pub fn new(attach_start: Instant, attach_duration: Duration) -> Self {
Self {
active_until: attach_start + attach_duration * 3 + Duration::from_secs(120),
max_lag: attach_duration * 2 + Duration::from_secs(60),
}
}
}
pub struct WalIngest {
attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
shard: ShardIdentity,
checkpoint: CheckPoint,
checkpoint_modified: bool,
@@ -103,6 +128,7 @@ impl WalIngest {
shard: *timeline.get_shard_identity(),
checkpoint,
checkpoint_modified: false,
attach_wal_lag_cooldown: timeline.attach_wal_lag_cooldown.clone(),
warn_ingest_lag: WarnIngestLag {
lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)),
@@ -1429,6 +1455,13 @@ impl WalIngest {
Ok(lag) => {
if lag > conf.wait_lsn_timeout {
rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| {
if let Some(cooldown) = self.attach_wal_lag_cooldown.get() {
if std::time::Instant::now() < cooldown.active_until && lag <= cooldown.max_lag {
return;
}
} else {
// Still loading? We shouldn't be here
}
let lag = humantime::format_duration(lag);
warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout");
})

View File

@@ -25,7 +25,18 @@ SHLIB_LINK_INTERNAL = $(libpq)
SHLIB_LINK = -lcurl
EXTENSION = neon
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql neon--1.4--1.5.sql neon--1.5--1.4.sql
DATA = \
neon--1.0.sql \
neon--1.0--1.1.sql \
neon--1.1--1.2.sql \
neon--1.2--1.3.sql \
neon--1.3--1.4.sql \
neon--1.4--1.5.sql \
neon--1.5--1.4.sql \
neon--1.4--1.3.sql \
neon--1.3--1.2.sql \
neon--1.2--1.1.sql \
neon--1.1--1.0.sql
PGFILEDESC = "neon - cloud storage for PostgreSQL"
EXTRA_CLEAN = \

View File

@@ -42,6 +42,7 @@
#include "hll.h"
#include "bitmap.h"
#include "neon.h"
#define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0)
@@ -173,7 +174,9 @@ lfc_disable(char const *op)
* If the reason of error is ENOSPC, then truncation of file may
* help to reclaim some space
*/
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_TRUNCATE);
int rc = ftruncate(lfc_desc, 0);
pgstat_report_wait_end();
if (rc < 0)
elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
@@ -769,8 +772,10 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
if (iteration_hits != 0)
{
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
rc = preadv(lfc_desc, iov, blocks_in_chunk,
((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
pgstat_report_wait_end();
if (rc != (BLCKSZ * blocks_in_chunk))
{
@@ -944,8 +949,11 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
lfc_ctl->writes += blocks_in_chunk;
LWLockRelease(lfc_lock);
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
rc = pwritev(lfc_desc, iov, blocks_in_chunk,
((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
pgstat_report_wait_end();
if (rc != BLCKSZ * blocks_in_chunk)
{
lfc_disable("write");

View File

@@ -490,7 +490,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE,
PQsocket(shard->conn),
0,
PG_WAIT_EXTENSION);
WAIT_EVENT_NEON_PS_STARTING);
elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc);
if (rc & WL_LATCH_SET)
{
@@ -512,7 +512,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE,
PQsocket(shard->conn),
0,
PG_WAIT_EXTENSION);
WAIT_EVENT_NEON_PS_STARTING);
elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc);
if (rc & WL_LATCH_SET)
{
@@ -608,7 +608,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
WAIT_EVENT_NEON_PS_CONFIGURING);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
@@ -656,7 +657,8 @@ static int
call_PQgetCopyData(shardno_t shard_no, char **buffer)
{
int ret;
PGconn *pageserver_conn = page_servers[shard_no].conn;
PageServer *shard = &page_servers[shard_no];
PGconn *pageserver_conn = shard->conn;
retry:
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
@@ -666,7 +668,8 @@ retry:
WaitEvent event;
/* Sleep until there's something to do */
(void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION);
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
WAIT_EVENT_NEON_PS_READ);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
@@ -937,7 +940,7 @@ PagestoreShmemInit(void)
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
pagestore_shared = ShmemInitStruct("libpagestore shared state",
PagestoreShmemSize(),
sizeof(PagestoreShmemState),
&found);
if (!found)
{

View File

@@ -41,6 +41,9 @@
#include "pagestore_client.h"
#include "control_plane_connector.h"
#include "walsender_hooks.h"
#if PG_MAJORVERSION_NUM >= 16
#include "storage/ipc.h"
#endif
PG_MODULE_MAGIC;
void _PG_init(void);
@@ -49,6 +52,23 @@ static int logical_replication_max_snap_files = 300;
static int running_xacts_overflow_policy;
#if PG_MAJORVERSION_NUM >= 16
static shmem_startup_hook_type prev_shmem_startup_hook;
static void neon_shmem_startup_hook(void);
#endif
#if PG_MAJORVERSION_NUM >= 17
uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE;
uint32 WAIT_EVENT_NEON_LFC_READ;
uint32 WAIT_EVENT_NEON_LFC_TRUNCATE;
uint32 WAIT_EVENT_NEON_LFC_WRITE;
uint32 WAIT_EVENT_NEON_PS_STARTING;
uint32 WAIT_EVENT_NEON_PS_CONFIGURING;
uint32 WAIT_EVENT_NEON_PS_SEND;
uint32 WAIT_EVENT_NEON_PS_READ;
uint32 WAIT_EVENT_NEON_WAL_DL;
#endif
enum RunningXactsOverflowPolicies {
OP_IGNORE,
OP_SKIP,
@@ -635,6 +655,9 @@ _PG_init(void)
*/
#if PG_VERSION_NUM >= 160000
load_file("$libdir/neon_rmgr", false);
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = neon_shmem_startup_hook;
#endif
pg_init_libpagestore();
@@ -721,3 +744,25 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
{
PG_RETURN_UINT64(BackpressureThrottlingTime());
}
#if PG_MAJORVERSION_NUM >= 16
static void
neon_shmem_startup_hook(void)
{
/* Initialize */
if (prev_shmem_startup_hook)
prev_shmem_startup_hook();
#if PG_PG_MAJORVERSION_NUM >= 17
WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");
WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read");
WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate");
WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write");
WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting");
WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring");
WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO");
WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
#endif
}
#endif

View File

@@ -1,8 +1,6 @@
# neon extension
comment = 'cloud storage for PostgreSQL'
# TODO: bump default version to 1.5, after we are certain that we don't
# need to rollback the compute image
default_version = '1.4'
default_version = '1.5'
module_pathname = '$libdir/neon'
relocatable = true
trusted = true

View File

@@ -12,6 +12,7 @@
#ifndef NEON_H
#define NEON_H
#include "access/xlogreader.h"
#include "utils/wait_event.h"
/* GUCs */
extern char *neon_auth_token;
@@ -22,6 +23,28 @@ extern char *wal_acceptors_list;
extern int wal_acceptor_reconnect_timeout;
extern int wal_acceptor_connection_timeout;
#if PG_MAJORVERSION_NUM >= 17
extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE;
extern uint32 WAIT_EVENT_NEON_LFC_READ;
extern uint32 WAIT_EVENT_NEON_LFC_TRUNCATE;
extern uint32 WAIT_EVENT_NEON_LFC_WRITE;
extern uint32 WAIT_EVENT_NEON_PS_STARTING;
extern uint32 WAIT_EVENT_NEON_PS_CONFIGURING;
extern uint32 WAIT_EVENT_NEON_PS_SEND;
extern uint32 WAIT_EVENT_NEON_PS_READ;
extern uint32 WAIT_EVENT_NEON_WAL_DL;
#else
#define WAIT_EVENT_NEON_LFC_MAINTENANCE PG_WAIT_EXTENSION
#define WAIT_EVENT_NEON_LFC_READ WAIT_EVENT_BUFFILE_READ
#define WAIT_EVENT_NEON_LFC_TRUNCATE WAIT_EVENT_BUFFILE_TRUNCATE
#define WAIT_EVENT_NEON_LFC_WRITE WAIT_EVENT_BUFFILE_WRITE
#define WAIT_EVENT_NEON_PS_STARTING PG_WAIT_EXTENSION
#define WAIT_EVENT_NEON_PS_CONFIGURING PG_WAIT_EXTENSION
#define WAIT_EVENT_NEON_PS_SEND PG_WAIT_EXTENSION
#define WAIT_EVENT_NEON_PS_READ PG_WAIT_EXTENSION
#define WAIT_EVENT_NEON_WAL_DL WAIT_EVENT_WAL_READ
#endif
extern void pg_init_libpagestore(void);
extern void pg_init_walproposer(void);

View File

@@ -27,7 +27,8 @@ NeonPerfCountersShmemSize(void)
{
Size size = 0;
size = add_size(size, mul_size(MaxBackends, sizeof(neon_per_backend_counters)));
size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
sizeof(neon_per_backend_counters)));
return size;
}
@@ -39,7 +40,7 @@ NeonPerfCountersShmemInit(void)
neon_per_backend_counters_shared =
ShmemInitStruct("Neon perf counters",
mul_size(MaxBackends,
mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
sizeof(neon_per_backend_counters)),
&found);
Assert(found == IsUnderPostmaster);
@@ -137,7 +138,7 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
metrics[i].is_bucket = false;
metrics[i].value = (double) counters->pageserver_requests_sent_total;
i++;
metrics[i].name = "pageserver_requests_disconnects_total";
metrics[i].name = "pageserver_disconnects_total";
metrics[i].is_bucket = false;
metrics[i].value = (double) counters->pageserver_disconnects_total;
i++;
@@ -192,7 +193,7 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
/* We put all the tuples into a tuplestore in one go. */
InitMaterializedSRF(fcinfo, 0);
for (int procno = 0; procno < MaxBackends; procno++)
for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++)
{
PGPROC *proc = GetPGProcByNumber(procno);
int pid = proc->pid;
@@ -231,7 +232,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
InitMaterializedSRF(fcinfo, 0);
/* Aggregate the counters across all backends */
for (int procno = 0; procno < MaxBackends; procno++)
for (int procno = 0; procno < NUM_NEON_PERF_COUNTER_SLOTS; procno++)
{
neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];

View File

@@ -96,6 +96,14 @@ typedef struct
/* Pointer to the shared memory array of neon_per_backend_counters structs */
extern neon_per_backend_counters *neon_per_backend_counters_shared;
/*
* Size of the perf counters array in shared memory. One slot for each backend
* and aux process. IOW one for each PGPROC slot, except for slots reserved
* for prepared transactions, because they're not real processes and cannot do
* I/O.
*/
#define NUM_NEON_PERF_COUNTER_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS)
#if PG_VERSION_NUM >= 170000
#define MyNeonCounters (&neon_per_backend_counters_shared[MyProcNumber])
#else

View File

@@ -803,15 +803,19 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
bool is_prefetch)
{
uint64 min_ring_index;
PrefetchRequest req;
PrefetchRequest hashkey;
#if USE_ASSERT_CHECKING
bool any_hits = false;
#endif
/* We will never read further ahead than our buffer can store. */
nblocks = Max(1, Min(nblocks, readahead_buffer_size));
/* use an intermediate PrefetchRequest struct to ensure correct alignment */
req.buftag = tag;
/*
* Use an intermediate PrefetchRequest struct as the hash key to ensure
* correct alignment and that the padding bytes are cleared.
*/
memset(&hashkey.buftag, 0, sizeof(BufferTag));
hashkey.buftag = tag;
Retry:
min_ring_index = UINT64_MAX;
@@ -837,8 +841,8 @@ Retry:
slot = NULL;
entry = NULL;
req.buftag.blockNum = tag.blockNum + i;
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
hashkey.buftag.blockNum = tag.blockNum + i;
entry = prfh_lookup(MyPState->prf_hash, &hashkey);
if (entry != NULL)
{
@@ -849,7 +853,7 @@ Retry:
Assert(slot->status != PRFS_UNUSED);
Assert(MyPState->ring_last <= ring_index &&
ring_index < MyPState->ring_unused);
Assert(BUFFERTAGS_EQUAL(slot->buftag, req.buftag));
Assert(BUFFERTAGS_EQUAL(slot->buftag, hashkey.buftag));
/*
* If the caller specified a request LSN to use, only accept
@@ -886,12 +890,19 @@ Retry:
{
min_ring_index = Min(min_ring_index, ring_index);
/* The buffered request is good enough, return that index */
pgBufferUsage.prefetch.duplicates++;
if (is_prefetch)
pgBufferUsage.prefetch.duplicates++;
else
pgBufferUsage.prefetch.hits++;
continue;
}
}
}
else if (!is_prefetch)
{
pgBufferUsage.prefetch.misses += 1;
MyNeonCounters->getpage_prefetch_misses_total++;
}
/*
* We can only leave the block above by finding that there's
* no entry that can satisfy this request, either because there
@@ -974,7 +985,7 @@ Retry:
* We must update the slot data before insertion, because the hash
* function reads the buffer tag from the slot.
*/
slot->buftag = req.buftag;
slot->buftag = hashkey.buftag;
slot->shard_no = get_shard_number(&tag);
slot->my_ring_index = ring_index;
@@ -1773,6 +1784,20 @@ neon_init(void)
if (MyPState != NULL)
return;
/*
* Sanity check that theperf counters array is sized correctly. We got
* this wrong once, and the formula for max number of backends and aux
* processes might well change in the future, so better safe than sorry.
* This is a very cheap check so we do it even without assertions. On
* v14, this gets called before initializing MyProc, so we cannot perform
* the check here. That's OK, we don't expect the logic to change in old
* releases.
*/
#if PG_VERSION_NUM>=150000
if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS])
elog(ERROR, "MyNeonCounters points past end of array");
#endif
prfs_size = offsetof(PrefetchState, prf_buffer) +
sizeof(PrefetchRequest) * readahead_buffer_size;
@@ -2728,14 +2753,19 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
uint64 ring_index;
PrfHashEntry *entry;
PrefetchRequest *slot;
BufferTag buftag = {0};
PrefetchRequest hashkey;
Assert(PointerIsValid(request_lsns));
Assert(nblocks >= 1);
CopyNRelFileInfoToBufTag(buftag, rinfo);
buftag.forkNum = forkNum;
buftag.blockNum = base_blockno;
/*
* Use an intermediate PrefetchRequest struct as the hash key to ensure
* correct alignment and that the padding bytes are cleared.
*/
memset(&hashkey.buftag, 0, sizeof(BufferTag));
CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
hashkey.buftag.forkNum = forkNum;
hashkey.buftag.blockNum = base_blockno;
/*
* The redo process does not lock pages that it needs to replay but are
@@ -2753,7 +2783,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
* weren't for the behaviour of the LwLsn cache that uses the highest
* value of the LwLsn cache when the entry is not found.
*/
prefetch_register_bufferv(buftag, request_lsns, nblocks, mask, false);
prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false);
for (int i = 0; i < nblocks; i++)
{
@@ -2774,8 +2804,8 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
* Try to find prefetched page in the list of received pages.
*/
Retry:
buftag.blockNum = blockno;
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
hashkey.buftag.blockNum = blockno;
entry = prfh_lookup(MyPState->prf_hash, &hashkey);
if (entry != NULL)
{
@@ -2783,7 +2813,6 @@ Retry:
if (neon_prefetch_response_usable(reqlsns, slot))
{
ring_index = slot->my_ring_index;
pgBufferUsage.prefetch.hits += 1;
}
else
{
@@ -2813,10 +2842,7 @@ Retry:
{
if (entry == NULL)
{
pgBufferUsage.prefetch.misses += 1;
MyNeonCounters->getpage_prefetch_misses_total++;
ring_index = prefetch_register_bufferv(buftag, reqlsns, 1, NULL, false);
ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false);
Assert(ring_index != UINT64_MAX);
slot = GetPrfSlot(ring_index);
}
@@ -2841,8 +2867,8 @@ Retry:
} while (!prefetch_wait_for(ring_index));
Assert(slot->status == PRFS_RECEIVED);
Assert(memcmp(&buftag, &slot->buftag, sizeof(BufferTag)) == 0);
Assert(buftag.blockNum == base_blockno + i);
Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0);
Assert(hashkey.buftag.blockNum == base_blockno + i);
resp = slot->response;
@@ -3045,6 +3071,9 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
nblocks, read);
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
/* Read all blocks from LFC, so we're done */
if (lfc_result == nblocks)
return;

View File

@@ -213,7 +213,7 @@ WalProposerPoll(WalProposer *wp)
rc = wp->api.wait_event_set(wp, timeout, &sk, &events);
/* Exit loop if latch is set (we got new WAL) */
if ((rc == 1 && events & WL_LATCH_SET))
if (rc == 1 && (events & WL_LATCH_SET))
break;
/*

View File

@@ -422,6 +422,9 @@ backpressure_throttling_impl(void)
TimestampTz start,
stop;
bool retry = false;
char *new_status = NULL;
const char *old_status;
int len;
if (PointerIsValid(PrevProcessInterruptsCallback))
retry = PrevProcessInterruptsCallback();
@@ -442,14 +445,24 @@ backpressure_throttling_impl(void)
if (lag == 0)
return retry;
/* Suspend writers until replicas catch up */
set_ps_display("backpressure throttling");
old_status = get_ps_display(&len);
new_status = (char *) palloc(len + 64 + 1);
memcpy(new_status, old_status, len);
snprintf(new_status + len, 64, "backpressure throttling: lag %lu", lag);
set_ps_display(new_status);
new_status[len] = '\0'; /* truncate off " backpressure ..." to later reset the ps */
elog(DEBUG2, "backpressure throttling: lag %lu", lag);
start = GetCurrentTimestamp();
pg_usleep(BACK_PRESSURE_DELAY);
stop = GetCurrentTimestamp();
pg_atomic_add_fetch_u64(&walprop_shared->backpressureThrottlingTime, stop - start);
/* Reset ps display */
set_ps_display(new_status);
pfree(new_status);
return true;
}
@@ -1473,11 +1486,33 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
{
NeonWALReadResult res;
res = NeonWALRead(sk->xlogreader,
buf,
startptr,
count,
walprop_pg_get_timeline_id());
#if PG_MAJORVERSION_NUM >= 17
if (!sk->wp->config->syncSafekeepers)
{
Size rbytes;
rbytes = WALReadFromBuffers(buf, startptr, count,
walprop_pg_get_timeline_id());
startptr += rbytes;
count -= rbytes;
}
#endif
if (count == 0)
{
res = NEON_WALREAD_SUCCESS;
}
else
{
Assert(count > 0);
/* Now read the remaining WAL from the WAL file */
res = NeonWALRead(sk->xlogreader,
buf,
startptr,
count,
walprop_pg_get_timeline_id());
}
if (res == NEON_WALREAD_SUCCESS)
{
@@ -1779,7 +1814,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
* If wait is terminated by latch set (walsenders' latch is set on each
* wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH)
*/
if ((rc == 1 && event.events & WL_LATCH_SET) || late_cv_trigger)
if ((rc == 1 && (event.events & WL_LATCH_SET)) || late_cv_trigger)
{
/* Reset our latch */
ResetLatch(MyLatch);
@@ -1791,7 +1826,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
* If the event contains something about the socket, it means we got an
* event from a safekeeper socket.
*/
if (rc == 1 && (event.events & (WL_SOCKET_MASK)))
if (rc == 1 && (event.events & WL_SOCKET_MASK))
{
*sk = (Safekeeper *) event.user_data;
*events = event.events;

View File

@@ -160,7 +160,7 @@ NeonWALPageRead(
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events,
sock,
timeout_ms,
WAIT_EVENT_WAL_SENDER_MAIN);
WAIT_EVENT_NEON_WAL_DL);
}
}
}
@@ -191,13 +191,14 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
if (!wal_reader)
{
XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn);
XLogRecPtr basebackupLsn = GetRedoStartLsn();
if (epochStartLsn == 0)
/* should never happen */
if (basebackupLsn == 0)
{
elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!");
elog(ERROR, "unable to start walsender when basebackupLsn is 0");
}
wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] ");
wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ");
}
xlr->page_read = NeonWALPageRead;
xlr->segment_open = NeonWALReadSegmentOpen;

154
poetry.lock generated
View File

@@ -2064,73 +2064,80 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
[[package]]
name = "psycopg2-binary"
version = "2.9.6"
version = "2.9.9"
description = "psycopg2 - Python-PostgreSQL Database Adapter"
optional = false
python-versions = ">=3.6"
python-versions = ">=3.7"
files = [
{file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"},
{file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"},
{file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"},
{file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"},
{file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"},
{file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"},
{file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"},
{file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"},
{file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"},
{file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
{file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
{file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"},
{file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"},
{file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"},
{file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"},
]
[[package]]
@@ -2577,7 +2584,6 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2702,13 +2708,13 @@ files = [
[[package]]
name = "requests"
version = "2.32.0"
version = "2.32.3"
description = "Python HTTP for Humans."
optional = false
python-versions = ">=3.8"
files = [
{file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"},
{file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"},
{file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
{file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
]
[package.dependencies]
@@ -3131,16 +3137,6 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3378,4 +3374,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055"
content-hash = "9055b73352f1534f664cd8af6ebf8d93cf3bf857f115756f312ff2e3ae1bbbc1"

View File

@@ -24,12 +24,12 @@ bytes = { workspace = true, features = ["serde"] }
camino.workspace = true
chrono.workspace = true
clap.workspace = true
compute_api.workspace = true
consumption_metrics.workspace = true
dashmap.workspace = true
env_logger.workspace = true
framed-websockets.workspace = true
futures.workspace = true
git-version.workspace = true
hashbrown.workspace = true
hashlink.workspace = true
hex.workspace = true
@@ -38,7 +38,7 @@ hostname.workspace = true
http.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
hyper0.workspace = true
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
http-body-util = { version = "0.1" }
@@ -82,7 +82,6 @@ tokio-postgres-rustls.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
tokio = { workspace = true, features = ["signal"] }
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
tracing-utils.workspace = true
tracing.workspace = true

View File

@@ -3,8 +3,8 @@ use crate::{
auth::{self, backend::ComputeCredentialKeys, AuthFlow},
compute,
config::AuthenticationConfig,
console::AuthSecret,
context::RequestMonitoring,
control_plane::AuthSecret,
sasl,
stream::{PqStream, Stream},
};

View File

@@ -1,18 +1,24 @@
use crate::{
auth, compute,
auth,
cache::Cached,
compute,
config::AuthenticationConfig,
console::{self, provider::NodeInfo},
context::RequestMonitoring,
control_plane::{self, provider::NodeInfo, CachedNodeInfo},
error::{ReportableError, UserFacingError},
proxy::connect_compute::ComputeConnectBackend,
stream::PqStream,
waiters,
};
use async_trait::async_trait;
use pq_proto::BeMessage as Be;
use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_postgres::config::SslMode;
use tracing::{info, info_span};
use super::ComputeCredentialKeys;
#[derive(Debug, Error)]
pub(crate) enum WebAuthError {
#[error(transparent)]
@@ -25,6 +31,11 @@ pub(crate) enum WebAuthError {
Io(#[from] std::io::Error),
}
#[derive(Debug)]
pub struct ConsoleRedirectBackend {
console_uri: reqwest::Url,
}
impl UserFacingError for WebAuthError {
fn to_string_client(&self) -> String {
"Internal error".to_string()
@@ -57,7 +68,40 @@ pub(crate) fn new_psql_session_id() -> String {
hex::encode(rand::random::<[u8; 8]>())
}
pub(super) async fn authenticate(
impl ConsoleRedirectBackend {
pub fn new(console_uri: reqwest::Url) -> Self {
Self { console_uri }
}
pub(crate) async fn authenticate(
&self,
ctx: &RequestMonitoring,
auth_config: &'static AuthenticationConfig,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<ConsoleRedirectNodeInfo> {
authenticate(ctx, auth_config, &self.console_uri, client)
.await
.map(ConsoleRedirectNodeInfo)
}
}
pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo);
#[async_trait]
impl ComputeConnectBackend for ConsoleRedirectNodeInfo {
async fn wake_compute(
&self,
_ctx: &RequestMonitoring,
) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
Ok(Cached::new_uncached(self.0.clone()))
}
fn get_keys(&self) -> &ComputeCredentialKeys {
&ComputeCredentialKeys::None
}
}
async fn authenticate(
ctx: &RequestMonitoring,
auth_config: &'static AuthenticationConfig,
link_uri: &reqwest::Url,
@@ -70,7 +114,7 @@ pub(super) async fn authenticate(
let (psql_session_id, waiter) = loop {
let psql_session_id = new_psql_session_id();
match console::mgmt::get_waiter(&psql_session_id) {
match control_plane::mgmt::get_waiter(&psql_session_id) {
Ok(waiter) => break (psql_session_id, waiter),
Err(_e) => continue,
}
@@ -89,7 +133,12 @@ pub(super) async fn authenticate(
// Wait for web console response (see `mgmt`).
info!(parent: &span, "waiting for console's reply...");
let db_info = waiter.await.map_err(WebAuthError::from)?;
let db_info = tokio::time::timeout(auth_config.webauth_confirmation_timeout, waiter)
.await
.map_err(|_elapsed| {
auth::AuthError::confirmation_timeout(auth_config.webauth_confirmation_timeout.into())
})?
.map_err(WebAuthError::from)?;
if auth_config.ip_allowlist_check_enabled {
if let Some(allowed_ips) = &db_info.allowed_ips {

Some files were not shown because too many files have changed in this diff Show More