Compare commits

..

141 Commits

Author SHA1 Message Date
Shany Pozin
2266ee5971 Merge pull request #4803 from neondatabase/releases/2023-07-25
Release 2023-07-25
2023-07-25 14:21:07 +03:00
Shany Pozin
b58445d855 Merge pull request #4746 from neondatabase/releases/2023-07-18
Release 2023-07-18
2023-07-18 14:45:39 +03:00
Conrad Ludgate
36050e7f3d Merge branch 'release' into releases/2023-07-18 2023-07-18 12:00:09 +01:00
Alexander Bayandin
33360ed96d Merge pull request #4705 from neondatabase/release-2023-07-12
Release 2023-07-12 (only proxy)
2023-07-12 19:44:36 +01:00
Conrad Ludgate
39a28d1108 proxy wake_compute loop (#4675)
## Problem

If we fail to wake up the compute node, a subsequent connect attempt
will definitely fail. However, kubernetes won't fail the connection
immediately, instead it hangs until we timeout (10s).

## Summary of changes

Refactor the loop to allow fast retries of compute_wake and to skip a
connect attempt.
2023-07-12 18:40:11 +01:00
Conrad Ludgate
efa6aa134f allow repeated IO errors from compute node (#4624)
## Problem

#4598 compute nodes are not accessible some time after wake up due to
kubernetes DNS not being fully propagated.

## Summary of changes

Update connect retry mechanism to support handling IO errors and
sleeping for 100ms

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
2023-07-12 18:40:06 +01:00
Alexander Bayandin
2c724e56e2 Merge pull request #4646 from neondatabase/releases/2023-07-06-hotfix
Release 2023-07-06 (add pg_embedding extension only)
2023-07-06 12:19:52 +01:00
Alexander Bayandin
feff887c6f Compile pg_embedding extension (#4634)
```
CREATE EXTENSION embedding;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);

SELECT * FROM t ORDER BY val <-> array[3,3,3];
   val   
---------
 {1,2,3}
 {1,2,4}
 {1,1,1}
 {0,0,0}
 
(5 rows)
```
2023-07-06 09:39:41 +01:00
Vadim Kharitonov
353d915fcf Merge pull request #4633 from neondatabase/releases/2023-07-05
Release 2023-07-05
2023-07-05 15:10:47 +02:00
Vadim Kharitonov
2e38098cbc Merge branch 'release' into releases/2023-07-05 2023-07-05 12:41:48 +02:00
Vadim Kharitonov
a6fe5ea1ac Merge pull request #4571 from neondatabase/releases/2023-06-27
Release 2023-06-27
2023-06-27 12:55:33 +02:00
Vadim Kharitonov
05b0aed0c1 Merge branch 'release' into releases/2023-06-27 2023-06-27 12:22:12 +02:00
Alex Chi Z
cd1705357d Merge pull request #4561 from neondatabase/releases/2023-06-23-hotfix
Release 2023-06-23 (pageserver-only)
2023-06-23 15:38:50 -04:00
Christian Schwarz
6bc7561290 don't use MGMT_REQUEST_RUNTIME for consumption metrics synthetic size worker
The consumption metrics synthetic size worker does logical size calculation.
Logical size calculation currently does synchronous disk IO.
This blocks the MGMT_REQUEST_RUNTIME's executor threads, starving other futures.

While there's work on the way to move the synchronous disk IO into spawn_blocking,
the quickfix here is to use the BACKGROUND_RUNTIME instead of MGMT_REQUEST_RUNTIME.

Actually it's not just a quickfix. We simply shouldn't be blocking MGMT_REQUEST_RUNTIME
executor threads on CPU or sync disk IO.
That work isn't done yet, as many of the mgmt tasks still _do_ disk IO.
But it's not as intensive as the logical size calculations that we're fixing here.

While we're at it, fix disk-usage-based eviction in a similar way.
It wasn't the culprit here, according to prod logs, but it can theoretically be
a little CPU-intensive.

More context, including graphs from Prod:
https://neondb.slack.com/archives/C03F5SM1N02/p1687541681336949

(cherry picked from commit d6e35222ea)
2023-06-23 20:54:07 +02:00
Christian Schwarz
fbd3ac14b5 Merge pull request #4544 from neondatabase/releases/2023-06-21-hotfix
Release 2023-06-21 (fixup for post-merge failed 2023-06-20)
2023-06-21 16:54:34 +03:00
Christian Schwarz
e437787c8f cargo update -p openssl (#4542)
To unblock release
https://github.com/neondatabase/neon/pull/4536#issuecomment-1600678054

Context: https://rustsec.org/advisories/RUSTSEC-2023-0044
2023-06-21 15:52:56 +03:00
Christian Schwarz
3460dbf90b Merge pull request #4536 from neondatabase/releases/2023-06-20
Release 2023-06-20 (actually 2023-06-21)
2023-06-21 14:19:14 +03:00
Vadim Kharitonov
6b89d99677 Merge pull request #4521 from neondatabase/release_2023-06-15
Release 2023 06 15
2023-06-15 17:40:01 +02:00
Vadim Kharitonov
6cc8ea86e4 Merge branch 'main' into release_2023-06-15 2023-06-15 16:50:44 +02:00
Shany Pozin
e62a492d6f Merge pull request #4486 from neondatabase/releases/2023-06-13
Release 2023-06-13
2023-06-13 15:21:35 +03:00
Alexey Kondratov
a475cdf642 [compute_ctl] Fix logging if catalog updates are skipped (#4480)
Otherwise, it wasn't clear from the log when Postgres started up
completely if catalog updates were skipped.

Follow-up for 4936ab6
2023-06-13 13:37:24 +02:00
Stas Kelvich
7002c79a47 Merge pull request #4447 from neondatabase/release_proxy_08-06-2023
Release proxy 08 06 2023
2023-06-08 21:02:54 +03:00
Vadim Kharitonov
ee6cf357b4 Merge pull request #4427 from neondatabase/releases/2023-06-06
Release 2023-06-06
2023-06-06 14:42:21 +02:00
Vadim Kharitonov
e5c2086b5f Merge branch 'release' into releases/2023-06-06 2023-06-06 12:33:56 +02:00
Shany Pozin
5f1208296a Merge pull request #4395 from neondatabase/releases/2023-06-01
Release 2023-06-01
2023-06-01 10:58:00 +03:00
Stas Kelvich
88e8e473cd Merge pull request #4345 from neondatabase/release-23-05-25-proxy
Release 23-05-25, take 3
2023-05-25 19:40:43 +03:00
Stas Kelvich
b0a77844f6 Add SQL-over-HTTP endpoint to Proxy
This commit introduces an SQL-over-HTTP endpoint in the proxy, with a JSON
response structure resembling that of the node-postgres driver. This method,
using HTTP POST, achieves smaller amortized latencies in edge setups due to
fewer round trips and an enhanced open connection reuse by the v8 engine.

This update involves several intricacies:
1. SQL injection protection: We employed the extended query protocol, modifying
   the rust-postgres driver to send queries in one roundtrip using a text
   protocol rather than binary, bypassing potential issues like those identified
   in https://github.com/sfackler/rust-postgres/issues/1030.

2. Postgres type compatibility: As not all postgres types have binary
   representations (e.g., acl's in pg_class), we adjusted rust-postgres to
   respond with text protocol, simplifying serialization and fixing queries with
   text-only types in response.

3. Data type conversion: Considering JSON supports fewer data types than
   Postgres, we perform conversions where possible, passing all other types as
   strings. Key conversions include:
   - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain
     text)
   - postgres bool, null, text -> json bool, null, string
   - postgres array -> json array
   - postgres json and jsonb -> json object

4. Alignment with node-postgres: To facilitate integration with js libraries,
   we've matched the response structure of node-postgres, returning command tags
   and column oids. Command tag capturing was added to the rust-postgres
   functionality as part of this change.
2023-05-25 17:59:17 +03:00
Vadim Kharitonov
1baf464307 Merge pull request #4309 from neondatabase/releases/2023-05-23
Release 2023-05-23
2023-05-24 11:56:54 +02:00
Alexander Bayandin
e9b8e81cea Merge branch 'release' into releases/2023-05-23 2023-05-23 12:54:08 +01:00
Alexander Bayandin
85d6194aa4 Fix regress-tests job for Postgres 15 on release branch (#4254)
## Problem

Compatibility tests don't support Postgres 15 yet, but we're still
trying to upload compatibility snapshot (which we do not collect).

Ref
https://github.com/neondatabase/neon/actions/runs/4991394158/jobs/8940369368#step:4:38129

## Summary of changes

Add `pg_version` parameter to `run-python-test-set` actions and do not
upload compatibility snapshot for Postgres 15
2023-05-16 17:19:12 +01:00
Vadim Kharitonov
333a7a68ef Merge pull request #4245 from neondatabase/releases/2023-05-16
Release 2023-05-16
2023-05-16 13:38:40 +02:00
Vadim Kharitonov
6aa4e41bee Merge branch 'release' into releases/2023-05-16 2023-05-16 12:48:23 +02:00
Joonas Koivunen
840183e51f try: higher page_service timeouts to isolate an issue 2023-05-11 16:24:53 +03:00
Shany Pozin
cbccc94b03 Merge pull request #4184 from neondatabase/releases/2023-05-09
Release 2023-05-09
2023-05-09 15:30:36 +03:00
Stas Kelvich
fce227df22 Merge pull request #4163 from neondatabase/main
Release 23-05-05
2023-05-05 15:56:23 +03:00
Stas Kelvich
bd787e800f Merge pull request #4133 from neondatabase/main
Release 23-04-01
2023-05-01 18:52:46 +03:00
Shany Pozin
4a7704b4a3 Merge pull request #4131 from neondatabase/sp/hotfix_adding_sks_us_west
Hotfix: Adding 4 new pageservers and two sets of safekeepers to us west 2
2023-05-01 15:17:38 +03:00
Shany Pozin
ff1119da66 Add 2 new sets of safekeepers to us-west2 2023-05-01 14:35:31 +03:00
Shany Pozin
4c3ba1627b Add 4 new Pageservers for retool launch 2023-05-01 14:34:38 +03:00
Vadim Kharitonov
1407174fb2 Merge pull request #4110 from neondatabase/vk/release_2023-04-28
Release 2023 04 28
2023-04-28 17:43:16 +02:00
Vadim Kharitonov
ec9dcb1889 Merge branch 'release' into vk/release_2023-04-28 2023-04-28 16:32:26 +02:00
Joonas Koivunen
d11d781afc revert: "Add check for duplicates of generated image layers" (#4104)
This reverts commit 732acc5.

Reverted PR: #3869

As noted in PR #4094, we do in fact try to insert duplicates to the
layer map, if L0->L1 compaction is interrupted. We do not have a proper
fix for that right now, and we are in a hurry to make a release to
production, so revert the changes related to this to the state that we
have in production currently. We know that we have a bug here, but
better to live with the bug that we've had in production for a long
time, than rush a fix to production without testing it in staging first.

Cc: #4094, #4088
2023-04-28 16:31:35 +02:00
Anastasia Lubennikova
4e44565b71 Merge pull request #4000 from neondatabase/releases/2023-04-11
Release 2023-04-11
2023-04-11 17:47:41 +03:00
Stas Kelvich
4ed51ad33b Add more proxy cnames 2023-04-11 15:59:35 +03:00
Arseny Sher
1c1ebe5537 Merge pull request #3946 from neondatabase/releases/2023-04-04
Release 2023-04-04
2023-04-04 14:38:40 +04:00
Christian Schwarz
c19cb7f386 Merge pull request #3935 from neondatabase/releases/2023-04-03
Release 2023-04-03
2023-04-03 16:19:49 +02:00
Vadim Kharitonov
4b97d31b16 Merge pull request #3896 from neondatabase/releases/2023-03-28
Release 2023-03-28
2023-03-28 17:58:06 +04:00
Shany Pozin
923ade3dd7 Merge pull request #3855 from neondatabase/releases/2023-03-21
Release 2023-03-21
2023-03-21 13:12:32 +02:00
Arseny Sher
b04e711975 Merge pull request #3825 from neondatabase/release-2023-03-15
Release 2023.03.15
2023-03-15 15:38:00 +03:00
Arseny Sher
afd0a6b39a Forward framed read buf contents to compute before proxy pass.
Otherwise they get lost. Normally buffer is empty before proxy pass, but this is
not the case with pipeline mode of out npm driver; fixes connection hangup
introduced by b80fe41af3 for it.

fixes https://github.com/neondatabase/neon/issues/3822
2023-03-15 15:36:06 +04:00
Lassi Pölönen
99752286d8 Use RollingUpdate strategy also for legacy proxy (#3814)
## Describe your changes
We have previously changed the neon-proxy to use RollingUpdate. This
should be enabled in legacy proxy too in order to avoid breaking
connections for the clients and allow for example backups to run even
during deployment. (https://github.com/neondatabase/neon/pull/3683)

## Issue ticket number and link
https://github.com/neondatabase/neon/issues/3333
2023-03-15 15:35:51 +04:00
Arseny Sher
15df93363c Merge pull request #3804 from neondatabase/release-2023-03-13
Release 2023.03.13
2023-03-13 20:25:40 +03:00
Vadim Kharitonov
bc0ab741af Merge pull request #3758 from neondatabase/releases/2023-03-07
Release 2023-03-07
2023-03-07 12:38:47 +01:00
Christian Schwarz
51d9dfeaa3 Merge pull request #3743 from neondatabase/releases/2023-03-03
Release 2023-03-03
2023-03-03 19:20:21 +01:00
Shany Pozin
f63cb18155 Merge pull request #3713 from neondatabase/releases/2023-02-28
Release 2023-02-28
2023-02-28 12:52:24 +02:00
Arseny Sher
0de603d88e Merge pull request #3707 from neondatabase/release-2023-02-24
Release 2023-02-24

Hotfix for UNLOGGED tables. Contains #3706
Also contains rebase on 14.7 and 15.2 #3581
2023-02-25 00:32:11 +04:00
Heikki Linnakangas
240913912a Fix UNLOGGED tables.
Instead of trying to create missing files on the way, send init fork contents as
main fork from pageserver during basebackup. Add test for that. Call
put_rel_drop for init forks; previously they weren't removed. Bump
vendor/postgres to revert previous approach on Postgres side.

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>

ref https://github.com/neondatabase/postgres/pull/264
ref https://github.com/neondatabase/postgres/pull/259
ref https://github.com/neondatabase/neon/issues/1222
2023-02-24 23:54:53 +04:00
MMeent
91a4ea0de2 Update vendored PostgreSQL versions to 14.7 and 15.2 (#3581)
## Describe your changes
Rebase vendored PostgreSQL onto 14.7 and 15.2

## Issue ticket number and link

#3579

## Checklist before requesting a review
- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
    ```
The version of PostgreSQL that we use is updated to 14.7 for PostgreSQL
14 and 15.2 for PostgreSQL 15.
    ```
2023-02-24 23:54:42 +04:00
Arseny Sher
8608704f49 Merge pull request #3691 from neondatabase/release-2023-02-23
Release 2023-02-23

Hotfix for the unlogged tables with indexes issue.

neondatabase/postgres#259
neondatabase/postgres#262
2023-02-23 13:39:33 +04:00
Arseny Sher
efef68ce99 Bump vendor/postgres to include hotfix for unlogged tables with indexes.
https://github.com/neondatabase/postgres/pull/259
https://github.com/neondatabase/postgres/pull/262
2023-02-23 08:49:43 +04:00
Joonas Koivunen
8daefd24da Merge pull request #3679 from neondatabase/releases/2023-02-22
Releases/2023-02-22
2023-02-22 15:56:55 +02:00
Arthur Petukhovsky
46cc8b7982 Remove safekeeper-1.ap-southeast-1.aws.neon.tech (#3671)
We migrated all timelines to
`safekeeper-3.ap-southeast-1.aws.neon.tech`, now old instance can be
removed.
2023-02-22 15:07:57 +02:00
Sergey Melnikov
38cd90dd0c Add -v to ansible invocations (#3670)
To get more debug output on failures
2023-02-22 15:07:57 +02:00
Joonas Koivunen
a51b269f15 fix: hold permit until GetObject eof (#3663)
previously we applied the ratelimiting only up to receiving the headers
from s3, or somewhere near it. the commit adds an adapter which carries
the permit until the AsyncRead has been disposed.

fixes #3662.
2023-02-22 15:07:57 +02:00
Joonas Koivunen
43bf6d0a0f calculate_logical_size: no longer use spawn_blocking (#3664)
Calculation of logical size is now async because of layer downloads, so
we shouldn't use spawn_blocking for it. Use of `spawn_blocking`
exhausted resources which are needed by `tokio::io::copy` when copying
from a stream to a file which lead to deadlock.

Fixes: #3657
2023-02-22 15:07:57 +02:00
Joonas Koivunen
15273a9b66 chore: ignore all compaction inactive tenant errors (#3665)
these are happening in tests because of #3655 but they sure took some
time to appear.

makes the `Compaction failed, retrying in 2s: Cannot run compaction
iteration on inactive tenant` into a globally allowed error, because it
has been seen failing on different test cases.
2023-02-22 15:07:57 +02:00
Joonas Koivunen
78aca668d0 fix: log download failed error (#3661)
Fixes #3659
2023-02-22 15:07:57 +02:00
Vadim Kharitonov
acbf4148ea Merge pull request #3656 from neondatabase/releases/2023-02-21
Release 2023-02-21
2023-02-21 16:03:48 +01:00
Vadim Kharitonov
6508540561 Merge branch 'release' into releases/2023-02-21 2023-02-21 15:31:16 +01:00
Arthur Petukhovsky
a41b5244a8 Add new safekeeper to ap-southeast-1 prod (#3645) (#3646)
To trigger deployment of #3645 to production.
2023-02-20 15:22:49 +00:00
Shany Pozin
2b3189be95 Merge pull request #3600 from neondatabase/releases/2023-02-14
Release 2023-02-14
2023-02-15 13:31:30 +02:00
Vadim Kharitonov
248563c595 Merge pull request #3553 from neondatabase/releases/2023-02-07
Release 2023-02-07
2023-02-07 14:07:44 +01:00
Vadim Kharitonov
14cd6ca933 Merge branch 'release' into releases/2023-02-07 2023-02-07 12:11:56 +01:00
Vadim Kharitonov
eb36403e71 Release 2023 01 31 (#3497)
Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
Co-authored-by: Christian Schwarz <christian@neon.tech>
Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Shany Pozin <shany@neon.tech>
Co-authored-by: Sergey Melnikov <sergey@neon.tech>
Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
Co-authored-by: Rory de Zoete <33318916+zoete@users.noreply.github.com>
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
Co-authored-by: Lassi Pölönen <lassi.polonen@iki.fi>
2023-01-31 15:06:35 +02:00
Anastasia Lubennikova
3c6f779698 Merge pull request #3411 from neondatabase/release_2023_01_23
Fix Release 2023 01 23
2023-01-23 20:10:03 +02:00
Joonas Koivunen
f67f0c1c11 More tenant size fixes (#3410)
Small changes, but hopefully this will help with the panic detected in
staging, for which we cannot get the debugging information right now
(end-of-branch before branch-point).
2023-01-23 17:46:13 +02:00
Shany Pozin
edb02d3299 Adding pageserver3 to staging (#3403) 2023-01-23 17:46:13 +02:00
Konstantin Knizhnik
664a69e65b Fix slru_segment_key_range function: segno was assigned to incorrect Key field (#3354) 2023-01-23 17:46:13 +02:00
Anastasia Lubennikova
478322ebf9 Fix tenant size orphans (#3377)
Before only the timelines which have passed the `gc_horizon` were
processed which failed with orphans at the tree_sort phase. Example
input in added `test_branched_empty_timeline_size` test case.

The PR changes iteration to happen through all timelines, and in
addition to that, any learned branch points will be calculated as they
would had been in the original implementation if the ancestor branch had
been over the `gc_horizon`.

This also changes how tenants where all timelines are below `gc_horizon`
are handled. Previously tenant_size 0 was returned, but now they will
have approximately `initdb_lsn` worth of tenant_size.

The PR also adds several new tenant size tests that describe various corner
cases of branching structure and `gc_horizon` setting.
They are currently disabled to not consume time during CI.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
2023-01-23 17:46:13 +02:00
Joonas Koivunen
802f174072 fix: dont stop pageserver if we fail to calculate synthetic size 2023-01-23 17:46:13 +02:00
Alexey Kondratov
47f9890bae [compute_ctl] Make role deletion spec processing idempotent (#3380)
Previously, we were trying to re-assign owned objects of the already
deleted role. This were causing a crash loop in the case when compute
was restarted with a spec that includes delta operation for role
deletion. To avoid such cases, check that role is still present before
calling `reassign_owned_objects`.

Resolves neondatabase/cloud#3553
2023-01-23 17:46:13 +02:00
Christian Schwarz
262265daad Revert "Use actual temporary dir for pageserver unit tests"
This reverts commit 826e89b9ce.

The problem with that commit was that it deletes the TempDir while
there are still EphemeralFile instances open.

At first I thought this could be fixed by simply adding

  Handle::current().block_on(task_mgr::shutdown(None, Some(tenant_id), None))

to TenantHarness::drop, but it turned out to be insufficient.

So, reverting the commit until we find a proper solution.

refs https://github.com/neondatabase/neon/issues/3385
2023-01-23 17:46:13 +02:00
bojanserafimov
300da5b872 Improve layer map docstrings (#3382) 2023-01-23 17:46:13 +02:00
Heikki Linnakangas
7b22b5c433 Switch to 'tracing' for logging, restructure code to make use of spans.
Refactors Compute::prepare_and_run. It's split into subroutines
differently, to make it easier to attach tracing spans to the
different stages. The high-level logic for waiting for Postgres to
exit is moved to the caller.

Replace 'env_logger' with 'tracing', and add `#instrument` directives
to different stages fo the startup process. This is a fairly
mechanical change, except for the changes in 'spec.rs'. 'spec.rs'
contained some complicated formatting, where parts of log messages
were printed directly to stdout with `print`s. That was a bit messed
up because the log normally goes to stderr, but those lines were
printed to stdout. In our docker images, stderr and stdout both go to
the same place so you wouldn't notice, but I don't think it was
intentional.

This changes the log format to the default
'tracing_subscriber::format' format. It's different from the Postgres
log format, however, and because both compute_tools and Postgres print
to the same log, it's now a mix of two different formats.  I'm not
sure how the Grafana log parsing pipeline can handle that. If it's a
problem, we can build custom formatter to change the compute_tools log
format to be the same as Postgres's, like it was before this commit,
or we can change the Postgres log format to match tracing_formatter's,
or we can start printing compute_tool's log output to a different
destination than Postgres
2023-01-23 17:46:12 +02:00
Kirill Bulatov
ffca97bc1e Enable logs in unit tests 2023-01-23 17:46:12 +02:00
Kirill Bulatov
cb356f3259 Use actual temporary dir for pageserver unit tests 2023-01-23 17:46:12 +02:00
Vadim Kharitonov
c85374295f Change SENTRY_ENVIRONMENT from "development" to "staging" 2023-01-23 17:46:12 +02:00
Anastasia Lubennikova
4992160677 Fix metric_collection_endpoint for prod.
It was incorrectly set to staging url
2023-01-23 17:46:12 +02:00
Heikki Linnakangas
bd535b3371 If an error happens while checking for core dumps, don't panic.
If we panic, we skip the 30s wait in 'main', and don't give the
console a chance to observe the error. Which is not nice.

Spotted by @ololobus at
https://github.com/neondatabase/neon/pull/3352#discussion_r1072806981
2023-01-23 17:46:12 +02:00
Kirill Bulatov
d90c5a03af Add more io::Error context when fail to operate on a path (#3254)
I have a test failure that shows 

```
Caused by:
    0: Failed to reconstruct a page image:
    1: Directory not empty (os error 39)
```

but does not really show where exactly that happens.

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3227/release/3823785365/index.html#categories/c0057473fc9ec8fb70876fd29a171ce8/7088dab272f2c7b7/?attachment=60fe6ed2add4d82d

The PR aims to add more context in debugging that issue.
2023-01-23 17:46:12 +02:00
Anastasia Lubennikova
2d02cc9079 Merge pull request #3365 from neondatabase/main
Release 2023-01-17
2023-01-17 16:41:34 +02:00
Christian Schwarz
49ad94b99f Merge pull request #3301 from neondatabase/release-2023-01-10
Release 2023-01-10
2023-01-10 16:42:26 +01:00
Christian Schwarz
948a217398 Merge commit '95bf19b85a06b27a7fc3118dee03d48648efab15' into release-2023-01-10
Conflicts:
        .github/helm-values/neon-stress.proxy-scram.yaml
        .github/helm-values/neon-stress.proxy.yaml
        .github/helm-values/staging.proxy-scram.yaml
        .github/helm-values/staging.proxy.yaml
        All of the above were deleted in `main` after we hotfixed them
        in `release. Deleting them here
        storage_broker/src/bin/storage_broker.rs
        Hotfix toned down logging, but `main` has sinced implemented
        a proper fix. Taken `main`'s side, see
        https://neondb.slack.com/archives/C033RQ5SPDH/p1673354385387479?thread_ts=1673354306.474729&cid=C033RQ5SPDH

closes https://github.com/neondatabase/neon/issues/3287
2023-01-10 15:40:14 +01:00
Dmitry Rodionov
125381eae7 Merge pull request #3236 from neondatabase/dkr/retrofit-sk4-sk4-change
Move zenith-1-sk-3 to zenith-1-sk-4 (#3164)
2022-12-30 14:13:50 +03:00
Arthur Petukhovsky
cd01bbc715 Move zenith-1-sk-3 to zenith-1-sk-4 (#3164) 2022-12-30 12:32:52 +02:00
Dmitry Rodionov
d8b5e3b88d Merge pull request #3229 from neondatabase/dkr/add-pageserver-for-release
add pageserver to new region see https://github.com/neondatabase/aws/pull/116

decrease log volume for pageserver
2022-12-30 12:34:04 +03:00
Dmitry Rodionov
06d25f2186 switch to debug from info to produce less noise 2022-12-29 17:48:47 +02:00
Dmitry Rodionov
f759b561f3 add pageserver to new region see https://github.com/neondatabase/aws/pull/116 2022-12-29 17:17:35 +02:00
Sergey Melnikov
ece0555600 Push proxy metrics to Victoria Metrics (#3106) 2022-12-16 14:44:49 +02:00
Joonas Koivunen
73ea0a0b01 fix(remote_storage): use cached credentials (#3128)
IMDSv2 has limits, and if we query it on every s3 interaction we are
going to go over those limits. Changes the s3_bucket client
configuration to use:
- ChainCredentialsProvider to handle env variables or imds usage
- LazyCachingCredentialsProvider to actually cache any credentials

Related: https://github.com/awslabs/aws-sdk-rust/issues/629
Possibly related: https://github.com/neondatabase/neon/issues/3118
2022-12-16 14:44:49 +02:00
Arseny Sher
d8f6d6fd6f Merge pull request #3126 from neondatabase/broker-lb-release
Deploy broker with L4 LB in new env.
2022-12-16 01:25:28 +03:00
Arseny Sher
d24de169a7 Deploy broker with L4 LB in new env.
Seems to be fixing issue with missing keepalives.
2022-12-16 01:45:32 +04:00
Arseny Sher
0816168296 Hotfix: terminate subscription if channel is full.
Might help as a hotfix, but need to understand root better.
2022-12-15 12:23:56 +03:00
Dmitry Rodionov
277b44d57a Merge pull request #3102 from neondatabase/main
Hotfix. See commits for details
2022-12-14 19:38:43 +03:00
MMeent
68c2c3880e Merge pull request #3038 from neondatabase/main
Release 22-12-14
2022-12-14 14:35:47 +01:00
Arthur Petukhovsky
49da498f65 Merge pull request #2833 from neondatabase/main
Release 2022-11-16
2022-11-17 08:44:10 +01:00
Stas Kelvich
2c76ba3dd7 Merge pull request #2718 from neondatabase/main-rc-22-10-28
Release 22-10-28
2022-10-28 20:33:56 +03:00
Arseny Sher
dbe3dc69ad Merge branch 'main' into main-rc-22-10-28
Release 22-10-28.
2022-10-28 19:10:11 +04:00
Arseny Sher
8e5bb3ed49 Enable etcd compaction in neon_local. 2022-10-27 12:53:20 +03:00
Stas Kelvich
ab0be7b8da Avoid debian-testing packages in compute Dockerfiles
plv8 can only be built with a fairly new gold linker version. We used to install
it via binutils packages from testing, but it also updates libc and that causes
troubles in the resulting image as different extensions were built against
different libc versions. We could either use libc from debian-testing everywhere
or restrain from using testing packages and install necessary programs manually.
This patch uses the latter approach: gold for plv8 and cmake for h3 are
installed manually.

In a passing declare h3_postgis as a safe extension (previous omission).
2022-10-27 12:53:20 +03:00
bojanserafimov
b4c55f5d24 Move pagestream api to libs/pageserver_api (#2698) 2022-10-27 12:53:20 +03:00
mikecaat
ede70d833c Add a docker-compose example file (#1943) (#2666)
Co-authored-by: Masahiro Ikeda <masahiro.ikeda.us@hco.ntt.co.jp>
2022-10-27 12:53:20 +03:00
Sergey Melnikov
70c3d18bb0 Do not release to new staging proxies on release (#2685) 2022-10-27 12:53:20 +03:00
bojanserafimov
7a491f52c4 Add draw_timeline binary (#2688) 2022-10-27 12:53:20 +03:00
Alexander Bayandin
323c4ecb4f Add data format backward compatibility tests (#2626) 2022-10-27 12:53:20 +03:00
Anastasia Lubennikova
3d2466607e Merge pull request #2692 from neondatabase/main-rc
Release 2022-10-25
2022-10-25 18:18:58 +03:00
Anastasia Lubennikova
ed478b39f4 Merge branch 'release' into main-rc 2022-10-25 17:06:33 +03:00
Stas Kelvich
91585a558d Merge pull request #2678 from neondatabase/stas/hotfix_schema
Hotfix to disable grant create on public schema
2022-10-22 02:54:31 +03:00
Stas Kelvich
93467eae1f Hotfix to disable grant create on public schema
`GRANT CREATE ON SCHEMA public` fails if there is no schema `public`.
Disable it in release for now and make a better fix later (it is
needed for v15 support).
2022-10-22 02:26:28 +03:00
Stas Kelvich
f3aac81d19 Merge pull request #2668 from neondatabase/main
Release 2022-10-21
2022-10-21 15:21:42 +03:00
Stas Kelvich
979ad60c19 Merge pull request #2581 from neondatabase/main
Release 2022-10-07
2022-10-07 16:50:55 +03:00
Stas Kelvich
9316cb1b1f Merge pull request #2573 from neondatabase/main
Release 2022-10-06
2022-10-07 11:07:06 +03:00
Anastasia Lubennikova
e7939a527a Merge pull request #2377 from neondatabase/main
Release 2022-09-01
2022-09-01 20:20:44 +03:00
Arthur Petukhovsky
36d26665e1 Merge pull request #2299 from neondatabase/main
* Check for entire range during sasl validation (#2281)

* Gen2 GH runner (#2128)

* Re-add rustup override

* Try s3 bucket

* Set git version

* Use v4 cache key to prevent problems

* Switch to v5 for key

* Add second rustup fix

* Rebase

* Add kaniko steps

* Fix typo and set compress level

* Disable global run default

* Specify shell for step

* Change approach with kaniko

* Try less verbose shell spec

* Add submodule pull

* Add promote step

* Adjust dependency chain

* Try default swap again

* Use env

* Don't override aws key

* Make kaniko build conditional

* Specify runs on

* Try without dependency link

* Try soft fail

* Use image with git

* Try passing to next step

* Fix duplicate

* Try other approach

* Try other approach

* Fix typo

* Try other syntax

* Set env

* Adjust setup

* Try step 1

* Add link

* Try global env

* Fix mistake

* Debug

* Try other syntax

* Try other approach

* Change order

* Move output one step down

* Put output up one level

* Try other syntax

* Skip build

* Try output

* Re-enable build

* Try other syntax

* Skip middle step

* Update check

* Try first step of dockerhub push

* Update needs dependency

* Try explicit dir

* Add missing package

* Try other approach

* Try other approach

* Specify region

* Use with

* Try other approach

* Add debug

* Try other approach

* Set region

* Follow AWS example

* Try github approach

* Skip Qemu

* Try stdin

* Missing steps

* Add missing close

* Add echo debug

* Try v2 endpoint

* Use v1 endpoint

* Try without quotes

* Revert

* Try crane

* Add debug

* Split steps

* Fix duplicate

* Add shell step

* Conform to options

* Add verbose flag

* Try single step

* Try workaround

* First request fails hunch

* Try bullseye image

* Try other approach

* Adjust verbose level

* Try previous step

* Add more debug

* Remove debug step

* Remove rogue indent

* Try with larger image

* Add build tag step

* Update workflow for testing

* Add tag step for test

* Remove unused

* Update dependency chain

* Add ownership fix

* Use matrix for promote

* Force update

* Force build

* Remove unused

* Add new image

* Add missing argument

* Update dockerfile copy

* Update Dockerfile

* Update clone

* Update dockerfile

* Go to correct folder

* Use correct format

* Update dockerfile

* Remove cd

* Debug find where we are

* Add debug on first step

* Changedir to postgres

* Set workdir

* Use v1 approach

* Use other dependency

* Try other approach

* Try other approach

* Update dockerfile

* Update approach

* Update dockerfile

* Update approach

* Update dockerfile

* Update dockerfile

* Add workspace hack

* Update Dockerfile

* Update Dockerfile

* Update Dockerfile

* Change last step

* Cleanup pull in prep for review

* Force build images

* Add condition for latest tagging

* Use pinned version

* Try without name value

* Remove more names

* Shorten names

* Add kaniko comments

* Pin kaniko

* Pin crane and ecr helper

* Up one level

* Switch to pinned tag for rust image

* Force update for test

Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@b04468bf-cdf4-41eb-9c94-aff4ca55e4bf.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@4795e9ee-4f32-401f-85f3-f316263b62b8.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@2f8bc4e5-4ec2-4ea2-adb1-65d863c4a558.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@27565b2b-72d5-4742-9898-a26c9033e6f9.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@ecc96c26-c6c4-4664-be6e-34f7c3f89a3c.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@7caff3a5-bf03-4202-bd0e-f1a93c86bdae.fritz.box>

* Add missing step output, revert one deploy step (#2285)

* Add missing step output, revert one deploy step

* Conform to syntax

* Update approach

* Add missing value

* Add missing needs

Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>

* Error for fatal not git repo (#2286)

Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>

* Use main, not branch for ref check (#2288)

* Use main, not branch for ref check

* Add more debug

* Count main, not head

* Try new approach

* Conform to syntax

* Update approach

* Get full history

* Skip checkout

* Cleanup debug

* Remove more debug

Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>

* Fix docker zombie process issue (#2289)

* Fix docker zombie process issue

* Init everywhere

Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>

* Fix 1.63 clippy lints (#2282)

* split out timeline metrics, track layer map loading and size calculation

* reset rust cache for clippy run to avoid an ICE

additionally remove trailing whitespaces

* Rename pg_control_ffi.h to bindgen_deps.h, for clarity.

The pg_control_ffi.h name implies that it only includes stuff related to
pg_control.h. That's mostly true currently, but really the point of the
file is to include everything that we need to generate Rust definitions
from.

* Make local mypy behave like CI mypy (#2291)

* Fix flaky pageserver restarts in tests (#2261)

* Remove extra type aliases (#2280)

* Update cachepot endpoint (#2290)

* Update cachepot endpoint

* Update dockerfile & remove env

* Update image building process

* Cannot use metadata endpoint for this

* Update workflow

* Conform to kaniko syntax

* Update syntax

* Update approach

* Update dockerfiles

* Force update

* Update dockerfiles

* Update dockerfile

* Cleanup dockerfiles

* Update s3 test location

* Revert s3 experiment

* Add more debug

* Specify aws region

* Remove debug, add prefix

* Remove one more debug

Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>

* workflows/benchmarking: increase timeout (#2294)

* Rework `init` in pageserver CLI  (#2272)

* Do not create initial tenant and timeline (adjust Python tests for that)
* Rework config handling during init, add --update-config to manage local config updates

* Fix: Always build images (#2296)

* Always build images

* Remove unused

Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>

* Move auto-generated 'bindings' to a separate inner module.

Re-export only things that are used by other modules.

In the future, I'm imagining that we run bindgen twice, for Postgres
v14 and v15. The two sets of bindings would go into separate
'bindings_v14' and 'bindings_v15' modules.

Rearrange postgres_ffi modules.

Move function, to avoid Postgres version dependency in timelines.rs
Move function to generate a logical-message WAL record to postgres_ffi.

* fix cargo test

* Fix walreceiver and safekeeper bugs (#2295)

- There was an issue with zero commit_lsn `reason: LaggingWal { current_commit_lsn: 0/0, new_commit_lsn: 1/6FD90D38, threshold: 10485760 } }`. The problem was in `send_wal.rs`, where we initialized `end_pos = Lsn(0)` and in some cases sent it to the pageserver.
- IDENTIFY_SYSTEM previously returned `flush_lsn` as a physical end of WAL. Now it returns `flush_lsn` (as it was) to walproposer and `commit_lsn` to everyone else including pageserver.
- There was an issue with backoff where connection was cancelled right after initialization: `connected!` -> `safekeeper_handle_db: Connection cancelled` -> `Backoff: waiting 3 seconds`. The problem was in sleeping before establishing the connection. This is fixed by reworking retry logic.
- There was an issue with getting `NoKeepAlives` reason in a loop. The issue is probably the same as the previous.
- There was an issue with filtering safekeepers based on retry attempts, which could filter some safekeepers indefinetely. This is fixed by using retry cooldown duration instead of retry attempts.
- Some `send_wal.rs` connections failed with errors without context. This is fixed by adding a timeline to safekeepers errors.

New retry logic works like this:
- Every candidate has a `next_retry_at` timestamp and is not considered for connection until that moment
- When walreceiver connection is closed, we update `next_retry_at` using exponential backoff, increasing the cooldown on every disconnect.
- When `last_record_lsn` was advanced using the WAL from the safekeeper, we reset the retry cooldown and exponential backoff, allowing walreceiver to reconnect to the same safekeeper instantly.

* on safekeeper registration pass availability zone param (#2292)

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Rory de Zoete <33318916+zoete@users.noreply.github.com>
Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@b04468bf-cdf4-41eb-9c94-aff4ca55e4bf.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@4795e9ee-4f32-401f-85f3-f316263b62b8.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@2f8bc4e5-4ec2-4ea2-adb1-65d863c4a558.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@27565b2b-72d5-4742-9898-a26c9033e6f9.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@ecc96c26-c6c4-4664-be6e-34f7c3f89a3c.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@7caff3a5-bf03-4202-bd0e-f1a93c86bdae.fritz.box>
Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: Anton Galitsyn <agalitsyn@users.noreply.github.com>
2022-08-18 15:32:33 +03:00
Arthur Petukhovsky
873347f977 Merge pull request #2275 from neondatabase/main
* github/workflows: Fix git dubious ownership (#2223)

* Move relation size cache from WalIngest to DatadirTimeline (#2094)

* Move relation sie cache to layered timeline

* Fix obtaining current LSN for relation size cache

* Resolve merge conflicts

* Resolve merge conflicts

* Reestore 'lsn' field in DatadirModification

* adjust DatadirModification lsn in ingest_record

* Fix formatting

* Pass lsn to get_relsize

* Fix merge conflict

* Update pageserver/src/pgdatadir_mapping.rs

Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>

* Update pageserver/src/pgdatadir_mapping.rs

Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>

Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>

* refactor: replace lazy-static with once-cell (#2195)

- Replacing all the occurrences of lazy-static with `once-cell::sync::Lazy`
- fixes #1147

Signed-off-by: Ankur Srivastava <best.ankur@gmail.com>

* Add more buckets to pageserver latency metrics (#2225)

* ignore record property warning to fix benchmarks

* increase statement timeout

* use event so it fires only if workload thread successfully finished

* remove debug log

* increase timeout to pass test with real s3

* avoid duplicate parameter, increase timeout

* Major migration script (#2073)

This script can be used to migrate a tenant across breaking storage versions, or (in the future) upgrading postgres versions. See the comment at the top for an overview.

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>

* Fix etcd typos

* Fix links to safekeeper protocol docs. (#2188)

safekeeper/README_PROTO.md was moved to docs/safekeeper-protocol.md in
commit 0b14fdb078, as part of reorganizing the docs into 'mdbook' format.

Fixes issue #1475. Thanks to @banks for spotting the outdated references.

In addition to fixing the above issue, this patch also fixes other broken links as a result of 0b14fdb078. See https://github.com/neondatabase/neon/pull/2188#pullrequestreview-1055918480.

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Thang Pham <thang@neon.tech>

* Update CONTRIBUTING.md

* Update CONTRIBUTING.md

* support node id and remote storage params in docker_entrypoint.sh

* Safe truncate (#2218)

* Move relation sie cache to layered timeline

* Fix obtaining current LSN for relation size cache

* Resolve merge conflicts

* Resolve merge conflicts

* Reestore 'lsn' field in DatadirModification

* adjust DatadirModification lsn in ingest_record

* Fix formatting

* Pass lsn to get_relsize

* Fix merge conflict

* Update pageserver/src/pgdatadir_mapping.rs

Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>

* Update pageserver/src/pgdatadir_mapping.rs

Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>

* Check if relation exists before trying to truncat it

refer #1932

* Add test reporducing FSM truncate problem

Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>

* Fix exponential backoff values

* Update back `vendor/postgres` back; it was changed accidentally. (#2251)

Commit 4227cfc96e accidentally reverted vendor/postgres to an older
version. Update it back.

* Add pageserver checkpoint_timeout option.

To flush inmemory layer eventually when no new data arrives, which helps
safekeepers to suspend activity (stop pushing to the broker). Default 10m should
be ok.

* Share exponential backoff code and fix logic for delete task failure (#2252)

* Fix bug when import large (>1GB) relations (#2172)

Resolves #2097 

- use timeline modification's `lsn` and timeline's `last_record_lsn` to determine the corresponding LSN to query data in `DatadirModification::get`
- update `test_import_from_pageserver`. Split the test into 2 variants: `small` and `multisegment`. 
  + `small` is the old test
  + `multisegment` is to simulate #2097 by using a larger number of inserted rows to create multiple segment files of a relation. `multisegment` is configured to only run with a `release` build

* Fix timeline physical size flaky tests (#2244)

Resolves #2212.

- use `wait_for_last_flush_lsn` in `test_timeline_physical_size_*` tests

## Context
Need to wait for the pageserver to catch up with the compute's last flush LSN because during the timeline physical size API call, it's possible that there are running `LayerFlushThread` threads. These threads flush new layers into disk and hence update the physical size. This results in a mismatch between the physical size reported by the API and the actual physical size on disk.

### Note
The `LayerFlushThread` threads are processed **concurrently**, so it's possible that the above error still persists even with this patch. However, making the tests wait to finish processing all the WALs (not flushing) before calculating the physical size should help reduce the "flakiness" significantly

* postgres_ffi/waldecoder: validate more header fields

* postgres_ffi/waldecoder: remove unused startlsn

* postgres_ffi/waldecoder: introduce explicit `enum State`

Previously it was emulated with a combination of nullable fields.
This change should make the logic more readable.

* disable `test_import_from_pageserver_multisegment` (#2258)

This test failed consistently on `main` now. It's better to temporarily disable it to avoid blocking others' PRs while investigating the root cause for the test failure.

See: #2255, #2256

* get_binaries uses DOCKER_TAG taken from docker image build step (#2260)

* [proxy] Rework wire format of the password hack and some errors (#2236)

The new format has a few benefits: it's shorter, simpler and
human-readable as well. We don't use base64 anymore, since
url encoding got us covered.

We also show a better error in case we couldn't parse the
payload; the users should know it's all about passing the
correct project name.

* test_runner/pg_clients: collect docker logs (#2259)

* get_binaries script fix (#2263)

* get_binaries uses DOCKER_TAG taken from docker image build step

* remove docker tag discovery at all and fix get_binaries for version variable

* Better storage sync logs (#2268)

* Find end of WAL on safekeepers using WalStreamDecoder.

We could make it inside wal_storage.rs, but taking into account that
 - wal_storage.rs reading is async
 - we don't need s3 here
 - error handling is different; error during decoding is normal
I decided to put it separately.

Test
cargo test test_find_end_of_wal_last_crossing_segment
prepared earlier by @yeputons passes now.

Fixes https://github.com/neondatabase/neon/issues/544
      https://github.com/neondatabase/cloud/issues/2004
Supersedes https://github.com/neondatabase/neon/pull/2066

* Improve walreceiver logic (#2253)

This patch makes walreceiver logic more complicated, but it should work better in most cases. Added `test_wal_lagging` to test scenarios where alive safekeepers can lag behind other alive safekeepers.

- There was a bug which looks like `etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn())` filtered all safekeepers in some strange cases. I removed this filter, it should probably help with #2237
- Now walreceiver_connection reports status, including commit_lsn. This allows keeping safekeeper connection even when etcd is down.
- Safekeeper connection now fails if pageserver doesn't receive safekeeper messages for some time. Usually safekeeper sends messages at least once per second.
- `LaggingWal` check now uses `commit_lsn` directly from safekeeper. This fixes the issue with often reconnects, when compute generates WAL really fast.
- `NoWalTimeout` is rewritten to trigger only when we know about the new WAL and the connected safekeeper doesn't stream any WAL. This allows setting a small `lagging_wal_timeout` because it will trigger only when we observe that the connected safekeeper has stuck.

* increase timeout in wait_for_upload to avoid spurious failures when testing with real s3

* Bump vendor/postgres to include XLP_FIRST_IS_CONTRECORD fix. (#2274)

* Set up a workflow to run pgbench against captest (#2077)

Signed-off-by: Ankur Srivastava <best.ankur@gmail.com>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>
Co-authored-by: Ankur Srivastava <ansrivas@users.noreply.github.com>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Thang Pham <thang@neon.tech>
Co-authored-by: Stas Kelvich <stas.kelvich@gmail.com>
Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
Co-authored-by: Egor Suvorov <egor@neon.tech>
Co-authored-by: Andrey Taranik <andrey@cicd.team>
Co-authored-by: Dmitry Ivanov <ivadmi5@gmail.com>
2022-08-15 21:30:45 +03:00
Arthur Petukhovsky
e814ac16f9 Merge pull request #2219 from neondatabase/main
Release 2022-08-04
2022-08-04 20:06:34 +03:00
Heikki Linnakangas
ad3055d386 Merge pull request #2203 from neondatabase/release-uuid-ossp
Deploy new storage and compute version to production

Release 2022-08-02
2022-08-02 15:08:14 +03:00
Heikki Linnakangas
94e03eb452 Merge remote-tracking branch 'origin/main' into 'release'
Release 2022-08-01
2022-08-02 12:43:49 +03:00
Sergey Melnikov
380f26ef79 Merge pull request #2170 from neondatabase/main (Release 2022-07-28)
Release 2022-07-28
2022-07-28 14:16:52 +03:00
Arthur Petukhovsky
3c5b7f59d7 Merge pull request #2119 from neondatabase/main
Release 2022-07-19
2022-07-19 11:58:48 +03:00
Arthur Petukhovsky
fee89f80b5 Merge pull request #2115 from neondatabase/main-2022-07-18
Release 2022-07-18
2022-07-18 19:21:11 +03:00
Arthur Petukhovsky
41cce8eaf1 Merge remote-tracking branch 'origin/release' into main-2022-07-18 2022-07-18 18:21:20 +03:00
Alexey Kondratov
f88fe0218d Merge pull request #1842 from neondatabase/release-deploy-hotfix
[HOTFIX] Release deploy fix

This PR uses this branch neondatabase/postgres#171 and several required commits from the main to use only locally built compute-tools. This should allow us to rollout safekeepers sync issue fix on prod
2022-06-01 11:04:30 +03:00
Alexey Kondratov
cc856eca85 Install missing openssl packages in the Github Actions workflow 2022-05-31 21:31:31 +02:00
Alexey Kondratov
cf350c6002 Use :local compute-tools tag to build compute-node image 2022-05-31 21:31:16 +02:00
Arseny Sher
0ce6b6a0a3 Merge pull request #1836 from neondatabase/release-hotfix-basebackup-lsn-page-boundary
Bump vendor/postgres to hotfix basebackup LSN comparison.
2022-05-31 16:54:03 +04:00
Arseny Sher
73f247d537 Bump vendor/postgres to hotfix basebackup LSN comparison. 2022-05-31 16:00:50 +04:00
Andrey Taranik
960be82183 Merge pull request #1792 from neondatabase/main
Release 2202-05-25 (second)
2022-05-25 16:37:57 +03:00
Andrey Taranik
806e5a6c19 Merge pull request #1787 from neondatabase/main
Release 2022-05-25
2022-05-25 13:34:11 +03:00
Alexey Kondratov
8d5df07cce Merge pull request #1385 from zenithdb/main
Release main 2022-03-22
2022-03-22 05:04:34 -05:00
Andrey Taranik
df7a9d1407 release fix 2022-03-16 (#1375) 2022-03-17 00:43:28 +03:00
78 changed files with 1186 additions and 4014 deletions

View File

@@ -21,5 +21,4 @@
!workspace_hack/
!neon_local/
!scripts/ninstall.sh
!scripts/combine_control_files.py
!vm-cgconfig.conf

View File

@@ -209,4 +209,4 @@ runs:
uses: ./.github/actions/allure-report-store
with:
report-dir: /tmp/test_output/allure/results
unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
unique-key: ${{ inputs.build_type }}

View File

@@ -955,15 +955,22 @@ jobs:
version: [ v14, v15 ]
env:
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
# While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
# Later all the extensions will be moved to extensions image.
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
S3_BUCKETS: |
${{ github.ref_name == 'release' &&
'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
steps:
- name: Pull postgres-extensions image
run: |
docker pull ${EXTENSIONS_IMAGE}
docker pull ${COMPUTE_NODE_IMAGE}
- name: Create postgres-extensions container
id: create-container
@@ -971,23 +978,46 @@ jobs:
EID=$(docker create ${EXTENSIONS_IMAGE} true)
echo "EID=${EID}" >> $GITHUB_OUTPUT
CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
echo "CID=${CID}" >> $GITHUB_OUTPUT
- name: Extract postgres-extensions from container
run: |
rm -rf ./extensions-to-upload # Just in case
mkdir -p extensions-to-upload
rm -rf ./extensions-to-upload ./custom-extensions # Just in case
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
# In compute image we have a bit different directory layout
mkdir -p extensions-to-upload/share
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib
# Delete Neon extensitons (they always present on compute-node image)
rm -rf ./extensions-to-upload/share/extension/neon*
rm -rf ./extensions-to-upload/lib/neon*
# Delete leftovers from the extension build step
rm -rf ./extensions-to-upload/lib/pgxs
rm -rf ./extensions-to-upload/lib/pkgconfig
docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
for EXT_NAME in $(ls ./custom-extensions); do
mkdir -p ./extensions-to-upload/${EXT_NAME}/share
mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib
done
- name: Upload postgres-extensions to S3
# TODO: Reenable step after switching to the new extensions format (tar-gzipped + index.json)
if: false
run: |
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
for BUCKET in $(echo ${S3_BUCKETS}); do
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
done
- name: Cleanup
if: ${{ always() && steps.create-container.outputs.EID }}
if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
run: |
docker rm ${{ steps.create-container.outputs.CID }} || true
docker rm ${{ steps.create-container.outputs.EID }} || true
deploy:

24
Cargo.lock generated
View File

@@ -2654,6 +2654,16 @@ dependencies = [
"windows-sys 0.45.0",
]
[[package]]
name = "pbkdf2"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
dependencies = [
"digest",
"hmac",
]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
@@ -2772,7 +2782,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"bytes",
"fallible-iterator",
@@ -2785,7 +2795,7 @@ dependencies = [
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"native-tls",
"tokio",
@@ -2796,7 +2806,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -2814,7 +2824,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"bytes",
"fallible-iterator",
@@ -3030,7 +3040,6 @@ dependencies = [
"chrono",
"clap",
"consumption_metrics",
"fallible-iterator",
"futures",
"git-version",
"hashbrown 0.13.2",
@@ -3048,9 +3057,9 @@ dependencies = [
"once_cell",
"opentelemetry",
"parking_lot 0.12.1",
"pbkdf2",
"pin-project-lite",
"postgres-native-tls",
"postgres-protocol",
"postgres_backend",
"pq_proto",
"prometheus",
@@ -3074,7 +3083,6 @@ dependencies = [
"thiserror",
"tls-listener",
"tokio",
"tokio-native-tls",
"tokio-postgres",
"tokio-postgres-rustls",
"tokio-rustls 0.23.4",
@@ -4306,7 +4314,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"async-trait",
"byteorder",

View File

@@ -88,6 +88,7 @@ opentelemetry = "0.19.0"
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.11.0"
parking_lot = "0.12"
pbkdf2 = "0.12.1"
pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
@@ -143,11 +144,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -182,7 +183,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
################# Binary contents sections

View File

@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
libicu-dev libxslt1-dev liblz4-dev libzstd-dev
#########################################################################################
#
@@ -77,7 +77,6 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
./autogen.sh && \
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -90,28 +89,17 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
mkdir -p /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
mkdir build && cd build && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
#########################################################################################
#
@@ -431,16 +419,12 @@ RUN apt-get update && \
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
mkdir build && cd build && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
#########################################################################################
#
@@ -551,8 +535,10 @@ FROM build-deps AS pg-embedding-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
# There is no release tag yet
RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -567,17 +553,16 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.ta
FROM build-deps AS pg-anon-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
find /usr/local/pgsql -type f | sort > /before.txt && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
find /usr/local/pgsql -type f | sort > /after.txt && \
/bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
#########################################################################################
#
@@ -769,23 +754,16 @@ RUN rm /usr/local/pgsql/lib/lib*.a
# Extenstion only
#
#########################################################################################
FROM python:3.9-slim-bullseye AS generate-ext-index
ARG PG_VERSION
ARG BUILD_TAG
RUN apt update && apt install -y zstd
# copy the control files here
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
COPY --from=pg-anon-pg-build /extensions/ /extensions/
COPY --from=postgis-build /extensions/ /extensions/
COPY scripts/combine_control_files.py ./combine_control_files.py
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
FROM scratch AS postgres-extensions
# After the transition this layer will include all extensitons.
# As for now, it's only a couple for testing purposses
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
COPY --from=generate-ext-index /ext_index.json /ext_index.json
# As for now, it's only for new custom ones
#
# # Default extensions
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib
# Custom extensions
COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
#########################################################################################
#

View File

@@ -108,8 +108,6 @@ postgres-%: postgres-configure-% \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+@echo "Compiling pageinspect $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+@echo "Compiling amcheck $*"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
.PHONY: postgres-clean-%
postgres-clean-%:

View File

@@ -193,13 +193,6 @@ fn main() -> Result<()> {
if !spec_set {
// No spec provided, hang waiting for it.
info!("no compute spec provided, waiting");
// TODO this can stall startups in the unlikely event that we bind
// this compute node while it's busy prewarming. It's not too
// bad because it's just 100ms and unlikely, but it's an
// avoidable problem.
compute.prewarm_postgres()?;
let mut state = compute.state.lock().unwrap();
while state.status != ComputeStatus::ConfigurationPending {
state = compute.state_changed.wait(state).unwrap();

View File

@@ -8,11 +8,9 @@ use std::sync::{Condvar, Mutex};
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use postgres::{Client, NoTls};
use tokio_postgres;
use tracing::{error, info, instrument, warn};
use tracing::{info, instrument, warn};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -23,7 +21,6 @@ use utils::measured_stream::MeasuredReader;
use crate::config;
use crate::pg_helpers::*;
use crate::spec::*;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
/// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode {
@@ -89,7 +86,6 @@ pub struct ParsedSpec {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub pageserver_connstr: String,
pub safekeeper_connstrings: Vec<String>,
pub storage_auth_token: Option<String>,
}
@@ -107,21 +103,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
.clone()
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
.ok_or("pageserver connstr should be provided")?;
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
if matches!(spec.mode, ComputeMode::Primary) {
spec.cluster
.settings
.find("neon.safekeepers")
.ok_or("safekeeper connstrings should be provided")?
.split(',')
.map(|str| str.to_string())
.collect()
} else {
vec![]
}
} else {
spec.safekeeper_connstrings.clone()
};
let storage_auth_token = spec.storage_auth_token.clone();
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
tenant_id
@@ -147,7 +128,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
Ok(ParsedSpec {
spec,
pageserver_connstr,
safekeeper_connstrings,
storage_auth_token,
tenant_id,
timeline_id,
@@ -329,102 +309,6 @@ impl ComputeNode {
Ok(())
}
pub async fn check_safekeepers_synced_async(
&self,
compute_state: &ComputeState,
) -> Result<Option<Lsn>> {
// Construct a connection config for each safekeeper
let pspec: ParsedSpec = compute_state
.pspec
.as_ref()
.expect("spec must be set")
.clone();
let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
let sk_configs = sk_connstrs.into_iter().map(|connstr| {
// Format connstr
let id = connstr.clone();
let connstr = format!("postgresql://no_user@{}", connstr);
let options = format!(
"-c timeline_id={} tenant_id={}",
pspec.timeline_id, pspec.tenant_id
);
// Construct client
let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
config.options(&options);
if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
config.password(storage_auth_token);
}
(id, config)
});
// Create task set to query all safekeepers
let mut tasks = FuturesUnordered::new();
let quorum = sk_configs.len() / 2 + 1;
for (id, config) in sk_configs {
let timeout = tokio::time::Duration::from_millis(100);
let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
tasks.push(tokio::spawn(task));
}
// Get a quorum of responses or errors
let mut responses = Vec::new();
let mut join_errors = Vec::new();
let mut task_errors = Vec::new();
let mut timeout_errors = Vec::new();
while let Some(response) = tasks.next().await {
match response {
Ok(Ok(Ok(r))) => responses.push(r),
Ok(Ok(Err(e))) => task_errors.push(e),
Ok(Err(e)) => timeout_errors.push(e),
Err(e) => join_errors.push(e),
};
if responses.len() >= quorum {
break;
}
if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
break;
}
}
// In case of error, log and fail the check, but don't crash.
// We're playing it safe because these errors could be transient
// and we don't yet retry. Also being careful here allows us to
// be backwards compatible with safekeepers that don't have the
// TIMELINE_STATUS API yet.
if responses.len() < quorum {
error!(
"failed sync safekeepers check {:?} {:?} {:?}",
join_errors, task_errors, timeout_errors
);
return Ok(None);
}
Ok(check_if_synced(responses))
}
// Fast path for sync_safekeepers. If they're already synced we get the lsn
// in one roundtrip. If not, we should do a full sync_safekeepers.
pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
let start_time = Utc::now();
// Run actual work with new tokio runtime
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.expect("failed to create rt");
let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
// Record runtime
self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
.signed_duration_since(start_time)
.to_std()
.unwrap()
.as_millis() as u64;
result
}
// Run `postgres` in a special mode with `--sync-safekeepers` argument
// and return the reported LSN back to the caller.
#[instrument(skip_all)]
@@ -487,14 +371,10 @@ impl ComputeNode {
// cannot sync safekeepers.
let lsn = match spec.mode {
ComputeMode::Primary => {
info!("checking if safekeepers are synced");
let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
lsn
} else {
info!("starting safekeepers syncing");
self.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?
};
info!("starting safekeepers syncing");
let lsn = self
.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?;
info!("safekeepers synced at LSN {}", lsn);
lsn
}
@@ -532,50 +412,6 @@ impl ComputeNode {
Ok(())
}
/// Start and stop a postgres process to warm up the VM for startup.
pub fn prewarm_postgres(&self) -> Result<()> {
info!("prewarming");
// Create pgdata
let pgdata = &format!("{}.warmup", self.pgdata);
create_pgdata(pgdata)?;
// Run initdb to completion
info!("running initdb");
let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
Command::new(initdb_bin)
.args(["-D", pgdata])
.output()
.expect("cannot start initdb process");
// Write conf
use std::io::Write;
let conf_path = Path::new(pgdata).join("postgresql.conf");
let mut file = std::fs::File::create(conf_path)?;
writeln!(file, "shared_buffers=65536")?;
writeln!(file, "port=51055")?; // Nobody should be connecting
writeln!(file, "shared_preload_libraries = 'neon'")?;
// Start postgres
info!("starting postgres");
let mut pg = Command::new(&self.pgbin)
.args(["-D", pgdata])
.spawn()
.expect("cannot start postgres process");
// Stop it when it's ready
info!("waiting for postgres");
wait_for_postgres(&mut pg, Path::new(pgdata))?;
pg.kill()?;
info!("sent kill signal");
pg.wait()?;
info!("done prewarming");
// clean up
let _ok = fs::remove_dir_all(pgdata);
Ok(())
}
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
#[instrument(skip_all)]

View File

@@ -13,4 +13,3 @@ pub mod monitor;
pub mod params;
pub mod pg_helpers;
pub mod spec;
pub mod sync_sk;

View File

@@ -1,98 +0,0 @@
// Utils for running sync_safekeepers
use anyhow::Result;
use tracing::info;
use utils::lsn::Lsn;
#[derive(Copy, Clone, Debug)]
pub enum TimelineStatusResponse {
NotFound,
Ok(TimelineStatusOkResponse),
}
#[derive(Copy, Clone, Debug)]
pub struct TimelineStatusOkResponse {
flush_lsn: Lsn,
commit_lsn: Lsn,
}
/// Get a safekeeper's metadata for our timeline. The id is only used for logging
pub async fn ping_safekeeper(
id: String,
config: tokio_postgres::Config,
) -> Result<TimelineStatusResponse> {
// TODO add retries
// Connect
info!("connecting to {}", id);
let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
tokio::spawn(async move {
if let Err(e) = conn.await {
eprintln!("connection error: {}", e);
}
});
// Query
info!("querying {}", id);
let result = client.simple_query("TIMELINE_STATUS").await?;
// Parse result
info!("done with {}", id);
if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
use std::str::FromStr;
let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
});
Ok(response)
} else {
// Timeline doesn't exist
Ok(TimelineStatusResponse::NotFound)
}
}
/// Given a quorum of responses, check if safekeepers are synced at some Lsn
pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
// Check if all responses are ok
let ok_responses: Vec<TimelineStatusOkResponse> = responses
.iter()
.filter_map(|r| match r {
TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
_ => None,
})
.cloned()
.collect();
if ok_responses.len() < responses.len() {
info!(
"not synced. Only {} out of {} know about this timeline",
ok_responses.len(),
responses.len()
);
return None;
}
// Get the min and the max of everything
let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
let commit_max = commit.iter().max().unwrap();
let commit_min = commit.iter().min().unwrap();
let flush_max = flush.iter().max().unwrap();
let flush_min = flush.iter().min().unwrap();
// Check that all values are equal
if commit_min != commit_max {
info!("not synced. {:?} {:?}", commit_min, commit_max);
return None;
}
if flush_min != flush_max {
info!("not synced. {:?} {:?}", flush_min, flush_max);
return None;
}
// Check that commit == flush
if commit_max != flush_max {
info!("not synced. {:?} {:?}", commit_max, flush_max);
return None;
}
Some(*commit_max)
}

View File

@@ -564,7 +564,9 @@ impl Endpoint {
}
Err(e) => {
if attempt == MAX_ATTEMPTS {
return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
return Err(e).context(
"timed out waiting to connect to compute_ctl HTTP; last error: {e}",
);
}
}
}

View File

@@ -70,7 +70,6 @@ where
pub struct ComputeMetrics {
pub wait_for_spec_ms: u64,
pub sync_safekeepers_ms: u64,
pub sync_sk_check_ms: u64,
pub basebackup_ms: u64,
pub basebackup_bytes: u64,
pub start_postgres_ms: u64,

View File

@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
use rand::Rng;
use serde::Serialize;
#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
#[serde(tag = "type")]
pub enum EventType {
#[serde(rename = "absolute")]
@@ -17,32 +17,6 @@ pub enum EventType {
},
}
impl EventType {
pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
use EventType::*;
match self {
Absolute { time } => Some(time),
_ => None,
}
}
pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
// these can most likely be thought of as Range or RangeFull
use EventType::*;
match self {
Incremental {
start_time,
stop_time,
} => Some(start_time..stop_time),
_ => None,
}
}
pub fn is_incremental(&self) -> bool {
matches!(self, EventType::Incremental { .. })
}
}
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct Event<Extra> {
#[serde(flatten)]

View File

@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
#[derive(Debug)]
pub struct FeCloseMessage;
/// An error occurred while parsing or serializing raw stream into Postgres
/// An error occured while parsing or serializing raw stream into Postgres
/// messages.
#[derive(thiserror::Error, Debug)]
pub enum ProtocolError {

View File

@@ -200,17 +200,13 @@ impl S3Bucket {
)
}
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
let path_string = path
.get_path()
.to_string_lossy()
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
.to_string();
match &self.prefix_in_bucket {
Some(prefix) => prefix.clone() + "/" + &path_string,
None => path_string,
fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
for segment in path.0.iter() {
full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
full_path.push_str(segment.to_str().unwrap_or_default());
}
full_path
}
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
@@ -431,12 +427,10 @@ impl RemoteStorage for S3Bucket {
}
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
// if prefix is not none then download file `prefix/from`
// if prefix is none then download file `from`
self.download_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
range: None,
..GetObjectRequest::default()
})
.await
}
@@ -529,63 +523,3 @@ impl RemoteStorage for S3Bucket {
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::num::NonZeroUsize;
use std::path::Path;
use crate::{RemotePath, S3Bucket, S3Config};
#[test]
fn relative_path() {
let all_paths = vec!["", "some/path", "some/path/"];
let all_paths: Vec<RemotePath> = all_paths
.iter()
.map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
.collect();
let prefixes = [
None,
Some(""),
Some("test/prefix"),
Some("test/prefix/"),
Some("/test/prefix/"),
];
let expected_outputs = vec![
vec!["", "some/path", "some/path"],
vec!["/", "/some/path", "/some/path"],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path",
],
];
for (prefix_idx, prefix) in prefixes.iter().enumerate() {
let config = S3Config {
bucket_name: "bucket".to_owned(),
bucket_region: "region".to_owned(),
prefix_in_bucket: prefix.map(str::to_string),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(5),
};
let storage = S3Bucket::new(&config).expect("remote storage init");
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
let result = storage.relative_path_to_s3_object(test_path);
let expected = expected_outputs[prefix_idx][test_path_idx];
assert_eq!(result, expected);
}
}
}
}

View File

@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
const BASE_PREFIX: &str = "test";
const BASE_PREFIX: &str = "test/";
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.

View File

@@ -24,29 +24,12 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
Ok(dir.next_entry().await?.is_none())
}
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
if e.kind() == io::ErrorKind::NotFound {
Ok(())
} else {
Err(e)
}
}
pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
where
F: Fn() -> io::Result<()>,
{
fs_operation().or_else(ignore_not_found)
}
#[cfg(test)]
mod test {
use std::path::PathBuf;
use crate::fs_ext::is_directory_empty;
use super::ignore_absent_files;
#[test]
fn is_empty_dir() {
use super::PathExt;
@@ -92,21 +75,4 @@ mod test {
std::fs::remove_file(&file_path).unwrap();
assert!(is_directory_empty(file_path).await.is_err());
}
#[test]
fn ignore_absent_files_works() {
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
let file_path: PathBuf = dir_path.join("testfile");
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
let f = std::fs::File::create(&file_path).unwrap();
drop(f);
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
assert!(!file_path.exists());
}
}

View File

@@ -1,7 +1,5 @@
use std::ffi::OsStr;
use std::{fmt, str::FromStr};
use anyhow::Context;
use hex::FromHex;
use rand::Rng;
use serde::{Deserialize, Serialize};
@@ -215,18 +213,6 @@ pub struct TimelineId(Id);
id_newtype!(TimelineId);
impl TryFrom<Option<&OsStr>> for TimelineId {
type Error = anyhow::Error;
fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
value
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TimelineId>()
.with_context(|| format!("Could not parse timeline id from {:?}", value))
}
}
/// Neon Tenant Id represents identifiar of a particular tenant.
/// Is used for distinguishing requests and data belonging to different users.
///

View File

@@ -33,8 +33,7 @@ use crate::tenant::config::TenantConf;
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
TIMELINE_UNINIT_MARK_SUFFIX,
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
};
pub mod defaults {
@@ -602,17 +601,6 @@ impl PageServerConf {
)
}
pub fn timeline_delete_mark_file_path(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_id, &timeline_id),
TIMELINE_DELETE_MARK_SUFFIX,
)
}
pub fn traces_path(&self) -> PathBuf {
self.workdir.join("traces")
}

View File

@@ -7,7 +7,7 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::{mgr, LogicalSizeCalculationCause};
use anyhow;
use chrono::{DateTime, Utc};
use chrono::Utc;
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
use pageserver_api::models::TenantState;
use reqwest::Url;
@@ -18,6 +18,12 @@ use std::time::Duration;
use tracing::*;
use utils::id::{NodeId, TenantId, TimelineId};
const WRITTEN_SIZE: &str = "written_size";
const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
const RESIDENT_SIZE: &str = "resident_size";
const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
#[serde_as]
@@ -38,121 +44,6 @@ pub struct PageserverConsumptionMetricsKey {
pub metric: &'static str,
}
impl PageserverConsumptionMetricsKey {
const fn absolute_values(self) -> AbsoluteValueFactory {
AbsoluteValueFactory(self)
}
const fn incremental_values(self) -> IncrementalValueFactory {
IncrementalValueFactory(self)
}
}
/// Helper type which each individual metric kind can return to produce only absolute values.
struct AbsoluteValueFactory(PageserverConsumptionMetricsKey);
impl AbsoluteValueFactory {
fn now(self, val: u64) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
let key = self.0;
let time = Utc::now();
(key, (EventType::Absolute { time }, val))
}
}
/// Helper type which each individual metric kind can return to produce only incremental values.
struct IncrementalValueFactory(PageserverConsumptionMetricsKey);
impl IncrementalValueFactory {
#[allow(clippy::wrong_self_convention)]
fn from_previous_up_to(
self,
prev_end: DateTime<Utc>,
up_to: DateTime<Utc>,
val: u64,
) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
let key = self.0;
// cannot assert prev_end < up_to because these are realtime clock based
(
key,
(
EventType::Incremental {
start_time: prev_end,
stop_time: up_to,
},
val,
),
)
}
fn key(&self) -> &PageserverConsumptionMetricsKey {
&self.0
}
}
// the static part of a PageserverConsumptionMetricsKey
impl PageserverConsumptionMetricsKey {
const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: "written_size",
}
.absolute_values()
}
/// Values will be the difference of the latest written_size (last_record_lsn) to what we
/// previously sent.
const fn written_size_delta(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> IncrementalValueFactory {
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: "written_size_bytes_delta",
}
.incremental_values()
}
const fn timeline_logical_size(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> AbsoluteValueFactory {
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: "timeline_logical_size",
}
.absolute_values()
}
const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: "remote_storage_size",
}
.absolute_values()
}
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: "resident_size",
}
.absolute_values()
}
const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: "synthetic_storage_size",
}
.absolute_values()
}
}
/// Main thread that serves metrics collection
pub async fn collect_metrics(
metric_collection_endpoint: &Url,
@@ -188,7 +79,7 @@ pub async fn collect_metrics(
.timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
.build()
.expect("Failed to create http client with timeout");
let mut cached_metrics = HashMap::new();
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
loop {
@@ -230,13 +121,13 @@ pub async fn collect_metrics(
/// - refactor this function (chunking+sending part) to reuse it in proxy module;
pub async fn collect_metrics_iteration(
client: &reqwest::Client,
cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, (EventType, u64)>,
cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
metric_collection_endpoint: &reqwest::Url,
node_id: NodeId,
ctx: &RequestContext,
send_cached: bool,
) {
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, (EventType, u64))> = Vec::new();
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
trace!(
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
metric_collection_endpoint
@@ -275,80 +166,27 @@ pub async fn collect_metrics_iteration(
if timeline.is_active() {
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
let (key, written_size_now) =
PageserverConsumptionMetricsKey::written_size(tenant_id, timeline.timeline_id)
.now(timeline_written_size);
// last_record_lsn can only go up, right now at least, TODO: #2592 or related
// features might change this.
let written_size_delta_key = PageserverConsumptionMetricsKey::written_size_delta(
tenant_id,
timeline.timeline_id,
);
// use this when available, because in a stream of incremental values, it will be
// accurate where as when last_record_lsn stops moving, we will only cache the last
// one of those.
let last_stop_time =
cached_metrics
.get(written_size_delta_key.key())
.map(|(until, _val)| {
until
.incremental_timerange()
.expect("never create EventType::Absolute for written_size_delta")
.end
});
// by default, use the last sent written_size as the basis for
// calculating the delta. if we don't yet have one, use the load time value.
let prev = cached_metrics
.get(&key)
.map(|(prev_at, prev)| {
// use the prev time from our last incremental update, or default to latest
// absolute update on the first round.
let prev_at = prev_at
.absolute_time()
.expect("never create EventType::Incremental for written_size");
let prev_at = last_stop_time.unwrap_or(prev_at);
(*prev_at, *prev)
})
.unwrap_or_else(|| {
// if we don't have a previous point of comparison, compare to the load time
// lsn.
let (disk_consistent_lsn, loaded_at) = &timeline.loaded_at;
(DateTime::from(*loaded_at), disk_consistent_lsn.0)
});
// written_size_delta_bytes
current_metrics.extend(
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
let up_to = written_size_now
.0
.absolute_time()
.expect("never create EventType::Incremental for written_size");
let key_value =
written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
Some(key_value)
} else {
None
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: Some(timeline.timeline_id),
metric: WRITTEN_SIZE,
},
);
// written_size
current_metrics.push((key, written_size_now));
timeline_written_size,
));
let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
// Only send timeline logical size when it is fully calculated.
Ok((size, is_exact)) if is_exact => {
current_metrics.push(
PageserverConsumptionMetricsKey::timeline_logical_size(
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline.timeline_id,
)
.now(size),
);
timeline_id: Some(timeline.timeline_id),
metric: TIMELINE_LOGICAL_SIZE,
},
size,
));
}
Ok((_, _)) => {}
Err(err) => {
@@ -367,10 +205,14 @@ pub async fn collect_metrics_iteration(
match tenant.get_remote_size().await {
Ok(tenant_remote_size) => {
current_metrics.push(
PageserverConsumptionMetricsKey::remote_storage_size(tenant_id)
.now(tenant_remote_size),
);
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: REMOTE_STORAGE_SIZE,
},
tenant_remote_size,
));
}
Err(err) => {
error!(
@@ -380,9 +222,14 @@ pub async fn collect_metrics_iteration(
}
}
current_metrics.push(
PageserverConsumptionMetricsKey::resident_size(tenant_id).now(tenant_resident_size),
);
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: RESIDENT_SIZE,
},
tenant_resident_size,
));
// Note that this metric is calculated in a separate bgworker
// Here we only use cached value, which may lag behind the real latest one
@@ -390,27 +237,23 @@ pub async fn collect_metrics_iteration(
if tenant_synthetic_size != 0 {
// only send non-zeroes because otherwise these show up as errors in logs
current_metrics.push(
PageserverConsumptionMetricsKey::synthetic_size(tenant_id)
.now(tenant_synthetic_size),
);
current_metrics.push((
PageserverConsumptionMetricsKey {
tenant_id,
timeline_id: None,
metric: SYNTHETIC_STORAGE_SIZE,
},
tenant_synthetic_size,
));
}
}
// Filter metrics, unless we want to send all metrics, including cached ones.
// See: https://github.com/neondatabase/neon/issues/3485
if !send_cached {
current_metrics.retain(|(curr_key, (kind, curr_val))| {
if kind.is_incremental() {
// incremental values (currently only written_size_delta) should not get any cache
// deduplication because they will be used by upstream for "is still alive."
true
} else {
match cached_metrics.get(curr_key) {
Some((_, val)) => val != curr_val,
None => true,
}
}
current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
Some(val) => val != curr_val,
None => true,
});
}
@@ -429,8 +272,8 @@ pub async fn collect_metrics_iteration(
chunk_to_send.clear();
// enrich metrics with type,timestamp and idempotency key before sending
chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
kind: *when,
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
kind: EventType::Absolute { time: Utc::now() },
metric: curr_key.metric,
idempotency_key: idempotency_key(node_id.to_string()),
value: *curr_val,

View File

@@ -545,12 +545,12 @@ async fn collect_eviction_candidates(
// We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
// That's what's typically used by the various background loops.
//
// The default can be overridden with a fixed value in the tenant conf.
// The default can be overriden with a fixed value in the tenant conf.
// A default override can be put in the default tenant conf in the pageserver.toml.
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
debug!(
tenant_id=%tenant.tenant_id(),
overridden_size=s,
overriden_size=s,
"using overridden min resident size for tenant"
);
s

View File

@@ -109,8 +109,6 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
/// A marker file to prevent pageserver from loading a certain tenant on restart.
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -125,30 +123,15 @@ pub fn is_temporary(path: &Path) -> bool {
}
}
fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
pub fn is_uninit_mark(path: &Path) -> bool {
match path.file_name() {
Some(name) => name.to_string_lossy().ends_with(suffix),
Some(name) => name
.to_string_lossy()
.ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
None => false,
}
}
pub fn is_uninit_mark(path: &Path) -> bool {
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
}
pub fn is_delete_mark(path: &Path) -> bool {
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
}
fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
if let Some(e) = e.io_error() {
if e.kind() == std::io::ErrorKind::NotFound {
return true;
}
}
false
}
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
/// blocking.
///

View File

@@ -73,7 +73,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
// Buckets for background operations like compaction, GC, size calculation
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_storage_operations_seconds_global",
"Time spent on storage operations",
@@ -93,7 +93,7 @@ pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
});
// Metrics collected on operations on the storage repository.
pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -102,7 +102,7 @@ pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_materialized_cache_hits_direct_total",
"Number of cache hits from materialized page cache without redo",
@@ -119,7 +119,7 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_materialized_cache_hits_total",
"Number of cache hits from materialized page cache",
@@ -280,7 +280,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_remote_ondemand_downloaded_layers_total",
"Total on-demand downloaded layers"
@@ -288,7 +288,7 @@ pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::ne
.unwrap()
});
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_remote_ondemand_downloaded_bytes_total",
"Total bytes of layers on-demand downloaded",
@@ -327,7 +327,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("Failed to register pageserver_tenant_states_count metric")
});
pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_synthetic_cached_size_bytes",
"Synthetic size of each tenant in bytes",
@@ -385,7 +385,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
.expect("failed to define a metric")
});
pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_unexpected_ondemand_downloads_count",
"Number of unexpected on-demand downloads. \
@@ -690,7 +690,7 @@ pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_tenant_task_events",
"Number of task start/stop/fail events.",
@@ -699,7 +699,7 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("Failed to register tenant_task_events metric")
});
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_period_overrun_count",
"Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -710,7 +710,7 @@ pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = La
// walreceiver metrics
pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_walreceiver_started_connections_total",
"Number of started walreceiver connections"
@@ -718,7 +718,7 @@ pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"pageserver_walreceiver_active_managers",
"Number of active walreceiver managers"
@@ -726,7 +726,7 @@ pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_walreceiver_switches_total",
"Number of walreceiver manager change_connection calls",
@@ -735,7 +735,7 @@ pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_walreceiver_broker_updates_total",
"Number of received broker updates in walreceiver"
@@ -743,7 +743,7 @@ pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_walreceiver_candidates_events_total",
"Number of walreceiver candidate events",
@@ -752,10 +752,10 @@ pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new
.expect("failed to define a metric")
});
pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
// Metrics collected on WAL redo operations
@@ -802,7 +802,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
};
}
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_seconds",
"Time spent on WAL redo",
@@ -811,7 +811,7 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the Postgres WAL redo process",
@@ -820,7 +820,7 @@ pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_records_histogram",
"Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -829,7 +829,7 @@ pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_bytes_histogram",
"Histogram of number of records replayed per redo sent to Postgres",
@@ -838,8 +838,7 @@ pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
.expect("failed to define a metric")
});
// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_replayed_wal_records_total",
"Number of WAL records replayed in WAL redo process"
@@ -1395,51 +1394,15 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
}
pub fn preinitialize_metrics() {
// Python tests need these and on some we do alerting.
//
// FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
// order:
// - global metrics reside in a Lazy<PageserverMetrics>
// - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
// - could move the statics into TimelineMetrics::new()?
// We want to alert on this metric increasing.
// Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
// counters
[
&MATERIALIZED_PAGE_CACHE_HIT,
&MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
&UNEXPECTED_ONDEMAND_DOWNLOADS,
&WALRECEIVER_STARTED_CONNECTIONS,
&WALRECEIVER_BROKER_UPDATES,
&WALRECEIVER_CANDIDATES_ADDED,
&WALRECEIVER_CANDIDATES_REMOVED,
]
.into_iter()
.for_each(|c| {
Lazy::force(c);
});
// Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()
.for_each(|c| {
Lazy::force(c);
});
// gauges
WALRECEIVER_ACTIVE_MANAGERS.get();
// histograms
[
&READ_NUM_FS_LAYERS,
&RECONSTRUCT_TIME,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,
&WAL_REDO_WAIT_TIME,
&WAL_REDO_RECORDS_HISTOGRAM,
&WAL_REDO_BYTES_HISTOGRAM,
]
.into_iter()
.for_each(|h| {
Lazy::force(h);
});
// Python tests need these.
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
MATERIALIZED_PAGE_CACHE_HIT.get();
}

View File

@@ -18,6 +18,7 @@ use remote_storage::DownloadError;
use remote_storage::GenericRemoteStorage;
use storage_broker::BrokerClientChannel;
use tokio::sync::watch;
use tokio::sync::OwnedMutexGuard;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -28,6 +29,7 @@ use std::cmp::min;
use std::collections::hash_map::Entry;
use std::collections::BTreeSet;
use std::collections::HashMap;
use std::ffi::OsStr;
use std::fs;
use std::fs::File;
use std::fs::OpenOptions;
@@ -46,7 +48,6 @@ use std::sync::{Mutex, RwLock};
use std::time::{Duration, Instant};
use self::config::TenantConf;
use self::metadata::LoadMetadataError;
use self::metadata::TimelineMetadata;
use self::remote_timeline_client::RemoteTimelineClient;
use self::timeline::uninit::TimelineUninitMark;
@@ -64,12 +65,12 @@ use crate::tenant::config::TenantConfOpt;
use crate::tenant::metadata::load_metadata;
use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
use crate::tenant::storage_layer::DeltaLayer;
use crate::tenant::storage_layer::ImageLayer;
use crate::tenant::storage_layer::Layer;
use crate::InitializationOrder;
use crate::tenant::timeline::delete::DeleteTimelineFlow;
use crate::tenant::timeline::uninit::cleanup_timeline_directory;
use crate::virtual_file::VirtualFile;
use crate::walredo::PostgresRedoManager;
@@ -265,14 +266,6 @@ pub enum GetTimelineError {
},
}
#[derive(Debug, thiserror::Error)]
pub enum LoadLocalTimelineError {
#[error("FailedToLoad")]
Load(#[source] anyhow::Error),
#[error("FailedToResumeDeletion")]
ResumeDeletion(#[source] anyhow::Error),
}
#[derive(Debug, thiserror::Error)]
pub enum DeleteTimelineError {
#[error("NotFound")]
@@ -326,6 +319,14 @@ impl std::fmt::Display for WaitToBecomeActiveError {
}
}
struct DeletionGuard(OwnedMutexGuard<bool>);
impl DeletionGuard {
fn is_deleted(&self) -> bool {
*self.0
}
}
#[derive(thiserror::Error, Debug)]
pub enum CreateTimelineError {
#[error("a timeline with the given ID already exists")]
@@ -336,16 +337,6 @@ pub enum CreateTimelineError {
Other(#[from] anyhow::Error),
}
struct TenantDirectoryScan {
sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
}
enum CreateTimelineCause {
Load,
Delete,
}
impl Tenant {
/// Yet another helper for timeline initialization.
/// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -384,7 +375,6 @@ impl Tenant {
ancestor.clone(),
remote_client,
init_order,
CreateTimelineCause::Load,
)?;
let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
anyhow::ensure!(
@@ -813,14 +803,11 @@ impl Tenant {
tenant
}
fn scan_and_sort_timelines_dir(self: Arc<Tenant>) -> anyhow::Result<TenantDirectoryScan> {
let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
// Note timelines_to_resume_deletion needs to be separate because it can be not sortable
// from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
// completed in non topological order (for example because parent has smaller number of layer files in it)
let mut timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)> = vec![];
pub fn scan_and_sort_timelines_dir(
self: Arc<Tenant>,
) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
let timelines_dir = self.conf.timelines_path(&self.tenant_id);
let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
for entry in
std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
@@ -848,13 +835,16 @@ impl Tenant {
);
continue;
}
let timeline_uninit_mark_file = &timeline_dir;
info!(
"Found an uninit mark file {}, removing the timeline and its uninit mark",
timeline_uninit_mark_file.display()
);
let timeline_id = TimelineId::try_from(timeline_uninit_mark_file.file_stem())
let timeline_id = timeline_uninit_mark_file
.file_stem()
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TimelineId>()
.with_context(|| {
format!(
"Could not parse timeline id out of the timeline uninit mark name {}",
@@ -867,47 +857,6 @@ impl Tenant {
{
error!("Failed to clean up uninit marked timeline: {e:?}");
}
} else if crate::is_delete_mark(&timeline_dir) {
// If metadata exists, load as usual, continue deletion
let timeline_id =
TimelineId::try_from(timeline_dir.file_stem()).with_context(|| {
format!(
"Could not parse timeline id out of the timeline uninit mark name {}",
timeline_dir.display()
)
})?;
match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
Ok(metadata) => {
timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
}
Err(e) => match &e {
LoadMetadataError::Read(r) => {
if r.kind() != io::ErrorKind::NotFound {
return Err(anyhow::anyhow!(e)).with_context(|| {
format!("Failed to load metadata for timeline_id {timeline_id}")
});
}
// If metadata doesnt exist it means that we've crashed without
// completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow.
// So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`.
// We cant do it here because the method is async so we'd need block_on
// and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations
// so that basically results in a cycle:
// spawn_blocking
// - block_on
// - spawn_blocking
// which can lead to running out of threads in blocing pool.
timelines_to_resume_deletion.push((timeline_id, None));
}
_ => {
return Err(anyhow::anyhow!(e)).with_context(|| {
format!("Failed to load metadata for timeline_id {timeline_id}")
})
}
},
}
} else {
if !timeline_dir.exists() {
warn!(
@@ -916,8 +865,12 @@ impl Tenant {
);
continue;
}
let timeline_id =
TimelineId::try_from(timeline_dir.file_name()).with_context(|| {
let timeline_id = timeline_dir
.file_name()
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TimelineId>()
.with_context(|| {
format!(
"Could not parse timeline id out of the timeline dir name {}",
timeline_dir.display()
@@ -939,14 +892,6 @@ impl Tenant {
continue;
}
let timeline_delete_mark_file = self
.conf
.timeline_delete_mark_file_path(self.tenant_id, timeline_id);
if timeline_delete_mark_file.exists() {
// Cleanup should be done in `is_delete_mark` branch above
continue;
}
let file_name = entry.file_name();
if let Ok(timeline_id) =
file_name.to_str().unwrap_or_default().parse::<TimelineId>()
@@ -966,10 +911,7 @@ impl Tenant {
// Sort the array of timeline IDs into tree-order, so that parent comes before
// all its children.
tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
sorted_timelines_to_load: sorted_timelines,
timelines_to_resume_deletion,
})
tree_sort_timelines(timelines_to_load)
}
///
@@ -995,7 +937,7 @@ impl Tenant {
let span = info_span!("blocking");
let cloned = Arc::clone(self);
let scan = tokio::task::spawn_blocking(move || {
let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || {
let _g = span.entered();
cloned.scan_and_sort_timelines_dir()
})
@@ -1006,60 +948,10 @@ impl Tenant {
// FIXME original collect_timeline_files contained one more check:
// 1. "Timeline has no ancestor and no layer files"
// Process loadable timelines first
for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
if let Err(e) = self
.load_local_timeline(timeline_id, local_metadata, init_order, ctx, false)
for (timeline_id, local_metadata) in sorted_timelines {
self.load_local_timeline(timeline_id, local_metadata, init_order, ctx)
.await
{
match e {
LoadLocalTimelineError::Load(source) => {
return Err(anyhow::anyhow!(source)
.context("Failed to load local timeline: {timeline_id}"))
}
LoadLocalTimelineError::ResumeDeletion(source) => {
// Make sure resumed deletion wont fail loading for entire tenant.
error!("Failed to resume timeline deletion: {source:#}")
}
}
}
}
// Resume deletion ones with deleted_mark
for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion {
match maybe_local_metadata {
None => {
// See comment in `scan_and_sort_timelines_dir`.
if let Err(e) =
DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id)
.await
{
warn!(
"cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
timeline_id, e
);
}
}
Some(local_metadata) => {
if let Err(e) = self
.load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
.await
{
match e {
LoadLocalTimelineError::Load(source) => {
// We tried to load deleted timeline, this is a bug.
return Err(anyhow::anyhow!(source).context(
"This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
));
}
LoadLocalTimelineError::ResumeDeletion(source) => {
// Make sure resumed deletion wont fail loading for entire tenant.
error!("Failed to resume timeline deletion: {source:#}")
}
}
}
}
}
.with_context(|| format!("load local timeline {timeline_id}"))?;
}
trace!("Done");
@@ -1077,8 +969,7 @@ impl Tenant {
local_metadata: TimelineMetadata,
init_order: Option<&InitializationOrder>,
ctx: &RequestContext,
found_delete_mark: bool,
) -> Result<(), LoadLocalTimelineError> {
) -> anyhow::Result<()> {
span::debug_assert_current_span_has_tenant_id();
let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
@@ -1090,6 +981,14 @@ impl Tenant {
)
});
let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
.with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
Some(ancestor_timeline)
} else {
None
};
let (remote_startup_data, remote_client) = match remote_client {
Some(remote_client) => match remote_client.download_index_file().await {
Ok(index_part) => {
@@ -1109,29 +1008,45 @@ impl Tenant {
info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler");
remote_client
.init_upload_queue_stopped_to_continue_deletion(&index_part)
.context("init queue stopped")
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
.init_upload_queue_stopped_to_continue_deletion(&index_part)?;
DeleteTimelineFlow::resume_deletion(
let timeline = self
.create_timeline_struct(
timeline_id,
&local_metadata,
ancestor,
Some(remote_client),
init_order,
)
.context("create_timeline_struct")?;
let guard = DeletionGuard(
Arc::clone(&timeline.delete_lock)
.try_lock_owned()
.expect("cannot happen because we're the only owner"),
);
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
// RemoteTimelineClient is the only functioning part.
timeline.set_state(TimelineState::Stopping);
// We meed to do this because when console retries delete request we shouldnt answer with 404
// because 404 means successful deletion.
// FIXME consider TimelineState::Deleting.
let mut locked = self.timelines.lock().unwrap();
locked.insert(timeline_id, Arc::clone(&timeline));
Tenant::schedule_delete_timeline(
Arc::clone(self),
timeline_id,
&local_metadata,
Some(remote_client),
init_order,
)
.await
.context("resume deletion")
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
timeline,
guard,
);
return Ok(());
}
};
let remote_metadata = index_part
.parse_metadata()
.context("parse_metadata")
.map_err(LoadLocalTimelineError::Load)?;
let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
(
Some(RemoteStartupData {
index_part,
@@ -1141,54 +1056,12 @@ impl Tenant {
)
}
Err(DownloadError::NotFound) => {
info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}");
if found_delete_mark {
// We could've resumed at a point where remote index was deleted, but metadata file wasnt.
// Cleanup:
return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(
self,
timeline_id,
)
.await
.context("cleanup_remaining_timeline_fs_traces")
.map_err(LoadLocalTimelineError::ResumeDeletion);
}
// We're loading fresh timeline that didnt yet make it into remote.
info!("no index file was found on the remote");
(None, Some(remote_client))
}
Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))),
Err(e) => return Err(anyhow::anyhow!(e)),
},
None => {
// No remote client
if found_delete_mark {
// There is no remote client, we found local metadata.
// Continue cleaning up local disk.
DeleteTimelineFlow::resume_deletion(
Arc::clone(self),
timeline_id,
&local_metadata,
None,
init_order,
)
.await
.context("resume deletion")
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
return Ok(());
}
(None, remote_client)
}
};
let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
.with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))
.map_err(LoadLocalTimelineError::Load)?;
Some(ancestor_timeline)
} else {
None
None => (None, remote_client),
};
self.timeline_init_and_sync(
@@ -1202,7 +1075,6 @@ impl Tenant {
ctx,
)
.await
.map_err(LoadLocalTimelineError::Load)
}
pub fn tenant_id(&self) -> TenantId {
@@ -1563,6 +1435,269 @@ impl Tenant {
}
}
/// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its
/// data from both disk and s3.
async fn delete_timeline(
&self,
timeline_id: TimelineId,
timeline: Arc<Timeline>,
guard: DeletionGuard,
) -> anyhow::Result<()> {
{
// Grab the layer_removal_cs lock, and actually perform the deletion.
//
// This lock prevents prevents GC or compaction from running at the same time.
// The GC task doesn't register itself with the timeline it's operating on,
// so it might still be running even though we called `shutdown_tasks`.
//
// Note that there are still other race conditions between
// GC, compaction and timeline deletion. See
// https://github.com/neondatabase/neon/issues/2671
//
// No timeout here, GC & Compaction should be responsive to the
// `TimelineState::Stopping` change.
info!("waiting for layer_removal_cs.lock()");
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
info!("got layer_removal_cs.lock(), deleting layer files");
// NB: remote_timeline_client upload tasks that reference these layers have been cancelled
// by the caller.
let local_timeline_directory = self
.conf
.timeline_path(&self.tenant_id, &timeline.timeline_id);
fail::fail_point!("timeline-delete-before-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
});
// NB: This need not be atomic because the deleted flag in the IndexPart
// will be observed during tenant/timeline load. The deletion will be resumed there.
//
// For configurations without remote storage, we tolerate that we're not crash-safe here.
// The timeline may come up Active but with missing layer files, in such setups.
// See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
match std::fs::remove_dir_all(&local_timeline_directory) {
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// This can happen if we're called a second time, e.g.,
// because of a previous failure/cancellation at/after
// failpoint timeline-delete-after-rm.
//
// It can also happen if we race with tenant detach, because,
// it doesn't grab the layer_removal_cs lock.
//
// For now, log and continue.
// warn! level is technically not appropriate for the
// first case because we should expect retries to happen.
// But the error is so rare, it seems better to get attention if it happens.
let tenant_state = self.current_state();
warn!(
timeline_dir=?local_timeline_directory,
?tenant_state,
"timeline directory not found, proceeding anyway"
);
// continue with the rest of the deletion
}
res => res.with_context(|| {
format!(
"Failed to remove local timeline directory '{}'",
local_timeline_directory.display()
)
})?,
}
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
drop(layer_removal_guard);
}
fail::fail_point!("timeline-delete-after-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
});
if let Some(remote_client) = &timeline.remote_client {
remote_client.delete_all().await.context("delete_all")?
};
pausable_failpoint!("in_progress_delete");
{
// Remove the timeline from the map.
let mut timelines = self.timelines.lock().unwrap();
let children_exist = timelines
.iter()
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
// We already deleted the layer files, so it's probably best to panic.
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
if children_exist {
panic!("Timeline grew children while we removed layer files");
}
timelines.remove(&timeline_id).expect(
"timeline that we were deleting was concurrently removed from 'timelines' map",
);
drop(timelines);
}
drop(guard);
Ok(())
}
/// Removes timeline-related in-memory data and schedules removal from remote storage.
#[instrument(skip(self, _ctx))]
pub async fn prepare_and_schedule_delete_timeline(
self: Arc<Self>,
timeline_id: TimelineId,
_ctx: &RequestContext,
) -> Result<(), DeleteTimelineError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// Transition the timeline into TimelineState::Stopping.
// This should prevent new operations from starting.
//
// Also grab the Timeline's delete_lock to prevent another deletion from starting.
let timeline;
let delete_lock_guard;
{
let mut timelines = self.timelines.lock().unwrap();
// Ensure that there are no child timelines **attached to that pageserver**,
// because detach removes files, which will break child branches
let children: Vec<TimelineId> = timelines
.iter()
.filter_map(|(id, entry)| {
if entry.get_ancestor_timeline_id() == Some(timeline_id) {
Some(*id)
} else {
None
}
})
.collect();
if !children.is_empty() {
return Err(DeleteTimelineError::HasChildren(children));
}
let timeline_entry = match timelines.entry(timeline_id) {
Entry::Occupied(e) => e,
Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
};
timeline = Arc::clone(timeline_entry.get());
// Prevent two tasks from trying to delete the timeline at the same time.
delete_lock_guard = DeletionGuard(
Arc::clone(&timeline.delete_lock)
.try_lock_owned()
.map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
);
// If another task finished the deletion just before we acquired the lock,
// return success.
if delete_lock_guard.is_deleted() {
return Ok(());
}
timeline.set_state(TimelineState::Stopping);
drop(timelines);
}
// Now that the Timeline is in Stopping state, request all the related tasks to
// shut down.
//
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
// Stop the walreceiver first.
debug!("waiting for wal receiver to shutdown");
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
if let Some(walreceiver) = maybe_started_walreceiver {
walreceiver.stop().await;
}
debug!("wal receiver shutdown confirmed");
// Prevent new uploads from starting.
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.stop();
match res {
Ok(()) => {}
Err(e) => match e {
remote_timeline_client::StopError::QueueUninitialized => {
// This case shouldn't happen currently because the
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
// That is, before we declare the Tenant as Active.
// But we only allow calls to delete_timeline on Active tenants.
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
}
},
}
}
// Stop & wait for the remaining timeline tasks, including upload tasks.
// NB: This and other delete_timeline calls do not run as a task_mgr task,
// so, they are not affected by this shutdown_tasks() call.
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
// Mark timeline as deleted in S3 so we won't pick it up next time
// during attach or pageserver restart.
// See comment in persist_index_part_with_deleted_flag.
if let Some(remote_client) = timeline.remote_client.as_ref() {
match remote_client.persist_index_part_with_deleted_flag().await {
// If we (now, or already) marked it successfully as deleted, we can proceed
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
// Bail out otherwise
//
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
// two tasks from performing the deletion at the same time. The first task
// that starts deletion should run it to completion.
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
}
}
}
self.schedule_delete_timeline(timeline_id, timeline, delete_lock_guard);
Ok(())
}
fn schedule_delete_timeline(
self: Arc<Self>,
timeline_id: TimelineId,
timeline: Arc<Timeline>,
guard: DeletionGuard,
) {
let tenant_id = self.tenant_id;
let timeline_clone = Arc::clone(&timeline);
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(self.tenant_id),
Some(timeline_id),
"timeline_delete",
false,
async move {
if let Err(err) = self.delete_timeline(timeline_id, timeline, guard).await {
error!("Error: {err:#}");
timeline_clone.set_broken(err.to_string())
};
Ok(())
}
.instrument({
let span =
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
span.follows_from(Span::current());
span
}),
);
}
pub fn current_state(&self) -> TenantState {
self.state.borrow().clone()
}
@@ -2019,10 +2154,6 @@ impl Tenant {
/// The returned Timeline is in Loading state. The caller is responsible for
/// initializing any on-disk state, and for inserting the Timeline to the 'timelines'
/// map.
///
/// `validate_ancestor == false` is used when a timeline is created for deletion
/// and we might not have the ancestor present anymore which is fine for to be
/// deleted timelines.
fn create_timeline_struct(
&self,
new_timeline_id: TimelineId,
@@ -2030,26 +2161,19 @@ impl Tenant {
ancestor: Option<Arc<Timeline>>,
remote_client: Option<RemoteTimelineClient>,
init_order: Option<&InitializationOrder>,
cause: CreateTimelineCause,
) -> anyhow::Result<Arc<Timeline>> {
let state = match cause {
CreateTimelineCause::Load => {
let ancestor_id = new_metadata.ancestor_timeline();
anyhow::ensure!(
ancestor_id == ancestor.as_ref().map(|t| t.timeline_id),
"Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found"
);
TimelineState::Loading
}
CreateTimelineCause::Delete => TimelineState::Stopping,
};
if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
anyhow::ensure!(
ancestor.is_some(),
"Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found"
)
}
let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);
let pg_version = new_metadata.pg_version();
let timeline = Timeline::new(
Ok(Timeline::new(
self.conf,
Arc::clone(&self.tenant_conf),
new_metadata,
@@ -2061,10 +2185,7 @@ impl Tenant {
pg_version,
initial_logical_size_can_start.cloned(),
initial_logical_size_attempt.cloned(),
state,
);
Ok(timeline)
))
}
fn new(
@@ -2731,14 +2852,7 @@ impl Tenant {
};
let timeline_struct = self
.create_timeline_struct(
new_timeline_id,
new_metadata,
ancestor,
remote_client,
None,
CreateTimelineCause::Load,
)
.create_timeline_struct(new_timeline_id, new_metadata, ancestor, remote_client, None)
.context("Failed to create timeline data structure")?;
timeline_struct.init_empty_layer_map(start_lsn);
@@ -3156,6 +3270,19 @@ pub async fn dump_layerfile_from_path(
Ok(())
}
fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
where
F: Fn() -> io::Result<()>,
{
fs_operation().or_else(|e| {
if e.kind() == io::ErrorKind::NotFound {
Ok(())
} else {
Err(e)
}
})
}
#[cfg(test)]
pub mod harness {
use bytes::{Bytes, BytesMut};
@@ -3834,9 +3961,9 @@ mod tests {
.await
.err()
.expect("should fail");
// get all the stack with all .context, not only the last one
// get all the stack with all .context, not tonly the last one
let message = format!("{err:#}");
let expected = "failed to load metadata";
let expected = "Failed to parse metadata bytes from path";
assert!(
message.contains(expected),
"message '{message}' expected to contain {expected}"
@@ -3853,8 +3980,7 @@ mod tests {
}
assert!(
found_error_message,
"didn't find the corrupted metadata error in {}",
message
"didn't find the corrupted metadata error"
);
Ok(())

View File

@@ -390,42 +390,39 @@ where
}
#[allow(dead_code)]
pub async fn dump(&self) -> Result<()> {
let mut stack = Vec::new();
pub fn dump(&self) -> Result<()> {
self.dump_recurse(self.root_blk, &[], 0)
}
stack.push((self.root_blk, String::new(), 0, 0, 0));
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
let blk = self.reader.read_blk(self.start_blk + blknum)?;
let buf: &[u8] = blk.as_ref();
while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
let blk = self.reader.read_blk(self.start_blk + blknum)?;
let buf: &[u8] = blk.as_ref();
let node = OnDiskNode::<L>::deparse(buf)?;
let node = OnDiskNode::<L>::deparse(buf)?;
if child_idx == 0 {
print!("{:indent$}", "", indent = depth * 2);
let path_prefix = stack
.iter()
.map(|(_blknum, path, ..)| path.as_str())
.collect::<String>();
println!(
"blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
hex::encode(node.prefix),
node.suffix_len
);
}
print!("{:indent$}", "", indent = depth * 2);
println!(
"blk #{}: path {}: prefix {}, suffix_len {}",
blknum,
hex::encode(path),
hex::encode(node.prefix),
node.suffix_len
);
if child_idx + 1 < node.num_children {
let key_off = key_off + node.suffix_len as usize;
stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
}
let mut idx = 0;
let mut key_off = 0;
while idx < node.num_children {
let key = &node.keys[key_off..key_off + node.suffix_len as usize];
let val = node.value(child_idx as usize);
let val = node.value(idx as usize);
print!("{:indent$}", "", indent = depth * 2 + 2);
println!("{}: {}", hex::encode(key), hex::encode(val.0));
if node.level > 0 {
stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
let child_path = [path, node.prefix].concat();
self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
}
idx += 1;
key_off += node.suffix_len as usize;
}
Ok(())
}
@@ -757,8 +754,8 @@ mod tests {
}
}
#[tokio::test]
async fn basic() -> Result<()> {
#[test]
fn basic() -> Result<()> {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
@@ -778,7 +775,7 @@ mod tests {
let reader = DiskBtreeReader::new(0, root_offset, disk);
reader.dump().await?;
reader.dump()?;
// Test the `get` function on all the keys.
for (key, val) in all_data.iter() {
@@ -838,8 +835,8 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn lots_of_keys() -> Result<()> {
#[test]
fn lots_of_keys() -> Result<()> {
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
@@ -859,7 +856,7 @@ mod tests {
let reader = DiskBtreeReader::new(0, root_offset, disk);
reader.dump().await?;
reader.dump()?;
use std::sync::Mutex;
@@ -997,8 +994,8 @@ mod tests {
///
/// This test contains a particular data set, see disk_btree_test_data.rs
///
#[tokio::test]
async fn particular_data() -> Result<()> {
#[test]
fn particular_data() -> Result<()> {
// Build a tree from it
let mut disk = TestDisk::new();
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1025,7 +1022,7 @@ mod tests {
})?;
assert_eq!(count, disk_btree_test_data::TEST_DATA.len());
reader.dump().await?;
reader.dump()?;
Ok(())
}

View File

@@ -9,11 +9,10 @@
//! [`remote_timeline_client`]: super::remote_timeline_client
use std::fs::{File, OpenOptions};
use std::io::{self, Write};
use std::io::Write;
use anyhow::{bail, ensure, Context};
use serde::{Deserialize, Serialize};
use thiserror::Error;
use tracing::info_span;
use utils::bin_ser::SerializeError;
use utils::{
@@ -268,24 +267,24 @@ pub fn save_metadata(
Ok(())
}
#[derive(Error, Debug)]
pub enum LoadMetadataError {
#[error(transparent)]
Read(#[from] io::Error),
#[error(transparent)]
Decode(#[from] anyhow::Error),
}
pub fn load_metadata(
conf: &'static PageServerConf,
tenant_id: &TenantId,
timeline_id: &TimelineId,
) -> Result<TimelineMetadata, LoadMetadataError> {
) -> anyhow::Result<TimelineMetadata> {
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
let metadata_bytes = std::fs::read(metadata_path)?;
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
format!(
"Failed to read metadata bytes from path {}",
metadata_path.display()
)
})?;
TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
format!(
"Failed to parse metadata bytes from path {}",
metadata_path.display()
)
})
}
#[cfg(test)]

View File

@@ -26,8 +26,6 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
use utils::fs_ext::PathExt;
use utils::id::{TenantId, TimelineId};
use super::timeline::delete::DeleteTimelineFlow;
/// The tenants known to the pageserver.
/// The enum variants are used to distinguish the different states that the pageserver can be in.
enum TenantsMap {
@@ -423,10 +421,12 @@ pub enum DeleteTimelineError {
pub async fn delete_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
_ctx: &RequestContext,
ctx: &RequestContext,
) -> Result<(), DeleteTimelineError> {
let tenant = get_tenant(tenant_id, true).await?;
DeleteTimelineFlow::run(&tenant, timeline_id).await?;
tenant
.prepare_and_schedule_delete_timeline(timeline_id, ctx)
.await?;
Ok(())
}

View File

@@ -514,7 +514,7 @@ impl RemoteTimelineClient {
/// updated metadata.
///
/// The upload will be added to the queue immediately, but it
/// won't be performed until all previously scheduled layer file
/// won't be performed until all previosuly scheduled layer file
/// upload operations have completed successfully. This is to
/// ensure that when the index file claims that layers X, Y and Z
/// exist in remote storage, they really do. To wait for the upload
@@ -625,7 +625,7 @@ impl RemoteTimelineClient {
/// Note: This schedules an index file upload before the deletions. The
/// deletion won't actually be performed, until any previously scheduled
/// upload operations, and the index file upload, have completed
/// successfully.
/// succesfully.
pub fn schedule_layer_file_deletion(
self: &Arc<Self>,
names: &[LayerFileName],
@@ -827,7 +827,7 @@ impl RemoteTimelineClient {
)
};
receiver.changed().await.context("upload queue shut down")?;
receiver.changed().await?;
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
@@ -855,23 +855,11 @@ impl RemoteTimelineClient {
self.storage_impl.delete_objects(&remaining).await?;
}
fail::fail_point!("timeline-delete-before-index-delete", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-delete"
))?
});
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
debug!("deleting index part");
self.storage_impl.delete(&index_file_path).await?;
fail::fail_point!("timeline-delete-after-index-delete", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-after-index-delete"
))?
});
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
Ok(())
@@ -1117,7 +1105,7 @@ impl RemoteTimelineClient {
debug!("remote task {} completed successfully", task.op);
}
// The task has completed successfully. Remove it from the in-progress list.
// The task has completed succesfully. Remove it from the in-progress list.
{
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
let upload_queue = match upload_queue_guard.deref_mut() {

View File

@@ -223,45 +223,6 @@ mod tests {
assert_eq!(part, expected);
}
#[test]
fn v2_indexpart_is_parsed_with_deleted_at() {
let example = r#"{
"version":2,
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
"missing_layers":["This shouldn't fail deserialization"],
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
"deleted_at": "2023-07-31T09:00:00.123"
}"#;
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 2,
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
layer_metadata: HashMap::from([
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
file_size: 25600000,
}),
("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: 9007199254741001,
})
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
"2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
};
let part = serde_json::from_str::<IndexPart>(example).unwrap();
assert_eq!(part, expected);
}
#[test]
fn empty_layers_are_parsed() {
let empty_layers_json = r#"{

View File

@@ -256,7 +256,7 @@ impl Layer for DeltaLayer {
file,
);
tree_reader.dump().await?;
tree_reader.dump()?;
let mut cursor = file.block_cursor();

View File

@@ -38,7 +38,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{bail, ensure, Context, Result};
use bytes::Bytes;
use hex;
use once_cell::sync::OnceCell;
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
@@ -48,6 +47,7 @@ use std::io::{Seek, SeekFrom};
use std::ops::Range;
use std::os::unix::prelude::FileExt;
use std::path::{Path, PathBuf};
use std::sync::{RwLock, RwLockReadGuard};
use tracing::*;
use utils::{
@@ -117,7 +117,7 @@ pub struct ImageLayer {
access_stats: LayerAccessStats,
inner: OnceCell<ImageLayerInner>,
inner: RwLock<ImageLayerInner>,
}
impl std::fmt::Debug for ImageLayer {
@@ -134,17 +134,21 @@ impl std::fmt::Debug for ImageLayer {
}
pub struct ImageLayerInner {
/// If false, the 'index' has not been loaded into memory yet.
loaded: bool,
// values copied from summary
index_start_blk: u32,
index_root_blk: u32,
/// Reader object for reading blocks from the file.
file: FileBlockReader<VirtualFile>,
/// Reader object for reading blocks from the file. (None if not loaded yet)
file: Option<FileBlockReader<VirtualFile>>,
}
impl std::fmt::Debug for ImageLayerInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ImageLayerInner")
.field("loaded", &self.loaded)
.field("index_start_blk", &self.index_start_blk)
.field("index_root_blk", &self.index_root_blk)
.finish()
@@ -171,11 +175,11 @@ impl Layer for ImageLayer {
}
let inner = self.load(LayerAccessKind::Dump, ctx)?;
let file = &inner.file;
let file = inner.file.as_ref().unwrap();
let tree_reader =
DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
tree_reader.dump().await?;
tree_reader.dump()?;
tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
println!("key: {} offset {}", hex::encode(key), value);
@@ -199,7 +203,7 @@ impl Layer for ImageLayer {
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
let file = &inner.file;
let file = inner.file.as_ref().unwrap();
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
@@ -318,26 +322,52 @@ impl ImageLayer {
/// Open the underlying file and read the metadata into memory, if it's
/// not loaded already.
///
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
fn load(
&self,
access_kind: LayerAccessKind,
ctx: &RequestContext,
) -> Result<RwLockReadGuard<ImageLayerInner>> {
self.access_stats
.record_access(access_kind, ctx.task_kind());
loop {
if let Some(inner) = self.inner.get() {
// Quick exit if already loaded
let inner = self.inner.read().unwrap();
if inner.loaded {
return Ok(inner);
}
self.inner
.get_or_try_init(|| self.load_inner())
.with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
// Need to open the file and load the metadata. Upgrade our lock to
// a write lock. (Or rather, release and re-lock in write mode.)
drop(inner);
let mut inner = self.inner.write().unwrap();
if !inner.loaded {
self.load_inner(&mut inner).with_context(|| {
format!("Failed to load image layer {}", self.path().display())
})?
} else {
// Another thread loaded it while we were not holding the lock.
}
// We now have the file open and loaded. There's no function to do
// that in the std library RwLock, so we have to release and re-lock
// in read mode. (To be precise, the lock guard was moved in the
// above call to `load_inner`, so it's already been released). And
// while we do that, another thread could unload again, so we have
// to re-check and retry if that happens.
drop(inner);
}
}
fn load_inner(&self) -> Result<ImageLayerInner> {
fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
let path = self.path();
// Open the file if it's not open already.
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
if inner.file.is_none() {
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
inner.file = Some(FileBlockReader::new(file));
}
let file = inner.file.as_mut().unwrap();
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -365,11 +395,10 @@ impl ImageLayer {
}
}
Ok(ImageLayerInner {
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
file,
})
inner.index_start_blk = actual_summary.index_start_blk;
inner.index_root_blk = actual_summary.index_root_blk;
inner.loaded = true;
Ok(())
}
/// Create an ImageLayer struct representing an existing file on disk
@@ -393,7 +422,12 @@ impl ImageLayer {
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: filename.lsn,
access_stats,
inner: OnceCell::new(),
inner: RwLock::new(ImageLayerInner {
loaded: false,
file: None,
index_start_blk: 0,
index_root_blk: 0,
}),
}
}
@@ -420,7 +454,12 @@ impl ImageLayer {
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: summary.lsn,
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: OnceCell::new(),
inner: RwLock::new(ImageLayerInner {
file: None,
loaded: false,
index_start_blk: 0,
index_root_blk: 0,
}),
})
}
@@ -581,7 +620,12 @@ impl ImageLayerWriterInner {
desc,
lsn: self.lsn,
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: OnceCell::new(),
inner: RwLock::new(ImageLayerInner {
loaded: false,
file: None,
index_start_blk,
index_root_blk,
}),
};
// fsync the file

View File

@@ -1,4 +1,3 @@
pub mod delete;
mod eviction_task;
pub mod layer_manager;
mod logical_size;
@@ -80,7 +79,6 @@ use crate::METADATA_FILE_NAME;
use crate::ZERO_PAGE;
use crate::{is_temporary, task_mgr};
use self::delete::DeleteTimelineFlow;
pub(super) use self::eviction_task::EvictionTaskTenantState;
use self::eviction_task::EvictionTaskTimelineState;
use self::layer_manager::LayerManager;
@@ -239,10 +237,11 @@ pub struct Timeline {
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
/// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`].
/// This is an `Arc<Mutex>` lock because we need an owned
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
/// Note that [`DeleteTimelineFlow`] uses `delete_progress` field.
///
/// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
@@ -284,7 +283,7 @@ pub struct Timeline {
/// Prevent two tasks from deleting the timeline at the same time. If held, the
/// timeline is being deleted. If 'true', the timeline has already been deleted.
pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
pub delete_lock: Arc<tokio::sync::Mutex<bool>>,
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
@@ -294,10 +293,6 @@ pub struct Timeline {
/// Completion shared between all timelines loaded during startup; used to delay heavier
/// background tasks until some logical sizes have been calculated.
initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
/// Load or creation time information about the disk_consistent_lsn and when the loading
/// happened. Used for consumption metrics.
pub(crate) loaded_at: (Lsn, SystemTime),
}
pub struct WalReceiverInfo {
@@ -1365,10 +1360,9 @@ impl Timeline {
pg_version: u32,
initial_logical_size_can_start: Option<completion::Barrier>,
initial_logical_size_attempt: Option<completion::Completion>,
state: TimelineState,
) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
let (state, _) = watch::channel(state);
let (state, _) = watch::channel(TimelineState::Loading);
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -1408,8 +1402,6 @@ impl Timeline {
last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
last_freeze_ts: RwLock::new(Instant::now()),
loaded_at: (disk_consistent_lsn, SystemTime::now()),
ancestor_timeline: ancestor,
ancestor_lsn: metadata.ancestor_lsn(),
@@ -1461,7 +1453,7 @@ impl Timeline {
eviction_task_timeline_state: tokio::sync::Mutex::new(
EvictionTaskTimelineState::default(),
),
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
delete_lock: Arc::new(tokio::sync::Mutex::new(false)),
initial_logical_size_can_start,
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
@@ -1606,7 +1598,7 @@ impl Timeline {
if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
// create an ImageLayer struct for each image file.
if imgfilename.lsn > disk_consistent_lsn {
info!(
warn!(
"found future image layer {} on timeline {} disk_consistent_lsn is {}",
imgfilename, self.timeline_id, disk_consistent_lsn
);
@@ -1638,7 +1630,7 @@ impl Timeline {
// is 102, then it might not have been fully flushed to disk
// before crash.
if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
info!(
warn!(
"found future delta layer {} on timeline {} disk_consistent_lsn is {}",
deltafilename, self.timeline_id, disk_consistent_lsn
);
@@ -1780,7 +1772,7 @@ impl Timeline {
match remote_layer_name {
LayerFileName::Image(imgfilename) => {
if imgfilename.lsn > up_to_date_disk_consistent_lsn {
info!(
warn!(
"found future image layer {} on timeline {} remote_consistent_lsn is {}",
imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
);
@@ -1805,7 +1797,7 @@ impl Timeline {
// is 102, then it might not have been fully flushed to disk
// before crash.
if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
info!(
warn!(
"found future delta layer {} on timeline {} remote_consistent_lsn is {}",
deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
);
@@ -1926,15 +1918,6 @@ impl Timeline {
}
fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
let state = self.current_state();
if matches!(
state,
TimelineState::Broken { .. } | TimelineState::Stopping
) {
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
return;
}
let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
.try_acquire_owned()
{

View File

@@ -1,576 +0,0 @@
use std::{
ops::{Deref, DerefMut},
sync::Arc,
};
use anyhow::Context;
use pageserver_api::models::TimelineState;
use tokio::sync::OwnedMutexGuard;
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
use utils::{
crashsafe, fs_ext,
id::{TenantId, TimelineId},
};
use crate::{
config::PageServerConf,
task_mgr::{self, TaskKind},
tenant::{
metadata::TimelineMetadata,
remote_timeline_client::{
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
},
CreateTimelineCause, DeleteTimelineError, Tenant,
},
InitializationOrder,
};
use super::Timeline;
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
// Stop the walreceiver first.
debug!("waiting for wal receiver to shutdown");
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
if let Some(walreceiver) = maybe_started_walreceiver {
walreceiver.stop().await;
}
debug!("wal receiver shutdown confirmed");
// Prevent new uploads from starting.
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.stop();
match res {
Ok(()) => {}
Err(e) => match e {
remote_timeline_client::StopError::QueueUninitialized => {
// This case shouldn't happen currently because the
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
// That is, before we declare the Tenant as Active.
// But we only allow calls to delete_timeline on Active tenants.
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
}
},
}
}
// Stop & wait for the remaining timeline tasks, including upload tasks.
// NB: This and other delete_timeline calls do not run as a task_mgr task,
// so, they are not affected by this shutdown_tasks() call.
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-index-deleted-at"
))?
});
Ok(())
}
/// Mark timeline as deleted in S3 so we won't pick it up next time
/// during attach or pageserver restart.
/// See comment in persist_index_part_with_deleted_flag.
async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
if let Some(remote_client) = timeline.remote_client.as_ref() {
match remote_client.persist_index_part_with_deleted_flag().await {
// If we (now, or already) marked it successfully as deleted, we can proceed
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
// Bail out otherwise
//
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
// two tasks from performing the deletion at the same time. The first task
// that starts deletion should run it to completion.
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
}
}
}
Ok(())
}
// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
// So we can just remove the mark file.
async fn create_delete_mark(
conf: &PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<(), DeleteTimelineError> {
fail::fail_point!("timeline-delete-before-delete-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-delete-mark"
))?
});
let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
// Note: we're ok to replace existing file.
let _ = std::fs::OpenOptions::new()
.write(true)
.create(true)
.open(&marker_path)
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
Ok(())
}
/// Grab the layer_removal_cs lock, and actually perform the deletion.
///
/// This lock prevents prevents GC or compaction from running at the same time.
/// The GC task doesn't register itself with the timeline it's operating on,
/// so it might still be running even though we called `shutdown_tasks`.
///
/// Note that there are still other race conditions between
/// GC, compaction and timeline deletion. See
/// <https://github.com/neondatabase/neon/issues/2671>
///
/// No timeout here, GC & Compaction should be responsive to the
/// `TimelineState::Stopping` change.
async fn delete_local_layer_files(
conf: &PageServerConf,
tenant_id: TenantId,
timeline: &Timeline,
) -> anyhow::Result<()> {
info!("waiting for layer_removal_cs.lock()");
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
info!("got layer_removal_cs.lock(), deleting layer files");
// NB: storage_sync upload tasks that reference these layers have been cancelled
// by the caller.
let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
fail::fail_point!("timeline-delete-before-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
});
// NB: This need not be atomic because the deleted flag in the IndexPart
// will be observed during tenant/timeline load. The deletion will be resumed there.
//
// For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
//
// Note that here we do not bail out on std::io::ErrorKind::NotFound.
// This can happen if we're called a second time, e.g.,
// because of a previous failure/cancellation at/after
// failpoint timeline-delete-after-rm.
//
// It can also happen if we race with tenant detach, because,
// it doesn't grab the layer_removal_cs lock.
//
// For now, log and continue.
// warn! level is technically not appropriate for the
// first case because we should expect retries to happen.
// But the error is so rare, it seems better to get attention if it happens.
//
// Note that metadata removal is skipped, this is not technically needed,
// but allows to reuse timeline loading code during resumed deletion.
// (we always expect that metadata is in place when timeline is being loaded)
#[cfg(feature = "testing")]
let mut counter = 0;
// Timeline directory may not exist if we failed to delete mark file and request was retried.
if !local_timeline_directory.exists() {
return Ok(());
}
let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
#[cfg(feature = "testing")]
{
counter += 1;
if counter == 2 {
fail::fail_point!("timeline-delete-during-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
});
}
}
let entry = entry?;
if entry.path() == metadata_path {
debug!("found metadata, skipping");
continue;
}
if entry.path() == local_timeline_directory {
// Keeping directory because metedata file is still there
debug!("found timeline dir itself, skipping");
continue;
}
let metadata = match entry.metadata() {
Ok(metadata) => metadata,
Err(e) => {
if crate::is_walkdir_io_not_found(&e) {
warn!(
timeline_dir=?local_timeline_directory,
path=?entry.path().display(),
"got not found err while removing timeline dir, proceeding anyway"
);
continue;
}
anyhow::bail!(e);
}
};
let r = if metadata.is_dir() {
// There shouldnt be any directories inside timeline dir as of current layout.
tokio::fs::remove_dir(entry.path()).await
} else {
tokio::fs::remove_file(entry.path()).await
};
if let Err(e) = r {
if e.kind() == std::io::ErrorKind::NotFound {
warn!(
timeline_dir=?local_timeline_directory,
path=?entry.path().display(),
"got not found err while removing timeline dir, proceeding anyway"
);
continue;
}
anyhow::bail!(anyhow::anyhow!(
"Failed to remove: {}. Error: {e}",
entry.path().display()
));
}
}
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
drop(layer_removal_guard);
fail::fail_point!("timeline-delete-after-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
});
Ok(())
}
/// Removes remote layers and an index file after them.
async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
if let Some(remote_client) = &timeline.remote_client {
remote_client.delete_all().await.context("delete_all")?
};
Ok(())
}
// This function removs remaining traces of a timeline on disk.
// Namely: metadata file, timeline directory, delete mark.
// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
// delete mark should be present because it is the last step during deletion.
// (nothing can fail after its deletion)
async fn cleanup_remaining_timeline_fs_traces(
conf: &PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> anyhow::Result<()> {
// Remove local metadata
tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("remove metadata")?;
fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-after-rm-metadata"
))?
});
// Remove timeline dir
tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("timeline dir")?;
fail::fail_point!("timeline-delete-after-rm-dir", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
});
// Remove delete mark
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
.await
.context("remove delete mark")
}
/// It is important that this gets called when DeletionGuard is being held.
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
async fn remove_timeline_from_tenant(
tenant: &Tenant,
timeline_id: TimelineId,
_: &DeletionGuard, // using it as a witness
) -> anyhow::Result<()> {
// Remove the timeline from the map.
let mut timelines = tenant.timelines.lock().unwrap();
let children_exist = timelines
.iter()
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
// We already deleted the layer files, so it's probably best to panic.
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
if children_exist {
panic!("Timeline grew children while we removed layer files");
}
timelines
.remove(&timeline_id)
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
drop(timelines);
Ok(())
}
/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
/// and deletes its data from both disk and s3.
/// The sequence of steps:
/// 1. Set deleted_at in remote index part.
/// 2. Create local mark file.
/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
/// 4. Delete remote layers
/// 5. Delete index part
/// 6. Delete meta, timeline directory
/// 7. Delete mark file
/// It is resumable from any step in case a crash/restart occurs.
/// There are three entrypoints to the process:
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
/// and we possibly neeed to continue deletion of remote files.
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
/// index but still have local metadata, timeline directory and delete mark.
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
#[derive(Default)]
pub enum DeleteTimelineFlow {
#[default]
NotStarted,
InProgress,
Finished,
}
impl DeleteTimelineFlow {
// These steps are run in the context of management api request handler.
// Long running steps are continued to run in the background.
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
#[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
pub async fn run(
tenant: &Arc<Tenant>,
timeline_id: TimelineId,
) -> Result<(), DeleteTimelineError> {
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
guard.mark_in_progress()?;
stop_tasks(&timeline).await?;
set_deleted_in_remote_index(&timeline).await?;
create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
fail::fail_point!("timeline-delete-before-schedule", |_| {
Err(anyhow::anyhow!(
"failpoint: timeline-delete-before-schedule"
))?
});
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
Ok(())
}
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
match self {
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
Self::InProgress { .. } => { /* We're in a retry */ }
Self::NotStarted => { /* Fresh start */ }
}
*self = Self::InProgress;
Ok(())
}
/// Shortcut to create Timeline in stopping state and spawn deletion task.
pub async fn resume_deletion(
tenant: Arc<Tenant>,
timeline_id: TimelineId,
local_metadata: &TimelineMetadata,
remote_client: Option<RemoteTimelineClient>,
init_order: Option<&InitializationOrder>,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
// RemoteTimelineClient is the only functioning part.
let timeline = tenant
.create_timeline_struct(
timeline_id,
local_metadata,
None, // Ancestor is not needed for deletion.
remote_client,
init_order,
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.
CreateTimelineCause::Delete,
)
.context("create_timeline_struct")?;
let mut guard = DeletionGuard(
Arc::clone(&timeline.delete_progress)
.try_lock_owned()
.expect("cannot happen because we're the only owner"),
);
// We meed to do this because when console retries delete request we shouldnt answer with 404
// because 404 means successful deletion.
{
let mut locked = tenant.timelines.lock().unwrap();
locked.insert(timeline_id, Arc::clone(&timeline));
}
guard.mark_in_progress()?;
// Note that delete mark can be missing on resume
// because we create delete mark after we set deleted_at in the index part.
create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
Self::schedule_background(guard, tenant.conf, tenant, timeline);
Ok(())
}
pub async fn cleanup_remaining_timeline_fs_traces(
tenant: &Tenant,
timeline_id: TimelineId,
) -> anyhow::Result<()> {
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
}
fn prepare(
tenant: &Tenant,
timeline_id: TimelineId,
) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
// Note the interaction between this guard and deletion guard.
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
// This is important because when you take into account `remove_timeline_from_tenant`
// we remove timeline from memory when we still hold the deletion guard.
// So here when timeline deletion is finished timeline wont be present in timelines map at all
// which makes the following sequence impossible:
// T1: get preempted right before the try_lock on `Timeline::delete_progress`
// T2: do a full deletion, acquire and drop `Timeline::delete_progress`
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
let timelines = tenant.timelines.lock().unwrap();
let timeline = match timelines.get(&timeline_id) {
Some(t) => t,
None => return Err(DeleteTimelineError::NotFound),
};
// Ensure that there are no child timelines **attached to that pageserver**,
// because detach removes files, which will break child branches
let children: Vec<TimelineId> = timelines
.iter()
.filter_map(|(id, entry)| {
if entry.get_ancestor_timeline_id() == Some(timeline_id) {
Some(*id)
} else {
None
}
})
.collect();
if !children.is_empty() {
return Err(DeleteTimelineError::HasChildren(children));
}
// Note that using try_lock here is important to avoid a deadlock.
// Here we take lock on timelines and then the deletion guard.
// At the end of the operation we're holding the guard and need to lock timelines map
// to remove the timeline from it.
// Always if you have two locks that are taken in different order this can result in a deadlock.
let delete_lock_guard = DeletionGuard(
Arc::clone(&timeline.delete_progress)
.try_lock_owned()
.map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
);
timeline.set_state(TimelineState::Stopping);
Ok((Arc::clone(timeline), delete_lock_guard))
}
fn schedule_background(
guard: DeletionGuard,
conf: &'static PageServerConf,
tenant: Arc<Tenant>,
timeline: Arc<Timeline>,
) {
let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_id),
Some(timeline_id),
"timeline_delete",
false,
async move {
if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
error!("Error: {err:#}");
timeline.set_broken(format!("{err:#}"))
};
Ok(())
}
.instrument({
let span =
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
span.follows_from(Span::current());
span
}),
);
}
async fn background(
mut guard: DeletionGuard,
conf: &PageServerConf,
tenant: &Tenant,
timeline: &Timeline,
) -> Result<(), DeleteTimelineError> {
delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
delete_remote_layers_and_index(timeline).await?;
pausable_failpoint!("in_progress_delete");
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
*guard.0 = Self::Finished;
Ok(())
}
}
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
impl Deref for DeletionGuard {
type Target = DeleteTimelineFlow;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for DeletionGuard {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

View File

@@ -2,9 +2,13 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
use anyhow::Context;
use tracing::{error, info, info_span, warn};
use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
use utils::{crashsafe, id::TimelineId, lsn::Lsn};
use crate::{context::RequestContext, import_datadir, tenant::Tenant};
use crate::{
context::RequestContext,
import_datadir,
tenant::{ignore_absent_files, Tenant},
};
use super::Timeline;
@@ -137,7 +141,7 @@ impl Drop for UninitializedTimeline<'_> {
pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
let timeline_path = &uninit_mark.timeline_path;
match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
Ok(()) => {
info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
}
@@ -181,7 +185,7 @@ impl TimelineUninitMark {
let uninit_mark_parent = uninit_mark_file
.parent()
.with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
})?;
crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;

View File

@@ -1123,7 +1123,7 @@ mod tests {
}
#[tokio::test]
async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
let mut state = dummy_state(&harness).await;
let current_lsn = Lsn(100_000).align();
@@ -1189,8 +1189,8 @@ mod tests {
}
#[tokio::test]
async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
let mut state = dummy_state(&harness).await;
let current_lsn = Lsn(100_000).align();
let now = Utc::now().naive_utc();
@@ -1252,8 +1252,8 @@ mod tests {
}
#[tokio::test]
async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
let mut state = dummy_state(&harness).await;
let current_lsn = Lsn(100_000).align();
let new_lsn = Lsn(100_100).align();

View File

@@ -292,7 +292,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
/*
* The docs for PQgetCopyData list the return values as: 0 if the copy is
* still in progress, but no "complete row" is available -1 if the copy is
* done -2 if an error occurred (> 0) if it was successful; that value is
* done -2 if an error occured (> 0) if it was successful; that value is
* the amount transferred.
*
* The protocol we use between walproposer and safekeeper means that we
@@ -353,7 +353,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
/*
* The docs for PQputcopyData list the return values as: 1 if the data was
* queued, 0 if it was not queued because of full buffers, or -1 if an
* error occurred
* error occured
*/
result = PQputCopyData(conn->pg_conn, buf, size);

View File

@@ -788,7 +788,7 @@ ReconnectSafekeepers(void)
/*
* Performs the logic for advancing the state machine of the specified safekeeper,
* given that a certain set of events has occurred.
* given that a certain set of events has occured.
*/
static void
AdvancePollState(Safekeeper *sk, uint32 events)

View File

@@ -23,7 +23,7 @@
* message header */
/*
* In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
* In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
* because all WL_* events are given flags equal to some (1 << i), starting from i = 0
*/
#define WL_NO_EVENTS 0
@@ -317,7 +317,7 @@ typedef struct AppendResponse
/* this is a criterion for walproposer --sync mode exit */
XLogRecPtr commitLsn;
HotStandbyFeedback hs;
/* Feedback received from pageserver includes standby_status_update fields */
/* Feedback recieved from pageserver includes standby_status_update fields */
/* and custom neon feedback. */
/* This part of the message is extensible. */
PageserverFeedback rf;

6
poetry.lock generated
View File

@@ -740,13 +740,13 @@ typing-extensions = ">=4.1.0"
[[package]]
name = "certifi"
version = "2023.7.22"
version = "2022.12.7"
description = "Python package for providing Mozilla's CA Bundle."
optional = false
python-versions = ">=3.6"
files = [
{file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
{file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
{file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
{file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
]
[[package]]

View File

@@ -29,9 +29,9 @@ metrics.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
parking_lot.workspace = true
pbkdf2.workspace = true
pin-project-lite.workspace = true
postgres_backend.workspace = true
postgres-protocol.workspace = true
pq_proto.workspace = true
prometheus.workspace = true
rand.workspace = true
@@ -65,13 +65,10 @@ webpki-roots.workspace = true
x509-parser.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
tokio-native-tls = "0.3.1"
workspace_hack.workspace = true
tokio-util.workspace = true
fallible-iterator = "0.2.0"
[dev-dependencies]
rcgen.workspace = true
rstest.workspace = true

View File

@@ -53,12 +53,6 @@ pub enum BackendType<'a, T> {
Postgres(Cow<'a, console::provider::mock::Api>, T),
/// Authentication via a web browser.
Link(Cow<'a, url::ApiUrl>),
/// Test backend.
Test(&'a dyn TestBackend),
}
pub trait TestBackend: Send + Sync + 'static {
fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
}
impl std::fmt::Display for BackendType<'_, ()> {
@@ -68,7 +62,6 @@ impl std::fmt::Display for BackendType<'_, ()> {
Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
Test(_) => fmt.debug_tuple("Test").finish(),
}
}
}
@@ -82,7 +75,6 @@ impl<T> BackendType<'_, T> {
Console(c, x) => Console(Cow::Borrowed(c), x),
Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
Link(c) => Link(Cow::Borrowed(c)),
Test(x) => Test(*x),
}
}
}
@@ -97,7 +89,6 @@ impl<'a, T> BackendType<'a, T> {
Console(c, x) => Console(c, f(x)),
Postgres(c, x) => Postgres(c, f(x)),
Link(c) => Link(c),
Test(x) => Test(x),
}
}
}
@@ -111,7 +102,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
Console(c, x) => x.map(|x| Console(c, x)),
Postgres(c, x) => x.map(|x| Postgres(c, x)),
Link(c) => Ok(Link(c)),
Test(x) => Ok(Test(x)),
}
}
}
@@ -157,7 +147,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
Console(_, creds) => creds.project.clone(),
Postgres(_, creds) => creds.project.clone(),
Link(_) => Some("link".to_owned()),
Test(_) => Some("test".to_owned()),
}
}
/// Authenticate the client via the requested backend, possibly using credentials.
@@ -199,9 +188,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
.await?
.map(CachedNodeInfo::new_uncached)
}
Test(_) => {
unreachable!("this function should never be called in the test backend")
}
};
info!("user successfully authenticated");
@@ -220,7 +206,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
Link(_) => Ok(None),
Test(x) => x.wake_compute().map(Some),
}
}
}

View File

@@ -1,11 +1,8 @@
use std::ops::ControlFlow;
use super::AuthSuccess;
use crate::{
auth::{self, AuthFlow, ClientCredentials},
compute,
console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
proxy::handle_try_wake,
sasl, scram,
stream::PqStream,
};
@@ -51,16 +48,7 @@ pub(super) async fn authenticate(
}
};
info!("compute node's state has likely changed; requesting a wake-up");
let mut num_retries = 0;
let mut node = loop {
let wake_res = api.wake_compute(extra, creds).await;
match handle_try_wake(wake_res, num_retries)? {
ControlFlow::Continue(_) => num_retries += 1,
ControlFlow::Break(n) => break n,
}
info!(num_retries, "retrying wake compute");
};
let mut node = api.wake_compute(extra, creds).await?;
if let Some(keys) = scram_keys {
use tokio_postgres::config::AuthKeys;
node.config.auth_keys(AuthKeys::ScramSha256(keys));

View File

@@ -14,7 +14,6 @@ pub mod errors {
use crate::{
error::{io_error, UserFacingError},
http,
proxy::ShouldRetry,
};
use thiserror::Error;
@@ -73,24 +72,6 @@ pub mod errors {
}
}
impl ShouldRetry for ApiError {
fn could_retry(&self) -> bool {
match self {
// retry some transport errors
Self::Transport(io) => io.could_retry(),
// retry some temporary failures because the compute was in a bad state
// (bad request can be returned when the endpoint was in transition)
Self::Console {
status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED,
..
} => true,
// retry server errors
Self::Console { status, .. } if status.is_server_error() => true,
_ => false,
}
}
}
impl From<reqwest::Error> for ApiError {
fn from(e: reqwest::Error) -> Self {
io_error(e).into()

View File

@@ -6,7 +6,7 @@ use std::fmt;
use std::{collections::HashMap, sync::Arc};
use tokio::time;
use crate::{auth, console, pg_client};
use crate::{auth, console};
use crate::{compute, config};
use super::sql_over_http::MAX_RESPONSE_SIZE;
@@ -41,10 +41,8 @@ impl fmt::Display for ConnInfo {
}
}
type PgConn =
pg_client::connection::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>;
struct ConnPoolEntry {
conn: PgConn,
conn: tokio_postgres::Client,
_last_access: std::time::Instant,
}
@@ -80,8 +78,12 @@ impl GlobalConnPool {
})
}
pub async fn get(&self, conn_info: &ConnInfo, force_new: bool) -> anyhow::Result<PgConn> {
let mut client: Option<PgConn> = None;
pub async fn get(
&self,
conn_info: &ConnInfo,
force_new: bool,
) -> anyhow::Result<tokio_postgres::Client> {
let mut client: Option<tokio_postgres::Client> = None;
if !force_new {
let pool = self.get_endpoint_pool(&conn_info.hostname).await;
@@ -112,7 +114,11 @@ impl GlobalConnPool {
}
}
pub async fn put(&self, conn_info: &ConnInfo, client: PgConn) -> anyhow::Result<()> {
pub async fn put(
&self,
conn_info: &ConnInfo,
client: tokio_postgres::Client,
) -> anyhow::Result<()> {
let pool = self.get_endpoint_pool(&conn_info.hostname).await;
// return connection to the pool
@@ -185,7 +191,7 @@ struct TokioMechanism<'a> {
#[async_trait]
impl ConnectMechanism for TokioMechanism<'_> {
type Connection = PgConn;
type Connection = tokio_postgres::Client;
type ConnectError = tokio_postgres::Error;
type Error = anyhow::Error;
@@ -207,7 +213,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
async fn connect_to_compute(
config: &config::ProxyConfig,
conn_info: &ConnInfo,
) -> anyhow::Result<PgConn> {
) -> anyhow::Result<tokio_postgres::Client> {
let tls = config.tls_config.as_ref();
let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -245,7 +251,7 @@ async fn connect_to_compute_once(
node_info: &console::CachedNodeInfo,
conn_info: &ConnInfo,
timeout: time::Duration,
) -> Result<PgConn, tokio_postgres::Error> {
) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
let mut config = (*node_info.config).clone();
let (client, connection) = config
@@ -257,13 +263,11 @@ async fn connect_to_compute_once(
.connect(tokio_postgres::NoTls)
.await?;
let stream = connection.stream.into_inner();
tokio::spawn(async move {
if let Err(e) = connection.await {
error!("connection error: {}", e);
}
});
// tokio::spawn(async move {
// if let Err(e) = connection.await {
// error!("connection error: {}", e);
// }
// });
Ok(PgConn::new(stream))
Ok(client)
}

View File

@@ -1,38 +1,19 @@
use std::io::ErrorKind;
use std::sync::Arc;
use anyhow::bail;
use bytes::BufMut;
use fallible_iterator::FallibleIterator;
use futures::pin_mut;
use futures::StreamExt;
use hashbrown::HashMap;
use hyper::body::HttpBody;
use hyper::http::HeaderName;
use hyper::http::HeaderValue;
use hyper::{Body, HeaderMap, Request};
use postgres_protocol::message::backend::DataRowBody;
use postgres_protocol::message::backend::ReadyForQueryBody;
use serde_json::json;
use serde_json::Map;
use serde_json::Value;
use tokio::io::AsyncRead;
use tokio::io::AsyncWrite;
use tokio_postgres::types::Kind;
use tokio_postgres::types::Type;
use tokio_postgres::GenericClient;
use tokio_postgres::IsolationLevel;
use tokio_postgres::Row;
use tokio_postgres::RowStream;
use tokio_postgres::Statement;
use url::Url;
use crate::pg_client;
use crate::pg_client::codec::FrontendMessage;
use crate::pg_client::connection;
use crate::pg_client::connection::RequestMessages;
use crate::pg_client::prepare::TypeinfoPreparedQueries;
use super::conn_pool::ConnInfo;
use super::conn_pool::GlobalConnPool;
@@ -42,21 +23,12 @@ struct QueryData {
params: Vec<serde_json::Value>,
}
#[derive(serde::Deserialize)]
#[serde(untagged)]
enum Payload {
Single(QueryData),
Batch(Vec<QueryData>),
}
pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
@@ -190,7 +162,7 @@ pub async fn handle(
request: Request<Body>,
sni_hostname: Option<String>,
conn_pool: Arc<GlobalConnPool>,
) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
) -> anyhow::Result<Value> {
//
// Determine the destination and connection params
//
@@ -205,23 +177,6 @@ pub async fn handle(
// Allow connection pooling only if explicitly requested
let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
// isolation level and read only
let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
let txn_isolation_level = match txn_isolation_level_raw {
Some(ref x) => Some(match x.as_bytes() {
b"Serializable" => IsolationLevel::Serializable,
b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
b"ReadCommitted" => IsolationLevel::ReadCommitted,
b"RepeatableRead" => IsolationLevel::RepeatableRead,
_ => bail!("invalid isolation level"),
}),
None => None,
};
let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
let request_content_length = match request.body().size_hint().upper() {
Some(v) => v,
None => MAX_REQUEST_SIZE + 1,
@@ -237,75 +192,15 @@ pub async fn handle(
// Read the query and query params from the request body
//
let body = hyper::body::to_bytes(request.into_body()).await?;
let payload: Payload = serde_json::from_slice(&body)?;
let mut client = conn_pool.get(&conn_info, !allow_pool).await?;
let QueryData { query, params } = serde_json::from_slice(&body)?;
let query_params = json_to_pg_text(params)?;
//
// Now execute the query and return the result
//
let result = match payload {
Payload::Single(query) => query_raw_txt_as_json(&mut client, query, raw_output, array_mode)
.await
.map(|x| (x, HashMap::default())),
Payload::Batch(queries) => {
let mut results = Vec::new();
let client = conn_pool.get(&conn_info, !allow_pool).await?;
client
.start_tx(txn_isolation_level, Some(txn_read_only))
.await?;
for query in queries {
let result =
query_raw_txt_as_json(&mut client, query, raw_output, array_mode).await;
match result {
// TODO: check this tag to see if the client has executed a commit during the non-interactive transactions...
Ok((r, _ready_tag)) => results.push(r),
Err(e) => {
let tag = client.rollback().await?;
if allow_pool && tag.status() == b'I' {
// return connection to the pool
tokio::task::spawn(async move {
let _ = conn_pool.put(&conn_info, client).await;
});
}
return Err(e);
}
}
}
let ready_tag = client.commit().await?;
let mut headers = HashMap::default();
headers.insert(
TXN_READ_ONLY.clone(),
HeaderValue::try_from(txn_read_only.to_string())?,
);
if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
}
Ok(((json!({ "results": results }), ready_tag), headers))
}
};
if allow_pool && ready_tag.status() == b'I' {
// return connection to the pool
tokio::task::spawn(async move {
let _ = conn_pool.put(&conn_info, client).await;
});
}
result
}
async fn query_to_json<T: GenericClient>(
client: &T,
data: QueryData,
raw_output: bool,
array_mode: bool,
) -> anyhow::Result<Value> {
let query_params = json_to_pg_text(data.params)?;
let row_stream = client
.query_raw_txt::<String, _>(data.query, query_params)
.await?;
let row_stream = client.query_raw_txt(query, query_params).await?;
// Manually drain the stream into a vector to leave row_stream hanging
// around to get a command tag. Also check that the response is not too
@@ -361,6 +256,13 @@ async fn query_to_json<T: GenericClient>(
.map(|row| pg_text_row_to_json(row, raw_output, array_mode))
.collect::<Result<Vec<_>, _>>()?;
if allow_pool {
// return connection to the pool
tokio::task::spawn(async move {
let _ = conn_pool.put(&conn_info, client).await;
});
}
// resulting JSON format is based on the format of node-postgres result
Ok(json!({
"command": command_tag_name,
@@ -371,99 +273,6 @@ async fn query_to_json<T: GenericClient>(
}))
}
async fn query_raw_txt_as_json<'a, St, T>(
conn: &mut connection::Connection<St, T>,
data: QueryData,
raw_output: bool,
array_mode: bool,
) -> anyhow::Result<(Value, ReadyForQueryBody)>
where
St: AsyncRead + AsyncWrite + Unpin + Send,
T: AsyncRead + AsyncWrite + Unpin + Send,
{
let params = json_to_pg_text(data.params)?;
let params = params.into_iter();
let stmt_name = conn.statement_name();
let row_description = conn.prepare(&stmt_name, &data.query).await?;
let mut fields = vec![];
let mut columns = vec![];
let mut it = row_description.fields();
while let Some(field) = it.next().map_err(pg_client::error::Error::parse)? {
fields.push(json!({
"name": Value::String(field.name().to_owned()),
"dataTypeID": Value::Number(field.type_oid().into()),
"tableID": field.table_oid(),
"columnID": field.column_id(),
"dataTypeSize": field.type_size(),
"dataTypeModifier": field.type_modifier(),
"format": "text",
}));
let type_ = match Type::from_oid(field.type_oid()) {
Some(t) => t,
None => TypeinfoPreparedQueries::get_type(conn, field.type_oid()).await?,
};
columns.push(Column {
name: field.name().to_string(),
type_,
});
}
conn.execute("", &stmt_name, params)?;
conn.sync().await?;
let mut rows = vec![];
let mut row_stream = conn.stream_query_results().await?;
let mut curret_size = 0;
while let Some(row) = row_stream.next().await.transpose()? {
// let row = row.map_err(Error::db)?;
curret_size += row.buffer().len();
if curret_size > MAX_RESPONSE_SIZE {
return Err(anyhow::anyhow!("response too large"));
}
rows.push(pg_text_row_to_json2(&row, &columns, raw_output, array_mode).unwrap());
}
let command_tag = row_stream.tag();
let command_tag = command_tag.tag()?;
let mut command_tag_split = command_tag.split(' ');
let command_tag_name = command_tag_split.next().unwrap_or_default();
let command_tag_count = if command_tag_name == "INSERT" {
// INSERT returns OID first and then number of rows
command_tag_split.nth(1)
} else {
// other commands return number of rows (if any)
command_tag_split.next()
}
.and_then(|s| s.parse::<i64>().ok());
let ready_tag = conn.wait_for_ready().await?;
// resulting JSON format is based on the format of node-postgres result
Ok((
json!({
"command": command_tag_name,
"rowCount": command_tag_count,
"rows": rows,
"fields": fields,
"rowAsArray": array_mode,
}),
ready_tag,
))
}
struct Column {
name: String,
type_: Type,
}
//
// Convert postgres row with text-encoded values to JSON object
//
@@ -483,7 +292,7 @@ pub fn pg_text_row_to_json(
} else {
pg_text_to_json(pg_value, column.type_())?
};
Ok((name, json_value))
Ok((name.to_string(), json_value))
});
if array_mode {
@@ -493,55 +302,7 @@ pub fn pg_text_row_to_json(
.collect::<Result<Vec<Value>, anyhow::Error>>()?;
Ok(Value::Array(arr))
} else {
let obj = iter
.map(|r| r.map(|(key, val)| (key.to_owned(), val)))
.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
Ok(Value::Object(obj))
}
}
//
// Convert postgres row with text-encoded values to JSON object
//
fn pg_text_row_to_json2(
row: &DataRowBody,
columns: &[Column],
raw_output: bool,
array_mode: bool,
) -> Result<Value, anyhow::Error> {
let ranges: Vec<Option<std::ops::Range<usize>>> = row.ranges().collect()?;
let iter = std::iter::zip(ranges, columns)
.enumerate()
.map(|(i, (range, column))| {
let name = &column.name;
let pg_value = range
.map(|r| {
std::str::from_utf8(&row.buffer()[r])
.map_err(|e| pg_client::error::Error::from_sql(e.into(), i))
})
.transpose()?;
// let pg_value = row.as_text(i)?;
let json_value = if raw_output {
match pg_value {
Some(v) => Value::String(v.to_string()),
None => Value::Null,
}
} else {
pg_text_to_json(pg_value, &column.type_)?
};
Ok((name, json_value))
});
if array_mode {
// drop keys and aggregate into array
let arr = iter
.map(|r| r.map(|(_key, val)| val))
.collect::<Result<Vec<Value>, anyhow::Error>>()?;
Ok(Value::Array(arr))
} else {
let obj = iter
.map(|r| r.map(|(key, val)| (key.to_owned(), val)))
.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
Ok(Value::Object(obj))
}
}
@@ -552,16 +313,16 @@ fn pg_text_row_to_json2(
pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
if let Some(val) = pg_value {
if let Kind::Array(elem_type) = pg_type.kind() {
return pg_array_parse(val, &elem_type);
return pg_array_parse(val, elem_type);
}
match pg_type {
&Type::BOOL => Ok(Value::Bool(val == "t")),
&Type::INT2 | &Type::INT4 => {
match *pg_type {
Type::BOOL => Ok(Value::Bool(val == "t")),
Type::INT2 | Type::INT4 => {
let val = val.parse::<i32>()?;
Ok(Value::Number(serde_json::Number::from(val)))
}
&Type::FLOAT4 | &Type::FLOAT8 => {
Type::FLOAT4 | Type::FLOAT8 => {
let fval = val.parse::<f64>()?;
let num = serde_json::Number::from_f64(fval);
if let Some(num) = num {
@@ -573,7 +334,7 @@ pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value,
Ok(Value::String(val.to_string()))
}
}
&Type::JSON | &Type::JSONB => Ok(serde_json::from_str(val)?),
Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
_ => Ok(Value::String(val.to_string())),
}
} else {

View File

@@ -6,7 +6,6 @@ use crate::{
};
use bytes::{Buf, Bytes};
use futures::{Sink, Stream, StreamExt};
use hashbrown::HashMap;
use hyper::{
server::{
accept,
@@ -182,15 +181,13 @@ async fn ws_handler(
// Check if the request is a websocket upgrade request.
if hyper_tungstenite::is_upgrade_request(&request) {
info!(session_id = ?session_id, "performing websocket upgrade");
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
.map_err(|e| ApiError::BadRequest(e.into()))?;
tokio::spawn(async move {
if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
{
error!(session_id = ?session_id, "error in websocket connection: {e:?}");
error!("error in websocket connection: {e:?}");
}
});
@@ -206,7 +203,7 @@ async fn ws_handler(
Ok(_) => StatusCode::OK,
Err(_) => StatusCode::BAD_REQUEST,
};
let (json, headers) = match result {
let json = match result {
Ok(r) => r,
Err(e) => {
let message = format!("{:?}", e);
@@ -217,10 +214,7 @@ async fn ws_handler(
},
None => Value::Null,
};
(
json!({ "message": message, "code": code }),
HashMap::default(),
)
json!({ "message": message, "code": code })
}
};
json_response(status_code, json).map(|mut r| {
@@ -228,9 +222,6 @@ async fn ws_handler(
"Access-Control-Allow-Origin",
hyper::http::HeaderValue::from_static("*"),
);
for (k, v) in headers {
r.headers_mut().insert(k, v);
}
r
})
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {

View File

@@ -22,7 +22,6 @@ pub mod scram;
pub mod stream;
pub mod url;
pub mod waiters;
pub mod pg_client;
/// Handle unix signals appropriately.
pub async fn handle_signals(token: CancellationToken) -> anyhow::Result<Infallible> {

View File

@@ -1,43 +0,0 @@
use bytes::{Bytes, BytesMut};
use fallible_iterator::FallibleIterator;
use postgres_protocol::message::backend::{self, Message};
use std::io;
use tokio_util::codec::{Decoder, Encoder};
pub struct FrontendMessage(pub Bytes);
pub struct BackendMessages(pub BytesMut);
impl BackendMessages {
pub fn empty() -> BackendMessages {
BackendMessages(BytesMut::new())
}
}
impl FallibleIterator for BackendMessages {
type Item = backend::Message;
type Error = io::Error;
fn next(&mut self) -> io::Result<Option<backend::Message>> {
backend::Message::parse(&mut self.0)
}
}
pub struct PostgresCodec;
impl Encoder<FrontendMessage> for PostgresCodec {
type Error = io::Error;
fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> {
dst.extend_from_slice(&item.0);
Ok(())
}
}
impl Decoder for PostgresCodec {
type Item = Message;
type Error = io::Error;
fn decode(&mut self, src: &mut BytesMut) -> Result<Option<Message>, io::Error> {
Message::parse(src)
}
}

View File

@@ -1,369 +0,0 @@
use super::codec::{BackendMessages, FrontendMessage, PostgresCodec};
use super::error::Error;
use super::prepare::TypeinfoPreparedQueries;
use bytes::{BufMut, BytesMut};
use futures::channel::mpsc;
use futures::{Sink, StreamExt};
use futures::{SinkExt, Stream};
use hashbrown::HashMap;
use postgres_protocol::message::backend::{
BackendKeyDataBody, CommandCompleteBody, DataRowBody, ErrorResponseBody, Message,
ReadyForQueryBody, RowDescriptionBody,
};
use postgres_protocol::message::frontend;
use postgres_protocol::Oid;
use std::collections::VecDeque;
use std::future::poll_fn;
use std::pin::Pin;
use std::task::{ready, Context, Poll};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_postgres::maybe_tls_stream::MaybeTlsStream;
use tokio_postgres::types::Type;
use tokio_postgres::IsolationLevel;
use tokio_util::codec::Framed;
pub enum RequestMessages {
Single(FrontendMessage),
}
pub struct Request {
pub messages: RequestMessages,
pub sender: mpsc::Sender<BackendMessages>,
}
pub struct Response {
sender: mpsc::Sender<BackendMessages>,
}
/// A connection to a PostgreSQL database.
pub struct RawConnection<S, T> {
stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
pending_responses: VecDeque<Message>,
pub buf: BytesMut,
}
impl<S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin> RawConnection<S, T> {
pub fn new(
stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
buf: BytesMut,
) -> RawConnection<S, T> {
RawConnection {
stream,
pending_responses: VecDeque::new(),
buf,
}
}
pub async fn send(&mut self) -> Result<(), Error> {
poll_fn(|cx| self.poll_send(cx)).await?;
let request = FrontendMessage(self.buf.split().freeze());
self.stream.start_send_unpin(request).map_err(Error::io)?;
poll_fn(|cx| self.poll_flush(cx)).await
}
pub async fn next_message(&mut self) -> Result<Message, Error> {
match self.pending_responses.pop_front() {
Some(message) => Ok(message),
None => poll_fn(|cx| self.poll_read(cx)).await,
}
}
fn poll_read(&mut self, cx: &mut Context<'_>) -> Poll<Result<Message, Error>> {
let message = match ready!(self.stream.poll_next_unpin(cx)?) {
Some(message) => message,
None => return Poll::Ready(Err(Error::closed())),
};
Poll::Ready(Ok(message))
}
fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
Pin::new(&mut self.stream).poll_close(cx).map_err(Error::io)
}
fn poll_send(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
if let Poll::Ready(msg) = self.poll_read(cx)? {
self.pending_responses.push_back(msg);
};
self.stream.poll_ready_unpin(cx).map_err(Error::io)
}
fn poll_flush(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
if let Poll::Ready(msg) = self.poll_read(cx)? {
self.pending_responses.push_back(msg);
};
self.stream.poll_flush_unpin(cx).map_err(Error::io)
}
}
pub struct Connection<S, T> {
stmt_counter: usize,
pub typeinfo: Option<TypeinfoPreparedQueries>,
pub typecache: HashMap<Oid, Type>,
pub raw: RawConnection<S, T>,
// key: BackendKeyDataBody,
}
impl<S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin> Connection<S, T> {
pub fn new(stream: MaybeTlsStream<S, T>) -> Connection<S, T> {
Connection {
stmt_counter: 0,
typeinfo: None,
typecache: HashMap::new(),
raw: RawConnection::new(Framed::new(stream, PostgresCodec), BytesMut::new()),
}
}
pub async fn start_tx(
&mut self,
isolation_level: Option<IsolationLevel>,
read_only: Option<bool>,
) -> Result<ReadyForQueryBody, Error> {
let mut query = "START TRANSACTION".to_string();
let mut first = true;
if let Some(level) = isolation_level {
first = false;
query.push_str(" ISOLATION LEVEL ");
let level = match level {
IsolationLevel::ReadUncommitted => "READ UNCOMMITTED",
IsolationLevel::ReadCommitted => "READ COMMITTED",
IsolationLevel::RepeatableRead => "REPEATABLE READ",
IsolationLevel::Serializable => "SERIALIZABLE",
_ => return Err(Error::unexpected_message()),
};
query.push_str(level);
}
if let Some(read_only) = read_only {
if !first {
query.push(',');
}
first = false;
let s = if read_only {
" READ ONLY"
} else {
" READ WRITE"
};
query.push_str(s);
}
self.execute_simple(&query).await
}
pub async fn rollback(&mut self) -> Result<ReadyForQueryBody, Error> {
self.execute_simple("ROLLBACK").await
}
pub async fn commit(&mut self) -> Result<ReadyForQueryBody, Error> {
self.execute_simple("COMMIT").await
}
// pub async fn auth_sasl_scram<'a, I>(
// mut raw: RawConnection<S, T>,
// params: I,
// password: &[u8],
// ) -> Result<Self, Error>
// where
// I: IntoIterator<Item = (&'a str, &'a str)>,
// {
// // send a startup message
// frontend::startup_message(params, &mut raw.buf).unwrap();
// raw.send().await?;
// // expect sasl authentication message
// let Message::AuthenticationSasl(body) = raw.next_message().await? else { return Err(Error::expecting("sasl authentication")) };
// // expect support for SCRAM_SHA_256
// if body
// .mechanisms()
// .find(|&x| Ok(x == authentication::sasl::SCRAM_SHA_256))?
// .is_none()
// {
// return Err(Error::expecting("SCRAM-SHA-256 auth"));
// }
// // initiate SCRAM_SHA_256 authentication without channel binding
// let auth = authentication::sasl::ChannelBinding::unrequested();
// let mut scram = authentication::sasl::ScramSha256::new(password, auth);
// frontend::sasl_initial_response(
// authentication::sasl::SCRAM_SHA_256,
// scram.message(),
// &mut raw.buf,
// )
// .unwrap();
// raw.send().await?;
// // expect sasl continue
// let Message::AuthenticationSaslContinue(b) = raw.next_message().await? else { return Err(Error::expecting("auth continue")) };
// scram.update(b.data()).unwrap();
// // continue sasl
// frontend::sasl_response(scram.message(), &mut raw.buf).unwrap();
// raw.send().await?;
// // expect sasl final
// let Message::AuthenticationSaslFinal(b) = raw.next_message().await? else { return Err(Error::expecting("auth final")) };
// scram.finish(b.data()).unwrap();
// // expect auth ok
// let Message::AuthenticationOk = raw.next_message().await? else { return Err(Error::expecting("auth ok")) };
// // expect connection accepted
// let key = loop {
// match raw.next_message().await? {
// Message::BackendKeyData(key) => break key,
// Message::ParameterStatus(_) => {}
// _ => return Err(Error::expecting("backend ready")),
// }
// };
// let Message::ReadyForQuery(b) = raw.next_message().await? else { return Err(Error::expecting("ready for query")) };
// // assert_eq!(b.status(), b'I');
// Ok(Self { raw, key })
// }
// pub fn prepare_and_execute(
// &mut self,
// portal: &str,
// name: &str,
// query: &str,
// params: impl IntoIterator<Item = Option<impl AsRef<str>>>,
// ) -> std::io::Result<()> {
// self.prepare(name, query)?;
// self.execute(portal, name, params)
// }
pub fn statement_name(&mut self) -> String {
self.stmt_counter += 1;
format!("s{}", self.stmt_counter)
}
async fn execute_simple(&mut self, query: &str) -> Result<ReadyForQueryBody, Error> {
frontend::query(query, &mut self.raw.buf)?;
self.raw.send().await?;
loop {
match self.raw.next_message().await? {
Message::ReadyForQuery(q) => return Ok(q),
Message::CommandComplete(_)
| Message::EmptyQueryResponse
| Message::RowDescription(_)
| Message::DataRow(_) => {}
_ => return Err(Error::unexpected_message()),
}
}
}
pub async fn prepare(&mut self, name: &str, query: &str) -> Result<RowDescriptionBody, Error> {
frontend::parse(name, query, std::iter::empty(), &mut self.raw.buf)?;
frontend::describe(b'S', name, &mut self.raw.buf)?;
self.sync().await?;
self.wait_for_prepare().await
}
pub fn execute(
&mut self,
portal: &str,
name: &str,
params: impl IntoIterator<Item = Option<impl AsRef<str>>>,
) -> std::io::Result<()> {
frontend::bind(
portal,
name,
std::iter::empty(), // all parameters use the default format (text)
params,
|param, buf| match param {
Some(param) => {
buf.put_slice(param.as_ref().as_bytes());
Ok(postgres_protocol::IsNull::No)
}
None => Ok(postgres_protocol::IsNull::Yes),
},
Some(0), // all text
&mut self.raw.buf,
)
.map_err(|e| match e {
frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
frontend::BindError::Serialization(io) => io,
})?;
frontend::execute(portal, 0, &mut self.raw.buf)
}
pub async fn sync(&mut self) -> Result<(), Error> {
frontend::sync(&mut self.raw.buf);
self.raw.send().await
}
pub async fn wait_for_prepare(&mut self) -> Result<RowDescriptionBody, Error> {
let Message::ParseComplete = self.raw.next_message().await? else { return Err(Error::expecting("parse")) };
let Message::ParameterDescription(_) = self.raw.next_message().await? else { return Err(Error::expecting("param description")) };
let Message::RowDescription(desc) = self.raw.next_message().await? else { return Err(Error::expecting("row description")) };
self.wait_for_ready().await?;
Ok(desc)
}
pub async fn stream_query_results(&mut self) -> Result<RowStream<'_, S, T>, Error> {
// let Message::ParseComplete = self.raw.next_message().await? else { return Err(Error::expecting("parse")) };
let Message::BindComplete = self.raw.next_message().await? else { return Err(Error::expecting("bind")) };
Ok(RowStream::Stream(&mut self.raw))
}
pub async fn wait_for_ready(&mut self) -> Result<ReadyForQueryBody, Error> {
loop {
match self.raw.next_message().await.unwrap() {
Message::ReadyForQuery(b) => break Ok(b),
_ => continue,
}
}
}
}
pub enum RowStream<'a, S, T> {
Stream(&'a mut RawConnection<S, T>),
Complete(Option<CommandCompleteBody>),
}
impl<S, T> Unpin for RowStream<'_, S, T> {}
impl<S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin> Stream
for RowStream<'_, S, T>
{
// this is horrible - first result is for transport/protocol errors errors
// second result is for sql errors.
type Item = Result<Result<DataRowBody, ErrorResponseBody>, Error>;
fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
match &mut *self {
RowStream::Stream(raw) => match ready!(raw.poll_read(cx)?) {
Message::DataRow(row) => Poll::Ready(Some(Ok(Ok(row)))),
Message::CommandComplete(tag) => {
*self = Self::Complete(Some(tag));
Poll::Ready(None)
}
Message::EmptyQueryResponse | Message::PortalSuspended => {
*self = Self::Complete(None);
Poll::Ready(None)
}
Message::ErrorResponse(error) => {
*self = Self::Complete(None);
Poll::Ready(Some(Ok(Err(error))))
}
_ => Poll::Ready(Some(Err(Error::expecting("command completion")))),
},
RowStream::Complete(_) => Poll::Ready(None),
}
}
}
impl<S, T> RowStream<'_, S, T> {
pub fn tag(self) -> Option<CommandCompleteBody> {
match self {
RowStream::Stream(_) => panic!("should not get tag unless row stream is exhausted"),
RowStream::Complete(tag) => tag,
}
}
}

View File

@@ -1,447 +0,0 @@
use std::{error, fmt, io};
use fallible_iterator::FallibleIterator;
use postgres_protocol::message::backend::{ErrorFields, ErrorResponseBody};
use tokio_native_tls::native_tls;
use tokio_postgres::error::{ErrorPosition, SqlState};
#[derive(Debug, PartialEq)]
enum Kind {
Io,
Tls,
UnexpectedMessage,
FromSql(usize),
Closed,
Db,
Parse,
Encode,
}
struct ErrorInner {
kind: Kind,
cause: Option<Box<dyn error::Error + Sync + Send>>,
}
/// An error communicating with the Postgres server.
pub struct Error(ErrorInner);
impl fmt::Debug for Error {
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt.debug_struct("Error")
.field("kind", &self.0.kind)
.field("cause", &self.0.cause)
.finish()
}
}
impl fmt::Display for Error {
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
match &self.0.kind {
Kind::Io => fmt.write_str("error communicating with the server")?,
Kind::Tls => fmt.write_str("error establishing tls")?,
Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?,
Kind::FromSql(idx) => write!(fmt, "error deserializing column {}", idx)?,
Kind::Closed => fmt.write_str("connection closed")?,
Kind::Db => fmt.write_str("db error")?,
Kind::Parse => fmt.write_str("error parsing response from server")?,
Kind::Encode => fmt.write_str("error encoding message to server")?,
};
if let Some(ref cause) = self.0.cause {
write!(fmt, ": {}", cause)?;
}
Ok(())
}
}
impl error::Error for Error {
fn source(&self) -> Option<&(dyn error::Error + 'static)> {
self.0.cause.as_ref().map(|e| &**e as _)
}
}
impl From<io::Error> for Error {
fn from(value: io::Error) -> Self {
Self::io(value)
}
}
impl Error {
/// Consumes the error, returning its cause.
pub fn into_source(self) -> Option<Box<dyn error::Error + Sync + Send>> {
self.0.cause
}
/// Returns the source of this error if it was a `DbError`.
///
/// This is a simple convenience method.
pub fn as_db_error(&self) -> Option<&DbError> {
error::Error::source(self).and_then(|e| e.downcast_ref::<DbError>())
}
/// Determines if the error was associated with closed connection.
pub fn is_closed(&self) -> bool {
self.0.kind == Kind::Closed
}
/// Returns the SQLSTATE error code associated with the error.
///
/// This is a convenience method that downcasts the cause to a `DbError` and returns its code.
pub fn code(&self) -> Option<&SqlState> {
self.as_db_error().map(DbError::code)
}
fn new(kind: Kind, cause: Option<Box<dyn error::Error + Sync + Send>>) -> Error {
Error(ErrorInner { kind, cause })
}
#[allow(clippy::needless_pass_by_value)]
pub(crate) fn db(error: ErrorResponseBody) -> Error {
match DbError::parse(&mut error.fields()) {
Ok(e) => Error::new(Kind::Db, Some(Box::new(e))),
Err(e) => Error::new(Kind::Parse, Some(Box::new(e))),
}
}
pub(crate) fn from_sql(e: Box<dyn error::Error + Sync + Send>, idx: usize) -> Error {
Error::new(Kind::FromSql(idx), Some(e))
}
pub(crate) fn closed() -> Error {
Error::new(Kind::Closed, None)
}
pub(crate) fn unexpected_message() -> Error {
Error::new(Kind::UnexpectedMessage, None)
}
pub(crate) fn expecting(expected: &str) -> Error {
Error::new(Kind::UnexpectedMessage, Some(expected.into()))
}
pub(crate) fn parse(e: io::Error) -> Error {
Error::new(Kind::Parse, Some(Box::new(e)))
}
pub(crate) fn encode(e: io::Error) -> Error {
Error::new(Kind::Encode, Some(Box::new(e)))
}
pub(crate) fn io(e: io::Error) -> Error {
Error::new(Kind::Io, Some(Box::new(e)))
}
pub(crate) fn tls(e: native_tls::Error) -> Error {
Error::new(Kind::Tls, Some(Box::new(e)))
}
}
/// The severity of a Postgres error or notice.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum Severity {
/// PANIC
Panic,
/// FATAL
Fatal,
/// ERROR
Error,
/// WARNING
Warning,
/// NOTICE
Notice,
/// DEBUG
Debug,
/// INFO
Info,
/// LOG
Log,
}
impl fmt::Display for Severity {
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match *self {
Severity::Panic => "PANIC",
Severity::Fatal => "FATAL",
Severity::Error => "ERROR",
Severity::Warning => "WARNING",
Severity::Notice => "NOTICE",
Severity::Debug => "DEBUG",
Severity::Info => "INFO",
Severity::Log => "LOG",
};
fmt.write_str(s)
}
}
impl Severity {
fn from_str(s: &str) -> Option<Severity> {
match s {
"PANIC" => Some(Severity::Panic),
"FATAL" => Some(Severity::Fatal),
"ERROR" => Some(Severity::Error),
"WARNING" => Some(Severity::Warning),
"NOTICE" => Some(Severity::Notice),
"DEBUG" => Some(Severity::Debug),
"INFO" => Some(Severity::Info),
"LOG" => Some(Severity::Log),
_ => None,
}
}
}
/// A Postgres error or notice.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DbError {
severity: String,
parsed_severity: Option<Severity>,
code: SqlState,
message: String,
detail: Option<String>,
hint: Option<String>,
position: Option<ErrorPosition>,
where_: Option<String>,
schema: Option<String>,
table: Option<String>,
column: Option<String>,
datatype: Option<String>,
constraint: Option<String>,
file: Option<String>,
line: Option<u32>,
routine: Option<String>,
}
impl DbError {
pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
let mut severity = None;
let mut parsed_severity = None;
let mut code = None;
let mut message = None;
let mut detail = None;
let mut hint = None;
let mut normal_position = None;
let mut internal_position = None;
let mut internal_query = None;
let mut where_ = None;
let mut schema = None;
let mut table = None;
let mut column = None;
let mut datatype = None;
let mut constraint = None;
let mut file = None;
let mut line = None;
let mut routine = None;
while let Some(field) = fields.next()? {
match field.type_() {
b'S' => severity = Some(field.value().to_owned()),
b'C' => code = Some(SqlState::from_code(field.value())),
b'M' => message = Some(field.value().to_owned()),
b'D' => detail = Some(field.value().to_owned()),
b'H' => hint = Some(field.value().to_owned()),
b'P' => {
normal_position = Some(field.value().parse::<u32>().map_err(|_| {
io::Error::new(
io::ErrorKind::InvalidInput,
"`P` field did not contain an integer",
)
})?);
}
b'p' => {
internal_position = Some(field.value().parse::<u32>().map_err(|_| {
io::Error::new(
io::ErrorKind::InvalidInput,
"`p` field did not contain an integer",
)
})?);
}
b'q' => internal_query = Some(field.value().to_owned()),
b'W' => where_ = Some(field.value().to_owned()),
b's' => schema = Some(field.value().to_owned()),
b't' => table = Some(field.value().to_owned()),
b'c' => column = Some(field.value().to_owned()),
b'd' => datatype = Some(field.value().to_owned()),
b'n' => constraint = Some(field.value().to_owned()),
b'F' => file = Some(field.value().to_owned()),
b'L' => {
line = Some(field.value().parse::<u32>().map_err(|_| {
io::Error::new(
io::ErrorKind::InvalidInput,
"`L` field did not contain an integer",
)
})?);
}
b'R' => routine = Some(field.value().to_owned()),
b'V' => {
parsed_severity = Some(Severity::from_str(field.value()).ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidInput,
"`V` field contained an invalid value",
)
})?);
}
_ => {}
}
}
Ok(DbError {
severity: severity
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`S` field missing"))?,
parsed_severity,
code: code
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`C` field missing"))?,
message: message
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`M` field missing"))?,
detail,
hint,
position: match normal_position {
Some(position) => Some(ErrorPosition::Original(position)),
None => match internal_position {
Some(position) => Some(ErrorPosition::Internal {
position,
query: internal_query.ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidInput,
"`q` field missing but `p` field present",
)
})?,
}),
None => None,
},
},
where_,
schema,
table,
column,
datatype,
constraint,
file,
line,
routine,
})
}
/// The field contents are ERROR, FATAL, or PANIC (in an error message),
/// or WARNING, NOTICE, DEBUG, INFO, or LOG (in a notice message), or a
/// localized translation of one of these.
pub fn severity(&self) -> &str {
&self.severity
}
/// A parsed, nonlocalized version of `severity`. (PostgreSQL 9.6+)
pub fn parsed_severity(&self) -> Option<Severity> {
self.parsed_severity
}
/// The SQLSTATE code for the error.
pub fn code(&self) -> &SqlState {
&self.code
}
/// The primary human-readable error message.
///
/// This should be accurate but terse (typically one line).
pub fn message(&self) -> &str {
&self.message
}
/// An optional secondary error message carrying more detail about the
/// problem.
///
/// Might run to multiple lines.
pub fn detail(&self) -> Option<&str> {
self.detail.as_deref()
}
/// An optional suggestion what to do about the problem.
///
/// This is intended to differ from `detail` in that it offers advice
/// (potentially inappropriate) rather than hard facts. Might run to
/// multiple lines.
pub fn hint(&self) -> Option<&str> {
self.hint.as_deref()
}
/// An optional error cursor position into either the original query string
/// or an internally generated query.
pub fn position(&self) -> Option<&ErrorPosition> {
self.position.as_ref()
}
/// An indication of the context in which the error occurred.
///
/// Presently this includes a call stack traceback of active procedural
/// language functions and internally-generated queries. The trace is one
/// entry per line, most recent first.
pub fn where_(&self) -> Option<&str> {
self.where_.as_deref()
}
/// If the error was associated with a specific database object, the name
/// of the schema containing that object, if any. (PostgreSQL 9.3+)
pub fn schema(&self) -> Option<&str> {
self.schema.as_deref()
}
/// If the error was associated with a specific table, the name of the
/// table. (Refer to the schema name field for the name of the table's
/// schema.) (PostgreSQL 9.3+)
pub fn table(&self) -> Option<&str> {
self.table.as_deref()
}
/// If the error was associated with a specific table column, the name of
/// the column.
///
/// (Refer to the schema and table name fields to identify the table.)
/// (PostgreSQL 9.3+)
pub fn column(&self) -> Option<&str> {
self.column.as_deref()
}
/// If the error was associated with a specific data type, the name of the
/// data type. (Refer to the schema name field for the name of the data
/// type's schema.) (PostgreSQL 9.3+)
pub fn datatype(&self) -> Option<&str> {
self.datatype.as_deref()
}
/// If the error was associated with a specific constraint, the name of the
/// constraint.
///
/// Refer to fields listed above for the associated table or domain.
/// (For this purpose, indexes are treated as constraints, even if they
/// weren't created with constraint syntax.) (PostgreSQL 9.3+)
pub fn constraint(&self) -> Option<&str> {
self.constraint.as_deref()
}
/// The file name of the source-code location where the error was reported.
pub fn file(&self) -> Option<&str> {
self.file.as_deref()
}
/// The line number of the source-code location where the error was
/// reported.
pub fn line(&self) -> Option<u32> {
self.line
}
/// The name of the source-code routine reporting the error.
pub fn routine(&self) -> Option<&str> {
self.routine.as_deref()
}
}
impl fmt::Display for DbError {
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(fmt, "{}: {}", self.severity, self.message)?;
if let Some(detail) = &self.detail {
write!(fmt, "\nDETAIL: {}", detail)?;
}
if let Some(hint) = &self.hint {
write!(fmt, "\nHINT: {}", hint)?;
}
Ok(())
}
}
impl error::Error for DbError {}

View File

@@ -1,5 +0,0 @@
pub mod codec;
pub mod connection;
pub mod error;
pub mod prepare;

View File

@@ -1,293 +0,0 @@
use fallible_iterator::FallibleIterator;
use futures::StreamExt;
use postgres_protocol::message::backend::{DataRowRanges, Message};
use postgres_protocol::message::frontend;
use std::future::Future;
use std::pin::Pin;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_postgres::types::{Field, Kind, Oid, ToSql, Type};
use super::connection::Connection;
use super::error::Error;
const TYPEINFO_QUERY: &str = "\
SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
FROM pg_catalog.pg_type t
LEFT OUTER JOIN pg_catalog.pg_range r ON r.rngtypid = t.oid
INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
WHERE t.oid = $1
";
const TYPEINFO_ENUM_QUERY: &str = "\
SELECT enumlabel
FROM pg_catalog.pg_enum
WHERE enumtypid = $1
ORDER BY enumsortorder
";
const TYPEINFO_COMPOSITE_QUERY: &str = "\
SELECT attname, atttypid
FROM pg_catalog.pg_attribute
WHERE attrelid = $1
AND NOT attisdropped
AND attnum > 0
ORDER BY attnum
";
#[derive(Clone)]
pub struct TypeinfoPreparedQueries {
query: String,
enum_query: String,
composite_query: String,
}
fn map_is_null(x: tokio_postgres::types::IsNull) -> postgres_protocol::IsNull {
match x {
tokio_postgres::types::IsNull::Yes => postgres_protocol::IsNull::Yes,
tokio_postgres::types::IsNull::No => postgres_protocol::IsNull::No,
}
}
fn read_column<'a, T: tokio_postgres::types::FromSql<'a>>(
buffer: &'a [u8],
type_: &Type,
ranges: &mut DataRowRanges<'a>,
) -> Result<T, Error> {
let range = ranges.next()?;
match range {
Some(range) => T::from_sql_nullable(type_, range.map(|r| &buffer[r])),
None => T::from_sql_null(type_),
}
.map_err(|e| Error::from_sql(e, 0))
}
impl TypeinfoPreparedQueries {
pub async fn new<
S: AsyncRead + AsyncWrite + Unpin + Send,
T: AsyncRead + AsyncWrite + Unpin + Send,
>(
c: &mut Connection<S, T>,
) -> Result<Self, Error> {
if let Some(ti) = &c.typeinfo {
return Ok(ti.clone());
}
let query = c.statement_name();
let enum_query = c.statement_name();
let composite_query = c.statement_name();
frontend::parse(&query, TYPEINFO_QUERY, [Type::OID.oid()], &mut c.raw.buf)?;
frontend::parse(
&enum_query,
TYPEINFO_ENUM_QUERY,
[Type::OID.oid()],
&mut c.raw.buf,
)?;
c.sync().await?;
frontend::parse(
&composite_query,
TYPEINFO_COMPOSITE_QUERY,
[Type::OID.oid()],
&mut c.raw.buf,
)?;
c.sync().await?;
let Message::ParseComplete = c.raw.next_message().await? else { return Err(Error::expecting("parse")) };
let Message::ParseComplete = c.raw.next_message().await? else { return Err(Error::expecting("parse")) };
let Message::ParseComplete = c.raw.next_message().await? else { return Err(Error::expecting("parse")) };
c.wait_for_ready().await?;
Ok(c.typeinfo
.insert(TypeinfoPreparedQueries {
query,
enum_query,
composite_query,
})
.clone())
}
fn get_type_rec<
S: AsyncRead + AsyncWrite + Unpin + Send,
T: AsyncRead + AsyncWrite + Unpin + Send,
>(
c: &mut Connection<S, T>,
oid: Oid,
) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + '_>> {
Box::pin(Self::get_type(c, oid))
}
pub async fn get_type<
S: AsyncRead + AsyncWrite + Unpin + Send,
T: AsyncRead + AsyncWrite + Unpin + Send,
>(
c: &mut Connection<S, T>,
oid: Oid,
) -> Result<Type, Error> {
if let Some(type_) = Type::from_oid(oid) {
return Ok(type_);
}
if let Some(type_) = c.typecache.get(&oid) {
return Ok(type_.clone());
}
let queries = Self::new(c).await?;
frontend::bind(
"",
&queries.query,
[1], // the only parameter is in binary format
[oid],
|param, buf| param.to_sql(&Type::OID, buf).map(map_is_null),
Some(1), // binary return type
&mut c.raw.buf,
)
.map_err(|e| match e {
frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
frontend::BindError::Serialization(io) => io,
})?;
frontend::execute("", 0, &mut c.raw.buf)?;
c.sync().await?;
let mut stream = c.stream_query_results().await?;
let Some(row) = stream.next().await.transpose()? else {
todo!()
};
let row = row.map_err(Error::db)?;
let b = row.buffer();
let mut ranges = row.ranges();
let name: String = read_column(b, &Type::NAME, &mut ranges)?;
let type_: i8 = read_column(b, &Type::CHAR, &mut ranges)?;
let elem_oid: Oid = read_column(b, &Type::OID, &mut ranges)?;
let rngsubtype: Option<Oid> = read_column(b, &Type::OID, &mut ranges)?;
let basetype: Oid = read_column(b, &Type::OID, &mut ranges)?;
let schema: String = read_column(b, &Type::NAME, &mut ranges)?;
let relid: Oid = read_column(b, &Type::OID, &mut ranges)?;
{
// should be none
let None = stream.next().await.transpose()? else {
todo!()
};
drop(stream);
}
let kind = if type_ == b'e' as i8 {
let variants = Self::get_enum_variants(c, oid).await?;
Kind::Enum(variants)
} else if type_ == b'p' as i8 {
Kind::Pseudo
} else if basetype != 0 {
let type_ = Self::get_type_rec(c, basetype).await?;
Kind::Domain(type_)
} else if elem_oid != 0 {
let type_ = Self::get_type_rec(c, elem_oid).await?;
Kind::Array(type_)
} else if relid != 0 {
let fields = Self::get_composite_fields(c, relid).await?;
Kind::Composite(fields)
} else if let Some(rngsubtype) = rngsubtype {
let type_ = Self::get_type_rec(c, rngsubtype).await?;
Kind::Range(type_)
} else {
Kind::Simple
};
let type_ = Type::new(name, oid, kind, schema);
c.typecache.insert(oid, type_.clone());
Ok(type_)
}
async fn get_enum_variants<
S: AsyncRead + AsyncWrite + Unpin + Send,
T: AsyncRead + AsyncWrite + Unpin + Send,
>(
c: &mut Connection<S, T>,
oid: Oid,
) -> Result<Vec<String>, Error> {
let queries = Self::new(c).await?;
frontend::bind(
"",
&queries.enum_query,
[1], // the only parameter is in binary format
[oid],
|param, buf| param.to_sql(&Type::OID, buf).map(map_is_null),
Some(1), // binary return type
&mut c.raw.buf,
)
.map_err(|e| match e {
frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
frontend::BindError::Serialization(io) => io,
})?;
frontend::execute("", 0, &mut c.raw.buf)?;
c.sync().await?;
let mut stream = c.stream_query_results().await?;
let mut variants = Vec::new();
while let Some(row) = stream.next().await.transpose()? {
let row = row.map_err(Error::db)?;
let variant: String = read_column(row.buffer(), &Type::NAME, &mut row.ranges())?;
variants.push(variant);
}
c.wait_for_ready().await?;
Ok(variants)
}
async fn get_composite_fields<
S: AsyncRead + AsyncWrite + Unpin + Send,
T: AsyncRead + AsyncWrite + Unpin + Send,
>(
c: &mut Connection<S, T>,
oid: Oid,
) -> Result<Vec<Field>, Error> {
let queries = Self::new(c).await?;
frontend::bind(
"",
&queries.composite_query,
[1], // the only parameter is in binary format
[oid],
|param, buf| param.to_sql(&Type::OID, buf).map(map_is_null),
Some(1), // binary return type
&mut c.raw.buf,
)
.map_err(|e| match e {
frontend::BindError::Conversion(e) => std::io::Error::new(std::io::ErrorKind::Other, e),
frontend::BindError::Serialization(io) => io,
})?;
frontend::execute("", 0, &mut c.raw.buf)?;
c.sync().await?;
let mut stream = c.stream_query_results().await?;
let mut fields = Vec::new();
while let Some(row) = stream.next().await.transpose()? {
let row = row.map_err(Error::db)?;
let mut ranges = row.ranges();
let name: String = read_column(row.buffer(), &Type::NAME, &mut ranges)?;
let oid: Oid = read_column(row.buffer(), &Type::OID, &mut ranges)?;
fields.push((name, oid));
}
c.wait_for_ready().await?;
let mut output_fields = Vec::with_capacity(fields.len());
for (name, oid) in fields {
let type_ = Self::get_type_rec(c, oid).await?;
output_fields.push(Field::new(name, type_))
}
Ok(output_fields)
}
}

View File

@@ -6,15 +6,18 @@ use crate::{
cancellation::{self, CancelMap},
compute::{self, PostgresConnection},
config::{ProxyConfig, TlsConfig},
console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
console::{
self,
errors::{ApiError, WakeComputeError},
messages::MetricsAuxInfo,
},
stream::{PqStream, Stream},
};
use anyhow::{bail, Context};
use async_trait::async_trait;
use futures::TryFutureExt;
use metrics::{
exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
};
use hyper::StatusCode;
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
use std::{error::Error, io, ops::ControlFlow, sync::Arc};
@@ -28,37 +31,25 @@ use utils::measured_stream::MeasuredStream;
/// Number of times we should retry the `/proxy_wake_compute` http request.
/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
pub const NUM_RETRIES_CONNECT: u32 = 10;
const NUM_RETRIES_CONNECT: u32 = 10;
const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"proxy_accepted_connections_total",
"Number of TCP client connections accepted.",
&["protocol"],
"Number of TCP client connections accepted."
)
.unwrap()
});
static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"proxy_closed_connections_total",
"Number of TCP client connections closed.",
&["protocol"],
)
.unwrap()
});
static COMPUTE_CONNECTION_LATENCY: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"proxy_compute_connection_latency_seconds",
"Time it took for proxy to establish a connection to the compute endpoint",
// largest bucket = 2^16 * 0.5ms = 32s
exponential_buckets(0.0005, 2.0, 16).unwrap(),
"Number of TCP client connections closed."
)
.unwrap()
});
@@ -146,13 +137,6 @@ pub enum ClientMode {
/// Abstracts the logic of handling TCP vs WS clients
impl ClientMode {
fn protocol_label(&self) -> &'static str {
match self {
ClientMode::Tcp => "tcp",
ClientMode::Websockets { .. } => "ws",
}
}
fn allow_cleartext(&self) -> bool {
match self {
ClientMode::Tcp => false,
@@ -191,17 +175,10 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
stream: S,
mode: ClientMode,
) -> anyhow::Result<()> {
info!(
protocol = mode.protocol_label(),
"handling interactive connection from client"
);
// The `closed` counter will increase when this future is destroyed.
NUM_CONNECTIONS_ACCEPTED_COUNTER
.with_label_values(&[mode.protocol_label()])
.inc();
NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
scopeguard::defer! {
NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc();
NUM_CONNECTIONS_CLOSED_COUNTER.inc();
}
let tls = config.tls_config.as_ref();
@@ -347,6 +324,11 @@ async fn connect_to_compute_once(
.await
}
enum ConnectionState<E> {
Cached(console::CachedNodeInfo),
Invalid(compute::ConnCfg, E),
}
#[async_trait]
pub trait ConnectMechanism {
type Connection;
@@ -398,91 +380,88 @@ where
M::ConnectError: ShouldRetry + std::fmt::Debug,
M::Error: From<WakeComputeError>,
{
let _timer = COMPUTE_CONNECTION_LATENCY.start_timer();
mechanism.update_connect_config(&mut node_info.config);
// try once
let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
Ok(res) => return Ok(res),
Err(e) => {
error!(error = ?e, "could not connect to compute node");
(invalidate_cache(node_info), e)
}
};
let mut num_retries = 0;
let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);
let mut num_retries = 1;
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
info!("compute node's state has likely changed; requesting a wake-up");
let node_info = loop {
let wake_res = match creds {
auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
// nothing to do?
auth::BackendType::Link(_) => return Err(err.into()),
// test backend
auth::BackendType::Test(x) => x.wake_compute(),
};
match handle_try_wake(wake_res, num_retries)? {
// failed to wake up but we can continue to retry
ControlFlow::Continue(_) => {}
// successfully woke up a compute node and can break the wakeup loop
ControlFlow::Break(mut node_info) => {
node_info.config.reuse_password(&config);
mechanism.update_connect_config(&mut node_info.config);
break node_info;
}
}
let wait_duration = retry_after(num_retries);
num_retries += 1;
time::sleep(wait_duration).await;
info!(num_retries, "retrying wake compute");
};
// now that we have a new node, try connect to it repeatedly.
// this can error for a few reasons, for instance:
// * DNS connection settings haven't quite propagated yet
info!("wake_compute success. attempting to connect");
loop {
match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
Ok(res) => return Ok(res),
Err(e) => {
error!(error = ?e, "could not connect to compute node");
if !e.should_retry(num_retries) {
return Err(e.into());
match state {
ConnectionState::Invalid(config, err) => {
match try_wake(&config, extra, creds).await {
// we can't wake up the compute node
Ok(None) => return Err(err.into()),
// there was an error communicating with the control plane
Err(e) => return Err(e.into()),
// failed to wake up but we can continue to retry
Ok(Some(ControlFlow::Continue(()))) => {
state = ConnectionState::Invalid(config, err);
let wait_duration = retry_after(num_retries);
num_retries += 1;
info!(num_retries, "retrying wake compute");
time::sleep(wait_duration).await;
continue;
}
// successfully woke up a compute node and can break the wakeup loop
Ok(Some(ControlFlow::Break(mut node_info))) => {
mechanism.update_connect_config(&mut node_info.config);
state = ConnectionState::Cached(node_info)
}
}
}
ConnectionState::Cached(node_info) => {
match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
Ok(res) => return Ok(res),
Err(e) => {
error!(error = ?e, "could not connect to compute node");
if !e.should_retry(num_retries) {
return Err(e.into());
}
// after the first connect failure,
// we should invalidate the cache and wake up a new compute node
if num_retries == 0 {
state = ConnectionState::Invalid(invalidate_cache(node_info), e);
} else {
state = ConnectionState::Cached(node_info);
}
let wait_duration = retry_after(num_retries);
num_retries += 1;
info!(num_retries, "retrying wake compute");
time::sleep(wait_duration).await;
}
}
}
}
let wait_duration = retry_after(num_retries);
num_retries += 1;
time::sleep(wait_duration).await;
info!(num_retries, "retrying connect_once");
}
}
/// Attempts to wake up the compute node.
/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
/// * Returns Ok(Break(node)) if the wakeup succeeded
/// * Returns Err(e) if there was an error
pub fn handle_try_wake(
result: Result<console::CachedNodeInfo, WakeComputeError>,
num_retries: u32,
) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
match result {
Err(err) => match &err {
WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
Ok(ControlFlow::Continue(err))
}
_ => Err(err),
},
// Ready to try again.
Ok(new) => Ok(ControlFlow::Break(new)),
/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
/// * Returns Ok(Some(false)) if the wakeup succeeded
/// * Returns Ok(None) or Err(e) if there was an error
async fn try_wake(
config: &compute::ConnCfg,
extra: &console::ConsoleReqExtra<'_>,
creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
) -> Result<Option<ControlFlow<console::CachedNodeInfo>>, WakeComputeError> {
info!("compute node's state has likely changed; requesting a wake-up");
match creds.wake_compute(extra).await {
// retry wake if the compute was in an invalid state
Err(WakeComputeError::ApiError(ApiError::Console {
status: StatusCode::BAD_REQUEST,
..
})) => Ok(Some(ControlFlow::Continue(()))),
// Update `node_info` and try again.
Ok(Some(mut new)) => {
new.config.reuse_password(config);
Ok(Some(ControlFlow::Break(new)))
}
Err(e) => Err(e),
Ok(None) => Ok(None),
}
}
@@ -490,6 +469,8 @@ pub trait ShouldRetry {
fn could_retry(&self) -> bool;
fn should_retry(&self, num_retries: u32) -> bool {
match self {
// retry all errors at least once
_ if num_retries == 0 => true,
_ if num_retries >= NUM_RETRIES_CONNECT => false,
err => err.could_retry(),
}
@@ -541,9 +522,14 @@ impl ShouldRetry for compute::ConnectionError {
}
}
fn retry_after(num_retries: u32) -> time::Duration {
// 1.5 seems to be an ok growth factor heuristic
BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
pub fn retry_after(num_retries: u32) -> time::Duration {
match num_retries {
0 => time::Duration::ZERO,
_ => {
// 3/2 = 1.5 which seems to be an ok growth factor heuristic
BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
}
}
}
/// Finish client connection initialization: confirm auth success, send params, etc.

View File

@@ -1,10 +1,10 @@
//! A group of high-level tests for connection establishing logic and auth.
//!
use std::borrow::Cow;
use super::*;
use crate::auth::backend::TestBackend;
use crate::auth::ClientCredentials;
use crate::console::{CachedNodeInfo, NodeInfo};
use crate::{auth, http, sasl, scram};
use crate::{auth, sasl, scram};
use async_trait::async_trait;
use rstest::rstest;
use tokio_postgres::config::SslMode;
@@ -99,8 +99,9 @@ struct Scram(scram::ServerSecret);
impl Scram {
fn new(password: &str) -> anyhow::Result<Self> {
let secret =
scram::ServerSecret::build(password).context("failed to generate scram secret")?;
let salt = rand::random::<[u8; 16]>();
let secret = scram::ServerSecret::build(password, &salt, 256)
.context("failed to generate scram secret")?;
Ok(Scram(secret))
}
@@ -301,18 +302,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
#[test]
fn connect_compute_total_wait() {
let mut total_wait = tokio::time::Duration::ZERO;
for num_retries in 1..10 {
for num_retries in 0..10 {
total_wait += retry_after(num_retries);
}
assert!(total_wait < tokio::time::Duration::from_secs(12));
assert!(total_wait > tokio::time::Duration::from_secs(10));
}
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy)]
enum ConnectAction {
Wake,
WakeFail,
WakeRetry,
Connect,
Retry,
Fail,
@@ -323,17 +321,6 @@ struct TestConnectMechanism {
sequence: Vec<ConnectAction>,
}
impl TestConnectMechanism {
fn verify(&self) {
let counter = self.counter.lock().unwrap();
assert_eq!(
*counter,
self.sequence.len(),
"sequence does not proceed to the end"
);
}
}
impl TestConnectMechanism {
fn new(sequence: Vec<ConnectAction>) -> Self {
Self {
@@ -383,63 +370,30 @@ impl ConnectMechanism for TestConnectMechanism {
ConnectAction::Connect => Ok(TestConnection),
ConnectAction::Retry => Err(TestConnectError { retryable: true }),
ConnectAction::Fail => Err(TestConnectError { retryable: false }),
x => panic!("expecting action {:?}, connect is called instead", x),
}
}
fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
}
impl TestBackend for TestConnectMechanism {
fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
let mut counter = self.counter.lock().unwrap();
let action = self.sequence[*counter];
*counter += 1;
match action {
ConnectAction::Wake => Ok(helper_create_cached_node_info()),
ConnectAction::WakeFail => {
let err = console::errors::ApiError::Console {
status: http::StatusCode::FORBIDDEN,
text: "TEST".into(),
};
assert!(!err.could_retry());
Err(console::errors::WakeComputeError::ApiError(err))
}
ConnectAction::WakeRetry => {
let err = console::errors::ApiError::Console {
status: http::StatusCode::INTERNAL_SERVER_ERROR,
text: "TEST".into(),
};
assert!(err.could_retry());
Err(console::errors::WakeComputeError::ApiError(err))
}
x => panic!("expecting action {:?}, wake_compute is called instead", x),
}
}
}
fn helper_create_cached_node_info() -> CachedNodeInfo {
fn helper_create_connect_info() -> (
CachedNodeInfo,
console::ConsoleReqExtra<'static>,
auth::BackendType<'static, ClientCredentials<'static>>,
) {
let node = NodeInfo {
config: compute::ConnCfg::new(),
aux: Default::default(),
allow_self_signed_compute: false,
};
CachedNodeInfo::new_uncached(node)
}
fn helper_create_connect_info(
mechanism: &TestConnectMechanism,
) -> (
CachedNodeInfo,
console::ConsoleReqExtra<'static>,
auth::BackendType<'_, ClientCredentials<'static>>,
) {
let cache = helper_create_cached_node_info();
let cache = CachedNodeInfo::new_uncached(node);
let extra = console::ConsoleReqExtra {
session_id: uuid::Uuid::new_v4(),
application_name: Some("TEST"),
};
let creds = auth::BackendType::Test(mechanism);
let url = "https://TEST_URL".parse().unwrap();
let api = console::provider::mock::Api::new(url);
let creds = auth::BackendType::Postgres(Cow::Owned(api), ClientCredentials::new_noop());
(cache, extra, creds)
}
@@ -447,46 +401,42 @@ fn helper_create_connect_info(
async fn connect_to_compute_success() {
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![Connect]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
let (cache, extra, creds) = helper_create_connect_info();
connect_to_compute(&mechanism, cache, &extra, &creds)
.await
.unwrap();
mechanism.verify();
}
#[tokio::test]
async fn connect_to_compute_retry() {
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Connect]);
let (cache, extra, creds) = helper_create_connect_info();
connect_to_compute(&mechanism, cache, &extra, &creds)
.await
.unwrap();
mechanism.verify();
}
/// Test that we don't retry if the error is not retryable.
#[tokio::test]
async fn connect_to_compute_non_retry_1() {
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Fail]);
let (cache, extra, creds) = helper_create_connect_info();
connect_to_compute(&mechanism, cache, &extra, &creds)
.await
.unwrap_err();
mechanism.verify();
}
/// Even for non-retryable errors, we should retry at least once.
#[tokio::test]
async fn connect_to_compute_non_retry_2() {
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
let mechanism = TestConnectMechanism::new(vec![Fail, Retry, Connect]);
let (cache, extra, creds) = helper_create_connect_info();
connect_to_compute(&mechanism, cache, &extra, &creds)
.await
.unwrap();
mechanism.verify();
}
/// Retry for at most `NUM_RETRIES_CONNECT` times.
@@ -495,36 +445,11 @@ async fn connect_to_compute_non_retry_3() {
assert_eq!(NUM_RETRIES_CONNECT, 10);
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![
Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
/* the 11th time */ Retry,
]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
let (cache, extra, creds) = helper_create_connect_info();
connect_to_compute(&mechanism, cache, &extra, &creds)
.await
.unwrap_err();
mechanism.verify();
}
/// Should retry wake compute.
#[tokio::test]
async fn wake_retry() {
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds)
.await
.unwrap();
mechanism.verify();
}
/// Wake failed with a non-retryable error.
#[tokio::test]
async fn wake_non_retry() {
use ConnectAction::*;
let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds)
.await
.unwrap_err();
mechanism.verify();
}

View File

@@ -12,6 +12,9 @@ mod messages;
mod secret;
mod signature;
#[cfg(any(test, doc))]
mod password;
pub use exchange::Exchange;
pub use key::ScramKey;
pub use secret::ServerSecret;
@@ -54,21 +57,27 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
#[cfg(test)]
mod tests {
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
use crate::sasl::{Mechanism, Step};
use super::{Exchange, ServerSecret};
use super::{password::SaltedPassword, Exchange, ServerSecret};
#[test]
fn snapshot() {
fn happy_path() {
let iterations = 4096;
let salt = "QSXCR+Q6sek8bf92";
let stored_key = "FO+9jBb3MUukt6jJnzjPZOWc5ow/Pu6JtPyju0aqaE8=";
let server_key = "qxJ1SbmSAi5EcS0J5Ck/cKAm/+Ixa+Kwp63f4OHDgzo=";
let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",);
let secret = ServerSecret::parse(&secret).unwrap();
let salt_base64 = "QSXCR+Q6sek8bf92";
let pw = SaltedPassword::new(
b"pencil",
base64::decode(salt_base64).unwrap().as_slice(),
iterations,
);
let secret = ServerSecret {
iterations,
salt_base64: salt_base64.to_owned(),
stored_key: pw.client_key().sha256(),
server_key: pw.server_key(),
doomed: false,
};
const NONCE: [u8; 18] = [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
];
@@ -106,40 +115,4 @@ mod tests {
]
);
}
fn run_round_trip_test(client_password: &str) {
let secret = ServerSecret::build("pencil").unwrap();
let mut exchange = Exchange::new(&secret, rand::random, None);
let mut client =
ScramSha256::new(client_password.as_bytes(), ChannelBinding::unsupported());
let client_first = std::str::from_utf8(client.message()).unwrap();
exchange = match exchange.exchange(client_first).unwrap() {
Step::Continue(exchange, message) => {
client.update(message.as_bytes()).unwrap();
exchange
}
Step::Success(_, _) => panic!("expected continue, got success"),
Step::Failure(f) => panic!("{f}"),
};
let client_final = std::str::from_utf8(client.message()).unwrap();
match exchange.exchange(client_final).unwrap() {
Step::Success(_, message) => client.finish(message.as_bytes()).unwrap(),
Step::Continue(_, _) => panic!("expected success, got continue"),
Step::Failure(f) => panic!("{f}"),
};
}
#[test]
fn round_trip() {
run_round_trip_test("pencil")
}
#[test]
#[should_panic(expected = "password doesn't match")]
fn failure() {
run_round_trip_test("eraser")
}
}

View File

@@ -3,7 +3,7 @@
/// Faithfully taken from PostgreSQL.
pub const SCRAM_KEY_LEN: usize = 32;
/// One of the keys derived from the user's password.
/// One of the keys derived from the [password](super::password::SaltedPassword).
/// We use the same structure for all keys, i.e.
/// `ClientKey`, `StoredKey`, and `ServerKey`.
#[derive(Default, PartialEq, Eq)]

View File

@@ -0,0 +1,74 @@
//! Password hashing routines.
use super::key::ScramKey;
pub const SALTED_PASSWORD_LEN: usize = 32;
/// Salted hashed password is essential for [key](super::key) derivation.
#[repr(transparent)]
pub struct SaltedPassword {
bytes: [u8; SALTED_PASSWORD_LEN],
}
impl SaltedPassword {
/// See `scram-common.c : scram_SaltedPassword` for details.
/// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
pbkdf2::pbkdf2_hmac_array::<sha2::Sha256, 32>(password, salt, iterations).into()
}
/// Derive `ClientKey` from a salted hashed password.
pub fn client_key(&self) -> ScramKey {
super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into()
}
/// Derive `ServerKey` from a salted hashed password.
pub fn server_key(&self) -> ScramKey {
super::hmac_sha256(&self.bytes, [b"Server Key".as_ref()]).into()
}
}
impl From<[u8; SALTED_PASSWORD_LEN]> for SaltedPassword {
#[inline(always)]
fn from(bytes: [u8; SALTED_PASSWORD_LEN]) -> Self {
Self { bytes }
}
}
#[cfg(test)]
mod tests {
use super::SaltedPassword;
fn legacy_pbkdf2_impl(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
let one = 1_u32.to_be_bytes(); // magic
let mut current = super::super::hmac_sha256(password, [salt, &one]);
let mut result = current;
for _ in 1..iterations {
current = super::super::hmac_sha256(password, [current.as_ref()]);
// TODO: result = current.zip(result).map(|(x, y)| x ^ y), issue #80094
for (i, x) in current.iter().enumerate() {
result[i] ^= x;
}
}
result.into()
}
#[test]
fn pbkdf2() {
let password = "a-very-secure-password";
let salt = "such-a-random-salt";
let iterations = 4096;
let output = [
203, 18, 206, 81, 4, 154, 193, 100, 147, 41, 211, 217, 177, 203, 69, 210, 194, 211,
101, 1, 248, 156, 96, 0, 8, 223, 30, 87, 158, 41, 20, 42,
];
let actual = SaltedPassword::new(password.as_bytes(), salt.as_bytes(), iterations);
let expected = legacy_pbkdf2_impl(password.as_bytes(), salt.as_bytes(), iterations);
assert_eq!(actual.bytes, output);
assert_eq!(actual.bytes, expected.bytes);
}
}

View File

@@ -3,7 +3,7 @@
use super::base64_decode_array;
use super::key::ScramKey;
/// Server secret is produced from user's password,
/// Server secret is produced from [password](super::password::SaltedPassword)
/// and is used throughout the authentication process.
pub struct ServerSecret {
/// Number of iterations for `PBKDF2` function.
@@ -58,10 +58,21 @@ impl ServerSecret {
/// Build a new server secret from the prerequisites.
/// XXX: We only use this function in tests.
#[cfg(test)]
pub fn build(password: &str) -> Option<Self> {
Self::parse(&postgres_protocol::password::scram_sha_256(
password.as_bytes(),
))
pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option<Self> {
// TODO: implement proper password normalization required by the RFC
if !password.is_ascii() {
return None;
}
let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations);
Some(Self {
iterations,
salt_base64: base64::encode(salt),
stored_key: password.client_key().sha256(),
server_key: password.server_key(),
doomed: false,
})
}
}
@@ -91,4 +102,20 @@ mod tests {
assert_eq!(base64::encode(parsed.stored_key), stored_key);
assert_eq!(base64::encode(parsed.server_key), server_key);
}
#[test]
fn build_scram_secret() {
let salt = b"salt";
let secret = ServerSecret::build("password", salt, 4096).unwrap();
assert_eq!(secret.iterations, 4096);
assert_eq!(secret.salt_base64, base64::encode(salt));
assert_eq!(
base64::encode(secret.stored_key.as_ref()),
"lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ="
);
assert_eq!(
base64::encode(secret.server_key.as_ref()),
"ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw="
);
}
}

View File

@@ -234,10 +234,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
listen_pg_addr_tenant_only
);
let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
error!(
"failed to bind to address {}: {}",
listen_pg_addr_tenant_only, e
);
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
e
})?;
Some(listener)

View File

@@ -11,7 +11,6 @@ use crate::auth::check_permission;
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
use crate::timeline::TimelineError;
use crate::wal_service::ConnectionId;
use crate::{GlobalTimelines, SafeKeeperConf};
use postgres_backend::QueryError;
@@ -46,7 +45,6 @@ enum SafekeeperPostgresCommand {
StartWalPush,
StartReplication { start_lsn: Lsn },
IdentifySystem,
TimelineStatus,
JSONCtrl { cmd: AppendLogicalMessage },
}
@@ -66,8 +64,6 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
} else if cmd.starts_with("IDENTIFY_SYSTEM") {
Ok(SafekeeperPostgresCommand::IdentifySystem)
} else if cmd.starts_with("TIMELINE_STATUS") {
Ok(SafekeeperPostgresCommand::TimelineStatus)
} else if cmd.starts_with("JSON_CTRL") {
let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?;
Ok(SafekeeperPostgresCommand::JSONCtrl {
@@ -82,7 +78,6 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
match cmd {
SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL",
}
@@ -224,7 +219,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
.await
}
SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
handle_json_ctrl(self, pgb, cmd).await
}
@@ -269,38 +263,6 @@ impl SafekeeperPostgresHandler {
check_permission(claims, tenant_id)
}
async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
&mut self,
pgb: &mut PostgresBackend<IO>,
) -> Result<(), QueryError> {
// Get timeline, handling "not found" error
let tli = match GlobalTimelines::get(self.ttid) {
Ok(tli) => Ok(Some(tli)),
Err(TimelineError::NotFound(_)) => Ok(None),
Err(e) => Err(QueryError::Other(e.into())),
}?;
// Write row description
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::text_col(b"flush_lsn"),
RowDescriptor::text_col(b"commit_lsn"),
]))?;
// Write row if timeline exists
if let Some(tli) = tli {
let (inmem, _state) = tli.get_state().await;
let flush_lsn = tli.get_flush_lsn().await;
let commit_lsn = inmem.commit_lsn;
pgb.write_message_noflush(&BeMessage::DataRow(&[
Some(flush_lsn.to_string().as_bytes()),
Some(commit_lsn.to_string().as_bytes()),
]))?;
}
pgb.write_message_noflush(&BeMessage::CommandComplete(b"TIMELINE_STATUS"))?;
Ok(())
}
///
/// Handle IDENTIFY_SYSTEM replication command
///

View File

@@ -1,80 +0,0 @@
#! /usr/bin/env python3
# Script to generate ext_index.json metadata file
# that stores content of the control files and location of extension archives
# for all extensions in extensions subdir.
import argparse
import json
import subprocess
from pathlib import Path
"""
# ext_index.json example:
{
"public_extensions": [
"anon"
],
"library_index": {
"anon": "anon",
"kq_imcx": "kq_imcx"
// would be more complicated for something like postgis where multiple library names all map to postgis
},
"extension_data": {
"kq_imcx": {
"control_data": {
"kq_imcx.control": "# This file is generated content from add_postgresql_extension.\n# No point in modifying it, it will be overwritten anyway.\n\n# Default version, always set\ndefault_version = '0.1'\n\n# Module pathname generated from target shared library name. Use\n# MODULE_PATHNAME in script file.\nmodule_pathname = '$libdir/kq_imcx.so'\n\n# Comment for extension. Set using COMMENT option. Can be set in\n# script file as well.\ncomment = 'ketteQ In-Memory Calendar Extension (IMCX)'\n\n# Encoding for script file. Set using ENCODING option.\n#encoding = ''\n\n# Required extensions. Set using REQUIRES option (multi-valued).\n#requires = ''\ntrusted = true\n"
},
"archive_path": "5648391853/v15/extensions/kq_imcx.tar.zst"
},
"anon": {
"control_data": {
"anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
},
"archive_path": "5648391853/v15/extensions/anon.tar.zst"
}
}
}
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="generate ext_index.json")
parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
parser.add_argument("--public_extensions", type=str, help="list of public extensions")
args = parser.parse_args()
pg_version = args.pg_version
BUILD_TAG = args.BUILD_TAG
public_ext_list = args.public_extensions.split(",")
ext_index = {}
library_index = {}
EXT_PATH = Path("extensions")
for extension in EXT_PATH.iterdir():
if extension.is_dir():
control_data = {}
for control_file in extension.glob("*.control"):
if control_file.suffix != ".control":
continue
with open(control_file, "r") as f:
control_data[control_file.name] = f.read()
ext_index[extension.name] = {
"control_data": control_data,
"archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
}
elif extension.suffix == ".zst":
file_list = (
str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
.strip()
.split("\n")
)
for file in file_list:
if file.endswith(".so") and file.startswith("lib/"):
lib_name = file[4:-3]
library_index[lib_name] = extension.name.replace(".tar.zst", "")
all_data = {
"public_extensions": public_ext_list,
"library_index": library_index,
"extension_data": ext_index,
}
with open("ext_index.json", "w") as f:
json.dump(all_data, f)

View File

@@ -542,7 +542,7 @@ class S3Storage:
access_key: str
secret_key: str
endpoint: Optional[str] = None
prefix_in_bucket: Optional[str] = ""
prefix_in_bucket: Optional[str] = None
def access_env_vars(self) -> Dict[str, str]:
return {
@@ -1504,7 +1504,6 @@ class NeonCli(AbstractNeonCli):
safekeepers: Optional[List[int]] = None,
tenant_id: Optional[TenantId] = None,
lsn: Optional[Lsn] = None,
branch_name: Optional[str] = None,
) -> "subprocess.CompletedProcess[str]":
args = [
"endpoint",
@@ -1518,11 +1517,8 @@ class NeonCli(AbstractNeonCli):
args.append(f"--lsn={lsn}")
args.extend(["--pg-port", str(pg_port)])
args.extend(["--http-port", str(http_port)])
if safekeepers is not None:
args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
if branch_name is not None:
args.extend(["--branch-name", branch_name])
if endpoint_id is not None:
args.append(endpoint_id)

View File

@@ -194,18 +194,14 @@ def wait_for_upload_queue_empty(
def wait_timeline_detail_404(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
timeline_id: TimelineId,
wait_longer: bool = False,
pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
):
last_exc = None
iterations = 10 if wait_longer else 2
for _ in range(iterations):
for _ in range(2):
time.sleep(0.250)
try:
data = pageserver_http.timeline_detail(tenant_id, timeline_id)
log.info(f"detail {data}")
log.error(f"detail {data}")
except PageserverApiException as e:
log.debug(e)
if e.status_code == 404:
@@ -220,8 +216,7 @@ def timeline_delete_wait_completed(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
timeline_id: TimelineId,
wait_longer: bool = False, # Use when running with RemoteStorageKind.REAL_S3
**delete_args,
):
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer)
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)

View File

@@ -61,7 +61,6 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
durations = {
"wait_for_spec_ms": f"{i}_wait_for_spec",
"sync_safekeepers_ms": f"{i}_sync_safekeepers",
"sync_sk_check_ms": f"{i}_sync_sk_check",
"basebackup_ms": f"{i}_basebackup",
"start_postgres_ms": f"{i}_start_postgres",
"config_ms": f"{i}_config",

View File

@@ -123,7 +123,7 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
@pytest.mark.parametrize("content_type", [None, "application/json"])
def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
"""
For backwards-compatibility: if we send an empty body,
For backwards-compatiblity: if we send an empty body,
the request should be accepted and the config should be the default config.
"""
env = positive_env

View File

@@ -4,7 +4,7 @@ import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import Any, List, Optional
from typing import Any, Optional
import pytest
import toml # TODO: replace with tomllib for Python >= 3.11
@@ -14,6 +14,7 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
PgBin,
PortDistributor,
parse_project_git_version_output,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import (
@@ -62,6 +63,7 @@ def test_create_snapshot(
neon_env_builder.pg_version = pg_version
neon_env_builder.num_safekeepers = 3
neon_env_builder.enable_local_fs_remote_storage()
neon_env_builder.preserve_database_files = True
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start("main")
@@ -257,15 +259,36 @@ def prepare_snapshot(
shutil.rmtree(repo_dir / "pgdatadirs")
os.mkdir(repo_dir / "endpoints")
# Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
# them anymore, but old versions did.
for tenant in (repo_dir / "tenants").glob("*"):
wal_redo_dir = tenant / "wal-redo-datadir.___temp"
if wal_redo_dir.exists() and wal_redo_dir.is_dir():
shutil.rmtree(wal_redo_dir)
# Update paths and ports in config files
pageserver_toml = repo_dir / "pageserver.toml"
pageserver_config = toml.load(pageserver_toml)
pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param])
pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
pageserver_config["listen_http_addr"]
)
pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
pageserver_config["listen_pg_addr"]
)
# since storage_broker these are overriden by neon_local during pageserver
# start; remove both to prevent unknown options during etcd ->
# storage_broker migration. TODO: remove once broker is released
pageserver_config.pop("broker_endpoint", None)
pageserver_config.pop("broker_endpoints", None)
etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"]
if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
pageserver_config["broker_endpoints"] = etcd_broker_endpoints # old etcd version
# We don't use authentication in compatibility tests
# so just remove authentication related settings.
# Older pageserver versions had just one `auth_type` setting. Now there
# are separate settings for pg and http ports. We don't use authentication
# in compatibility tests so just remove authentication related settings.
pageserver_config.pop("auth_type", None)
pageserver_config.pop("pg_auth_type", None)
pageserver_config.pop("http_auth_type", None)
@@ -277,16 +300,31 @@ def prepare_snapshot(
snapshot_config_toml = repo_dir / "config"
snapshot_config = toml.load(snapshot_config_toml)
for param in ("listen_http_addr", "listen_pg_addr"):
snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port(
snapshot_config["pageserver"][param]
)
snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
snapshot_config["broker"]["listen_addr"]
# Provide up/downgrade etcd <-> storage_broker to make forward/backward
# compatibility test happy. TODO: leave only the new part once broker is released.
if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
# old etcd version
snapshot_config["etcd_broker"] = {
"etcd_binary_path": shutil.which("etcd"),
"broker_endpoints": etcd_broker_endpoints,
}
snapshot_config.pop("broker", None)
else:
# new storage_broker version
broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
snapshot_config.pop("etcd_broker", None)
snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
snapshot_config["pageserver"]["listen_http_addr"]
)
snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port(
snapshot_config["pageserver"]["listen_pg_addr"]
)
for sk in snapshot_config["safekeepers"]:
for param in ("http_port", "pg_port", "pg_tenant_only_port"):
sk[param] = port_distributor.replace_with_new_port(sk[param])
sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])
if pg_distrib_dir:
snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
@@ -312,6 +350,12 @@ def prepare_snapshot(
), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
# get git SHA of neon binary
def get_neon_version(neon_binpath: Path):
out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8")
return parse_project_git_version_output(out)
def check_neon_works(
repo_dir: Path,
neon_target_binpath: Path,
@@ -337,6 +381,7 @@ def check_neon_works(
config.pg_version = pg_version
config.initial_tenant = snapshot_config["default_tenant_id"]
config.pg_distrib_dir = pg_distrib_dir
config.preserve_database_files = True
# Use the "target" binaries to launch the storage nodes
config_target = config
@@ -393,14 +438,6 @@ def check_neon_works(
test_output_dir / "dump-from-wal.filediff",
)
# TODO: Run pg_amcheck unconditionally after the next release
try:
pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
except subprocess.CalledProcessError:
log.info("Extension amcheck is not available, skipping pg_amcheck")
else:
pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
# Check that we can interract with the data
pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])
@@ -408,15 +445,10 @@ def check_neon_works(
assert not initial_dump_differs, "initial dump differs"
def dump_differs(
first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None
) -> bool:
def dump_differs(first: Path, second: Path, output: Path) -> bool:
"""
Runs diff(1) command on two SQL dumps and write the output to the given output file.
The function supports allowed diffs, if the diff is in the allowed_diffs list, it's not considered as a difference.
See the example of it in https://github.com/neondatabase/neon/pull/4425/files#diff-15c5bfdd1d5cc1411b9221091511a60dd13a9edf672bdfbb57dd2ef8bb7815d6
Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False).
Returns True if the dumps differ, False otherwise.
"""
with output.open("w") as stdout:
@@ -434,30 +466,51 @@ def dump_differs(
differs = res.returncode != 0
allowed_diffs = allowed_diffs or []
if differs and len(allowed_diffs) > 0:
for allowed_diff in allowed_diffs:
with tempfile.NamedTemporaryFile(mode="w") as tmp:
tmp.write(allowed_diff)
tmp.flush()
# TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made
if differs:
with tempfile.NamedTemporaryFile(mode="w") as tmp:
tmp.write(PR4425_ALLOWED_DIFF)
tmp.flush()
allowed = subprocess.run(
[
"diff",
"--unified", # Make diff output more readable
r"--ignore-matching-lines=^---", # Ignore diff headers
r"--ignore-matching-lines=^\+\+\+", # Ignore diff headers
"--ignore-matching-lines=^@@", # Ignore diff blocks location
"--ignore-matching-lines=^ *$", # Ignore lines with only spaces
"--ignore-matching-lines=^ --.*", # Ignore SQL comments in diff
"--ignore-blank-lines",
str(output),
str(tmp.name),
],
)
allowed = subprocess.run(
[
"diff",
"--unified", # Make diff output more readable
r"--ignore-matching-lines=^---", # Ignore diff headers
r"--ignore-matching-lines=^\+\+\+", # Ignore diff headers
"--ignore-matching-lines=^@@", # Ignore diff blocks location
"--ignore-matching-lines=^ *$", # Ignore lines with only spaces
"--ignore-matching-lines=^ --.*", # Ignore the " --" lines for compatibility with PG14
"--ignore-blank-lines",
str(output),
str(tmp.name),
],
)
differs = allowed.returncode != 0
if not differs:
break
differs = allowed.returncode != 0
return differs
PR4425_ALLOWED_DIFF = """
--- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000
+++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql 2023-06-13 07:25:35.211733653 +0000
@@ -13,12 +13,20 @@
CREATE ROLE cloud_admin;
ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS;
+CREATE ROLE neon_superuser;
+ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS;
--
-- User Configurations
--
+--
+-- Role memberships
+--
+
+GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin;
+GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin;
"""

View File

@@ -14,6 +14,10 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
env = neon_env_builder.init_start()
# These warnings are expected, when the pageserver is restarted abruptly
env.pageserver.allowed_errors.append(".*found future image layer.*")
env.pageserver.allowed_errors.append(".*found future delta layer.*")
pageserver_http = env.pageserver.http_client()
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test

View File

@@ -16,13 +16,11 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port
)
branch_name = "migration-check"
env.neon_cli.create_branch(new_branch_name=branch_name)
env.neon_cli.create_branch(new_branch_name="migration_check")
pg_port = port_distributor.get_port()
http_port = port_distributor.get_port()
env.neon_cli.endpoint_start(
f"ep-{branch_name}", pg_port, http_port, branch_name=branch_name
endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port
)
finally:
env.neon_cli.stop()

View File

@@ -690,6 +690,10 @@ def test_ondemand_download_failure_to_replace(
pageserver_http = env.pageserver.http_client()
lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
# remove layers so that they will be redownloaded
pageserver_http.tenant_detach(tenant_id)
pageserver_http.tenant_attach(tenant_id)
@@ -700,10 +704,8 @@ def test_ondemand_download_failure_to_replace(
# requesting details with non-incremental size should trigger a download of the only layer
# this will need to be adjusted if an index for logical sizes is ever implemented
with pytest.raises(PageserverApiException):
# PageserverApiException is expected because of the failpoint (timeline_detail building does something)
# ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
# but should it be added back, we would wait for 15s here.
pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)
# error message is not useful
pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
actual_message = ".* ERROR .*layermap-replace-notfound"
assert env.pageserver.log_contains(actual_message) is not None

View File

@@ -72,6 +72,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
# These warnings are expected, when the pageserver is restarted abruptly
env.pageserver.allowed_errors.append(".*found future image layer.*")
env.pageserver.allowed_errors.append(".*found future delta layer.*")
# Use a tiny checkpoint distance, to create a lot of layers quickly.
# That allows us to stress the compaction and layer flushing logic more.
tenant, _ = env.neon_cli.create_tenant(

View File

@@ -1,6 +1,6 @@
import json
import subprocess
from typing import Any, List, Optional, Tuple
from typing import Any, List, Optional
import psycopg2
import pytest
@@ -260,73 +260,3 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):
rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, True)["rows"]
assert rows == [["1", "a", "{1,2,3}"]]
def test_sql_over_http_batch(static_proxy: NeonProxy):
static_proxy.safe_psql("create role http with login password 'http' superuser")
def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
response = requests.post(
f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
headers={
"Content-Type": "application/sql",
"Neon-Connection-String": connstr,
"Neon-Batch-Isolation-Level": "Serializable",
"Neon-Batch-Read-Only": "true" if read_only else "false",
},
verify=str(static_proxy.test_output_dir / "proxy.crt"),
)
assert response.status_code == 200
return response.json()["results"], response.headers
result, headers = qq(
[
("select 42 as answer", None),
("select $1 as answer", [42]),
("select $1 * 1 as answer", [42]),
("select $1::int[] as answer", [[1, 2, 3]]),
("select $1::json->'a' as answer", [{"a": {"b": 42}}]),
("select * from pg_class limit 1", None),
("create table t(id serial primary key, val int)", None),
("insert into t(val) values (10), (20), (30) returning id", None),
("select * from t", None),
("drop table t", None),
]
)
assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
assert headers["Neon-Batch-Read-Only"] == "false"
assert result[0]["rows"] == [{"answer": 42}]
assert result[1]["rows"] == [{"answer": "42"}]
assert result[2]["rows"] == [{"answer": 42}]
assert result[3]["rows"] == [{"answer": [1, 2, 3]}]
assert result[4]["rows"] == [{"answer": {"b": 42}}]
assert len(result[5]["rows"]) == 1
res = result[6]
assert res["command"] == "CREATE"
assert res["rowCount"] is None
res = result[7]
assert res["command"] == "INSERT"
assert res["rowCount"] == 3
assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}]
res = result[8]
assert res["command"] == "SELECT"
assert res["rowCount"] == 3
res = result[9]
assert res["command"] == "DROP"
assert res["rowCount"] is None
assert len(result) == 10
result, headers = qq(
[
("select 42 as answer", None),
],
True,
)
assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
assert headers["Neon-Batch-Read-Only"] == "true"
assert result[0]["rows"] == [{"answer": 42}]

View File

@@ -15,6 +15,10 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
env.pageserver.is_testing_enabled_or_skip()
# These warnings are expected, when the pageserver is restarted abruptly
env.pageserver.allowed_errors.append(".*found future delta layer.*")
env.pageserver.allowed_errors.append(".*found future image layer.*")
# Create a branch for us
env.neon_cli.create_branch("test_pageserver_recovery", "main")

View File

@@ -38,12 +38,6 @@ def test_threshold_based_eviction(
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(metrics_refused_log_line)
# these can happen whenever we run consumption metrics collection
env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled")
env.pageserver.allowed_errors.append(
r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes"
)
tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
assert isinstance(timeline_id, TimelineId)

View File

@@ -1,4 +1,3 @@
import enum
import os
import queue
import shutil
@@ -12,12 +11,9 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
RemoteStorageKind,
S3Storage,
available_remote_storages,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
@@ -121,183 +117,59 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
class Check(enum.Enum):
RETRY_WITHOUT_RESTART = enum.auto()
RETRY_WITH_RESTART = enum.auto()
DELETE_FAILPOINTS = [
"timeline-delete-before-index-deleted-at",
"timeline-delete-before-schedule",
"timeline-delete-before-rm",
"timeline-delete-during-rm",
"timeline-delete-after-rm",
"timeline-delete-before-index-delete",
"timeline-delete-after-index-delete",
"timeline-delete-after-rm-metadata",
"timeline-delete-after-rm-dir",
]
def combinations():
result = []
remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
remotes.append(RemoteStorageKind.REAL_S3)
for remote_storage_kind in remotes:
for delete_failpoint in DELETE_FAILPOINTS:
if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
"timeline-delete-before-index-delete",
"timeline-delete-after-index-delete",
):
# the above failpoints are not relevant for config without remote storage
continue
result.append((remote_storage_kind, delete_failpoint))
return result
# cover the two cases: remote storage configured vs not configured
@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
@pytest.mark.parametrize("check", list(Check))
def test_delete_timeline_exercise_crash_safety_failpoints(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
failpoint: str,
check: Check,
pg_bin: PgBin,
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
def test_delete_timeline_post_rm_failure(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
"""
If there is a failure during deletion in one of the associated failpoints (or crash restart happens at this point) the delete operation
should be retryable and should be successfully resumed.
We iterate over failpoints list, changing failpoint to the next one.
1. Set settings to generate many layers
2. Create branch.
3. Insert something
4. Go with the test.
5. Iterate over failpoints
6. Execute delete for each failpoint
7. Ensure failpoint is hit
8. Retry or restart without the failpoint and check the result.
If there is a failure after removing the timeline directory, the delete operation
should be retryable.
"""
if remote_storage_kind is not None:
neon_env_builder.enable_remote_storage(
remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
remote_storage_kind, "test_delete_timeline_post_rm_failure"
)
env = neon_env_builder.init_start(
initial_tenant_conf={
"gc_period": "0s",
"compaction_period": "0s",
"checkpoint_distance": f"{1024 ** 2}",
"image_creation_threshold": "100",
}
)
env = neon_env_builder.init_start()
assert env.initial_timeline
env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm")
env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
ps_http = env.pageserver.http_client()
timeline_id = env.neon_cli.create_timeline("delete")
with env.endpoints.create_start("delete") as endpoint:
# generate enough layers
pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
if remote_storage_kind is RemoteStorageKind.NOOP:
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
else:
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
failpoint_name = "timeline-delete-after-rm"
ps_http.configure_failpoints((failpoint_name, "return"))
env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
# It appears when we stopped flush loop during deletion and then pageserver is stopped
env.pageserver.allowed_errors.append(
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
wait_until_timeline_state(
pageserver_http=ps_http,
tenant_id=env.initial_tenant,
timeline_id=env.initial_timeline,
expected_state="Broken",
iterations=2, # effectively try immediately and retry once in one second
)
# This happens when we fail before scheduling background operation.
# Timeline is left in stopping state and retry tries to stop it again.
# FIXME: #4719
# timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
env.pageserver.allowed_errors.append(at_failpoint_log_message)
env.pageserver.allowed_errors.append(
".*Ignoring new state, equal to the existing one: Stopping"
f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}"
)
# This happens when we retry delete requests for broken timelines
env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
# This happens when timeline remains are cleaned up during loading
env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
# In one of the branches we poll for tenant to become active. Polls can generate this log message:
env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")
ps_http.configure_failpoints((failpoint, "return"))
# retry without failpoint, it should succeed
ps_http.configure_failpoints((failpoint_name, "off"))
# These failpoints are earlier than background task is spawned.
# so they result in api request failure.
if failpoint in (
"timeline-delete-before-index-deleted-at",
"timeline-delete-before-schedule",
):
with pytest.raises(PageserverApiException, match=failpoint):
ps_http.timeline_delete(env.initial_tenant, timeline_id)
else:
ps_http.timeline_delete(env.initial_tenant, timeline_id)
timeline_info = wait_until_timeline_state(
pageserver_http=ps_http,
tenant_id=env.initial_tenant,
timeline_id=timeline_id,
expected_state="Broken",
iterations=2, # effectively try immediately and retry once in one second
)
reason = timeline_info["state"]["Broken"]["reason"]
log.info(f"timeline broken: {reason}")
# failpoint may not be the only error in the stack
assert reason.endswith(f"failpoint: {failpoint}"), reason
wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3
if check is Check.RETRY_WITH_RESTART:
env.pageserver.stop()
env.pageserver.start()
if failpoint == "timeline-delete-before-index-deleted-at":
# We crashed before persisting this to remote storage, need to retry delete request
# Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading)
wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2)
timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
else:
# Pageserver should've resumed deletion after restart.
wait_timeline_detail_404(
ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
)
elif check is Check.RETRY_WITHOUT_RESTART:
# this should succeed
# this also checks that delete can be retried even when timeline is in Broken state
ps_http.configure_failpoints((failpoint, "off"))
timeline_delete_wait_completed(
ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
)
# Check remote is impty
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(env.initial_tenant),
"timelines",
str(timeline_id),
)
),
)
timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id)
# Check local is empty
assert not timeline_dir.exists()
# Check no delete mark present
assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()
# this should succeed
# this also checks that delete can be retried even when timeline is in Broken state
timeline_delete_wait_completed(ps_http, env.initial_tenant, env.initial_timeline)
env.pageserver.allowed_errors.append(
f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
)
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@@ -455,7 +327,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
)
ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
timeline_info = wait_until_timeline_state(
wait_until_timeline_state(
pageserver_http=ps_http,
tenant_id=env.initial_tenant,
timeline_id=leaf_timeline_id,
@@ -463,7 +335,8 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
iterations=2, # effectively try immediately and retry once in one second
)
assert timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-before-rm"
# FIXME: #4719
# timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
assert leaf_timeline_path.exists(), "the failpoint didn't work"
@@ -715,7 +588,6 @@ def test_timeline_delete_works_for_remote_smoke(
assert tenant_id == env.initial_tenant
assert main_timeline_id == env.initial_timeline
assert env.initial_timeline is not None
timeline_ids = [env.initial_timeline]
for i in range(2):
branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")

View File

@@ -1,4 +1,4 @@
{
"postgres-v15": "770c6dffc5ef6aac05bf049693877fb377eea6fc",
"postgres-v14": "da3885c34db312afd555802be2ce985fafd1d8ad"
"postgres-v15": "e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e",
"postgres-v14": "12c5dc8281d20b5bd636e1097eea80a7bc609591"
}