Compare commits

...

140 Commits

Author SHA1 Message Date
Anastasia Lubennikova
a4ddad92c1 WIP: half-done refactoring of download_extension logic.
this breaks tests
2023-06-23 16:34:48 +03:00
Alek Westover
fd3dfe9d52 fix typo 2023-06-23 08:04:37 -04:00
Alek Westover
384e3ab1a8 fix code style 2023-06-22 15:10:08 -04:00
Alek Westover
4259464f72 fix typo 2023-06-22 15:07:28 -04:00
Alek Westover
152206211b turn remote extensions off by default 2023-06-22 15:02:51 -04:00
Alek Westover
9c35c06c58 small refactor 2023-06-22 14:24:59 -04:00
Alek Westover
245b4c9d72 remove env vars 2023-06-22 13:48:21 -04:00
Alek Westover
3d0f74fc0c fixing paths 2023-06-22 13:47:47 -04:00
Alek Westover
ce55f70cac added env vars 2023-06-22 12:23:30 -04:00
Alek Westover
053d592ddb fix minor bug 2023-06-22 12:08:28 -04:00
Alek Westover
b85416b58d satisfying mypy 2023-06-22 11:15:33 -04:00
Alek Westover
195838436c Merge branch 'extension_server' of github.com:neondatabase/neon into extension_server 2023-06-22 11:01:04 -04:00
Alek Westover
9313045de6 finished making the configuration optional 2023-06-22 11:00:38 -04:00
Alek Westover
44ac7a45be Merge branch 'main' into extension_server 2023-06-22 10:30:19 -04:00
Alek Westover
e35e8a7dcb None 2023-06-22 10:25:54 -04:00
Alek Westover
a79b0d69c4 made remote_ext_config an optional parameter 2023-06-22 10:21:07 -04:00
Anastasia Lubennikova
2f618f46be Use BUILD_TAG in compute_ctl binary. (#4541)
Pass BUILD_TAG to compute_ctl binary. 
We need it to access versioned extension storage.
2023-06-22 17:06:16 +03:00
Alek Westover
d475e901e5 fix more code style 2023-06-22 09:45:43 -04:00
Alek Westover
bf3b83b504 fix code style for clippy 2023-06-22 09:37:07 -04:00
Alek Westover
94781e8710 remote useless return 2023-06-21 17:26:20 -04:00
Alek Westover
4b83a206bf download extension from rust works 2023-06-21 17:22:21 -04:00
Alek Westover
f984f9e7d3 seems close to working 2023-06-21 15:25:06 -04:00
Alek Westover
6b42464c23 delete useless file 2023-06-21 14:34:37 -04:00
Alek Westover
605c30e5c5 fixed an issue where pgconfig was pointing at global installation of postgres rather than the correct local version 2023-06-21 14:34:24 -04:00
Alek Westover
0b11d8e836 replaced download_files function with more appropriate download_extensions function 2023-06-21 14:01:44 -04:00
Alek Westover
7602483af9 also need separate bucket if real s3 2023-06-21 13:25:35 -04:00
Alek Westover
5e1e859ab8 separate bucket for extensions 2023-06-21 13:18:35 -04:00
Alek Westover
85a7511700 remove extraneous argv passing 2023-06-21 12:42:17 -04:00
Alek Westover
89b8ea132e refactor more 2023-06-21 11:39:24 -04:00
Alek Westover
bfbae98f24 refactor 2023-06-21 11:32:44 -04:00
Alek Westover
02a1d4d8c1 refactoring a bit 2023-06-21 11:32:04 -04:00
Alexander Bayandin
d3aa8a48ea Update client libs for test_runner/pg_clients to their latest versions (#4547)
Resolves https://github.com/neondatabase/neon/security/dependabot/27
2023-06-21 16:20:35 +01:00
Alek Westover
4a35f29301 code style 2023-06-21 11:07:27 -04:00
Alek Westover
559e318328 remote dead imports 2023-06-21 11:01:45 -04:00
Alek Westover
a4d236b02f finishing cleanup debugging 2023-06-21 11:00:36 -04:00
Christian Schwarz
e4da76f021 update_gc_info: fix typo in timeline_id tracing field (#4546)
Commit

```
commit 472cc17b7a
Author: Dmitry Rodionov <dmitry@neon.tech>
Date:   Thu Jun 15 17:30:12 2023 +0300

    propagate lock guard to background deletion task (#4495)
```

did a drive-by fix, but, the drive-by had a typo.

```
gc_loop{tenant_id=2e2f2bff091b258ac22a4c4dd39bd25d}:update_gc_info{timline_id=837c688fd37c903639b9aa0a6dd3f1f1}:download_remote_layer{layer=000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000024DA0D1-000000000443FB51}:panic{thread=background op worker location=pageserver/src/tenant/timeline.rs:4843:25}: missing extractors: ["TimelineId"]

Stack backtrace:
   0: utils::logging::tracing_panic_hook
             at /libs/utils/src/logging.rs:166:21
   1: <alloc::boxed::Box<F,A> as core::ops::function::Fn<Args>>::call
             at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/alloc/src/boxed.rs:2002:9
   2: std::panicking::rust_panic_with_hook
             at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/panicking.rs:692:13
   3: std::panicking::begin_panic_handler::{{closure}}
             at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/panicking.rs:579:13
   4: std::sys_common::backtrace::__rust_end_short_backtrace
             at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/sys_common/backtrace.rs:137:18
   5: rust_begin_unwind
             at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/std/src/panicking.rs:575:5
   6: core::panicking::panic_fmt
             at /rustc/9eb3afe9ebe9c7d2b84b71002d44f4a0edac95e0/library/core/src/panicking.rs:64:14
   7: pageserver::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id
             at /pageserver/src/tenant/timeline.rs:4843:25
   8: <pageserver::tenant::timeline::Timeline>::download_remote_layer::{closure#0}::{closure#0}
             at /pageserver/src/tenant/timeline.rs:4368:9
   9: <tracing::instrument::Instrumented<<pageserver::tenant::timeline::Timeline>::download_remote_layer::{closure#0}::{closure#0}> as core::future::future::Future>::poll
             at /.cargo/registry/src/github.com-1ecc6299db9ec823/tracing-0.1.37/src/instrument.rs:272:9
  10: <pageserver::tenant::timeline::Timeline>::download_remote_layer::{closure#0}
             at /pageserver/src/tenant/timeline.rs:4363:5
  11: <pageserver::tenant::timeline::Timeline>::get_reconstruct_data::{closure#0}
             at /pageserver/src/tenant/timeline.rs:2618:69
  12: <pageserver::tenant::timeline::Timeline>::get::{closure#0}
             at /pageserver/src/tenant/timeline.rs:565:13
  13: <pageserver::tenant::timeline::Timeline>::list_slru_segments::{closure#0}
             at /pageserver/src/pgdatadir_mapping.rs:427:42
  14: <pageserver::tenant::timeline::Timeline>::is_latest_commit_timestamp_ge_than::{closure#0}
             at /pageserver/src/pgdatadir_mapping.rs:390:13
  15: <pageserver::tenant::timeline::Timeline>::find_lsn_for_timestamp::{closure#0}
             at /pageserver/src/pgdatadir_mapping.rs:338:17
  16: <pageserver::tenant::timeline::Timeline>::update_gc_info::{closure#0}::{closure#0}
             at /pageserver/src/tenant/timeline.rs:3967:71
  17: <tracing::instrument::Instrumented<<pageserver::tenant::timeline::Timeline>::update_gc_info::{closure#0}::{closure#0}> as core::future::future::Future>::poll
             at /.cargo/registry/src/github.com-1ecc6299db9ec823/tracing-0.1.37/src/instrument.rs:272:9
  18: <pageserver::tenant::timeline::Timeline>::update_gc_info::{closure#0}
             at /pageserver/src/tenant/timeline.rs:3948:5
  19: <pageserver::tenant::Tenant>::refresh_gc_info_internal::{closure#0}
             at /pageserver/src/tenant.rs:2687:21
  20: <pageserver::tenant::Tenant>::gc_iteration_internal::{closure#0}
             at /pageserver/src/tenant.rs:2551:13
  21: <pageserver::tenant::Tenant>::gc_iteration::{closure#0}
             at /pageserver/src/tenant.rs:1490:13
  22: pageserver::tenant::tasks::gc_loop::{closure#0}::{closure#0}
             at /pageserver/src/tenant/tasks.rs:187:21
  23: pageserver::tenant::tasks::gc_loop::{closure#0}
             at /pageserver/src/tenant/tasks.rs:208:5
```

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
2023-06-21 18:00:14 +03:00
Alek Westover
8b9f72e117 removing debugging 2023-06-21 10:51:35 -04:00
Alek Westover
bb414e5a0a removing debugging 2023-06-21 10:45:37 -04:00
Alek Westover
32c03bc784 cleaning up some comments 2023-06-21 10:32:28 -04:00
Alek Westover
c99e203094 I think it's working 2023-06-21 10:10:02 -04:00
Christian Schwarz
870740c949 cargo update -p openssl (#4542)
To unblock release
https://github.com/neondatabase/neon/pull/4536#issuecomment-1600678054

Context: https://rustsec.org/advisories/RUSTSEC-2023-0044
2023-06-21 15:50:52 +03:00
Dmitry Rodionov
75d583c04a Tenant::load: fix uninit timeline marker processing (#4458)
## Problem

During timeline creation we create special mark file which presense
indicates that initialization didnt complete successfully. In case of a
crash restart we can remove such half-initialized timeline and following
retry from control plane side should perform another attempt.

So in case of a possible crash restart during initial loading we have
following picture:

```
timelines
| - <timeline_id>___uninit
| - <timeline_id>
| - | <timeline files>
```

We call `std::fs::read_dir` to walk files in `timelines` directory one
by one. If we see uninit file
we proceed with deletion of both, timeline directory and uninit file. If
we see timeline we check if uninit file exists and do the same cleanup.
But in fact its possible to get both branches to be true at the same
time. Result of readdir doesnt reflect following directory state
modifications. So you can still get "valid" entry on the next iteration
of the loop despite the fact that it was deleted in one of the previous
iterations of the loop.

To see that you can apply the following patch (it disables uninit mark
cleanup on successful timeline creation):
```diff
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4beb2664..b3cdad8f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -224,11 +224,6 @@ impl UninitializedTimeline<'_> {
                             )
                         })?;
                 }
-                uninit_mark.remove_uninit_mark().with_context(|| {
-                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
-                    )
-                })?;
                 v.insert(Arc::clone(&new_timeline));
 
                 new_timeline.maybe_spawn_flush_loop();
```
And perform the following steps:

```bash
neon_local init
neon_local start
neon_local tenant create
neon_local stop
neon_local start
```

The error is:
```log
INFO load{tenant_id=X}:blocking: Found an uninit mark file .neon/tenants/X/timelines/Y.___uninit, removing the timeline and its uninit mark
2023-06-09T18:43:41.664247Z ERROR load{tenant_id=X}: load failed, setting tenant state to Broken: failed to load metadata

Caused by:
    0: Failed to read metadata bytes from path .neon/tenants/X/timelines/Y/metadata
    1: No such file or directory (os error 2)
```

So uninit mark got deleted together with timeline directory but we still
got directory entry for it and tried to load it.

The bug prevented tenant from being successfully loaded.

## Summary of changes

Ideally I think we shouldnt place uninit marks in the same directory as timeline directories but move them to separate directory and
gather them as an input to actual listing, but that would be sort of an
on-disk format change, so just check whether entries are still valid
before operating on them.
2023-06-21 14:25:58 +03:00
Alek Westover
e7b9259675 more debugging, didn't find the problem 2023-06-20 22:48:43 -04:00
Alek Westover
356f7d3a7e more debugging 2023-06-20 22:39:02 -04:00
Alek Westover
0f6b05337e fixed minor issue with merge 2023-06-20 22:08:07 -04:00
Alek Westover
2e81d280c8 delete comment 2023-06-20 21:50:52 -04:00
Alek Westover
f9700c8bb9 Merge branch 'main' into extension_server 2023-06-20 21:49:41 -04:00
Alek Westover
e6137d45d2 Merge branch 'main' into extension_server 2023-06-20 21:36:15 -04:00
Alek Westover
ab1d903600 remote extra files 2023-06-20 21:28:00 -04:00
Alek Westover
bfd670b9a7 fixed an issue with the wrong path 2023-06-20 21:24:51 -04:00
Alek Westover
5e96ab43ea more debugginh 2023-06-20 20:02:24 -04:00
Alek Westover
890061d371 arg passing is mostly working 2023-06-20 19:35:13 -04:00
Alek Westover
b4c5beff9f list_files function in remote_storage (#4522) 2023-06-20 15:36:28 -04:00
bojanserafimov
90e1f629e8 Add test for skip_pg_catalog_updates (#4530) 2023-06-20 11:38:59 -04:00
Alek Westover
6b74d1a76a partils 2023-06-19 15:25:53 -04:00
Alek Westover
2023e22ed3 Add RelationError error type to pageserver rather than string parsing error messages (#4508) 2023-06-19 13:14:20 -04:00
Christian Schwarz
036fda392f log timings for compact_level0_phase1 (#4527)
The data will help decide whether it's ok
to keep holding Timeline::layers in shared mode until
after we've calculated the holes.

Other timings are to understand the general breakdown
of timings in that function.

Context: https://github.com/neondatabase/neon/issues/4492
2023-06-19 17:25:57 +03:00
Arseny Sher
557abc18f3 Fix test_s3_wal_replay assertion flakiness.
Supposedly fixes https://github.com/neondatabase/neon/issues/4277
2023-06-19 16:08:20 +04:00
Arseny Sher
3b06a5bc54 Raise pageserver walreceiver timeouts.
I observe sporadic reconnections with ~10k idle computes. It looks like a
separate issue, probably walreceiver runtime gets blocked somewhere, but in any
case 2-3 seconds is too small.
2023-06-19 15:59:38 +04:00
Alek Westover
a936b8a92b add ext cli args 2023-06-16 17:05:39 -04:00
Alek Westover
c7bea52849 adding command line argument 2023-06-16 16:58:13 -04:00
Alek Westover
1b7ab6d468 successfully upload and download the test_load extension 2023-06-16 16:51:08 -04:00
Alek Westover
e07d5d00e9 actually write correct data 2023-06-16 16:20:05 -04:00
Alek Westover
15d3d007eb added several imports so that extension_server compiles 2023-06-16 16:09:02 -04:00
Alek Westover
77157c7741 merge tiny suggestion 2023-06-16 15:49:18 -04:00
Alek Westover
b9b1b3596c started working on the tests, ran into some issues 2023-06-16 15:47:49 -04:00
Alexander Bayandin
1b947fc8af test_runner: workaround rerunfailures and timeout incompatibility (#4469)
## Problem

`pytest-timeout` and `pytest-rerunfailures` are incompatible (or rather
not fully compatible). Timeouts aren't set for reruns.

Ref https://github.com/pytest-dev/pytest-rerunfailures/issues/99

## Summary of changes
- Dynamically make timeouts `func_only` for tests that we're going to
retry. It applies timeouts for reruns as well.
2023-06-16 18:08:11 +01:00
Christian Schwarz
78082d0b9f create_delta_layer: avoid needless stat (#4489)
We already do it inside `frozen_layer.write_to_disk()`.

Context:
https://github.com/neondatabase/neon/pull/4441#discussion_r1228083959
2023-06-16 16:54:41 +02:00
Alexander Bayandin
190c3ba610 Add tags for releases (#4524)
## Problem

It's not a trivial task to find corresponding changes for a particular
release (for example, for 3371 — 🤷)

Ref:
https://neondb.slack.com/archives/C04BLQ4LW7K/p1686761537607649?thread_ts=1686736854.174559&cid=C04BLQ4LW7K

## Summary of changes
- Tag releases
- Add a manual trigger for the release workflow
2023-06-16 14:17:37 +01:00
Christian Schwarz
14d495ae14 create_delta_layer: improve misleading TODO comment (#4488)
Context:
https://github.com/neondatabase/neon/pull/4441#discussion_r1228086608
2023-06-16 14:23:55 +03:00
Alek Westover
91a809332f Update libs/remote_storage/src/s3_bucket.rs
Co-authored-by: Alex Chi Z. <iskyzh@gmail.com>
2023-06-15 13:00:07 -04:00
Dmitry Rodionov
472cc17b7a propagate lock guard to background deletion task (#4495)
## Problem

1. During the rollout we got a panic: "timeline that we were deleting
was concurrently removed from 'timelines' map" that was caused by lock
guard not being propagated to the background part of the deletion.
Existing test didnt catch it because failpoint that was used for
verification was placed earlier prior to background task spawning.
2. When looking at surrounding code one more bug was detected. We
removed timeline from the map before deletion is finished, which breaks
client retry logic, because it will indicate 404 before actual deletion
is completed which can lead to client stopping its retry poll earlier.

## Summary of changes

1. Carry the lock guard over to background deletion. Ensure existing
test case fails without applied patch (second deletion becomes stuck
without it, which eventually leads to a test failure).
2. Move delete_all call earlier so timeline is removed from the map is
the last thing done during deletion.

Additionally I've added timeline_id to the `update_gc_info` span,
because `debug_assert_current_span_has_tenant_and_timeline_id` in
`download_remote_layer` was firing when `update_gc_info` lead to
on-demand downloads via `find_lsn_for_timestamp` (caught by @problame).
This is not directly related to the PR but fixes possible flakiness.

Another smaller set of changes involves deletion wrapper used in python
tests. Now there is a simpler wrapper that waits for deletions to
complete `timeline_delete_wait_completed`. Most of the
test_delete_timeline.py tests make negative tests, i.e., "does
ps_http.timeline_delete() fail in this and that scenario".
These can be left alone. Other places when we actually do the deletions,
we need to use the helper that polls for completion.

Discussion
https://neondb.slack.com/archives/C03F5SM1N02/p1686668007396639

resolves #4496

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
2023-06-15 17:30:12 +03:00
Alek Westover
214ecacfc4 clippy 2023-06-15 10:20:01 -04:00
Arthur Petukhovsky
76413a0fb8 Revert reconnect_timeout to improve performance (#4512)
Default value for `wal_acceptor_reconnect_timeout` was changed in
https://github.com/neondatabase/neon/pull/4428 and it affected
performance up to 20% in some cases.

Revert the value back.
2023-06-15 15:26:59 +03:00
Alexander Bayandin
e60b70b475 Fix data ingestion scripts (#4515)
## Problem

When I switched `psycopg2.connect` from context manager to a regular
function call in https://github.com/neondatabase/neon/pull/4382 I 
embarrassingly forgot about commit, so it doesn't really put data into DB 😞

## Summary of changes
- Enable autocommit for data ingestion scripts
2023-06-15 15:01:06 +03:00
Alex Chi Z
2252c5c282 metrics: convert some metrics to pageserver-level (#4490)
## Problem

Some metrics are better to be observed at page-server level. Otherwise,
as we have a lot of tenants in production, we cannot do a sum b/c
Prometheus has limit on how many time series we can aggregate. This also
helps reduce metrics scraping size.

## Summary of changes

Some integration tests are likely not to pass as it will check the
existence of some metrics. Waiting for CI complete and fix them.

Metrics downgraded: page cache hit (where we are likely to have a
page-server level page cache in the future instead of per-tenant), and
reconstruct time (this would better be tenant-level, as we have one pg
replayer for each tenant, but now we make it page-server level as we do
not need that fine-grained data).

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-14 17:12:34 -04:00
Alek Westover
7465c644b9 started adding a test 2023-06-14 14:54:36 -04:00
Alexander Bayandin
94f315d490 Remove neon-image-depot job (#4506)
## Problem

`neon-image-depot` is an experimental job we use to compare with the
main `neon-image` job.
But it's not stable and right now we don't have the capacity to properly
fix and evaluate it.
We can come back to this later.

## Summary of changes

Remove `neon-image-depot` job
2023-06-14 19:03:09 +01:00
Christian Schwarz
cd3faa8c0c test_basic_eviction: avoid some sources of flakiness (#4504)
We've seen the download_layer() call return 304 in prod because of a
spurious on-demand download caused by a GetPage request from compute.

Avoid these and some other sources of on-demand downloads by shutting
down compute, SKs, and by disabling background loops.

CF
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4498/5258914461/index.html#suites/2599693fa27db8427603ba822bcf2a20/357808fd552fede3
2023-06-14 19:04:22 +02:00
Arthur Petukhovsky
a7a0c3cd27 Invalidate proxy cache in http-over-sql (#4500)
HTTP queries failed with errors `error connecting to server: failed to
lookup address information: Name or service not known\n\nCaused by:\n
failed to lookup address information: Name or service not known`

The fix reused cache invalidation logic in proxy from usual postgres
connections and added it to HTTP-over-SQL queries.

Also removed a timeout for HTTP request, because it almost never worked
on staging (50s+ time just to start the compute), and we can have the
similar case in production. Should be ok, since we have a limits for the
requests and responses.
2023-06-14 19:24:46 +03:00
Dmitry Rodionov
ee9a5bae43 Filter only active timelines for compaction (#4487)
Previously we may've included Stopping/Broken timelines here, which
leads to errors in logs -> causes tests to sporadically fail

resolves #4467
2023-06-14 19:07:42 +03:00
Alek Westover
bb931f2ce0 alek: add download files changes 2023-06-14 10:50:25 -04:00
Alexander Bayandin
9484b96d7c GitHub Autocomment: do not fail the job (#4478)
## Problem

If the script fails to generate a test summary, the step also fails the
job/workflow (despite this could be a non-fatal problem).

## Summary of changes
- Separate JSON parsing and summarisation into separate functions
- Wrap functions calling into try..catch block, add an error message to
GitHub comment and do not fail the step
- Make `scripts/comment-test-report.js` a CLI script that can be run
locally (mock GitHub calls) to make it easier to debug issues locally
2023-06-14 15:07:30 +01:00
Shany Pozin
ebee8247b5 Move s3 delete_objects to use chunks of 1000 OIDs (#4463)
See
https://github.com/neondatabase/neon/pull/4461#pullrequestreview-1474240712
2023-06-14 15:38:01 +03:00
bojanserafimov
3164ad7052 compute_ctl: Spec parser forward compatibility test (#4494) 2023-06-13 21:48:09 -04:00
Alexander Bayandin
a0b3990411 Retry data ingestion scripts on connection errors (#4382)
## Problem

From time to time, we're catching a race condition when trying to upload
perf or regression test results.

Ref:
- https://neondb.slack.com/archives/C03H1K0PGKH/p1685462717870759
- https://github.com/neondatabase/cloud/issues/3686

## Summary of changes

Wrap `psycopg2.connect` method with `@backoff.on_exception`
contextmanager
2023-06-13 22:33:42 +01:00
Stas Kelvich
4385e0c291 Return more RowDescription fields via proxy json endpoint
As we aim to align client-side behavior with node-postgres, it's necessary for us to return these fields, because node-postgres does so as well.
2023-06-13 22:31:18 +03:00
Christian Schwarz
3693d1f431 turn Timeline::layers into tokio::sync::RwLock (#4441)
This is preliminary work for/from #4220 (async `Layer::get_value_reconstruct_data`).

# Full Stack Of Preliminary PRs

Thanks to the countless preliminary PRs, this conversion is relatively
straight-forward.

1. Clean-ups
  * https://github.com/neondatabase/neon/pull/4316
  * https://github.com/neondatabase/neon/pull/4317
  * https://github.com/neondatabase/neon/pull/4318
  * https://github.com/neondatabase/neon/pull/4319
  * https://github.com/neondatabase/neon/pull/4321
* Note: these were mostly to find an alternative to #4291, which I
   thought we'd need in my original plan where we would need to convert
   `Tenant::timelines` into an async locking primitive (#4333). In reviews,
   we walked away from that, but these cleanups were still quite useful.
2. https://github.com/neondatabase/neon/pull/4364
3. https://github.com/neondatabase/neon/pull/4472
4. https://github.com/neondatabase/neon/pull/4476
5. https://github.com/neondatabase/neon/pull/4477
6. https://github.com/neondatabase/neon/pull/4485

# Significant Changes In This PR

## `compact_level0_phase1` & `create_delta_layer`

This commit partially reverts

   "pgserver: spawn_blocking in compaction (#4265)"
    4e359db4c7.

Specifically, it reverts the `spawn_blocking`-ificiation of
`compact_level0_phase1`.
If we didn't revert it, we'd have to use `Timeline::layers.blocking_read()`
inside `compact_level0_phase1`. That would use up a thread in the
`spawn_blocking` thread pool, which is hard-capped.

I considered wrapping the code that follows the second
`layers.read().await` into `spawn_blocking`, but there are lifetime
issues with `deltas_to_compact`.

Also, this PR switches the `create_delta_layer` _function_ back to
async, and uses `spawn_blocking` inside to run the code that does sync
IO, while keeping the code that needs to lock `Timeline::layers` async.

## `LayerIter` and `LayerKeyIter` `Send` bounds

I had to add a `Send` bound on the `dyn` type that `LayerIter`
and `LayerKeyIter` wrap. Why? Because we now have the second
`layers.read().await` inside `compact_level0_phase`, and these
iterator instances are held across that await-point.

More background:
https://github.com/neondatabase/neon/pull/4462#issuecomment-1587376960

## `DatadirModification::flush`

Needed to replace the `HashMap::retain` with a hand-rolled variant
because `TimelineWriter::put` is now async.
2023-06-13 18:38:41 +02:00
Anastasia Lubennikova
8013f9630d code cleanup 2023-06-13 18:23:15 +02:00
Anastasia Lubennikova
34f22e9b12 Request extension files from compute_ctl 2023-06-13 16:52:37 +02:00
Christian Schwarz
fdf7a67ed2 init_empty_layer_map: use try_write (#4485)
This is preliminary work for/from #4220 (async
`Layer::get_value_reconstruct_data`).
Or more specifically, #4441, where we turn Timeline::layers into a
tokio::sync::RwLock.

By using try_write() here, we can avoid turning init_empty_layer_map
async,
which is nice because much of its transitive call(er) graph isn't async.
2023-06-13 13:49:40 +02:00
Alexey Kondratov
1299df87d2 [compute_ctl] Fix logging if catalog updates are skipped (#4480)
Otherwise, it wasn't clear from the log when Postgres started up
completely if catalog updates were skipped.

Follow-up for 4936ab6
2023-06-13 13:34:56 +02:00
Christian Schwarz
754ceaefac make TimelineWriter Send by using tokio::sync Mutex internally (#4477)
This is preliminary work for/from #4220 (async
`Layer::get_value_reconstruct_data`).

There, we want to switch `Timeline::layers` to be a
`tokio::sync::RwLock`.

That will require the `TimelineWriter` to become async, because at times
its functions need to lock `Timeline::layers` in order to freeze the
open layer.

While doing that, rustc complains that we're now holding
`Timeline::write_lock` across await points (lock order is that
`write_lock` must be acquired before `Timelines::layers`).

So, we need to switch it over to an async primitive.
2023-06-13 10:15:25 +02:00
Arseny Sher
143fa0da42 Remove timeout on test_close_on_connections_exit
We have 300s timeout on all tests, and doubling logic in popen.wait sometimes
exceeds 5s, making the test flaky.

ref https://github.com/neondatabase/neon/issues/4211
2023-06-13 06:26:03 +04:00
bojanserafimov
4936ab6842 compute_ctl: add flag to avoid config step (#4457)
Add backwards-compatible flag that cplane can use to speed up startup time
2023-06-12 13:57:02 -04:00
Christian Schwarz
939593d0d3 refactor check_checkpoint_distance to prepare for async Timeline::layers (#4476)
This is preliminary work for/from #4220 (async
`Layer::get_value_reconstruct_data`).

There, we want to switch `Timeline::layers` to be a
`tokio::sync::RwLock`.

That will require the `TimelineWriter` to become async.

That will require `freeze_inmem_layer` to become async.

So, inside check_checkpoint_distance, we will have
`freeze_inmem_layer().await`.

But current rustc isn't smart enough to understand that we
`drop(layers)` earlier, and hence, will complain about the `!Send`
`layers` being held across the `freeze_inmem_layer().await`-point.

This patch puts the guard into a scope, so rustc will shut up in the
next patch where we make the transition for `TimelineWriter`.

obsoletes https://github.com/neondatabase/neon/pull/4474
2023-06-12 17:45:56 +01:00
Christian Schwarz
2011cc05cd make Delta{Value,Key}Iter Send (#4472)
... by switching the internal RwLock to a OnceCell.

This is preliminary work for/from #4220 (async `Layer::get_value_reconstruct_data`).

See https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
for more context.

fixes https://github.com/neondatabase/neon/issues/4471
2023-06-12 17:45:56 +01:00
Arthur Petukhovsky
b0286e3c46 Always truncate WAL after restart (#4464)
c058e1cec2 skipped `truncate_wal()` it if `write_lsn` is equal to
truncation position, but didn't took into account that `write_lsn` is
reset on restart.

Fixes regression looking like:

```
ERROR WAL acceptor{cid=22 ...}:panic{thread=WAL acceptor 19b6c1743666ec02991a7633c57178db/b07db8c88f4c76ea5ed0954c04cc1e74 location=safekeeper/src/wal_storage.rs:230:13}: unexpected write into non-partial segment file
```

This fix will prevent skipping WAL truncation when we are running for
the first time after restart.
2023-06-12 13:42:28 +00:00
Matthias van de Meent
4f3f817384 Amendments to the RFC 2023-06-12 14:09:20 +02:00
Matthias van de Meent
b50475b567 Add RFC 024 - Extension Loading 2023-06-12 14:09:20 +02:00
Heikki Linnakangas
e4f05ce0a2 Enable sanity check that disk_consistent_lsn is valid on created timeline.
Commit `create_test_timeline: always put@initdb_lsn the minimum required keys`
already switched us over to using valid initdb_lsns.

All that's left to do is to actually flush the minimum keys so that
we move from disk_consistent_lsn=Lsn(0) to disk_consistent_lsn=initdb_lsn.

Co-authored-by: Christian Schwarz <christian@neon.tech>

Part of https://github.com/neondatabase/neon/pull/4364
2023-06-12 11:56:49 +01:00
Heikki Linnakangas
8d106708d7 Clean up timeline initialization code.
Clarify who's responsible for initializing the layer map. There were
previously two different ways to do it:

- create_empty_timeline and bootstrap_timeline let prepare_timeline()
  initialize an empty layer map.

- branch_timeline passed a flag to initialize_with_lock() to tell
  initialize_with_lock to call load_layer_map(). Because it was a
  newly created timeline, load_layer_map() never found any layer
  files, so it just initialized an empty layer map.

With this commit, prepare_new_timeline() always does it. The LSN to
initialize it with is passed as argument.

Other changes per function:

prepare_timeline:
- rename to 'prepare_new_timeline' to make it clear that it's only used
  when creating a new timeline, not when loading an existing timeline
- always initialize an empty layer map. The caller can pass the LSN to
  initialize it with. (Previously, prepare_timeline would optionally
  load the layer map at 'initdb_lsn'. Some caller used that, while others
  let initialize_with_lock do it

initialize_with_lock:
- As mentioned above, remove the option to load the layer map
- Acquire the 'timelines' lock in the function itself. None of the callers
  did any other work while holding the lock.
- Rename it to finish_creation() to make its intent more clear. It's only
  used when creating a new timeline now.

create_timeline_data:
- Rename to create_timeline_struct() for clarity. It just initializes
  the Timeline struct, not any other "data"

create_timeline_files:
- use create_dir rather than create_dir_all, to be a little more strict.
  We know that the parent directory should already exist, and the timeline
  directory should not exist.
- Move the call to create_timeline_struct() to the caller. It was just
  being "passed through"

Part of https://github.com/neondatabase/neon/pull/4364
2023-06-12 11:56:49 +01:00
Christian Schwarz
f450369b20 timeline_init_and_sync: don't hold Tenant::timelines while load_layer_map
This patch inlines `initialize_with_lock` and then reorganizes the code
such that we can `load_layer_map` without holding the
`Tenant::timelines` lock.

As a nice aside, we can get rid of the dummy() uninit mark, which has
always been a terrible hack.

Part of https://github.com/neondatabase/neon/pull/4364
2023-06-12 11:56:49 +01:00
Christian Schwarz
aad918fb56 create_test_timeline: tests for put@initdb_lsn optimization code 2023-06-12 11:04:49 +01:00
Christian Schwarz
86dd8c96d3 add infrastructure to expect use of initdb_lsn flush optimization 2023-06-12 11:04:49 +01:00
Christian Schwarz
6a65c4a4fe create_test_timeline: always put@initdb_lsn the minimum required keys (#4451)
See the added comment on `create_empty_timeline`.

The various test cases now need to set a valid `Lsn` instead of
`Lsn(0)`.

Rough context:
https://github.com/neondatabase/neon/pull/4364#discussion_r1221995691
2023-06-12 09:28:34 +00:00
Vadim Kharitonov
e9072ee178 Compile rdkit (#4442)
`rdkit` extension

```
postgres=# create extension rdkit;
CREATE EXTENSION
postgres=# select 'c1[o,s]ncn1'::qmol;
    qmol
-------------
 c1[o,s]ncn1
(1 row)
```
2023-06-12 11:13:33 +02:00
Joonas Koivunen
7e17979d7a feat: http request logging on safekeepers.
With RequestSpan, successfull GETs are not logged, but all others, errors and
warns on cancellations are.
2023-06-11 22:53:08 +04:00
Arseny Sher
227271ccad Switch safekeepers to async.
This is a full switch, fs io operations are also tokio ones, working through
thread pool. Similar to pageserver, we have multiple runtimes for easier `top`
usage and isolation.

Notable points:
- Now that guts of safekeeper.rs are full of .await's, we need to be very
  careful not to drop task at random point, leaving timeline in unclear
  state. Currently the only writer is walreceiver and we don't have top
  level cancellation there, so we are good. But to be safe probably we should
  add a fuse panicking if task is being dropped while operation on a timeline
  is in progress.
- Timeline lock is Tokio one now, as we do disk IO under it.
- Collecting metrics got a crutch: since prometheus Collector is
  synchronous, it spawns a thread with current thread runtime collecting data.
- Anything involving closures becomes significantly more complicated, as
  async fns are already kinda closures + 'async closures are unstable'.
- Main thread now tracks other main tasks, which got much easier.
- The only sync place left is initial data loading, as otherwise clippy
  complains on timeline map lock being held across await points -- which is
  not bad here as it happens only in single threaded runtime of main thread.
  But having it sync doesn't hurt either.

I'm concerned about performance of thread pool io offloading, async traits and
many await points; but we can try and see how it goes.

fixes https://github.com/neondatabase/neon/issues/3036
fixes https://github.com/neondatabase/neon/issues/3966
2023-06-11 22:53:08 +04:00
dependabot[bot]
fbf0367e27 build(deps): bump cryptography from 39.0.1 to 41.0.0 (#4409) 2023-06-11 19:14:30 +01:00
Arthur Petukhovsky
a21b55fe0b Use connect_timeout for broker::connect (#4452)
Use `storage_broker::connect` everywhere. Add a default 5 seconds
timeout for opening new connection.
2023-06-09 17:38:53 +03:00
Shany Pozin
add51e1372 Add delete_objects to storage api (#4449)
## Summary of changes
Add missing delete_objects API to support bulk deletes
2023-06-09 13:23:12 +03:00
Alex Chi Z
cdce04d721 pgserver: add local manifest for atomic operation (#4422)
## Problem

Part of https://github.com/neondatabase/neon/issues/4418

## Summary of changes

This PR implements the local manifest interfaces. After the refactor of
timeline is done, we can integrate this with the current storage. The
reader will stop at the first corrupted record.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
2023-06-08 19:34:25 -04:00
bojanserafimov
6bac770811 Add cold start test (#4436) 2023-06-08 18:11:33 -04:00
Stas Kelvich
c82d19d8d6 Fix NULLs handling in proxy json endpoint
There were few problems with null handling:

* query_raw_txt() accepted vector of string so it always (erroneously)
treated "null" as a string instead of null. Change rust pg client
to accept the vector of Option<String> instead of just Strings. Adopt
coding here to pass nulls as None.

* pg_text_to_json() had a check that always interpreted "NULL" string
as null. That is wrong and nulls were already handled by match None.
This bug appeared as a bad attempt to parse arrays containing NULL
elements. Fix coding by checking presence of quotes while parsing an
array (no quotes -> null, quoted -> "null" string).

Array parser fix also slightly changes behavior by always cleaning
current entry when pushing to the resulting vector. This seems to be
an omission by previous coding, however looks like it was harmless
as entry was not cleared only at the end of the nested or to-level
array.
2023-06-08 16:00:18 +03:00
Stas Kelvich
d73639646e Add more output options to proxy json endpoint
With this commit client can pass following optional headers:

`Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres.

`Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge
cases where it is hard to use rows represented as objects (e.g. when several fields have the same name).
2023-06-08 16:00:18 +03:00
Dmitry Rodionov
d53f9ab3eb delete timelines from s3 (#4384)
Delete data from s3 when timeline deletion is requested

## Summary of changes

UploadQueue is altered to support scheduling of delete operations in
stopped state. This looks weird, and I'm thinking whether there are
better options/refactorings for upload client to make it look better.

Probably can be part of https://github.com/neondatabase/neon/issues/4378

Deletion is implemented directly in existing endpoint because changes are not
that significant. If we want more safety we can separate those or create
feature flag for new behavior.

resolves [#4193](https://github.com/neondatabase/neon/issues/4193)

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
2023-06-08 15:01:22 +03:00
Dmitry Rodionov
8560a98d68 fix openapi spec to pass swagger editor validation (#4445)
There shouldnt be a dash before `type: object`. Also added description.
2023-06-08 13:25:30 +03:00
Alex Chi Z
2e687bca5b refactor: use LayerDesc in layer map (part 1) (#4408)
## Problem

part of https://github.com/neondatabase/neon/issues/4392

## Summary of changes

This PR adds a new HashMap that maps persistent layer desc to the layer
object *inside* LayerMap. Originally I directly went towards adding such
layer cache in Timeline, but the changes are too many and cannot be
reviewed as a reasonably-sized PR. Therefore, we take this intermediate
step to change part of the codebase to use persistent layer desc, and
come up with other PRs to move this hash map of layer desc to the
timeline struct.

Also, file_size is now part of the layer desc.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
2023-06-07 18:28:18 +03:00
Dmitry Rodionov
1a1019990a map TenantState::Broken to TenantAttachmentStatus::Failed (#4371)
## Problem

Attach failures are not reported in public part of the api (in
`attachment_status` field of TenantInfo).

## Summary of changes

Expose TenantState::Broken as TenantAttachmentStatus::Failed

In the way its written Failed status will be reported even if no
attachment happened. (I e if tenant become broken on startup). This is
in line with other members. I e Active will be resolved to Attached even
if no actual attach took place.

This can be tweaked if needed. At the current stage it would be overengineering without clear motivation

resolves #4344
2023-06-07 18:25:30 +03:00
Alex Chi Z
1c200bd15f fix: break dev dependencies between wal_craft and pg_ffi (#4424)
## Problem

close https://github.com/neondatabase/neon/issues/4266

## Summary of changes

With this PR, rust-analyzer should be able to give lints and auto
complete in `mod tests`, and this makes writing tests easier.
Previously, rust-analyzer cannot do auto completion.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
2023-06-07 17:51:13 +03:00
Arseny Sher
37bf2cac4f Persist safekeeper control file once in a while.
It should make remote_consistent_lsn commonly up-to-date on non actively writing
projects, which removes spike or pageserver -> safekeeper reconnections on
storage nodes restart.
2023-06-07 17:23:37 +04:00
Joonas Koivunen
5761190e0d feat: three phased startup order (#4399)
Initial logical size calculation could still hinder our fast startup
efforts in #4397. See #4183. In deployment of 2023-06-06
about a 200 initial logical sizes were calculated on hosts which
took the longest to complete initial load (12s).

Implements the three step/tier initialization ordering described in
#4397:
1. load local tenants
2. do initial logical sizes per walreceivers for 10s
3. background tasks

Ordering is controlled by:
- waiting on `utils::completion::Barrier`s on background tasks
- having one attempt for each Timeline to do initial logical size
calculation
- `pageserver/src/bin/pageserver.rs` releasing background jobs after
timeout or completion of initial logical size calculation

The timeout is there just to safeguard in case a legitimate non-broken
timeline initial logical size calculation goes long. The timeout is
configurable, by default 10s, which I think would be fine for production
systems. In the test cases I've been looking at, it seems that these
steps are completed as fast as possible.

Co-authored-by: Christian Schwarz <christian@neon.tech>
2023-06-07 14:29:23 +03:00
Vadim Kharitonov
88f0cfc575 Fix pgx_ulid extension (#4431)
The issue was in the wrong `control` file name
2023-06-07 11:41:53 +02:00
Arseny Sher
6b3c020cd9 Don't warn on system id = 0 in walproposer greeting.
sync-safekeepers doesn't know it and sends 0.
2023-06-07 12:39:20 +04:00
Arseny Sher
c058e1cec2 Quick exit in truncate_wal if nothing to do.
ref https://github.com/neondatabase/neon/issues/4414
2023-06-07 12:39:20 +04:00
Arseny Sher
dc6a382873 Increase timeouts on compute -> sk connections.
context: https://github.com/neondatabase/neon/issues/4414

And improve messages/comments here and there.
2023-06-07 12:39:20 +04:00
Heikki Linnakangas
df3bae2ce3 Use compute_ctl to manage Postgres in tests. (#3886)
This adds test coverage for 'compute_ctl', as it is now used by all
the python tests.
    
There are a few differences in how 'compute_ctl' is called in the
tests, compared to the real web console:
    
- In the tests, the postgresql.conf file is included as one large
  string in the spec file, and it is written out as it is to the data
  directory.  I added a new field for that to the spec file. The real
  web console, however, sets all the necessary settings in the
  'settings' field, and 'compute_ctl' creates the postgresql.conf from
  those settings.

- In the tests, the information needed to connect to the storage, i.e.
  tenant_id, timeline_id, connection strings to pageserver and
  safekeepers, are now passed as new fields in the spec file. The real
  web console includes them as the GUCs in the 'settings' field. (Both
  of these are different from what the test control plane used to do:
  It used to write the GUCs directly in the postgresql.conf file). The
  plan is to change the control plane to use the new method, and
  remove the old method, but for now, support both.

Some tests that were sensitive to the amount of WAL generated needed
small changes, to accommodate that compute_ctl runs the background
health monitor which makes a few small updates. Also some tests shut
down the pageserver, and now that the background health check can run
some queries while the pageserver is down, that can produce a few
extra errors in the logs, which needed to be allowlisted.

Other changes:
- remove obsolete comments about PostgresNode;
- create standby.signal file for Static compute node;
- log output of `compute_ctl` and `postgres` is merged into
`endpoints/compute.log`.

---------

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
2023-06-06 14:59:36 +01:00
Joonas Koivunen
0cef7e977d refactor: just one way to shutdown a tenant (#4407)
We have 2 ways of tenant shutdown, we should have just one.

Changes are mostly mechanical simple refactorings.

Added `warn!` on the "shutdown all remaining tasks" should trigger test
failures in the between time of not having solved the "tenant/timeline
owns all spawned tasks" issue.

Cc: #4327.
2023-06-06 15:30:55 +03:00
Joonas Koivunen
18a9d47f8e test: restore NotConnected being allowed globally (#4426)
Flakyness introduced by #4402 evidence [^1].

I had assumed the NotConnected would had been an expected io error, but
it's not. Restore the global `allowed_error`.

[^1]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4407/5185897757/index.html#suites/82004ab4e3720b47bf78f312dabe7c55/14f636d0ecd3939d/
2023-06-06 13:51:39 +03:00
Sasha Krassovsky
ac11e7c32d Remove arch-specific stuff from HNSW extension (#4423) 2023-06-05 22:04:15 -08:00
Konstantin Knizhnik
8e1b5e1224 Remove -ftree-vectorizer-verbose=0 option notrecognized by MaxOS/X c… (#4412)
…ompiler

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
2023-06-05 20:10:19 +03:00
Joonas Koivunen
e0bd81ce1f test: fix flaky warning on attach (#4415)
added the `allowed_error` to the `positive_env` so any tests completing
the attach are allowed have this print out. they are allowed to do so,
because the `random_init_delay` can produce close to zero and thus the
first run will be near attach. Though... Unsure if we ever really need
the eviction task to run **before** it can evict something, as in after
20min or 24h.

in the failed test case however period is 20s so interesting that we
didn't run into this sooner.

evidence of flaky:
https://github.com/neondatabase/neon/actions/runs/5175677035/jobs/9323705929?pr=4399#step:4:38536
2023-06-05 18:12:58 +03:00
Joonas Koivunen
77598f5d0a Better walreceiver logging (#4402)
walreceiver logs are a bit hard to understand because of partial span
usage, extra messages, ignored errors popping up as huge stacktraces.

Fixes #3330 (by spans, also demote info -> debug).

- arrange walreceivers spans into a hiearchy: 
    - `wal_connection_manager{tenant_id, timeline_id}` ->
      `connection{node_id}` -> `poller`
- unifies the error reporting inside `wal_receiver`:
- All ok errors are now `walreceiver connection handling ended: {e:#}`
- All unknown errors are still stacktraceful task_mgr reported errors
  with context `walreceiver connection handling failure`
- Remove `connect` special casing, was: `DB connection stream finished`
  for ok errors
- Remove `done replicating` special casing, was `Replication stream
  finished` for ok errors
- lowered log levels for (non-exhaustive list):
    - `WAL receiver manager started, connecting to broker` (at startup)
    - `WAL receiver shutdown requested, shutting down` (at shutdown)
    - `Connection manager loop ended, shutting down` (at shutdown)
    - `sender is dropped while join handle is still alive` (at lucky
      shutdown, see #2885)
    - `timeline entered terminal state {:?}, stopping wal connection manager
      loop` (at shutdown)
    - `connected!` (at startup)
- `Walreceiver db connection closed` (at disconnects?, was without span)
    - `Connection cancelled` (at shutdown, was without span)
- `observed timeline state change, new state is {new_state:?}` (never
  after Timeline::activate was made infallible)
- changed:
    - `Timeline dropped state updates sender, stopping wal connection
      manager loop`
    - was out of date; sender is not dropped but `Broken | Stopping` state
      transition
        - also made `debug!`
    - `Timeline dropped state updates sender before becoming active,
      stopping wal connection manager loop`
    - was out of date: sender is again not dropped but `Broken | Stopping`
      state transition
        - also made `debug!`
- log fixes:
    - stop double reporting panics via JoinError
2023-06-05 17:35:23 +03:00
Joonas Koivunen
8142edda01 test: Less flaky gc (#4416)
Solves a flaky test error in the wild[^1] by:

- Make the gc shutdown signal reading an `allowed_error`
- Note the gc shutdown signal readings as being in `allowed_error`s
- Allow passing tenant conf to init_start to avoid unncessary tenants

[^1]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4399/5176432780/index.html#suites/b97efae3a617afb71cb8142f5afa5224/2cd76021ea011f93
2023-06-05 15:43:52 +03:00
Vadim Kharitonov
b9871158ba Compile PGX ULID extension (#4413)
Create pgx_ulid extension

```
postgres=# create extension ulid;
CREATE EXTENSION
postgres=# CREATE TABLE users (
  id ulid NOT NULL DEFAULT gen_ulid() PRIMARY KEY,
  name text NOT NULL
);
CREATE TABLE
postgres=# insert into users (name) values ('vadim');
INSERT 0 1
postgres=# select * from users;
             id             | name
----------------------------+-------
 01H25DDG3KYMYZTNR41X38E256 | vadim
```
2023-06-05 12:52:13 +03:00
Joonas Koivunen
8caef2c0c5 fix: delay eviction_task as well (#4397)
As seen on deployment of 2023-06-01 release, times were improving but
there were some outliers caused by:
- timelines `eviction_task` starting while activating and running
imitation
- timelines `initial logical size` calculation

This PR fixes it so that `eviction_task` is delayed like other
background tasks fixing an oversight from earlier #4372.

After this PR activation will be two phases:
1. load and activate tenants AND calculate some initial logical sizes
2. rest of initial logical sizes AND background tasks
- compaction, gc, disk usage based eviction, timelines `eviction_task`,
consumption metrics
2023-06-05 09:37:53 +03:00
Konstantin Knizhnik
04542826be Add HNSW extension (#4227)
## Describe your changes

Port HNSW implementation for ANN search top Postgres 

## Issue ticket number and link

https://www.pinecone.io/learn/hnsw

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
2023-06-04 11:41:38 +03:00
bojanserafimov
4ba950a35a Add libcurl as dependency to readme (#4405) 2023-06-02 18:07:45 -04:00
Joonas Koivunen
a55c663848 chore: comment marker fixes (#4406)
Upgrading to rust 1.70 will require these.
2023-06-02 21:03:12 +03:00
164 changed files with 8502 additions and 3413 deletions

View File

@@ -264,7 +264,7 @@ jobs:
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
- name: Install rust binaries
run: |
@@ -623,51 +623,6 @@ jobs:
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
neon-image-depot:
# For testing this will run side-by-side for a few merges.
# This action is not really optimized yet, but gets the job done
runs-on: [ self-hosted, gen3, large ]
needs: [ tag ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
permissions:
contents: read
id-token: write
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Setup go
uses: actions/setup-go@v3
with:
go-version: '1.19'
- name: Set up Depot CLI
uses: depot/setup-action@v1
- name: Install Crane & ECR helper
run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
- name: Configure ECR login
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
- name: Build and push
uses: depot/build-push-action@v1
with:
# if no depot.json file is at the root of your repo, you must specify the project id
project: nrdv0s4kcs
push: true
tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
build-args: |
GIT_VERSION=${{ github.sha }}
REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
compute-tools-image:
runs-on: [ self-hosted, gen3, large ]
needs: [ tag ]
@@ -704,6 +659,7 @@ jobs:
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
--context .
--build-arg GIT_VERSION=${{ github.sha }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-tools
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
@@ -761,6 +717,7 @@ jobs:
--context .
--build-arg GIT_VERSION=${{ github.sha }}
--build-arg PG_VERSION=${{ matrix.version }}
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
--dockerfile Dockerfile.compute-node
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
@@ -959,6 +916,20 @@ jobs:
exit 1
fi
- name: Create tag "release-${{ needs.tag.outputs.build-tag }}"
if: github.ref_name == 'release'
uses: actions/github-script@v6
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
github.rest.git.createRef({
owner: context.repo.owner,
repo: context.repo.repo,
ref: "refs/tags/release-${{ needs.tag.outputs.build-tag }}",
sha: context.sha,
})
promote-compatibility-data:
runs-on: [ self-hosted, gen3, small ]
container:

View File

@@ -3,6 +3,7 @@ name: Create Release Branch
on:
schedule:
- cron: '0 10 * * 2'
workflow_dispatch:
jobs:
create_release_branch:

24
Cargo.lock generated
View File

@@ -924,12 +924,14 @@ dependencies = [
"opentelemetry",
"postgres",
"regex",
"remote_storage",
"reqwest",
"serde",
"serde_json",
"tar",
"tokio",
"tokio-postgres",
"toml_edit",
"tracing",
"tracing-opentelemetry",
"tracing-subscriber",
@@ -997,6 +999,7 @@ dependencies = [
"tar",
"thiserror",
"toml",
"tracing",
"url",
"utils",
"workspace_hack",
@@ -2349,9 +2352,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "openssl"
version = "0.10.52"
version = "0.10.55"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
dependencies = [
"bitflags",
"cfg-if",
@@ -2381,9 +2384,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.87"
version = "0.9.90"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
dependencies = [
"cc",
"libc",
@@ -2770,7 +2773,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"bytes",
"fallible-iterator",
@@ -2783,7 +2786,7 @@ dependencies = [
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"native-tls",
"tokio",
@@ -2794,7 +2797,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -2812,7 +2815,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"bytes",
"fallible-iterator",
@@ -2874,7 +2877,6 @@ dependencies = [
"serde",
"thiserror",
"utils",
"wal_craft",
"workspace_hack",
]
@@ -4273,7 +4275,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
dependencies = [
"async-trait",
"byteorder",
@@ -4894,7 +4896,9 @@ dependencies = [
"once_cell",
"postgres",
"postgres_ffi",
"regex",
"tempfile",
"utils",
"workspace_hack",
]

View File

@@ -9,7 +9,20 @@ members = [
"storage_broker",
"workspace_hack",
"trace",
"libs/*",
"libs/compute_api",
"libs/pageserver_api",
"libs/postgres_ffi",
"libs/safekeeper_api",
"libs/utils",
"libs/consumption_metrics",
"libs/postgres_backend",
"libs/pq_proto",
"libs/tenant_size_model",
"libs/metrics",
"libs/postgres_connection",
"libs/remote_storage",
"libs/tracing-utils",
"libs/postgres_ffi/wal_craft",
]
[workspace.package]
@@ -127,11 +140,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
## Other git libraries
@@ -167,7 +180,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
# Changes the MAX_THREADS limit from 4096 to 32768.
# This is a temporary workaround for using tracing from many threads in safekeepers code,

View File

@@ -2,6 +2,7 @@ ARG PG_VERSION
ARG REPOSITORY=neondatabase
ARG IMAGE=rust
ARG TAG=pinned
ARG BUILD_TAG
#########################################################################################
#
@@ -67,7 +68,7 @@ RUN apt update && \
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
make clean && cp -R /sfcgal/* /
@@ -95,7 +96,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
mkdir build && \
cd build && \
cmake .. && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
@@ -355,7 +356,7 @@ RUN apt-get update && \
wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
cd build && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make install -j $(getconf _NPROCESSORS_ONLN) && \
@@ -410,7 +411,7 @@ RUN apt-get update && \
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
mkdir build && \
cd build && \
cmake .. && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
@@ -432,6 +433,54 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
#########################################################################################
#
# Layer "rdkit-pg-build"
# compile rdkit extension
#
#########################################################################################
FROM build-deps AS rdkit-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt-get update && \
apt-get install -y \
cmake \
libboost-iostreams1.74-dev \
libboost-regex1.74-dev \
libboost-serialization1.74-dev \
libboost-system1.74-dev \
libeigen3-dev \
libfreetype6-dev
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
cmake \
-D RDK_BUILD_CAIRO_SUPPORT=OFF \
-D RDK_BUILD_INCHI_SUPPORT=ON \
-D RDK_BUILD_AVALON_SUPPORT=ON \
-D RDK_BUILD_PYTHON_WRAPPERS=OFF \
-D RDK_BUILD_DESCRIPTORS3D=OFF \
-D RDK_BUILD_FREESASA_SUPPORT=OFF \
-D RDK_BUILD_COORDGEN_SUPPORT=ON \
-D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
-D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
-D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
-D RDK_USE_URF=OFF \
-D RDK_BUILD_PGSQL=ON \
-D RDK_PGSQL_STATIC=ON \
-D PostgreSQL_CONFIG=pg_config \
-D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
-D RDK_INSTALL_INTREE=OFF \
-D CMAKE_BUILD_TYPE=Release \
. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
#########################################################################################
#
# Layer "rust extensions"
@@ -517,6 +566,22 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
#########################################################################################
#
# Layer "pg-pgx-ulid-build"
# Compile "pgx_ulid" extension
#
#########################################################################################
FROM rust-extensions-build AS pg-pgx-ulid-build
RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
sed -i 's/pgx = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
cargo pgx install --release && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -547,6 +612,8 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -556,6 +623,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon_utils \
-s install && \
make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/hnsw \
-s install
#########################################################################################
@@ -564,6 +635,9 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
#
#########################################################################################
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
@@ -616,14 +690,19 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2
# libzstd1 for zstd
# libboost*, libfreetype6, and zlib1g for rdkit
RUN apt update && \
apt install --no-install-recommends -y \
gdb \
locales \
libicu67 \
liblz4-1 \
libreadline8 \
libboost-iostreams1.74.0 \
libboost-regex1.74.0 \
libboost-serialization1.74.0 \
libboost-system1.74.0 \
libossp-uuid16 \
libfreetype6 \
libgeos-c1v5 \
libgdal28 \
libproj19 \
@@ -633,7 +712,9 @@ RUN apt update && \
libxslt1.1 \
libzstd1 \
libcurl4-openssl-dev \
procps && \
locales \
procps \
zlib1g && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -3,6 +3,7 @@
ARG REPOSITORY=neondatabase
ARG IMAGE=rust
ARG TAG=pinned
ARG BUILD_TAG
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
WORKDIR /home/nonroot
@@ -16,6 +17,8 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot
ARG CACHEPOT_BUCKET=neon-github-dev
#ARG AWS_ACCESS_KEY_ID
#ARG AWS_SECRET_ACCESS_KEY
ARG BUILD_TAG
ENV BUILD_TAG=$BUILD_TAG
COPY . .

View File

@@ -138,6 +138,11 @@ neon-pg-ext-%: postgres-%
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
+@echo "Compiling hnsw $*"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install
.PHONY: neon-pg-ext-clean-%
neon-pg-ext-clean-%:
@@ -153,6 +158,9 @@ neon-pg-ext-clean-%:
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean
.PHONY: neon-pg-ext
neon-pg-ext: \

View File

@@ -28,18 +28,19 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
* On Ubuntu or Debian, this set of packages should be sufficient to build the code:
```bash
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
libcurl4-openssl-dev
```
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel
protobuf-devel libcurl-devel
```
* On Arch based systems, these packages are needed:
```bash
pacman -S base-devel readline zlib libseccomp openssl clang \
postgresql-libs cmake postgresql protobuf
postgresql-libs cmake postgresql protobuf curl
```
Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).

View File

@@ -30,3 +30,5 @@ url.workspace = true
compute_api.workspace = true
utils.workspace = true
workspace_hack.workspace = true
toml_edit.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }

View File

@@ -27,7 +27,8 @@
//! compute_ctl -D /var/db/postgres/compute \
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres
//! -b /usr/local/bin/postgres \
//! -r {"bucket": "my-bucket", "region": "eu-central-1"}
//! ```
//!
use std::collections::HashMap;
@@ -48,17 +49,47 @@ use compute_api::responses::ComputeStatus;
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
use compute_tools::configurator::launch_configurator;
use compute_tools::extension_server::{
download_extension, get_availiable_extensions, init_remote_storage,
};
use compute_tools::http::api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::spec::*;
use tokio::runtime::Runtime;
const BUILD_TAG_DEFAULT: &str = "local";
fn main() -> Result<()> {
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
let matches = cli().get_matches();
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
info!("build_tag: {build_tag}");
let matches = cli().get_matches();
let pgbin_default = String::from("postgres");
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
let ext_remote_storage = match remote_ext_config {
Some(x) => Some(init_remote_storage(x)?),
None => None,
};
let rt = Runtime::new().unwrap();
let copy_remote_storage = ext_remote_storage.clone();
// rt.block_on(async move {
// download_extension(&copy_remote_storage, ExtensionType::Shared, pgbin)
// .await
// .expect("download extension should work");
// });
let http_port = *matches
.get_one::<u16>("http-port")
.expect("http-port is required");
let pgdata = matches
.get_one::<String>("pgdata")
.expect("PGDATA path is required");
@@ -119,9 +150,6 @@ fn main() -> Result<()> {
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
// Try to use just 'postgres' if no path is provided
let pgbin = matches.get_one::<String>("pgbin").unwrap();
let spec;
let mut live_config_allowed = false;
match spec_json {
@@ -173,12 +201,31 @@ fn main() -> Result<()> {
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
ext_remote_storage,
availiable_extensions: Vec::new(),
availiable_libraries: Vec::new(),
};
let compute = Arc::new(compute_node);
// Launch http service first, so we were able to serve control-plane
// requests, while configuration is still in progress.
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
let _http_handle =
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
let extension_server_port: u16 = http_port;
// exen before we have spec, we can get public availiable extensions
// TODO turn get_availiable_extensions() & other functions into ComputeNode method,
// we pass to many params from it anyways..
compute_node.availiable_extensions = get_availiable_extensions(
ext_remote_storage,
pg_version, //TODO
pgbin,
None,
);
// TODO same for libraries
if !spec_set {
// No spec provided, hang waiting for it.
@@ -217,10 +264,21 @@ fn main() -> Result<()> {
let _configurator_handle =
launch_configurator(&compute).expect("cannot launch configurator thread");
// download private tenant extensions before postgres start
// TODO
// compute_node.availiable_extensions = get_availiable_extensions(ext_remote_storage,
// pg_version, //TODO
// pgbin,
// tenant_id); //TODO get tenant_id from spec
// download preload shared libraries before postgres start (if any)
// TODO
// download_library_file();
// Start Postgres
let mut delay_exit = false;
let mut exit_code = None;
let pg = match compute.start_compute() {
let pg = match compute.start_compute(extension_server_port) {
Ok(pg) => Some(pg),
Err(err) => {
error!("could not start the compute node: {:?}", err);
@@ -286,6 +344,14 @@ fn cli() -> clap::Command {
let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
clap::Command::new("compute_ctl")
.version(version)
.arg(
Arg::new("http-port")
.long("http-port")
.value_name("HTTP_PORT")
.default_value("3080")
.value_parser(clap::value_parser!(u16))
.required(false),
)
.arg(
Arg::new("connstr")
.short('C')
@@ -331,6 +397,12 @@ fn cli() -> clap::Command {
.long("control-plane-uri")
.value_name("CONTROL_PLANE_API_BASE_URI"),
)
.arg(
Arg::new("remote-ext-config")
.short('r')
.long("remote-ext-config")
.value_name("REMOTE_EXT_CONFIG"),
)
}
#[test]

View File

@@ -1,19 +1,3 @@
//
// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
// but there are several things that makes `PostgresNode` usage inconvenient in the
// cloud:
// - it inherits from `LocalEnv`, which contains **all-all** the information about
// a complete service running
// - it uses `PageServerNode` with information about http endpoint, which we do not
// need in the cloud again
// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
//
// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
// attributes (not required for the cloud). Yet, it is still tempting to unify these
// `PostgresNode` and `ComputeNode` and use one in both places.
//
// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
//
use std::fs;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
@@ -32,6 +16,8 @@ use utils::lsn::Lsn;
use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeMode, ComputeSpec};
use remote_storage::{GenericRemoteStorage, RemotePath};
use crate::config;
use crate::pg_helpers::*;
use crate::spec::*;
@@ -61,6 +47,10 @@ pub struct ComputeNode {
pub state: Mutex<ComputeState>,
/// `Condvar` to allow notifying waiters about state changes.
pub state_changed: Condvar,
/// S3 extensions configuration variables
pub ext_remote_storage: Option<GenericRemoteStorage>,
pub availiable_extensions: Vec<RemotePath>,
pub availiable_libraries: Vec<RemotePath>,
}
#[derive(Clone, Debug)]
@@ -106,26 +96,38 @@ pub struct ParsedSpec {
impl TryFrom<ComputeSpec> for ParsedSpec {
type Error = String;
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
// Extract the options from the spec file that are needed to connect to
// the storage system.
//
// For backwards-compatibility, the top-level fields in the spec file
// may be empty. In that case, we need to dig them from the GUCs in the
// cluster.settings field.
let pageserver_connstr = spec
.cluster
.settings
.find("neon.pageserver_connstring")
.pageserver_connstring
.clone()
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
.ok_or("pageserver connstr should be provided")?;
let storage_auth_token = spec.storage_auth_token.clone();
let tenant_id: TenantId = spec
.cluster
.settings
.find("neon.tenant_id")
.ok_or("tenant id should be provided")
.map(|s| TenantId::from_str(&s))?
.or(Err("invalid tenant id"))?;
let timeline_id: TimelineId = spec
.cluster
.settings
.find("neon.timeline_id")
.ok_or("timeline id should be provided")
.map(|s| TimelineId::from_str(&s))?
.or(Err("invalid timeline id"))?;
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
tenant_id
} else {
spec.cluster
.settings
.find("neon.tenant_id")
.ok_or("tenant id should be provided")
.map(|s| TenantId::from_str(&s))?
.or(Err("invalid tenant id"))?
};
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
timeline_id
} else {
spec.cluster
.settings
.find("neon.timeline_id")
.ok_or("timeline id should be provided")
.map(|s| TimelineId::from_str(&s))?
.or(Err("invalid timeline id"))?
};
Ok(ParsedSpec {
spec,
@@ -249,14 +251,22 @@ impl ComputeNode {
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
#[instrument(skip(self, compute_state))]
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
pub fn prepare_pgdata(
&self,
compute_state: &ComputeState,
extension_server_port: u16,
) -> Result<()> {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
let spec = &pspec.spec;
let pgdata_path = Path::new(&self.pgdata);
// Remove/create an empty pgdata directory and put configuration there.
self.create_pgdata()?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
config::write_postgres_conf(
&pgdata_path.join("postgresql.conf"),
&pspec.spec,
Some(extension_server_port),
)?;
// Syncing safekeepers is only safe with primary nodes: if a primary
// is already connected it will be kicked out, so a secondary (standby)
@@ -295,8 +305,8 @@ impl ComputeNode {
update_pg_hba(pgdata_path)?;
match spec.mode {
ComputeMode::Primary | ComputeMode::Static(..) => {}
ComputeMode::Replica => {
ComputeMode::Primary => {}
ComputeMode::Replica | ComputeMode::Static(..) => {
add_standby_signal(pgdata_path)?;
}
}
@@ -374,11 +384,6 @@ impl ComputeNode {
// 'Close' connection
drop(client);
info!(
"finished configuration of compute for project {}",
spec.cluster.cluster_id
);
Ok(())
}
@@ -399,7 +404,7 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.pgdata);
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
self.pg_reload_conf(&mut client)?;
@@ -429,24 +434,24 @@ impl ComputeNode {
}
#[instrument(skip(self))]
pub fn start_compute(&self) -> Result<std::process::Child> {
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
let compute_state = self.state.lock().unwrap().clone();
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}",
spec.spec.cluster.cluster_id,
spec.spec.operation_uuid.as_deref().unwrap_or("None"),
spec.tenant_id,
spec.timeline_id,
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
pspec.tenant_id,
pspec.timeline_id,
);
self.prepare_pgdata(&compute_state)?;
self.prepare_pgdata(&compute_state, extension_server_port)?;
let start_time = Utc::now();
let pg = self.start_postgres(spec.storage_auth_token.clone())?;
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
if spec.spec.mode == ComputeMode::Primary {
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
self.apply_config(&compute_state)?;
}
@@ -466,6 +471,11 @@ impl ComputeNode {
}
self.set_status(ComputeStatus::Running);
info!(
"finished configuration of compute for project {}",
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
);
Ok(pg)
}

View File

@@ -5,6 +5,7 @@ use std::path::Path;
use anyhow::Result;
use crate::pg_helpers::escape_conf_value;
use crate::pg_helpers::PgOptionsSerialize;
use compute_api::spec::{ComputeMode, ComputeSpec};
@@ -32,14 +33,52 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
}
/// Create or completely rewrite configuration file specified by `path`
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
pub fn write_postgres_conf(
path: &Path,
spec: &ComputeSpec,
extension_server_port: Option<u16>,
) -> Result<()> {
// File::create() destroys the file content if it exists.
let mut file = File::create(path)?;
writeln!(file, "# Managed by compute_ctl: begin")?;
// Write the postgresql.conf content from the spec file as is.
if let Some(conf) = &spec.cluster.postgresql_conf {
writeln!(file, "{}", conf)?;
}
write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
// Add options for connecting to storage
writeln!(file, "# Neon storage settings")?;
if let Some(s) = &spec.pageserver_connstring {
writeln!(
file,
"neon.pageserver_connstring='{}'",
escape_conf_value(s)
)?;
}
if !spec.safekeeper_connstrings.is_empty() {
writeln!(
file,
"neon.safekeepers='{}'",
escape_conf_value(&spec.safekeeper_connstrings.join(","))
)?;
}
if let Some(s) = &spec.tenant_id {
writeln!(
file,
"neon.tenant_id='{}'",
escape_conf_value(&s.to_string())
)?;
}
if let Some(s) = &spec.timeline_id {
writeln!(
file,
"neon.timeline_id='{}'",
escape_conf_value(&s.to_string())
)?;
}
match spec.mode {
ComputeMode::Primary => {}
ComputeMode::Static(lsn) => {
@@ -53,7 +92,16 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
}
}
writeln!(file, "# Managed by compute_ctl: end")?;
// If there are any extra options in the 'settings' field, append those
if spec.cluster.settings.is_some() {
writeln!(file, "# Managed by compute_ctl: begin")?;
write!(file, "{}", spec.cluster.settings.as_pg_settings())?;
writeln!(file, "# Managed by compute_ctl: end")?;
}
if let Some(port) = extension_server_port {
writeln!(file, "neon.extension_server_port={}", port)?;
}
Ok(())
}

View File

@@ -0,0 +1,182 @@
use anyhow::{self, bail, Result};
use remote_storage::*;
use serde_json::{self, Value};
use std::fs::File;
use std::io::{BufWriter, Write};
use std::num::{NonZeroU32, NonZeroUsize};
use std::path::{Path, PathBuf};
use std::str;
use tokio::io::AsyncReadExt;
use tracing::info;
use utils::id::TenantId;
fn get_pg_config(argument: &str, pgbin: &str) -> String {
// gives the result of `pg_config [argument]`
// where argument is a flag like `--version` or `--sharedir`
let pgconfig = pgbin.replace("postgres", "pg_config");
let config_output = std::process::Command::new(pgconfig)
.arg(argument)
.output()
.expect("pg_config error");
std::str::from_utf8(&config_output.stdout)
.expect("pg_config error")
.trim()
.to_string()
}
fn get_pg_version(pgbin: &str) -> String {
// pg_config --version returns a (platform specific) human readable string
// such as "PostgreSQL 15.4". We parse this to v14/v15
let human_version = get_pg_config("--version", pgbin);
if human_version.contains("v15") {
return "v15".to_string();
}
"v14".to_string()
}
async fn download_helper(
remote_storage: &GenericRemoteStorage,
remote_from_path: &RemotePath,
download_location: &Path,
) -> anyhow::Result<()> {
// downloads file at remote_from_path to download_location/[file_name]
let local_path = download_location.join(remote_from_path.object_name().expect("bad object"));
info!(
"Downloading {:?} to location {:?}",
&remote_from_path, &local_path
);
let mut download = remote_storage.download(remote_from_path).await?;
let mut write_data_buffer = Vec::new();
download
.download_stream
.read_to_end(&mut write_data_buffer)
.await?;
let mut output_file = BufWriter::new(File::create(local_path)?);
output_file.write_all(&write_data_buffer)?;
Ok(())
}
// download extension control files
//
// return list of all extension files to use it in the future searches
//
// if tenant_id is provided - search in a private per-tenant extension path,
// otherwise - in public extension path
//
pub async fn get_availiable_extensions(
remote_storage: &GenericRemoteStorage,
pg_version: &str,
pgbin: &str,
tenant_id: Option<TenantId>,
) -> anyhow::Result<Vec<RemotePath>> {
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
let remote_sharedir = match tenant_id {
None => RemotePath::new(&Path::new(&pg_version).join("share/postgresql/extension"))?,
Some(tenant_id) => RemotePath::new(
&Path::new(&pg_version)
.join(&tenant_id.to_string())
.join("share/postgresql/extension"),
)?,
};
let from_paths = remote_storage.list_files(Some(&remote_sharedir)).await?;
// download all found control files
for remote_from_path in &from_paths {
if remote_from_path.extension() == Some("control") {
download_helper(remote_storage, &remote_from_path, &local_sharedir).await?;
}
}
Ok(from_paths)
}
// download all sql files for a given extension name
//
pub async fn download_extension_sql_files(
ext_name: &str,
availiable_extensions: Vec<RemotePath>,
remote_storage: &GenericRemoteStorage,
pgbin: &str,
) -> Result<()> {
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
// check if extension files exist
let files_to_download: Vec<&RemotePath> = availiable_extensions
.iter()
.filter(|ext| {
ext.extension() == Some("sql") && ext.object_name().unwrap().starts_with(ext_name)
})
.collect();
if files_to_download.is_empty() {
bail!("Files for extension {ext_name} are not found in the extension store");
}
for remote_from_path in files_to_download {
download_helper(remote_storage, &remote_from_path, &local_sharedir).await?;
}
Ok(())
}
// download shared library file
pub async fn download_library_file(
lib_name: &str,
availiable_libraries: Vec<RemotePath>,
remote_storage: &GenericRemoteStorage,
pgbin: &str,
) -> Result<()> {
let local_libdir: PathBuf = Path::new(&get_pg_config("--libdir", pgbin)).into();
// check if the library file exists
let lib = availiable_libraries
.iter()
.find(|lib: &&RemotePath| lib.object_name().unwrap() == lib_name);
match lib {
None => bail!("Shared library file {lib_name} is not found in the extension store"),
Some(lib) => {
download_helper(remote_storage, &lib, &local_libdir).await?;
}
}
Ok(())
}
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
let remote_ext_bucket = match &remote_ext_config["bucket"] {
Value::String(x) => x,
_ => bail!("remote_ext_config missing bucket"),
};
let remote_ext_region = match &remote_ext_config["region"] {
Value::String(x) => x,
_ => bail!("remote_ext_config missing region"),
};
let remote_ext_endpoint = match &remote_ext_config["endpoint"] {
Value::String(x) => Some(x.clone()),
_ => None,
};
let remote_ext_prefix = match &remote_ext_config["prefix"] {
Value::String(x) => Some(x.clone()),
_ => None,
};
// load will not be large, so default parameters are fine
let config = S3Config {
bucket_name: remote_ext_bucket.to_string(),
bucket_region: remote_ext_region.to_string(),
prefix_in_bucket: remote_ext_prefix,
endpoint: remote_ext_endpoint,
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
max_keys_per_list_response: None,
};
let config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
storage: RemoteStorageKind::AwsS3(config),
};
GenericRemoteStorage::from_config(&config)
}

View File

@@ -16,6 +16,8 @@ use tokio::task;
use tracing::{error, info};
use tracing_utils::http::OtelName;
use crate::extension_server::{download_extension_sql_files, download_library_file};
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
ComputeStatusResponse {
start_time: state.start_time,
@@ -121,8 +123,68 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
}
}
// download extension files from S3 on demand
(&Method::POST, route) if route.starts_with("/extension_server/") => {
info!("serving {:?} POST request", route);
let is_library = false;
let filename = route.split('/').last().unwrap();
info!(
"serving /extension_server POST request, filename: {:?}",
filename
);
if compute.ext_remote_storage.is_none() {
error!("Remote extension storage is not set up");
let mut resp = Response::new(Body::from("Remote extension storage is not set up"));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
return resp;
}
let ext_storage = &compute.ext_remote_storage.unwrap();
if !is_library {
match download_extension_sql_files(
filename,
&compute.availiable_extensions,
&ext_storage,
&compute.pgbin,
)
.await
{
Ok(_) => Response::new(Body::from("OK")),
Err(e) => {
error!("extension download failed: {}", e);
let mut resp = Response::new(Body::from(e.to_string()));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
} else {
match download_library_file(
filename,
&compute.availiable_libraries,
&ext_storage,
&compute.pgbin,
)
.await
{
Ok(_) => Response::new(Body::from("OK")),
Err(e) => {
error!("library download failed: {}", e);
let mut resp = Response::new(Body::from(e.to_string()));
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
resp
}
}
}
}
// Return the `404 Not Found` for any other routes.
_ => {
method => {
info!("404 Not Found for {:?}", method);
let mut not_found = Response::new(Body::from("404 Not Found"));
*not_found.status_mut() = StatusCode::NOT_FOUND;
not_found
@@ -220,8 +282,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(state: Arc<ComputeNode>) {
let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
async fn serve(port: u16, state: Arc<ComputeNode>) {
let addr = SocketAddr::from(([0, 0, 0, 0], port));
let make_service = make_service_fn(move |_conn| {
let state = state.clone();
@@ -256,10 +318,10 @@ async fn serve(state: Arc<ComputeNode>) {
}
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state);
Ok(thread::Builder::new()
.name("http-endpoint".into())
.spawn(move || serve(state))?)
.spawn(move || serve(port, state))?)
}

View File

@@ -139,6 +139,34 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
/extension_server:
post:
tags:
- Extension
summary: Download extension from S3 to local folder.
description: ""
operationId: downloadExtension
responses:
200:
description: Extension downloaded
content:
text/plain:
schema:
type: string
description: Error text or 'OK' if download succeeded.
example: "OK"
400:
description: Request is invalid.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: Extension download request failed.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:

View File

@@ -9,6 +9,7 @@ pub mod http;
#[macro_use]
pub mod logger;
pub mod compute;
pub mod extension_server;
pub mod monitor;
pub mod params;
pub mod pg_helpers;

View File

@@ -23,7 +23,7 @@ fn escape_literal(s: &str) -> String {
/// Escape a string so that it can be used in postgresql.conf.
/// Same as escape_literal, currently.
fn escape_conf_value(s: &str) -> String {
pub fn escape_conf_value(s: &str) -> String {
s.replace('\'', "''").replace('\\', "\\\\")
}

View File

@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
// File `postgresql.conf` is no longer included into `basebackup`, so just
// always write all config into it creating new file.
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
update_pg_hba(pgdata_path)?;

View File

@@ -32,3 +32,4 @@ utils.workspace = true
compute_api.workspace = true
workspace_hack.workspace = true
tracing.workspace = true

View File

@@ -476,10 +476,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
println!("Creating endpoint for imported timeline ...");
cplane.new_endpoint(
tenant_id,
name,
tenant_id,
timeline_id,
None,
None,
pg_version,
ComputeMode::Primary,
)?;
@@ -591,7 +592,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
table.add_row([
endpoint_id.as_str(),
&endpoint.address.to_string(),
&endpoint.pg_address.to_string(),
&endpoint.timeline_id.to_string(),
branch_name,
lsn_str.as_str(),
@@ -620,8 +621,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
.get_branch_timeline_id(branch_name, tenant_id)
.ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
let pg_version = sub_args
.get_one::<u32>("pg-version")
.copied()
@@ -639,14 +640,40 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
};
cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
cplane.new_endpoint(
&endpoint_id,
tenant_id,
timeline_id,
pg_port,
http_port,
pg_version,
mode,
)?;
}
"start" => {
let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
let endpoint_id = sub_args
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
// If --safekeepers argument is given, use only the listed safekeeper nodes.
let safekeepers =
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
let mut safekeepers: Vec<NodeId> = Vec::new();
for sk_id in safekeepers_str.split(',').map(str::trim) {
let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
})?);
safekeepers.push(sk_id);
}
safekeepers
} else {
env.safekeepers.iter().map(|sk| sk.id).collect()
};
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
@@ -673,7 +700,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
_ => {}
}
println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token)?;
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
} else {
let branch_name = sub_args
.get_one::<String>("branch-name")
@@ -709,14 +736,15 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
let ep = cplane.new_endpoint(
tenant_id,
endpoint_id,
tenant_id,
timeline_id,
port,
pg_port,
http_port,
pg_version,
mode,
)?;
ep.start(&auth_token)?;
ep.start(&auth_token, safekeepers, remote_ext_config)?;
}
}
"stop" => {
@@ -944,11 +972,22 @@ fn cli() -> Command {
.value_parser(value_parser!(u32))
.default_value(DEFAULT_PG_VERSION);
let port_arg = Arg::new("port")
.long("port")
let pg_port_arg = Arg::new("pg-port")
.long("pg-port")
.required(false)
.value_parser(value_parser!(u16))
.value_name("port");
.value_name("pg-port");
let http_port_arg = Arg::new("http-port")
.long("http-port")
.required(false)
.value_parser(value_parser!(u16))
.value_name("http-port");
let safekeepers_arg = Arg::new("safekeepers")
.long("safekeepers")
.required(false)
.value_name("safekeepers");
let stop_mode_arg = Arg::new("stop-mode")
.short('m')
@@ -965,6 +1004,12 @@ fn cli() -> Command {
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
.required(false);
let remote_ext_config_args = Arg::new("remote-ext-config")
.long("remote-ext-config")
.num_args(1)
.help("Configure the S3 bucket that we search for extensions in.")
.required(false);
let lsn_arg = Arg::new("lsn")
.long("lsn")
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
@@ -1093,7 +1138,8 @@ fn cli() -> Command {
.arg(branch_name_arg.clone())
.arg(tenant_id_arg.clone())
.arg(lsn_arg.clone())
.arg(port_arg.clone())
.arg(pg_port_arg.clone())
.arg(http_port_arg.clone())
.arg(
Arg::new("config-only")
.help("Don't do basebackup, create endpoint directory with only config files")
@@ -1109,9 +1155,12 @@ fn cli() -> Command {
.arg(branch_name_arg)
.arg(timeline_id_arg)
.arg(lsn_arg)
.arg(port_arg)
.arg(pg_port_arg)
.arg(http_port_arg)
.arg(pg_version_arg)
.arg(hot_standby_arg)
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
)
.subcommand(
Command::new("stop")

View File

@@ -1,3 +1,9 @@
//! Code to manage the storage broker
//!
//! In the local test environment, the data for each safekeeper is stored in
//!
//! .neon/safekeepers/<safekeeper id>
//!
use anyhow::Context;
use std::path::PathBuf;

View File

@@ -1,41 +1,73 @@
//! Code to manage compute endpoints
//!
//! In the local test environment, the data for each endpoint is stored in
//!
//! .neon/endpoints/<endpoint id>
//!
//! Some basic information about the endpoint, like the tenant and timeline IDs,
//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
//! when the endpoint is created, and doesn't change afterwards.
//!
//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
//! the basebackup from the pageserver to initialize the the data directory, and
//! finally launches the PostgreSQL process. It watches the PostgreSQL process
//! until it exits.
//!
//! When an endpoint is created, a `postgresql.conf` file is also created in
//! the endpoint's directory. The file can be modified before starting PostgreSQL.
//! However, the `postgresql.conf` file in the endpoint directory is not used directly
//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
//! copy of it in the data directory.
//!
//! Directory contents:
//!
//! ```ignore
//! .neon/endpoints/main/
//! compute.log - log output of `compute_ctl` and `postgres`
//! endpoint.json - serialized `EndpointConf` struct
//! postgresql.conf - postgresql settings
//! spec.json - passed to `compute_ctl`
//! pgdata/
//! postgresql.conf - copy of postgresql.conf created by `compute_ctl`
//! zenith.signal
//! <other PostgreSQL files>
//! ```
//!
use std::collections::BTreeMap;
use std::fs::{self, File};
use std::io::Write;
use std::net::SocketAddr;
use std::net::TcpStream;
use std::os::unix::fs::PermissionsExt;
use std::path::PathBuf;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::process::Command;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{Context, Result};
use anyhow::{anyhow, bail, Context, Result};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use utils::id::{NodeId, TenantId, TimelineId};
use crate::local_env::LocalEnv;
use crate::pageserver::PageServerNode;
use crate::postgresql_conf::PostgresConf;
use compute_api::spec::ComputeMode;
use compute_api::responses::{ComputeState, ComputeStatus};
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
// contents of a endpoint.json file
#[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct EndpointConf {
name: String,
endpoint_id: String,
#[serde_as(as = "DisplayFromStr")]
tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
timeline_id: TimelineId,
mode: ComputeMode,
port: u16,
pg_port: u16,
http_port: u16,
pg_version: u32,
skip_pg_catalog_updates: bool,
}
//
@@ -57,11 +89,11 @@ impl ComputeControlPlane {
let pageserver = Arc::new(PageServerNode::from_env(&env));
let mut endpoints = BTreeMap::default();
for endpoint_dir in fs::read_dir(env.endpoints_path())
for endpoint_dir in std::fs::read_dir(env.endpoints_path())
.with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
{
let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
endpoints.insert(ep.name.clone(), Arc::new(ep));
endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
}
Ok(ComputeControlPlane {
@@ -76,47 +108,58 @@ impl ComputeControlPlane {
1 + self
.endpoints
.values()
.map(|ep| ep.address.port())
.map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
.max()
.unwrap_or(self.base_port)
}
#[allow(clippy::too_many_arguments)]
pub fn new_endpoint(
&mut self,
endpoint_id: &str,
tenant_id: TenantId,
name: &str,
timeline_id: TimelineId,
port: Option<u16>,
pg_port: Option<u16>,
http_port: Option<u16>,
pg_version: u32,
mode: ComputeMode,
) -> Result<Arc<Endpoint>> {
let port = port.unwrap_or_else(|| self.get_port());
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
let ep = Arc::new(Endpoint {
name: name.to_owned(),
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
endpoint_id: endpoint_id.to_owned(),
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
env: self.env.clone(),
pageserver: Arc::clone(&self.pageserver),
timeline_id,
mode,
tenant_id,
pg_version,
skip_pg_catalog_updates: false,
});
ep.create_pgdata()?;
ep.create_endpoint_dir()?;
std::fs::write(
ep.endpoint_path().join("endpoint.json"),
serde_json::to_string_pretty(&EndpointConf {
name: name.to_string(),
endpoint_id: endpoint_id.to_string(),
tenant_id,
timeline_id,
mode,
port,
http_port,
pg_port,
pg_version,
skip_pg_catalog_updates: false,
})?,
)?;
ep.setup_pg_conf()?;
std::fs::write(
ep.endpoint_path().join("postgresql.conf"),
ep.setup_pg_conf()?.to_string(),
)?;
self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
self.endpoints
.insert(ep.endpoint_id.clone(), Arc::clone(&ep));
Ok(ep)
}
@@ -127,13 +170,15 @@ impl ComputeControlPlane {
#[derive(Debug)]
pub struct Endpoint {
/// used as the directory name
name: String,
endpoint_id: String,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub mode: ComputeMode,
// port and address of the Postgres server
pub address: SocketAddr,
// port and address of the Postgres server and `compute_ctl`'s HTTP API
pub pg_address: SocketAddr,
pub http_address: SocketAddr,
// postgres major version in the format: 14, 15, etc.
pg_version: u32,
@@ -141,6 +186,9 @@ pub struct Endpoint {
// the endpoint runs in.
pub env: LocalEnv,
pageserver: Arc<PageServerNode>,
// Optimizations
skip_pg_catalog_updates: bool,
}
impl Endpoint {
@@ -158,123 +206,37 @@ impl Endpoint {
// parse data directory name
let fname = entry.file_name();
let name = fname.to_str().unwrap().to_string();
let endpoint_id = fname.to_str().unwrap().to_string();
// Read the endpoint.json file
let conf: EndpointConf =
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
// ok now
Ok(Endpoint {
address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
name,
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
endpoint_id,
env: env.clone(),
pageserver: Arc::clone(pageserver),
timeline_id: conf.timeline_id,
mode: conf.mode,
tenant_id: conf.tenant_id,
pg_version: conf.pg_version,
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
})
}
fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
let mut cmd = Command::new(pg_path);
cmd.arg("--sync-safekeepers")
.env_clear()
.env(
"LD_LIBRARY_PATH",
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
)
.env(
"DYLD_LIBRARY_PATH",
self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
)
.env("PGDATA", self.pgdata().to_str().unwrap())
.stdout(Stdio::piped())
// Comment this to avoid capturing stderr (useful if command hangs)
.stderr(Stdio::piped());
if let Some(token) = auth_token {
cmd.env("NEON_AUTH_TOKEN", token);
}
let sync_handle = cmd
.spawn()
.expect("postgres --sync-safekeepers failed to start");
let sync_output = sync_handle
.wait_with_output()
.expect("postgres --sync-safekeepers failed");
if !sync_output.status.success() {
anyhow::bail!(
"sync-safekeepers failed: '{}'",
String::from_utf8_lossy(&sync_output.stderr)
);
}
let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
println!("Safekeepers synced on {}", lsn);
Ok(lsn)
}
/// Get basebackup from the pageserver as a tar archive and extract it
/// to the `self.pgdata()` directory.
fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
println!(
"Extracting base backup to create postgres instance: path={} port={}",
self.pgdata().display(),
self.address.port()
);
let sql = if let Some(lsn) = lsn {
format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
} else {
format!("basebackup {} {}", self.tenant_id, self.timeline_id)
};
let mut client = self
.pageserver
.page_server_psql_client()
.context("connecting to page server failed")?;
let copyreader = client
.copy_out(sql.as_str())
.context("page server 'basebackup' command failed")?;
// Read the archive directly from the `CopyOutReader`
//
// Set `ignore_zeros` so that unpack() reads all the Copy data and
// doesn't stop at the end-of-archive marker. Otherwise, if the server
// sends an Error after finishing the tarball, we will not notice it.
let mut ar = tar::Archive::new(copyreader);
ar.set_ignore_zeros(true);
ar.unpack(&self.pgdata())
.context("extracting base backup failed")?;
Ok(())
}
fn create_pgdata(&self) -> Result<()> {
fs::create_dir_all(self.pgdata()).with_context(|| {
fn create_endpoint_dir(&self) -> Result<()> {
std::fs::create_dir_all(self.endpoint_path()).with_context(|| {
format!(
"could not create data directory {}",
self.pgdata().display()
"could not create endpoint directory {}",
self.endpoint_path().display()
)
})?;
fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
.with_context(|| {
format!(
"could not set permissions in data directory {}",
self.pgdata().display()
)
})
})
}
// Write postgresql.conf with default configuration
// and PG_VERSION file to the data directory of a new endpoint.
fn setup_pg_conf(&self) -> Result<()> {
// Generate postgresql.conf with default configuration
fn setup_pg_conf(&self) -> Result<PostgresConf> {
let mut conf = PostgresConf::new();
conf.append("max_wal_senders", "10");
conf.append("wal_log_hints", "off");
@@ -287,25 +249,14 @@ impl Endpoint {
// wal_sender_timeout is the maximum time to wait for WAL replication.
// It also defines how often the walreciever will send a feedback message to the wal sender.
conf.append("wal_sender_timeout", "5s");
conf.append("listen_addresses", &self.address.ip().to_string());
conf.append("port", &self.address.port().to_string());
conf.append("listen_addresses", &self.pg_address.ip().to_string());
conf.append("port", &self.pg_address.port().to_string());
conf.append("wal_keep_size", "0");
// walproposer panics when basebackup is invalid, it is pointless to restart in this case.
conf.append("restart_after_crash", "off");
// Configure the Neon Postgres extension to fetch pages from pageserver
let pageserver_connstr = {
let config = &self.pageserver.pg_connection_config;
let (host, port) = (config.host(), config.port());
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
format!("postgresql://no_user@{host}:{port}")
};
// Load the 'neon' extension
conf.append("shared_preload_libraries", "neon");
conf.append_line("");
conf.append("neon.pageserver_connstring", &pageserver_connstr);
conf.append("neon.tenant_id", &self.tenant_id.to_string());
conf.append("neon.timeline_id", &self.timeline_id.to_string());
conf.append_line("");
// Replication-related configurations, such as WAL sending
@@ -390,46 +341,11 @@ impl Endpoint {
}
}
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
file.write_all(conf.to_string().as_bytes())?;
let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
file.write_all(self.pg_version.to_string().as_bytes())?;
Ok(())
}
fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
let backup_lsn = match &self.mode {
ComputeMode::Primary => {
if !self.env.safekeepers.is_empty() {
// LSN 0 means that it is bootstrap and we need to download just
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
// procedure evolves quite actively right now, so let's think about it again
// when things would be more stable (TODO).
let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
if lsn == Lsn(0) {
None
} else {
Some(lsn)
}
} else {
None
}
}
ComputeMode::Static(lsn) => Some(*lsn),
ComputeMode::Replica => {
None // Take the latest snapshot available to start with
}
};
self.do_basebackup(backup_lsn)?;
Ok(())
Ok(conf)
}
pub fn endpoint_path(&self) -> PathBuf {
self.env.endpoints_path().join(&self.name)
self.env.endpoints_path().join(&self.endpoint_id)
}
pub fn pgdata(&self) -> PathBuf {
@@ -439,7 +355,7 @@ impl Endpoint {
pub fn status(&self) -> &str {
let timeout = Duration::from_millis(300);
let has_pidfile = self.pgdata().join("postmaster.pid").exists();
let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
match (has_pidfile, can_connect) {
(true, true) => "running",
@@ -457,8 +373,6 @@ impl Endpoint {
&[
"-D",
self.pgdata().to_str().unwrap(),
"-l",
self.pgdata().join("pg.log").to_str().unwrap(),
"-w", //wait till pg_ctl actually does what was asked
],
args,
@@ -494,36 +408,192 @@ impl Endpoint {
Ok(())
}
pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
pub fn start(
&self,
auth_token: &Option<String>,
safekeepers: Vec<NodeId>,
remote_ext_config: Option<&String>,
) -> Result<()> {
if self.status() == "running" {
anyhow::bail!("The endpoint is already running");
}
// 1. We always start Postgres from scratch, so
// if old dir exists, preserve 'postgresql.conf' and drop the directory
let postgresql_conf_path = self.pgdata().join("postgresql.conf");
let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
format!(
"failed to read config file in {}",
postgresql_conf_path.to_str().unwrap()
)
})?;
fs::remove_dir_all(self.pgdata())?;
self.create_pgdata()?;
// Slurp the endpoints/<endpoint id>/postgresql.conf file into
// memory. We will include it in the spec file that we pass to
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
// in the data directory.
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
Ok(content) => String::from_utf8(content)?,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
Err(e) => {
return Err(anyhow::Error::new(e).context(format!(
"failed to read config file in {}",
postgresql_conf_path.to_str().unwrap()
)))
}
};
// 2. Bring back config files
fs::write(&postgresql_conf_path, postgresql_conf)?;
// 3. Load basebackup
self.load_basebackup(auth_token)?;
if self.mode != ComputeMode::Primary {
File::create(self.pgdata().join("standby.signal"))?;
// We always start the compute node from scratch, so if the Postgres
// data dir exists from a previous launch, remove it first.
if self.pgdata().exists() {
std::fs::remove_dir_all(self.pgdata())?;
}
// 4. Finally start postgres
println!("Starting postgres at '{}'", self.connstr());
self.pg_ctl(&["start"], auth_token)
let pageserver_connstring = {
let config = &self.pageserver.pg_connection_config;
let (host, port) = (config.host(), config.port());
// NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
format!("postgresql://no_user@{host}:{port}")
};
let mut safekeeper_connstrings = Vec::new();
if self.mode == ComputeMode::Primary {
for sk_id in safekeepers {
let sk = self
.env
.safekeepers
.iter()
.find(|node| node.id == sk_id)
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
}
}
// Create spec file
let spec = ComputeSpec {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used
state: None,
roles: vec![],
databases: vec![],
settings: None,
postgresql_conf: Some(postgresql_conf),
},
delta_operations: None,
tenant_id: Some(self.tenant_id),
timeline_id: Some(self.timeline_id),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
// Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
let logfile = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open(self.endpoint_path().join("compute.log"))?;
// Launch compute_ctl
println!("Starting postgres node at '{}'", self.connstr());
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
cmd.args(["--http-port", &self.http_address.port().to_string()])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &self.connstr()])
.args([
"--spec-path",
self.endpoint_path().join("spec.json").to_str().unwrap(),
])
.args([
"--pgbin",
self.env
.pg_bin_dir(self.pg_version)?
.join("postgres")
.to_str()
.unwrap(),
])
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);
if let Some(remote_ext_config) = remote_ext_config {
cmd.args(["--remote-ext-config", remote_ext_config]);
}
let _child = cmd.spawn()?;
// Wait for it to start
let mut attempt = 0;
const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
loop {
attempt += 1;
match self.get_status() {
Ok(state) => {
match state.status {
ComputeStatus::Init => {
if attempt == MAX_ATTEMPTS {
bail!("compute startup timed out; still in Init state");
}
// keep retrying
}
ComputeStatus::Running => {
// All good!
break;
}
ComputeStatus::Failed => {
bail!(
"compute startup failed: {}",
state
.error
.as_deref()
.unwrap_or("<no error from compute_ctl>")
);
}
ComputeStatus::Empty
| ComputeStatus::ConfigurationPending
| ComputeStatus::Configuration => {
bail!("unexpected compute status: {:?}", state.status)
}
}
}
Err(e) => {
if attempt == MAX_ATTEMPTS {
return Err(e).context(
"timed out waiting to connect to compute_ctl HTTP; last error: {e}",
);
}
}
}
std::thread::sleep(ATTEMPT_INTERVAL);
}
Ok(())
}
// Call the /status HTTP API
pub fn get_status(&self) -> Result<ComputeState> {
let client = reqwest::blocking::Client::new();
let response = client
.request(
reqwest::Method::GET,
format!(
"http://{}:{}/status",
self.http_address.ip(),
self.http_address.port()
),
)
.send()?;
// Interpret the response
let status = response.status();
if !(status.is_client_error() || status.is_server_error()) {
Ok(response.json()?)
} else {
// reqwest does not export its error construction utility functions, so let's craft the message ourselves
let url = response.url().to_owned();
let msg = match response.text() {
Ok(err_body) => format!("Error: {}", err_body),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
};
Err(anyhow::anyhow!(msg))
}
}
pub fn stop(&self, destroy: bool) -> Result<()> {
@@ -540,7 +610,7 @@ impl Endpoint {
"Destroying postgres data directory '{}'",
self.pgdata().to_str().unwrap()
);
fs::remove_dir_all(self.endpoint_path())?;
std::fs::remove_dir_all(self.endpoint_path())?;
} else {
self.pg_ctl(&["stop"], &None)?;
}
@@ -549,10 +619,10 @@ impl Endpoint {
pub fn connstr(&self) -> String {
format!(
"host={} port={} user={} dbname={}",
self.address.ip(),
self.address.port(),
"postgresql://{}@{}:{}/{}",
"cloud_admin",
self.pg_address.ip(),
self.pg_address.port(),
"postgres"
)
}

View File

@@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and
// compute nodes).
// compute endpoints).
//
// This is not stored in the config file. Rather, this is the path where the
// config file itself is. It is read from the NEON_REPO_DIR env variable or

View File

@@ -1,3 +1,9 @@
//! Code to manage pageservers
//!
//! In the local test environment, the pageserver stores its data directly in
//!
//! .neon/
//!
use std::borrow::Cow;
use std::collections::HashMap;
use std::fs::File;

View File

@@ -1,3 +1,9 @@
//! Code to manage safekeepers
//!
//! In the local test environment, the data for each safekeeper is stored in
//!
//! .neon/safekeepers/<safekeeper id>
//!
use std::io::Write;
use std::path::PathBuf;
use std::process::Child;

View File

@@ -0,0 +1,301 @@
# Supporting custom user Extensions
Created 2023-05-03
## Motivation
There are many extensions in the PostgreSQL ecosystem, and not all extensions
are of a quality that we can confidently support them. Additionally, our
current extension inclusion mechanism has several problems because we build all
extensions into the primary Compute image: We build the extensions every time
we build the compute image regardless of whether we actually need to rebuild
the image, and the inclusion of these extensions in the image adds a hard
dependency on all supported extensions - thus increasing the image size, and
with it the time it takes to download that image - increasing first start
latency.
This RFC proposes a dynamic loading mechanism that solves most of these
problems.
## Summary
`compute_ctl` is made responsible for loading extensions on-demand into
the container's file system for dynamically loaded extensions, and will also
make sure that the extensions in `shared_preload_libraries` are downloaded
before the compute node starts.
## Components
compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
## Requirements
Compute nodes with no extra extensions should not be negatively impacted by
the existence of support for many extensions.
Installing an extension into PostgreSQL should be easy.
Non-preloaded extensions shouldn't impact startup latency.
Uninstalled extensions shouldn't impact query latency.
A small latency penalty for dynamically loaded extensions is acceptable in
the first seconds of compute startup, but not in steady-state operations.
## Proposed implementation
### On-demand, JIT-loading of extensions
TLDR; we download extensions as soon as we need them, or when we have spare
time.
That means, we first download the extensions required to start the PostMaster
(`shared_preload_libraries` and their dependencies), then the libraries required
before a backend can start processing user input (`preload_libraries` and
dependencies), and then (with network limits applied) the remainder of the
configured extensions, with prioritization for installed extensions.
If PostgreSQL tries to load a library that is not yet fully on disk, it will
ask `compute_ctl` first if the extension has been downloaded yet, and will wait
for `compute_ctl` to finish downloading that extension. `compute_ctl` will
prioritize downloading that extension over other extensions that were not yet
requested.
#### Workflow
```mermaid
sequenceDiagram
autonumber
participant EX as External (control plane, ...)
participant CTL as compute_ctl
participant ST as extension store
actor PG as PostgreSQL
EX ->>+ CTL: Start compute with config X
note over CTL: The configuration contains a list of all <br/>extensions available to that compute node, etc.
par Optionally parallel or concurrent
loop Available extensions
CTL ->>+ ST: Download control file of extension
activate CTL
ST ->>- CTL: Finish downloading control file
CTL ->>- CTL: Put control file in extensions directory
end
loop For each extension in shared_preload_libraries
CTL ->>+ ST: Download extension's data
activate CTL
ST ->>- CTL: Finish downloading
CTL ->>- CTL: Put extension's files in the right place
end
end
CTL ->>+ PG: Start PostgreSQL
note over CTL: PostgreSQL can now start accepting <br/>connections. However, users may still need to wait <br/>for preload_libraries extensions to get downloaded.
par Load preload_libraries
loop For each extension in preload_libraries
CTL ->>+ ST: Download extension's data
activate CTL
ST ->>- CTL: Finish downloading
CTL ->>- CTL: Put extension's files in the right place
end
end
note over CTL: After this, connections don't have any hard <br/>waits for extension files left, except for those <br/>connections that override preload_libraries <br/>in their startup packet
par PG's internal_load_library(library)
alt Library is not yet loaded
PG ->>+ CTL: Load library X
CTL ->>+ ST: Download the extension that provides X
ST ->>- CTL: Finish downloading
CTL ->> CTL: Put extension's files in the right place
CTL ->>- PG: Ready
else Library is already loaded
note over PG: No-op
end
and Download all remaining extensions
loop Extension X
CTL ->>+ ST: Download not-yet-downloaded extension X
activate CTL
ST ->>- CTL: Finish downloading
CTL ->>- CTL: Put extension's files in the right place
end
end
deactivate PG
deactivate CTL
```
#### Summary
Pros:
- Startup is only as slow as it takes to load all (shared_)preload_libraries
- Supports BYO Extension
Cons:
- O(sizeof(extensions)) IO requirement for loading all extensions.
### Alternative solutions
1. Allow users to add their extensions to the base image
Pros:
- Easy to deploy
Cons:
- Doesn't scale - first start size is dependent on image size;
- All extensions are shared across all users: It doesn't allow users to
bring their own restrictive-licensed extensions
2. Bring Your Own compute image
Pros:
- Still easy to deploy
- User can bring own patched version of PostgreSQL
Cons:
- First start latency is O(sizeof(extensions image))
- Warm instance pool for skipping pod schedule latency is not feasible with
O(n) custom images
- Support channels are difficult to manage
3. Download all user extensions in bulk on compute start
Pros:
- Easy to deploy
- No startup latency issues for "clean" users.
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- Downloading all extensions in advance takes a lot of time, thus startup
latency issues
4. Store user's extensions in persistent storage
Pros:
- Easy to deploy
- No startup latency issues
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- EC2 instances have only limited number of attachments shared between EBS
volumes, direct-attached NVMe drives, and ENIs.
- Compute instance migration isn't trivially solved for EBS mounts (e.g.
the device is unavailable whilst moving the mount between instances).
- EBS can only mount on one instance at a time (except the expensive IO2
device type).
5. Store user's extensions in network drive
Pros:
- Easy to deploy
- Few startup latency issues
- Warm instance pool for skipping pod schedule latency is possible
Cons:
- We'd need networked drives, and a lot of them, which would store many
duplicate extensions.
- **UNCHECKED:** Compute instance migration may not work nicely with
networked IOs
### Idea extensions
The extension store does not have to be S3 directly, but could be a Node-local
caching service on top of S3. This would reduce the load on the network for
popular extensions.
## Extension Store implementation
Extension Store in our case is a private S3 bucket.
Extensions are stored as tarballs in the bucket. The tarball contains the extension's control file and all the files that the extension needs to run.
We may also store the control file separately from the tarball to speed up the extension loading.
`s3://<the-bucket>/extensions/ext-name/sha-256+1234abcd1234abcd1234abcd1234abcd/bundle.tar`
where `ext-name` is an extension name and `sha-256+1234abcd1234abcd1234abcd1234abcd` is a hash of a specific extension version tarball.
To ensure security, there is no direct access to the S3 bucket from compute node.
Control plane forms a list of extensions available to the compute node
and forms a short-lived [pre-signed URL](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html)
for each extension that is available to the compute node.
so, `compute_ctl` receives spec in the following format
```
"extensions": [{
"meta_format": 1,
"extension_name": "postgis",
"link": "https://<the-bucket>/extensions/sha-256+1234abcd1234abcd1234abcd1234abcd/bundle.tar?AWSAccessKeyId=1234abcd1234abcd1234abcd1234abcd&Expires=1234567890&Signature=1234abcd1234abcd1234abcd1234abcd",
...
}]
```
`compute_ctl` then downloads the extension from the link and unpacks it to the right place.
### How do we handle private extensions?
Private and public extensions are treated equally from the Extension Store perspective.
The only difference is that the private extensions are not listed in the user UI (managed by control plane).
### How to add new extension to the Extension Store?
Since we need to verify that the extension is compatible with the compute node and doesn't contain any malicious code,
we need to review the extension before adding it to the Extension Store.
I do not expect that we will have a lot of extensions to review, so we can do it manually for now.
Some admin UI may be added later to automate this process.
The list of extensions available to a compute node is stored in the console database.
### How is the list of available extensions managed?
We need to add new tables to the console database to store the list of available extensions, their versions and access rights.
something like this:
```
CREATE TABLE extensions (
id SERIAL PRIMARY KEY,
name VARCHAR(255) NOT NULL,
version VARCHAR(255) NOT NULL,
hash VARCHAR(255) NOT NULL, // this is the path to the extension in the Extension Store
supported_postgres_versions integer[] NOT NULL,
is_public BOOLEAN NOT NULL, // public extensions are available to all users
is_shared_preload BOOLEAN NOT NULL, // these extensions require postgres restart
is_preload BOOLEAN NOT NULL,
license VARCHAR(255) NOT NULL,
);
CREATE TABLE user_extensions (
user_id INTEGER NOT NULL,
extension_id INTEGER NOT NULL,
FOREIGN KEY (user_id) REFERENCES users (id),
FOREIGN KEY (extension_id) REFERENCES extensions (id)
);
```
When new extension is added to the Extension Store, we add a new record to the table and set permissions.
In UI, user may select the extensions that they want to use with their compute node.
NOTE: Extensions that require postgres restart will not be available until the next compute restart.
Also, currently user cannot force postgres restart. We should add this feature later.
For other extensions, we must communicate updates to `compute_ctl` and they will be downloaded in the background.
### How can user update the extension?
User can update the extension by selecting the new version of the extension in the UI.
### Alternatives
For extensions written on trusted languages we can also adopt
`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
This will increase the amount supported extensions and decrease the amount of work required to support them.

View File

@@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer};
use crate::spec::ComputeSpec;
#[derive(Serialize, Debug)]
#[derive(Serialize, Debug, Deserialize)]
pub struct GenericAPIError {
pub error: String,
}
/// Response of the /status API
#[derive(Serialize, Debug)]
#[derive(Serialize, Debug, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct ComputeStatusResponse {
pub start_time: DateTime<Utc>,
@@ -23,7 +23,7 @@ pub struct ComputeStatusResponse {
pub error: Option<String>,
}
#[derive(Serialize)]
#[derive(Deserialize, Serialize)]
#[serde(rename_all = "snake_case")]
pub struct ComputeState {
pub status: ComputeStatus,
@@ -33,7 +33,7 @@ pub struct ComputeState {
pub error: Option<String>,
}
#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ComputeStatus {
// Spec wasn't provided at start, waiting for it to be

View File

@@ -5,6 +5,7 @@
//! and connect it to the storage nodes.
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
/// String type alias representing Postgres identifier and
@@ -14,7 +15,7 @@ pub type PgIdent = String;
/// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description.
#[serde_as]
#[derive(Clone, Debug, Default, Deserialize)]
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct ComputeSpec {
pub format_version: f32,
@@ -26,9 +27,38 @@ pub struct ComputeSpec {
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
/// An optinal hint that can be passed to speed up startup time if we know
/// that no pg catalog mutations (like role creation, database creation,
/// extension creation) need to be done on the actual database to start.
#[serde(default)] // Default false
pub skip_pg_catalog_updates: bool,
// Information needed to connect to the storage layer.
//
// `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
//
// Depending on `mode`, this can be a primary read-write node, a read-only
// replica, or a read-only node pinned at an older LSN.
// `safekeeper_connstrings` must be set for a primary.
//
// For backwards compatibility, the control plane may leave out all of
// these, and instead set the "neon.tenant_id", "neon.timeline_id",
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
// updated to fill these fields, we can make these non optional.
#[serde_as(as = "Option<DisplayFromStr>")]
pub tenant_id: Option<TenantId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub timeline_id: Option<TimelineId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub pageserver_connstring: Option<String>,
#[serde(default)]
pub safekeeper_connstrings: Vec<String>,
#[serde(default)]
pub mode: ComputeMode,
/// If set, 'storage_auth_token' is used as the password to authenticate to
/// the pageserver and safekeepers.
pub storage_auth_token: Option<String>,
}
@@ -47,13 +77,19 @@ pub enum ComputeMode {
Replica,
}
#[derive(Clone, Debug, Default, Deserialize)]
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct Cluster {
pub cluster_id: String,
pub name: String,
pub cluster_id: Option<String>,
pub name: Option<String>,
pub state: Option<String>,
pub roles: Vec<Role>,
pub databases: Vec<Database>,
/// Desired contents of 'postgresql.conf' file. (The 'compute_ctl'
/// tool may add additional settings to the final file.)
pub postgresql_conf: Option<String>,
/// Additional settings that will be appended to the 'postgresql.conf' file.
pub settings: GenericOptions,
}
@@ -63,7 +99,7 @@ pub struct Cluster {
/// - DROP ROLE
/// - ALTER ROLE name RENAME TO new_name
/// - ALTER DATABASE name RENAME TO new_name
#[derive(Clone, Debug, Deserialize)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct DeltaOp {
pub action: String,
pub name: PgIdent,
@@ -72,7 +108,7 @@ pub struct DeltaOp {
/// Rust representation of Postgres role info with only those fields
/// that matter for us.
#[derive(Clone, Debug, Deserialize)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct Role {
pub name: PgIdent,
pub encrypted_password: Option<String>,
@@ -81,7 +117,7 @@ pub struct Role {
/// Rust representation of Postgres database info with only those fields
/// that matter for us.
#[derive(Clone, Debug, Deserialize)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct Database {
pub name: PgIdent,
pub owner: PgIdent,
@@ -91,7 +127,7 @@ pub struct Database {
/// Common type representing both SQL statement params with or without value,
/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
/// options like `wal_level = logical`.
#[derive(Clone, Debug, Deserialize)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct GenericOption {
pub name: String,
pub value: Option<String>,
@@ -112,4 +148,14 @@ mod tests {
let file = File::open("tests/cluster_spec.json").unwrap();
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
}
#[test]
fn parse_unknown_fields() {
// Forward compatibility test
let file = File::open("tests/cluster_spec.json").unwrap();
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
let ob = json.as_object_mut().unwrap();
ob.insert("unknown_field_123123123".into(), "hello".into());
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
}
}

View File

@@ -110,12 +110,11 @@ impl TenantState {
Self::Active => Attached,
// If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
// However, it also becomes Broken if the regular load fails.
// We would need a separate TenantState variant to distinguish these cases.
// However, there's no practical difference from Console's perspective.
// It will run a Postgres-level health check as soon as it observes Attached.
// That will fail on Broken tenants.
// Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
Self::Broken { .. } => Attached,
// From Console's perspective there's no practical difference
// because attachment_status is polled by console only during attach operation execution.
Self::Broken { reason, .. } => Failed {
reason: reason.to_owned(),
},
// Why is Stopping a Maybe case? Because, during pageserver shutdown,
// we set the Stopping state irrespective of whether the tenant
// has finished attaching or not.
@@ -153,7 +152,7 @@ pub enum ActivatingFrom {
}
/// A state of a timeline in pageserver's memory.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum TimelineState {
/// The timeline is recognized by the pageserver but is not yet operational.
/// In particular, the walreceiver connection loop is not running for this timeline.
@@ -166,7 +165,7 @@ pub enum TimelineState {
/// It cannot transition back into any other state.
Stopping,
/// The timeline is broken and not operational (previous states: Loading or Active).
Broken,
Broken { reason: String, backtrace: String },
}
#[serde_as]
@@ -312,10 +311,11 @@ impl std::ops::Deref for TenantAttachConfig {
/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
#[derive(Serialize, Deserialize, Clone)]
#[serde(rename_all = "snake_case")]
#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
pub enum TenantAttachmentStatus {
Maybe,
Attached,
Failed { reason: String },
}
#[serde_as]
@@ -809,7 +809,9 @@ mod tests {
"slug": "Active",
},
"current_physical_size": 42,
"attachment_status": "attached",
"attachment_status": {
"slug":"attached",
}
});
let original_broken = TenantInfo {
@@ -831,7 +833,9 @@ mod tests {
}
},
"current_physical_size": 42,
"attachment_status": "attached",
"attachment_status": {
"slug":"attached",
}
});
assert_eq!(

View File

@@ -24,7 +24,6 @@ workspace_hack.workspace = true
[dev-dependencies]
env_logger.workspace = true
postgres.workspace = true
wal_craft = { path = "wal_craft" }
[build-dependencies]
anyhow.workspace = true

View File

@@ -33,6 +33,7 @@ macro_rules! postgres_ffi {
}
pub mod controlfile_utils;
pub mod nonrelfile_utils;
pub mod wal_craft_test_export;
pub mod waldecoder_handler;
pub mod xlog_utils;
@@ -45,8 +46,15 @@ macro_rules! postgres_ffi {
};
}
postgres_ffi!(v14);
postgres_ffi!(v15);
#[macro_export]
macro_rules! for_all_postgres_versions {
($macro:tt) => {
$macro!(v14);
$macro!(v15);
};
}
for_all_postgres_versions! { postgres_ffi }
pub mod pg_constants;
pub mod relfile_utils;

View File

@@ -0,0 +1,6 @@
//! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage.
pub use super::PG_MAJORVERSION;
pub use super::xlog_utils::*;
pub use super::bindings::*;
pub use crate::WAL_SEGMENT_SIZE;

View File

@@ -481,220 +481,4 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
wal
}
#[cfg(test)]
mod tests {
use super::super::PG_MAJORVERSION;
use super::*;
use regex::Regex;
use std::cmp::min;
use std::fs;
use std::{env, str::FromStr};
use utils::const_assert;
fn init_logging() {
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
))
.is_test(true)
.try_init();
}
fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
use wal_craft::*;
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
// Craft some WAL
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..");
let cfg = Conf {
pg_version,
pg_distrib_dir: top_path.join("pg_install"),
datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
};
if cfg.datadir.exists() {
fs::remove_dir_all(&cfg.datadir).unwrap();
}
cfg.initdb().unwrap();
let srv = cfg.start_server().unwrap();
let (intermediate_lsns, expected_end_of_wal_partial) =
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
.iter()
.map(|&lsn| u64::from(lsn).into())
.collect();
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
srv.kill();
// Check find_end_of_wal on the initial WAL
let last_segment = cfg
.wal_dir()
.read_dir()
.unwrap()
.map(|f| f.unwrap().file_name().into_string().unwrap())
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
for start_lsn in intermediate_lsns
.iter()
.chain(std::iter::once(&expected_end_of_wal))
{
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
// We assume that `start_lsn` is non-decreasing.
info!(
"Checking with start_lsn={}, erasing WAL before it",
start_lsn
);
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
let fname = file.file_name().into_string().unwrap();
if !IsXLogFileName(&fname) {
continue;
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(*start_lsn) {
continue;
}
let mut f = File::options().write(true).open(file.path()).unwrap();
const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
f.write_all(
&ZEROS[0..min(
WAL_SEGMENT_SIZE,
(u64::from(*start_lsn) - seg_start_lsn) as usize,
)],
)
.unwrap();
}
check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
}
}
fn check_pg_waldump_end_of_wal(
cfg: &wal_craft::Conf,
last_segment: &str,
expected_end_of_wal: Lsn,
) {
// Get the actual end of WAL by pg_waldump
let waldump_output = cfg
.pg_waldump("000000010000000000000001", last_segment)
.unwrap()
.stderr;
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
let caps = match Regex::new(r"invalid record length at (.+):")
.unwrap()
.captures(waldump_output)
{
Some(caps) => caps,
None => {
error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
panic!();
}
};
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
info!(
"waldump erred on {}, expected wal end at {}",
waldump_wal_end, expected_end_of_wal
);
assert_eq!(waldump_wal_end, expected_end_of_wal);
}
fn check_end_of_wal(
cfg: &wal_craft::Conf,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal: Lsn,
) {
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
// let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
// info!(
// "find_end_of_wal returned wal_end={} with non-partial WAL segment",
// wal_end
// );
// assert_eq!(wal_end, expected_end_of_wal_non_partial);
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
cfg.wal_dir().join(last_segment),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
info!(
"find_end_of_wal returned wal_end={} with partial WAL segment",
wal_end
);
assert_eq!(wal_end, expected_end_of_wal);
fs::rename(
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
)
.unwrap();
}
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
#[test]
pub fn test_find_end_of_wal_simple() {
init_logging();
test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
}
#[test]
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
init_logging();
test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
);
}
#[test]
pub fn test_find_end_of_wal_last_crossing_segment() {
init_logging();
test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
"test_find_end_of_wal_last_crossing_segment",
);
}
/// Check the math in update_next_xid
///
/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
/// currently 1024.
#[test]
pub fn test_update_next_xid() {
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
checkpoint.nextXid = FullTransactionId { value: 10 };
assert_eq!(checkpoint.nextXid.value, 10);
// The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
// boundary
checkpoint.update_next_xid(100);
assert_eq!(checkpoint.nextXid.value, 1024);
// No change
checkpoint.update_next_xid(500);
assert_eq!(checkpoint.nextXid.value, 1024);
checkpoint.update_next_xid(1023);
assert_eq!(checkpoint.nextXid.value, 1024);
// The function returns the *next* XID, given the highest XID seen so
// far. So when we pass 1024, the nextXid gets bumped up to the next
// XID_CHECKPOINT_INTERVAL boundary.
checkpoint.update_next_xid(1024);
assert_eq!(checkpoint.nextXid.value, 2048);
}
#[test]
pub fn test_encode_logical_message() {
let expected = [
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
];
let actual = encode_logical_message("prefix", "message");
assert_eq!(expected, actual[..]);
}
}
// If you need to craft WAL and write tests for this module, put it at wal_craft crate.

View File

@@ -15,3 +15,7 @@ postgres_ffi.workspace = true
tempfile.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
regex.workspace = true
utils.workspace = true

View File

@@ -10,6 +10,20 @@ use std::process::Command;
use std::time::{Duration, Instant};
use tempfile::{tempdir, TempDir};
macro_rules! xlog_utils_test {
($version:ident) => {
#[path = "."]
mod $version {
pub use postgres_ffi::$version::wal_craft_test_export::*;
#[allow(clippy::duplicate_mod)]
#[cfg(test)]
mod xlog_utils_test;
}
};
}
postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Conf {
pub pg_version: u32,

View File

@@ -0,0 +1,219 @@
//! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency.
use super::*;
use crate::{error, info};
use regex::Regex;
use std::cmp::min;
use std::fs::{self, File};
use std::io::Write;
use std::{env, str::FromStr};
use utils::const_assert;
use utils::lsn::Lsn;
fn init_logging() {
let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
))
.is_test(true)
.try_init();
}
fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
use crate::*;
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
// Craft some WAL
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..")
.join("..");
let cfg = Conf {
pg_version,
pg_distrib_dir: top_path.join("pg_install"),
datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
};
if cfg.datadir.exists() {
fs::remove_dir_all(&cfg.datadir).unwrap();
}
cfg.initdb().unwrap();
let srv = cfg.start_server().unwrap();
let (intermediate_lsns, expected_end_of_wal_partial) =
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
.iter()
.map(|&lsn| u64::from(lsn).into())
.collect();
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
srv.kill();
// Check find_end_of_wal on the initial WAL
let last_segment = cfg
.wal_dir()
.read_dir()
.unwrap()
.map(|f| f.unwrap().file_name().into_string().unwrap())
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
for start_lsn in intermediate_lsns
.iter()
.chain(std::iter::once(&expected_end_of_wal))
{
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
// We assume that `start_lsn` is non-decreasing.
info!(
"Checking with start_lsn={}, erasing WAL before it",
start_lsn
);
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
let fname = file.file_name().into_string().unwrap();
if !IsXLogFileName(&fname) {
continue;
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(*start_lsn) {
continue;
}
let mut f = File::options().write(true).open(file.path()).unwrap();
const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
f.write_all(
&ZEROS[0..min(
WAL_SEGMENT_SIZE,
(u64::from(*start_lsn) - seg_start_lsn) as usize,
)],
)
.unwrap();
}
check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
}
}
fn check_pg_waldump_end_of_wal(
cfg: &crate::Conf,
last_segment: &str,
expected_end_of_wal: Lsn,
) {
// Get the actual end of WAL by pg_waldump
let waldump_output = cfg
.pg_waldump("000000010000000000000001", last_segment)
.unwrap()
.stderr;
let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
let caps = match Regex::new(r"invalid record length at (.+):")
.unwrap()
.captures(waldump_output)
{
Some(caps) => caps,
None => {
error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
panic!();
}
};
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
info!(
"waldump erred on {}, expected wal end at {}",
waldump_wal_end, expected_end_of_wal
);
assert_eq!(waldump_wal_end, expected_end_of_wal);
}
fn check_end_of_wal(
cfg: &crate::Conf,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal: Lsn,
) {
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
// let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
// info!(
// "find_end_of_wal returned wal_end={} with non-partial WAL segment",
// wal_end
// );
// assert_eq!(wal_end, expected_end_of_wal_non_partial);
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
cfg.wal_dir().join(last_segment),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
info!(
"find_end_of_wal returned wal_end={} with partial WAL segment",
wal_end
);
assert_eq!(wal_end, expected_end_of_wal);
fs::rename(
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
)
.unwrap();
}
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
#[test]
pub fn test_find_end_of_wal_simple() {
init_logging();
test_end_of_wal::<crate::Simple>("test_find_end_of_wal_simple");
}
#[test]
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
init_logging();
test_end_of_wal::<crate::WalRecordCrossingSegmentFollowedBySmallOne>(
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
);
}
#[test]
pub fn test_find_end_of_wal_last_crossing_segment() {
init_logging();
test_end_of_wal::<crate::LastWalRecordCrossingSegment>(
"test_find_end_of_wal_last_crossing_segment",
);
}
/// Check the math in update_next_xid
///
/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
/// currently 1024.
#[test]
pub fn test_update_next_xid() {
let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
checkpoint.nextXid = FullTransactionId { value: 10 };
assert_eq!(checkpoint.nextXid.value, 10);
// The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
// boundary
checkpoint.update_next_xid(100);
assert_eq!(checkpoint.nextXid.value, 1024);
// No change
checkpoint.update_next_xid(500);
assert_eq!(checkpoint.nextXid.value, 1024);
checkpoint.update_next_xid(1023);
assert_eq!(checkpoint.nextXid.value, 1024);
// The function returns the *next* XID, given the highest XID seen so
// far. So when we pass 1024, the nextXid gets bumped up to the next
// XID_CHECKPOINT_INTERVAL boundary.
checkpoint.update_next_xid(1024);
assert_eq!(checkpoint.nextXid.value, 2048);
}
#[test]
pub fn test_encode_logical_message() {
let expected = [
64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
];
let actual = encode_logical_message("prefix", "message");
assert_eq!(expected, actual[..]);
}

View File

@@ -70,6 +70,14 @@ impl RemotePath {
pub fn join(&self, segment: &Path) -> Self {
Self(self.0.join(segment))
}
pub fn get_path(&self) -> &PathBuf {
&self.0
}
pub fn extension(&self) -> Option<&str> {
self.0.extension()?.to_str()
}
}
/// Storage (potentially remote) API to manage its state.
@@ -86,6 +94,19 @@ pub trait RemoteStorage: Send + Sync + 'static {
prefix: Option<&RemotePath>,
) -> Result<Vec<RemotePath>, DownloadError>;
/// Lists all files in directory "recursively"
/// (not really recursively, because AWS has a flat namespace)
/// Note: This is subtely different than list_prefixes,
/// because it is for listing files instead of listing
/// names sharing common prefixes.
/// For example,
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
/// whereas,
/// list_prefixes("foo/bar/") = ["cat", "dog"]
/// See `test_real_s3.rs` for more details.
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
/// Streams the local file contents into remote into the remote storage entry.
async fn upload(
&self,
@@ -111,6 +132,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
) -> Result<Download, DownloadError>;
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
}
pub struct Download {
@@ -161,6 +184,20 @@ pub enum GenericRemoteStorage {
}
impl GenericRemoteStorage {
// A function for listing all the files in a "directory"
// Example:
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
match self {
Self::LocalFs(s) => s.list_files(folder).await,
Self::AwsS3(s) => s.list_files(folder).await,
Self::Unreliable(s) => s.list_files(folder).await,
}
}
// lists common *prefixes*, if any of files
// Example:
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
pub async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
@@ -223,6 +260,14 @@ impl GenericRemoteStorage {
Self::Unreliable(s) => s.delete(path).await,
}
}
pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.delete_objects(paths).await,
Self::AwsS3(s) => s.delete_objects(paths).await,
Self::Unreliable(s) => s.delete_objects(paths).await,
}
}
}
impl GenericRemoteStorage {

View File

@@ -17,7 +17,7 @@ use tokio::{
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
};
use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
use crate::{Download, DownloadError, RemotePath};
@@ -48,6 +48,14 @@ impl LocalFs {
Ok(Self { storage_root })
}
// mirrors S3Bucket::s3_object_to_relative_path
fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
let relative_path = key
.strip_prefix(&self.storage_root)
.expect("relative path must contain storage_root as prefix");
RemotePath(relative_path.into())
}
async fn read_storage_metadata(
&self,
file_path: &Path,
@@ -101,19 +109,63 @@ impl RemoteStorage for LocalFs {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
};
Ok(get_all_files(path.as_ref(), false)
let prefixes_to_filter = get_all_files(path.as_ref(), false)
.await
.map_err(DownloadError::Other)?
.into_iter()
.map(|path| {
path.strip_prefix(&self.storage_root)
.context("Failed to strip preifix")
.map_err(DownloadError::Other)?;
let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
// filter out empty directories to mirror s3 behavior.
for prefix in prefixes_to_filter {
if prefix.is_dir()
&& is_directory_empty(&prefix)
.await
.map_err(DownloadError::Other)?
{
continue;
}
prefixes.push(
prefix
.strip_prefix(&self.storage_root)
.context("Failed to strip prefix")
.and_then(RemotePath::new)
.expect(
"We list files for storage root, hence should be able to remote the prefix",
)
})
.collect())
),
)
}
Ok(prefixes)
}
// recursively lists all files in a directory,
// mirroring the `list_files` for `s3_bucket`
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
let full_path = match folder {
Some(folder) => folder.with_base(&self.storage_root),
None => self.storage_root.clone(),
};
let mut files = vec![];
let mut directory_queue = vec![full_path.clone()];
while !directory_queue.is_empty() {
let cur_folder = directory_queue
.pop()
.expect("queue cannot be empty: we just checked");
let mut entries = fs::read_dir(cur_folder.clone()).await?;
while let Some(entry) = entries.next_entry().await? {
let file_name: PathBuf = entry.file_name().into();
let full_file_name = cur_folder.clone().join(&file_name);
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
files.push(file_remote_path.clone());
if full_file_name.is_dir() {
directory_queue.push(full_file_name);
}
}
}
Ok(files)
}
async fn upload(
@@ -291,11 +343,25 @@ impl RemoteStorage for LocalFs {
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let file_path = path.with_base(&self.storage_root);
if file_path.exists() && file_path.is_file() {
Ok(fs::remove_file(file_path).await?)
} else {
bail!("File {file_path:?} either does not exist or is not a file")
if !file_path.exists() {
// See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
// > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
return Ok(());
}
if !file_path.is_file() {
anyhow::bail!("{file_path:?} is not a file");
}
Ok(fs::remove_file(file_path)
.await
.map_err(|e| anyhow::anyhow!(e))?)
}
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
for path in paths {
self.delete(path).await?
}
Ok(())
}
}
@@ -320,7 +386,7 @@ where
let file_type = dir_entry.file_type().await?;
let entry_path = dir_entry.path();
if file_type.is_symlink() {
debug!("{entry_path:?} us a symlink, skipping")
debug!("{entry_path:?} is a symlink, skipping")
} else if file_type.is_dir() {
if recursive {
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
@@ -595,15 +661,11 @@ mod fs_tests {
storage.delete(&upload_target).await?;
assert!(storage.list().await?.is_empty());
match storage.delete(&upload_target).await {
Ok(()) => panic!("Should not allow deleting non-existing storage files"),
Err(e) => {
let error_string = e.to_string();
assert!(error_string.contains("does not exist"));
let expected_path = upload_target.with_base(&storage.storage_root);
assert!(error_string.contains(expected_path.to_str().unwrap()));
}
}
storage
.delete(&upload_target)
.await
.expect("Should allow deleting non-existing storage files");
Ok(())
}

View File

@@ -17,6 +17,7 @@ use aws_sdk_s3::{
error::SdkError,
operation::get_object::GetObjectError,
primitives::ByteStream,
types::{Delete, ObjectIdentifier},
Client,
};
use aws_smithy_http::body::SdkBody;
@@ -33,6 +34,8 @@ use crate::{
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
};
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
pub(super) mod metrics {
use metrics::{register_int_counter_vec, IntCounterVec};
use once_cell::sync::Lazy;
@@ -81,12 +84,24 @@ pub(super) mod metrics {
.inc();
}
pub fn inc_delete_objects(count: u64) {
S3_REQUESTS_COUNT
.with_label_values(&["delete_object"])
.inc_by(count);
}
pub fn inc_delete_object_fail() {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["delete_object"])
.inc();
}
pub fn inc_delete_objects_fail(count: u64) {
S3_REQUESTS_FAIL_COUNT
.with_label_values(&["delete_object"])
.inc_by(count);
}
pub fn inc_list_objects() {
S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
}
@@ -332,6 +347,52 @@ impl RemoteStorage for S3Bucket {
Ok(document_keys)
}
/// See the doc for `RemoteStorage::list_files`
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
// TODO: if bucket prefix is empty, folder is prefixed with a "/" I think. Is this desired?
let folder_name = folder
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| self.prefix_in_bucket.clone());
// AWS may need to break the response into several parts
let mut continuation_token = None;
let mut all_files = vec![];
loop {
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 list_files")?;
metrics::inc_list_objects();
let response = self
.client
.list_objects_v2()
.bucket(self.bucket_name.clone())
.set_prefix(folder_name.clone())
.set_continuation_token(continuation_token)
.set_max_keys(self.max_keys_per_list_response)
.send()
.await
.map_err(|e| {
metrics::inc_list_objects_fail();
e
})
.context("Failed to list files in S3 bucket")?;
for object in response.contents().unwrap_or_default() {
let object_path = object.key().expect("response does not contain a key");
let remote_path = self.s3_object_to_relative_path(object_path);
all_files.push(remote_path);
}
match response.next_continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
}
Ok(all_files)
}
async fn upload(
&self,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -396,6 +457,50 @@ impl RemoteStorage for S3Bucket {
})
.await
}
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 delete")?;
let mut delete_objects = Vec::with_capacity(paths.len());
for path in paths {
let obj_id = ObjectIdentifier::builder()
.set_key(Some(self.relative_path_to_s3_object(path)))
.build();
delete_objects.push(obj_id);
}
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
metrics::inc_delete_objects(chunk.len() as u64);
let resp = self
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
.send()
.await;
match resp {
Ok(resp) => {
if let Some(errors) = resp.errors {
metrics::inc_delete_objects_fail(errors.len() as u64);
return Err(anyhow::format_err!(
"Failed to delete {} objects",
errors.len()
));
}
}
Err(e) => {
metrics::inc_delete_objects_fail(chunk.len() as u64);
return Err(e.into());
}
}
}
Ok(())
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let _guard = self

View File

@@ -24,6 +24,7 @@ enum RemoteOp {
Upload(RemotePath),
Download(RemotePath),
Delete(RemotePath),
DeleteObjects(Vec<RemotePath>),
}
impl UnreliableWrapper {
@@ -82,6 +83,11 @@ impl RemoteStorage for UnreliableWrapper {
self.inner.list_prefixes(prefix).await
}
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
self.inner.list_files(folder).await
}
async fn upload(
&self,
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -119,4 +125,21 @@ impl RemoteStorage for UnreliableWrapper {
self.attempt(RemoteOp::Delete(path.clone()))?;
self.inner.delete(path).await
}
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
let mut error_counter = 0;
for path in paths {
if (self.delete(path).await).is_err() {
error_counter += 1;
}
}
if error_counter > 0 {
return Err(anyhow::anyhow!(
"failed to delete {} objects",
error_counter
));
}
Ok(())
}
}

View File

@@ -7,6 +7,7 @@ use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::Context;
use once_cell::sync::OnceCell;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
};
@@ -14,8 +15,12 @@ use test_context::{test_context, AsyncTestContext};
use tokio::task::JoinSet;
use tracing::{debug, error, info};
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
const BASE_PREFIX: &str = "test/";
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
/// See the client creation in [`create_s3_client`] for details on the required env vars.
@@ -38,20 +43,20 @@ const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_
///
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
#[test_context(MaybeEnabledS3)]
#[test_context(MaybeEnabledS3WithTestBlobs)]
#[tokio::test]
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3::Enabled(ctx) => ctx,
MaybeEnabledS3::Disabled => return Ok(()),
MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
};
let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
let test_client = Arc::clone(&ctx.enabled.client);
let expected_remote_prefixes = ctx.remote_prefixes.clone();
let base_prefix =
RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
.context("common_prefix construction")?;
let root_remote_prefixes = test_client
.list_prefixes(None)
.await
@@ -83,27 +88,174 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<(
Ok(())
}
/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
/// See `s3_pagination_should_work` for more information.
///
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
/// Then performs the following queries:
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
#[tokio::test]
async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
anyhow::bail!("S3 init failed: {e:?}")
}
};
let test_client = Arc::clone(&ctx.enabled.client);
let base_prefix =
RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
let root_files = test_client
.list_files(None)
.await
.context("client list root files failure")?
.into_iter()
.collect::<HashSet<_>>();
assert_eq!(
root_files,
ctx.remote_blobs.clone(),
"remote storage list_files on root mismatches with the uploads."
);
let nested_remote_files = test_client
.list_files(Some(&base_prefix))
.await
.context("client list nested files failure")?
.into_iter()
.collect::<HashSet<_>>();
let trim_remote_blobs: HashSet<_> = ctx
.remote_blobs
.iter()
.map(|x| x.get_path().to_str().expect("must be valid name"))
.filter(|x| x.starts_with("folder1"))
.map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
.collect();
assert_eq!(
nested_remote_files, trim_remote_blobs,
"remote storage list_files on subdirrectory mismatches with the uploads."
);
Ok(())
}
#[test_context(MaybeEnabledS3)]
#[tokio::test]
async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3::Enabled(ctx) => ctx,
MaybeEnabledS3::Disabled => return Ok(()),
};
let path = RemotePath::new(&PathBuf::from(format!(
"{}/for_sure_there_is_nothing_there_really",
ctx.base_prefix,
)))
.with_context(|| "RemotePath conversion")?;
ctx.client.delete(&path).await.expect("should succeed");
Ok(())
}
#[test_context(MaybeEnabledS3)]
#[tokio::test]
async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledS3::Enabled(ctx) => ctx,
MaybeEnabledS3::Disabled => return Ok(()),
};
let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
.with_context(|| "RemotePath conversion")?;
let data1 = "remote blob data1".as_bytes();
let data1_len = data1.len();
let data2 = "remote blob data2".as_bytes();
let data2_len = data2.len();
ctx.client
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
.await?;
ctx.client
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
.await?;
ctx.client.delete_objects(&[path1, path2]).await?;
Ok(())
}
fn ensure_logging_ready() {
LOGGING_DONE.get_or_init(|| {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
)
.expect("logging init failed");
});
}
struct EnabledS3 {
client: Arc<GenericRemoteStorage>,
base_prefix: &'static str,
}
impl EnabledS3 {
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
let client = create_s3_client(max_keys_in_list_response)
.context("S3 client creation")
.expect("S3 client creation failed");
EnabledS3 {
client,
base_prefix: BASE_PREFIX,
}
}
}
enum MaybeEnabledS3 {
Enabled(EnabledS3),
Disabled,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledS3 {
async fn setup() -> Self {
ensure_logging_ready();
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
info!(
"`{}` env variable is not set, skipping the test",
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
);
return Self::Disabled;
}
Self::Enabled(EnabledS3::setup(None).await)
}
}
enum MaybeEnabledS3WithTestBlobs {
Enabled(S3WithTestBlobs),
Disabled,
UploadsFailed(anyhow::Error, S3WithTestBlobs),
}
struct S3WithTestBlobs {
client_with_excessive_pagination: Arc<GenericRemoteStorage>,
base_prefix_str: &'static str,
enabled: EnabledS3,
remote_prefixes: HashSet<RemotePath>,
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledS3 {
impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
async fn setup() -> Self {
utils::logging::init(
utils::logging::LogFormat::Test,
utils::logging::TracingErrorLayerEnablement::Disabled,
)
.expect("logging init failed");
ensure_logging_ready();
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
info!(
"`{}` env variable is not set, skipping the test",
@@ -115,23 +267,14 @@ impl AsyncTestContext for MaybeEnabledS3 {
let max_keys_in_list_response = 10;
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
.context("S3 client creation")
.expect("S3 client creation failed");
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
let base_prefix_str = "test/";
match upload_s3_data(
&client_with_excessive_pagination,
base_prefix_str,
upload_tasks_count,
)
.await
{
match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
ControlFlow::Continue(uploads) => {
info!("Remote objects created successfully");
Self::Enabled(S3WithTestBlobs {
client_with_excessive_pagination,
base_prefix_str,
enabled,
remote_prefixes: uploads.prefixes,
remote_blobs: uploads.blobs,
})
@@ -139,8 +282,7 @@ impl AsyncTestContext for MaybeEnabledS3 {
ControlFlow::Break(uploads) => Self::UploadsFailed(
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
S3WithTestBlobs {
client_with_excessive_pagination,
base_prefix_str,
enabled,
remote_prefixes: uploads.prefixes,
remote_blobs: uploads.blobs,
},
@@ -152,13 +294,75 @@ impl AsyncTestContext for MaybeEnabledS3 {
match self {
Self::Disabled => {}
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
}
}
}
}
fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
// whereas the list_files function is concerned with listing files.
// See `RemoteStorage::list_files` documentation for more details
enum MaybeEnabledS3WithSimpleTestBlobs {
Enabled(S3WithSimpleTestBlobs),
Disabled,
UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
}
struct S3WithSimpleTestBlobs {
enabled: EnabledS3,
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
info!(
"`{}` env variable is not set, skipping the test",
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
);
return Self::Disabled;
}
let max_keys_in_list_response = 10;
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
ControlFlow::Continue(uploads) => {
info!("Remote objects created successfully");
Self::Enabled(S3WithSimpleTestBlobs {
enabled,
remote_blobs: uploads,
})
}
ControlFlow::Break(uploads) => Self::UploadsFailed(
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
S3WithSimpleTestBlobs {
enabled,
remote_blobs: uploads,
},
),
}
}
async fn teardown(self) {
match self {
Self::Disabled => {}
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
}
}
}
}
fn create_s3_client(
max_keys_per_list_response: Option<i32>,
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
@@ -166,7 +370,7 @@ fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<Gener
let random_prefix_part = std::time::SystemTime::now()
.duration_since(UNIX_EPOCH)
.context("random s3 test prefix part calculation")?
.as_millis();
.as_nanos();
let remote_storage_config = RemoteStorageConfig {
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
max_sync_errors: NonZeroU32::new(5).unwrap(),
@@ -176,7 +380,7 @@ fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<Gener
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(max_keys_per_list_response),
max_keys_per_list_response,
}),
};
Ok(Arc::new(
@@ -272,3 +476,52 @@ async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<
}
}
}
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
async fn upload_simple_s3_data(
client: &Arc<GenericRemoteStorage>,
upload_tasks_count: usize,
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
info!("Creating {upload_tasks_count} S3 files");
let mut upload_tasks = JoinSet::new();
for i in 1..upload_tasks_count + 1 {
let task_client = Arc::clone(client);
upload_tasks.spawn(async move {
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
let blob_path = RemotePath::new(&blob_path)
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
debug!("Creating remote item {i} at path {blob_path:?}");
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>(blob_path)
});
}
let mut upload_tasks_failed = false;
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
while let Some(task_run_result) = upload_tasks.join_next().await {
match task_run_result
.context("task join failed")
.and_then(|task_result| task_result.context("upload task failed"))
{
Ok(upload_path) => {
uploaded_blobs.insert(upload_path);
}
Err(e) => {
error!("Upload task failed: {e:?}");
upload_tasks_failed = true;
}
}
}
if upload_tasks_failed {
ControlFlow::Break(uploaded_blobs)
} else {
ControlFlow::Continue(uploaded_blobs)
}
}

View File

@@ -1,6 +1,8 @@
/// Extensions to `std::fs` types.
use std::{fs, io, path::Path};
use anyhow::Context;
pub trait PathExt {
/// Returns an error if `self` is not a directory.
fn is_empty_dir(&self) -> io::Result<bool>;
@@ -15,10 +17,19 @@ where
}
}
pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool> {
let mut dir = tokio::fs::read_dir(&path)
.await
.context(format!("read_dir({})", path.as_ref().display()))?;
Ok(dir.next_entry().await?.is_none())
}
#[cfg(test)]
mod test {
use std::path::PathBuf;
use crate::fs_ext::is_directory_empty;
#[test]
fn is_empty_dir() {
use super::PathExt;
@@ -42,4 +53,26 @@ mod test {
std::fs::remove_file(&file_path).unwrap();
assert!(file_path.is_empty_dir().is_err());
}
#[tokio::test]
async fn is_empty_dir_async() {
let dir = tempfile::tempdir().unwrap();
let dir_path = dir.path();
// test positive case
assert!(
is_directory_empty(dir_path).await.expect("test failure"),
"new tempdir should be empty"
);
// invoke on a file to ensure it returns an error
let file_path: PathBuf = dir_path.join("testfile");
let f = std::fs::File::create(&file_path).unwrap();
drop(f);
assert!(is_directory_empty(&file_path).await.is_err());
// do it again on a path, we know to be nonexistent
std::fs::remove_file(&file_path).unwrap();
assert!(is_directory_empty(file_path).await.is_err());
}
}

View File

@@ -1,19 +1,18 @@
use crate::auth::{Claims, JwtAuth};
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
use anyhow::{anyhow, Context};
use anyhow::Context;
use hyper::header::{HeaderName, AUTHORIZATION};
use hyper::http::HeaderValue;
use hyper::Method;
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
use hyper::{header::CONTENT_TYPE, Body, Request, Response};
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tokio::task::JoinError;
use tracing::{self, debug, info, info_span, warn, Instrument};
use std::future::Future;
use std::net::TcpListener;
use std::str::FromStr;
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -348,40 +347,6 @@ pub fn check_permission_with(
}
}
///
/// Start listening for HTTP requests on given socket.
///
/// 'shutdown_future' can be used to stop. If the Future becomes
/// ready, we stop listening for new requests, and the function returns.
///
pub fn serve_thread_main<S>(
router_builder: RouterBuilder<hyper::Body, ApiError>,
listener: TcpListener,
shutdown_future: S,
) -> anyhow::Result<()>
where
S: Future<Output = ()> + Send + Sync,
{
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
// Create a Service from the router above to handle incoming requests.
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
// Enter a single-threaded tokio runtime bound to the current thread
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
let _guard = runtime.enter();
let server = Server::from_tcp(listener)?
.serve(service)
.with_graceful_shutdown(shutdown_future);
runtime.block_on(server)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -1,5 +1,6 @@
use hyper::{header, Body, Response, StatusCode};
use serde::{Deserialize, Serialize};
use std::error::Error as StdError;
use thiserror::Error;
use tracing::error;
@@ -15,13 +16,13 @@ pub enum ApiError {
Unauthorized(String),
#[error("NotFound: {0}")]
NotFound(anyhow::Error),
NotFound(Box<dyn StdError + Send + Sync + 'static>),
#[error("Conflict: {0}")]
Conflict(String),
#[error("Precondition failed: {0}")]
PreconditionFailed(&'static str),
PreconditionFailed(Box<str>),
#[error(transparent)]
InternalServerError(anyhow::Error),

View File

@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
min_lsn = min(min_lsn, lsn_range.start);
max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
updates.insert_historic(Arc::new(layer));
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
}
println!("min: {min_lsn}, max: {max_lsn}");
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
is_incremental: false,
short_id: format!("Layer {}", i),
};
updates.insert_historic(Arc::new(layer));
updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
}
updates.flush();
println!("Finished layer map init in {:?}", now.elapsed());

View File

@@ -335,31 +335,116 @@ fn start_pageserver(
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
// All tenant load operations carry this while they are ongoing; it will be dropped once those
// operations finish either successfully or in some other manner. However, the initial load
// will be then done, and we can start the global background tasks.
// Startup staging or optimizing:
//
// We want to minimize downtime for `page_service` connections, and trying not to overload
// BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
//
// init_done_rx will notify when all initial load operations have completed.
//
// background_jobs_can_start (same name used to hold off background jobs from starting at
// consumer side) will be dropped once we can start the background jobs. Currently it is behind
// completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
// (background_task_maximum_delay).
let (init_done_tx, init_done_rx) = utils::completion::channel();
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
let order = pageserver::InitializationOrder {
initial_tenant_load: Some(init_done_tx),
initial_logical_size_can_start: init_done_rx.clone(),
initial_logical_size_attempt: init_logical_size_done_tx,
background_jobs_can_start: background_jobs_barrier.clone(),
};
// Scan the local 'tenants/' directory and start loading the tenants
let init_started_at = std::time::Instant::now();
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
broker_client.clone(),
remote_storage.clone(),
(init_done_tx, init_done_rx.clone()),
order,
))?;
BACKGROUND_RUNTIME.spawn({
let init_done_rx = init_done_rx.clone();
async move {
init_done_rx.wait().await;
let init_done_rx = init_done_rx;
let shutdown_pageserver = shutdown_pageserver.clone();
let drive_init = async move {
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
let elapsed = init_started_at.elapsed();
init_done_rx.wait().await;
// initial logical sizes can now start, as they were waiting on init_done_rx.
scopeguard::ScopeGuard::into_inner(guard);
let init_done = std::time::Instant::now();
let elapsed = init_done - init_started_at;
tracing::info!(
elapsed_millis = elapsed.as_millis(),
"Initial load completed."
"Initial load completed"
);
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
let timeout = conf.background_task_maximum_delay;
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
let init_sizes_done = tokio::select! {
_ = &mut init_sizes_done => {
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed"
);
None
}
_ = tokio::time::sleep(timeout) => {
tracing::info!(
timeout_millis = timeout.as_millis(),
"Initial logical size timeout elapsed; starting background jobs"
);
Some(init_sizes_done)
}
};
scopeguard::ScopeGuard::into_inner(guard);
// allow background jobs to start
drop(background_jobs_can_start);
if let Some(init_sizes_done) = init_sizes_done {
// ending up here is not a bug; at the latest logical sizes will be queried by
// consumption metrics.
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
init_sizes_done.await;
scopeguard::ScopeGuard::into_inner(guard);
let now = std::time::Instant::now();
tracing::info!(
from_init_done_millis = (now - init_done).as_millis(),
from_init_millis = (now - init_started_at).as_millis(),
"Initial logical sizes completed after timeout (background jobs already started)"
);
}
};
async move {
let mut drive_init = std::pin::pin!(drive_init);
// just race these tasks
tokio::select! {
_ = shutdown_pageserver.cancelled() => {},
_ = &mut drive_init => {},
}
}
});
@@ -374,7 +459,7 @@ fn start_pageserver(
conf,
remote_storage.clone(),
disk_usage_eviction_state.clone(),
init_done_rx.clone(),
background_jobs_barrier.clone(),
)?;
}
@@ -412,7 +497,7 @@ fn start_pageserver(
);
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
let init_done_rx = init_done_rx;
let background_jobs_barrier = background_jobs_barrier;
let metrics_ctx = RequestContext::todo_child(
TaskKind::MetricsCollection,
// This task itself shouldn't download anything.
@@ -428,12 +513,17 @@ fn start_pageserver(
"consumption metrics collection",
true,
async move {
// first wait for initial load to complete before first iteration.
// first wait until background jobs are cleared to launch.
//
// this is because we only process active tenants and timelines, and the
// Timeline::get_current_logical_size will spawn the logical size calculation,
// which will not be rate-limited.
init_done_rx.wait().await;
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => {}
};
pageserver::consumption_metrics::collect_metrics(
metric_collection_endpoint,
@@ -483,6 +573,8 @@ fn start_pageserver(
);
}
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
// All started up! Now just sit and wait for shutdown signal.
ShutdownSignals::handle(|signal| match signal {
Signal::Quit => {
@@ -498,6 +590,11 @@ fn start_pageserver(
"Got {}. Terminating gracefully in fast shutdown mode",
signal.name()
);
// This cancels the `shutdown_pageserver` cancellation tree.
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
unreachable!()
}

View File

@@ -63,6 +63,7 @@ pub mod defaults {
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
///
/// Default built-in configuration file.
@@ -91,9 +92,10 @@ pub mod defaults {
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -187,6 +189,15 @@ pub struct PageServerConf {
pub test_remote_failures: u64,
pub ondemand_download_behavior_treat_error_as_warn: bool,
/// How long will background tasks be delayed at most after initial load of tenants.
///
/// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
/// as we now isolate initial loading, initial logical size calculation and background tasks.
/// Smaller nodes will have background tasks "not running" for this long unless every timeline
/// has it's initial logical size calculated. Not running background tasks for some seconds is
/// not terrible.
pub background_task_maximum_delay: Duration,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -259,6 +270,8 @@ struct PageServerConfigBuilder {
test_remote_failures: BuilderValue<u64>,
ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
background_task_maximum_delay: BuilderValue<Duration>,
}
impl Default for PageServerConfigBuilder {
@@ -316,6 +329,11 @@ impl Default for PageServerConfigBuilder {
test_remote_failures: Set(0),
ondemand_download_behavior_treat_error_as_warn: Set(false),
background_task_maximum_delay: Set(humantime::parse_duration(
DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
)
.unwrap()),
}
}
}
@@ -440,6 +458,10 @@ impl PageServerConfigBuilder {
BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
}
pub fn background_task_maximum_delay(&mut self, delay: Duration) {
self.background_task_maximum_delay = BuilderValue::Set(delay);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_size_logical_size_queries = self
.concurrent_tenant_size_logical_size_queries
@@ -522,6 +544,9 @@ impl PageServerConfigBuilder {
.ok_or(anyhow!(
"missing ondemand_download_behavior_treat_error_as_warn"
))?,
background_task_maximum_delay: self
.background_task_maximum_delay
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
})
}
}
@@ -710,6 +735,7 @@ impl PageServerConf {
)
},
"ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
"background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -877,6 +903,7 @@ impl PageServerConf {
disk_usage_based_eviction: None,
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::ZERO,
}
}
}
@@ -1036,6 +1063,7 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
synthetic_size_calculation_interval = '333 s'
log_format = 'json'
background_task_maximum_delay = '334 s'
"#;
@@ -1094,6 +1122,9 @@ log_format = 'json'
disk_usage_based_eviction: None,
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: humantime::parse_duration(
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
)?,
},
"Correct defaults should be used when no config values are provided"
);
@@ -1148,6 +1179,7 @@ log_format = 'json'
disk_usage_based_eviction: None,
test_remote_failures: 0,
ondemand_download_behavior_treat_error_as_warn: false,
background_task_maximum_delay: Duration::from_secs(334),
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -83,7 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
conf: &'static PageServerConf,
storage: GenericRemoteStorage,
state: Arc<State>,
init_done: completion::Barrier,
background_jobs_barrier: completion::Barrier,
) -> anyhow::Result<()> {
let Some(task_config) = &conf.disk_usage_based_eviction else {
info!("disk usage based eviction task not configured");
@@ -100,17 +100,16 @@ pub fn launch_disk_usage_global_eviction_task(
"disk usage based eviction",
false,
async move {
// wait until initial load is complete, because we cannot evict from loading tenants.
init_done.wait().await;
let cancel = task_mgr::shutdown_token();
disk_usage_eviction_task(
&state,
task_config,
storage,
&conf.tenants_path(),
task_mgr::shutdown_token(),
)
.await;
// wait until initial load is complete, because we cannot evict from loading tenants.
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => { }
};
disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
.await;
info!("disk usage based eviction task finishing");
Ok(())
},
@@ -517,7 +516,7 @@ async fn collect_eviction_candidates(
if !tl.is_active() {
continue;
}
let info = tl.get_local_layers_for_disk_usage_eviction();
let info = tl.get_local_layers_for_disk_usage_eviction().await;
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
tenant_candidates.extend(
info.resident_layers

View File

@@ -215,7 +215,7 @@ paths:
schema:
$ref: "#/components/schemas/NotFoundError"
"412":
description: Tenant is missing
description: Tenant is missing, or timeline has children
content:
application/json:
schema:
@@ -386,6 +386,7 @@ paths:
"202":
description: Tenant attaching scheduled
"400":
description: Bad Request
content:
application/json:
schema:
@@ -928,12 +929,28 @@ components:
writing to the tenant's S3 state, so, DO NOT ATTACH the
tenant to any other pageserver, or we risk split-brain.
- `attached` means that the attach operation has completed,
maybe successfully, maybe not. Perform a health check at
the Postgres level to determine healthiness of the tenant.
successfully
- `failed` means that attach has failed. For reason check corresponding `reason` failed.
`failed` is the terminal state, retrying attach call wont resolve the issue.
For example this can be caused by s3 being unreachable. The retry may be implemented
with call to detach, though it would be better to not automate it and inspec failed state
manually before proceeding with a retry.
See the tenant `/attach` endpoint for more information.
type: string
enum: [ "maybe", "attached" ]
type: object
required:
- slug
- data
properties:
slug:
type: string
enum: [ "maybe", "attached", "failed" ]
data:
type: object
properties:
reason:
type: string
TenantCreateRequest:
allOf:
- $ref: '#/components/schemas/TenantConfig'

View File

@@ -142,7 +142,7 @@ impl From<TenantMapInsertError> for ApiError {
impl From<TenantStateError> for ApiError {
fn from(tse: TenantStateError) -> ApiError {
match tse {
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
}
}
@@ -151,7 +151,7 @@ impl From<TenantStateError> for ApiError {
impl From<GetTenantError> for ApiError {
fn from(tse: GetTenantError) -> ApiError {
match tse {
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
e @ GetTenantError::NotActive(_) => {
// Why is this not `ApiError::NotFound`?
// Because we must be careful to never return 404 for a tenant if it does
@@ -169,7 +169,7 @@ impl From<SetNewTenantConfigError> for ApiError {
fn from(e: SetNewTenantConfigError) -> ApiError {
match e {
SetNewTenantConfigError::GetTenant(tid) => {
ApiError::NotFound(anyhow!("tenant {}", tid))
ApiError::NotFound(anyhow!("tenant {}", tid).into())
}
e @ SetNewTenantConfigError::Persist(_) => {
ApiError::InternalServerError(anyhow::Error::new(e))
@@ -182,10 +182,11 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
use crate::tenant::DeleteTimelineError::*;
match value {
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
HasChildren => ApiError::BadRequest(anyhow::anyhow!(
"Cannot delete timeline which has child timelines"
)),
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
HasChildren(children) => ApiError::PreconditionFailed(
format!("Cannot delete timeline which has child timelines: {children:?}")
.into_boxed_str(),
),
Other(e) => ApiError::InternalServerError(e),
}
}
@@ -197,9 +198,9 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
match value {
// Report Precondition failed so client can distinguish between
// "tenant is missing" case from "timeline is missing"
Tenant(GetTenantError::NotFound(..)) => {
ApiError::PreconditionFailed("Requested tenant is missing")
}
Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed(
"Requested tenant is missing".to_owned().into_boxed_str(),
),
Tenant(t) => ApiError::from(t),
Timeline(t) => ApiError::from(t),
}
@@ -214,7 +215,7 @@ async fn build_timeline_info(
) -> anyhow::Result<TimelineInfo> {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
let mut info = build_timeline_info_common(timeline, ctx)?;
let mut info = build_timeline_info_common(timeline, ctx).await?;
if include_non_incremental_logical_size {
// XXX we should be using spawn_ondemand_logical_size_calculation here.
// Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -232,7 +233,7 @@ async fn build_timeline_info(
Ok(info)
}
fn build_timeline_info_common(
async fn build_timeline_info_common(
timeline: &Arc<Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<TimelineInfo> {
@@ -263,7 +264,7 @@ fn build_timeline_info_common(
None
}
};
let current_physical_size = Some(timeline.layer_size_sum());
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
@@ -329,6 +330,7 @@ async fn timeline_create_handler(
Ok(Some(new_timeline)) => {
// Created. Construct a TimelineInfo for it.
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
@@ -395,7 +397,7 @@ async fn timeline_detail_handler(
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(ApiError::NotFound)?;
.map_err(|e| ApiError::NotFound(e.into()))?;
let timeline_info = build_timeline_info(
&timeline,
@@ -494,7 +496,8 @@ async fn timeline_delete_handler(
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
.await?;
json_response(StatusCode::OK, ())
// FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
json_response(StatusCode::ACCEPTED, ())
}
async fn tenant_detach_handler(
@@ -589,7 +592,7 @@ async fn tenant_status(
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
for timeline in tenant.list_timelines().iter() {
current_physical_size += timeline.layer_size_sum();
current_physical_size += timeline.layer_size_sum().await;
}
let state = tenant.current_state();
@@ -699,7 +702,7 @@ async fn layer_map_info_handler(
check_permission(&request, Some(tenant_id))?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let layer_map_info = timeline.layer_map_info(reset);
let layer_map_info = timeline.layer_map_info(reset).await;
json_response(StatusCode::OK, layer_map_info)
}
@@ -1058,7 +1061,7 @@ async fn timeline_download_remote_layers_handler_get(
let info = timeline
.get_download_all_remote_layers_task_info()
.context("task never started since last pageserver process start")
.map_err(ApiError::NotFound)?;
.map_err(|e| ApiError::NotFound(e.into()))?;
json_response(StatusCode::OK, info)
}
@@ -1069,7 +1072,7 @@ async fn active_timeline_of_active_tenant(
let tenant = mgr::get_tenant(tenant_id, true).await?;
tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)
.map_err(|e| ApiError::NotFound(e.into()))
}
async fn always_panic_handler(

View File

@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
{
pg_control = Some(control_file);
}
modification.flush()?;
modification.flush().await?;
}
}
// We're done importing all the data files.
modification.commit()?;
modification.commit().await?;
// We expect the Postgres server to be shut down cleanly.
let pg_control = pg_control.context("pg_control file not found")?;
@@ -148,17 +148,17 @@ async fn import_rel(
// because there is no guarantee about the order in which we are processing segments.
// ignore "relation already exists" error
//
// FIXME: use proper error type for this, instead of parsing the error message.
// Or better yet, keep track of which relations we've already created
// FIXME: Keep track of which relations we've already created?
// https://github.com/neondatabase/neon/issues/3309
if let Err(e) = modification
.put_rel_creation(rel, nblocks as u32, ctx)
.await
{
if e.to_string().contains("already exists") {
debug!("relation {} already exists. we must be extending it", rel);
} else {
return Err(e);
match e {
RelationError::AlreadyExists => {
debug!("Relation {} already exist. We must be extending it.", rel)
}
_ => return Err(e.into()),
}
}
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
// We found the pg_control file.
pg_control = Some(res);
}
modification.flush()?;
modification.flush().await?;
}
tokio_tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
// sanity check: ensure that pg_control is loaded
let _pg_control = pg_control.context("pg_control file not found")?;
modification.commit()?;
modification.commit().await?;
Ok(())
}
@@ -594,7 +594,7 @@ async fn import_file(
// zenith.signal is not necessarily the last file, that we handle
// but it is ok to call `finish_write()`, because final `modification.commit()`
// will update lsn once more to the final one.
let writer = modification.tline.writer();
let writer = modification.tline.writer().await;
writer.finish_write(prev_lsn);
debug!("imported zenith signal {}", prev_lsn);

View File

@@ -58,12 +58,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
// the checkpoint and GC tasks.
tenant::mgr::shutdown_all_tenants().await;
// Stop syncing with remote storage.
//
// FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
// Should it?
task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.
@@ -138,6 +132,29 @@ pub fn is_uninit_mark(path: &Path) -> bool {
}
}
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
/// blocking.
///
/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
/// delaying is needed.
#[derive(Clone)]
pub struct InitializationOrder {
/// Each initial tenant load task carries this until completion.
pub initial_tenant_load: Option<utils::completion::Completion>,
/// Barrier for when we can start initial logical size calculations.
pub initial_logical_size_can_start: utils::completion::Barrier,
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
/// attempt. It is important to drop this once the attempt has completed.
pub initial_logical_size_attempt: utils::completion::Completion,
/// Barrier for when we can start any background jobs.
///
/// This can be broken up later on, but right now there is just one class of a background job.
pub background_jobs_can_start: utils::completion::Barrier,
}
#[cfg(test)]
mod backoff_defaults_tests {
use super::*;

View File

@@ -1,4 +1,3 @@
use metrics::core::{AtomicU64, GenericCounter};
use metrics::{
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
@@ -95,21 +94,19 @@ static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
});
// Metrics collected on operations on the storage repository.
static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_getpage_reconstruct_seconds",
"Time spent in reconstruct_value",
&["tenant_id", "timeline_id"],
"Time spent in reconstruct_value (reconstruct a page from deltas)",
CRITICAL_OP_BUCKETS.into(),
)
.expect("failed to define a metric")
});
static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_materialized_cache_hits_direct_total",
"Number of cache hits from materialized page cache without redo",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -124,11 +121,10 @@ static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_materialized_cache_hits_total",
"Number of cache hits from materialized page cache",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
@@ -752,10 +748,7 @@ impl StorageTimeMetrics {
pub struct TimelineMetrics {
tenant_id: String,
timeline_id: String,
pub reconstruct_time_histo: Histogram,
pub get_reconstruct_data_time_histo: Histogram,
pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
pub flush_time_histo: StorageTimeMetrics,
pub compact_time_histo: StorageTimeMetrics,
pub create_images_time_histo: StorageTimeMetrics,
@@ -783,15 +776,9 @@ impl TimelineMetrics {
) -> Self {
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let reconstruct_time_histo = RECONSTRUCT_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let flush_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
let compact_time_histo =
@@ -833,19 +820,18 @@ impl TimelineMetrics {
let read_num_fs_layers = READ_NUM_FS_LAYERS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration =
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
// TODO(chi): remove this once we remove Lazy for all metrics. Otherwise this will not appear in the exporter
// and integration test will error.
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
MATERIALIZED_PAGE_CACHE_HIT.get();
TimelineMetrics {
tenant_id,
timeline_id,
reconstruct_time_histo,
get_reconstruct_data_time_histo,
materialized_page_cache_hit_counter,
materialized_page_cache_hit_upon_request_counter,
flush_time_histo,
compact_time_histo,
create_images_time_histo,
@@ -872,10 +858,7 @@ impl Drop for TimelineMetrics {
fn drop(&mut self) {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);

View File

@@ -390,7 +390,9 @@ impl PageServerHandler {
};
// Check that the timeline exists
let timeline = tenant.get_timeline(timeline_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| anyhow::anyhow!(e))?;
// switch client to COPYBOTH
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
@@ -1230,6 +1232,6 @@ async fn get_active_tenant_timeline(
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(GetActiveTimelineError::Timeline)?;
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
Ok(timeline)
}

View File

@@ -43,6 +43,16 @@ pub enum CalculateLogicalSizeError {
Other(#[from] anyhow::Error),
}
#[derive(Debug, thiserror::Error)]
pub enum RelationError {
#[error("Relation Already Exists")]
AlreadyExists,
#[error("invalid relnode")]
InvalidRelnode,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
///
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
/// and other special kinds of files, in a versioned key-value store. The
@@ -101,9 +111,9 @@ impl Timeline {
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(anyhow::anyhow!(
"invalid relnode"
)));
return Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
));
}
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
@@ -148,9 +158,9 @@ impl Timeline {
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(anyhow::anyhow!(
"invalid relnode"
)));
return Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
));
}
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -193,9 +203,9 @@ impl Timeline {
ctx: &RequestContext,
) -> Result<bool, PageReconstructError> {
if tag.relnode == 0 {
return Err(PageReconstructError::Other(anyhow::anyhow!(
"invalid relnode"
)));
return Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
));
}
// first try to lookup relation in cache
@@ -699,6 +709,20 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
#[cfg(test)]
pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
self.init_empty()?;
self.put_control_file(bytes::Bytes::from_static(
b"control_file contents do not matter",
))
.context("put_control_file")?;
self.put_checkpoint(bytes::Bytes::from_static(
b"checkpoint_file contents do not matter",
))
.context("put_checkpoint_file")?;
Ok(())
}
/// Put a new page version that can be constructed from a WAL record
///
/// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -710,7 +734,7 @@ impl<'a> DatadirModification<'a> {
blknum: BlockNumber,
rec: NeonWalRecord,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
Ok(())
}
@@ -737,7 +761,7 @@ impl<'a> DatadirModification<'a> {
blknum: BlockNumber,
img: Bytes,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
Ok(())
}
@@ -861,32 +885,38 @@ impl<'a> DatadirModification<'a> {
rel: RelTag,
nblocks: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
) -> Result<(), RelationError> {
if rel.relnode == 0 {
return Err(RelationError::AlreadyExists);
}
// It's possible that this is the first rel for this db in this
// tablespace. Create the reldir entry for it if so.
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
.context("deserialize db")?;
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
// Didn't exist. Update dbdir
dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
let buf = DbDirectory::ser(&dbdir)?;
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
self.put(DBDIR_KEY, Value::Image(buf.into()));
// and create the RelDirectory
RelDirectory::default()
} else {
// reldir already exists, fetch it
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
.context("deserialize db")?
};
// Add the new relation to the rel directory entry, and write it back
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
anyhow::bail!("rel {rel} already exists");
return Err(RelationError::AlreadyExists);
}
self.put(
rel_dir_key,
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
Value::Image(Bytes::from(
RelDirectory::ser(&rel_dir).context("serialize")?,
)),
);
// Put size
@@ -911,7 +941,7 @@ impl<'a> DatadirModification<'a> {
nblocks: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
let last_lsn = self.tline.get_last_record_lsn();
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
let size_key = rel_size_to_key(rel);
@@ -942,7 +972,7 @@ impl<'a> DatadirModification<'a> {
nblocks: BlockNumber,
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
// Put size
let size_key = rel_size_to_key(rel);
@@ -963,7 +993,7 @@ impl<'a> DatadirModification<'a> {
/// Drop a relation.
pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
// Remove it from the directory entry
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
@@ -1108,7 +1138,7 @@ impl<'a> DatadirModification<'a> {
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub fn flush(&mut self) -> anyhow::Result<()> {
pub async fn flush(&mut self) -> anyhow::Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
@@ -1116,19 +1146,20 @@ impl<'a> DatadirModification<'a> {
return Ok(());
}
let writer = self.tline.writer();
let writer = self.tline.writer().await;
// Flush relation and SLRU data blocks, keep metadata.
let mut result: anyhow::Result<()> = Ok(());
self.pending_updates.retain(|&key, value| {
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
result = writer.put(key, self.lsn, value);
false
let mut retained_pending_updates = HashMap::new();
for (key, value) in self.pending_updates.drain() {
if is_rel_block_key(key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, self.lsn, &value).await?;
} else {
true
retained_pending_updates.insert(key, value);
}
});
result?;
}
self.pending_updates.extend(retained_pending_updates);
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1143,17 +1174,17 @@ impl<'a> DatadirModification<'a> {
/// underlying timeline.
/// All the modifications in this atomic update are stamped by the specified LSN.
///
pub fn commit(&mut self) -> anyhow::Result<()> {
let writer = self.tline.writer();
pub async fn commit(&mut self) -> anyhow::Result<()> {
let writer = self.tline.writer().await;
let lsn = self.lsn;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
for (key, value) in self.pending_updates.drain() {
writer.put(key, lsn, &value)?;
writer.put(key, lsn, &value).await?;
}
for key_range in self.pending_deletions.drain(..) {
writer.delete(key_range, lsn)?;
writer.delete(key_range, lsn).await?;
}
writer.finish_write(lsn);
@@ -1593,20 +1624,6 @@ fn is_slru_block_key(key: Key) -> bool {
&& key.field6 != 0xffffffff // and not SlruSegSize
}
#[cfg(test)]
pub fn create_test_timeline(
tenant: &crate::tenant::Tenant,
timeline_id: utils::id::TimelineId,
pg_version: u32,
ctx: &RequestContext,
) -> anyhow::Result<std::sync::Arc<Timeline>> {
let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
let mut m = tline.begin_modification(Lsn(8));
m.init_empty()?;
m.commit()?;
Ok(tline)
}
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {

View File

@@ -257,6 +257,9 @@ pub enum TaskKind {
// task that handles attaching a tenant
Attach,
// Used mostly for background deletion from s3
TimelineDeletionWorker,
// task that handhes metrics collection
MetricsCollection,
@@ -476,18 +479,35 @@ pub async fn shutdown_tasks(
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
{
task.cancel.cancel();
victim_tasks.push(Arc::clone(task));
victim_tasks.push((
Arc::clone(task),
task.kind,
task_mut.tenant_id,
task_mut.timeline_id,
));
}
}
}
for task in victim_tasks {
let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
let join_handle = {
let mut task_mut = task.mutable.lock().unwrap();
task_mut.join_handle.take()
};
if let Some(mut join_handle) = join_handle {
if log_all {
if tenant_id.is_none() {
// there are quite few of these
info!(name = task.name, kind = ?task_kind, "stopping global task");
} else {
// warn to catch these in tests; there shouldn't be any
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
}
}
let completed = tokio::select! {
biased;
_ = &mut join_handle => { true },
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
// allow some time to elapse before logging to cut down the number of log

File diff suppressed because it is too large Load Diff

View File

@@ -38,8 +38,8 @@ pub mod defaults {
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
}

View File

@@ -51,7 +51,9 @@ use crate::keyspace::KeyPartitioning;
use crate::repository::Key;
use crate::tenant::storage_layer::InMemoryLayer;
use crate::tenant::storage_layer::Layer;
use anyhow::Context;
use anyhow::Result;
use std::collections::HashMap;
use std::collections::VecDeque;
use std::ops::Range;
use std::sync::Arc;
@@ -61,6 +63,8 @@ use historic_layer_coverage::BufferedHistoricLayerCoverage;
pub use historic_layer_coverage::Replacement;
use super::storage_layer::range_eq;
use super::storage_layer::PersistentLayerDesc;
use super::storage_layer::PersistentLayerKey;
///
/// LayerMap tracks what layers exist on a timeline.
@@ -86,11 +90,16 @@ pub struct LayerMap<L: ?Sized> {
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
/// Index of the historic layers optimized for search
historic: BufferedHistoricLayerCoverage<Arc<L>>,
historic: BufferedHistoricLayerCoverage<Arc<PersistentLayerDesc>>,
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
l0_delta_layers: Vec<Arc<L>>,
l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
/// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
/// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
/// RemoteLayer will be removed.
mapping: HashMap<PersistentLayerKey, Arc<L>>,
}
impl<L: ?Sized> Default for LayerMap<L> {
@@ -101,6 +110,7 @@ impl<L: ?Sized> Default for LayerMap<L> {
frozen_layers: VecDeque::default(),
l0_delta_layers: Vec::default(),
historic: BufferedHistoricLayerCoverage::default(),
mapping: HashMap::default(),
}
}
}
@@ -125,8 +135,9 @@ where
///
/// Insert an on-disk layer.
///
pub fn insert_historic(&mut self, layer: Arc<L>) {
self.layer_map.insert_historic_noflush(layer)
// TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
self.layer_map.insert_historic_noflush(layer_desc, layer)
}
///
@@ -134,8 +145,8 @@ where
///
/// This should be called when the corresponding file on disk has been deleted.
///
pub fn remove_historic(&mut self, layer: Arc<L>) {
self.layer_map.remove_historic_noflush(layer)
pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
self.layer_map.remove_historic_noflush(layer_desc, layer)
}
/// Replaces existing layer iff it is the `expected`.
@@ -150,12 +161,15 @@ where
/// that we can replace values only by updating a hashmap.
pub fn replace_historic(
&mut self,
expected_desc: PersistentLayerDesc,
expected: &Arc<L>,
new_desc: PersistentLayerDesc,
new: Arc<L>,
) -> anyhow::Result<Replacement<Arc<L>>> {
fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
self.layer_map.replace_historic_noflush(expected, new)
self.layer_map
.replace_historic_noflush(expected_desc, expected, new_desc, new)
}
// We will flush on drop anyway, but this method makes it
@@ -230,6 +244,7 @@ where
(None, None) => None,
(None, Some(image)) => {
let lsn_floor = image.get_lsn_range().start;
let image = self.get_layer_from_mapping(&image.key()).clone();
Some(SearchResult {
layer: image,
lsn_floor,
@@ -237,6 +252,7 @@ where
}
(Some(delta), None) => {
let lsn_floor = delta.get_lsn_range().start;
let delta = self.get_layer_from_mapping(&delta.key()).clone();
Some(SearchResult {
layer: delta,
lsn_floor,
@@ -247,6 +263,7 @@ where
let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
let image_exact_match = img_lsn + 1 == end_lsn;
if image_is_newer || image_exact_match {
let image = self.get_layer_from_mapping(&image.key()).clone();
Some(SearchResult {
layer: image,
lsn_floor: img_lsn,
@@ -254,6 +271,7 @@ where
} else {
let lsn_floor =
std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
let delta = self.get_layer_from_mapping(&delta.key()).clone();
Some(SearchResult {
layer: delta,
lsn_floor,
@@ -273,16 +291,33 @@ where
///
/// Helper function for BatchedUpdates::insert_historic
///
pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
/// TODO(chi): remove L generic so that we do not need to pass layer object.
pub(self) fn insert_historic_noflush(
&mut self,
layer_desc: PersistentLayerDesc,
layer: Arc<L>,
) {
self.mapping.insert(layer_desc.key(), layer.clone());
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
self.historic.insert(
historic_layer_coverage::LayerKey::from(&*layer),
Arc::clone(&layer),
);
if Self::is_l0(&layer) {
self.l0_delta_layers.push(layer);
self.l0_delta_layers.push(layer_desc.clone().into());
}
self.historic.insert(
historic_layer_coverage::LayerKey::from(&*layer),
layer_desc.into(),
);
}
fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
let layer = self
.mapping
.get(key)
.with_context(|| format!("{key:?}"))
.expect("inconsistent layer mapping");
layer
}
///
@@ -290,14 +325,16 @@ where
///
/// Helper function for BatchedUpdates::remove_historic
///
pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
self.historic
.remove(historic_layer_coverage::LayerKey::from(&*layer));
if Self::is_l0(&layer) {
let len_before = self.l0_delta_layers.len();
self.l0_delta_layers
.retain(|other| !Self::compare_arced_layers(other, &layer));
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
l0_delta_layers.retain(|other| {
!Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
});
self.l0_delta_layers = l0_delta_layers;
// this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
// there's a chance that the comparison fails at runtime due to it comparing (pointer,
// vtable) pairs.
@@ -307,11 +344,14 @@ where
"failed to locate removed historic layer from l0_delta_layers"
);
}
self.mapping.remove(&layer_desc.key());
}
pub(self) fn replace_historic_noflush(
&mut self,
expected_desc: PersistentLayerDesc,
expected: &Arc<L>,
new_desc: PersistentLayerDesc,
new: Arc<L>,
) -> anyhow::Result<Replacement<Arc<L>>> {
let key = historic_layer_coverage::LayerKey::from(&**expected);
@@ -332,10 +372,9 @@ where
let l0_index = if expected_l0 {
// find the index in case replace worked, we need to replace that as well
let pos = self
.l0_delta_layers
.iter()
.position(|slot| Self::compare_arced_layers(slot, expected));
let pos = self.l0_delta_layers.iter().position(|slot| {
Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
});
if pos.is_none() {
return Ok(Replacement::NotFound);
@@ -345,16 +384,28 @@ where
None
};
let replaced = self.historic.replace(&key, new.clone(), |existing| {
Self::compare_arced_layers(existing, expected)
let new_desc = Arc::new(new_desc);
let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
**existing == expected_desc
});
if let Replacement::Replaced { .. } = &replaced {
self.mapping.remove(&expected_desc.key());
self.mapping.insert(new_desc.key(), new);
if let Some(index) = l0_index {
self.l0_delta_layers[index] = new;
self.l0_delta_layers[index] = new_desc;
}
}
let replaced = match replaced {
Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
Replacement::NotFound => Replacement::NotFound,
Replacement::RemovalBuffered => Replacement::RemovalBuffered,
Replacement::Unexpected(x) => {
Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
}
};
Ok(replaced)
}
@@ -383,7 +434,7 @@ where
let start = key.start.to_i128();
let end = key.end.to_i128();
let layer_covers = |layer: Option<Arc<L>>| match layer {
let layer_covers = |layer: Option<Arc<PersistentLayerDesc>>| match layer {
Some(layer) => layer.get_lsn_range().start >= lsn.start,
None => false,
};
@@ -404,7 +455,9 @@ where
}
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
self.historic.iter()
self.historic
.iter()
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
}
///
@@ -436,14 +489,24 @@ where
// Loop through the change events and push intervals
for (change_key, change_val) in version.image_coverage.range(start..end) {
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
coverage.push((kr, current_val.take()));
coverage.push((
kr,
current_val
.take()
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
));
current_key = change_key;
current_val = change_val.clone();
}
// Add the final interval
let kr = Key::from_i128(current_key)..Key::from_i128(end);
coverage.push((kr, current_val.take()));
coverage.push((
kr,
current_val
.take()
.map(|l| self.get_layer_from_mapping(&l.key()).clone()),
));
Ok(coverage)
}
@@ -532,7 +595,9 @@ where
let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
let lr = lsn.start..val.get_lsn_range().start;
if !kr.is_empty() {
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let base_count =
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath =
self.count_deltas(&kr, &lr, new_limit)?;
@@ -555,7 +620,9 @@ where
let lr = lsn.start..val.get_lsn_range().start;
if !kr.is_empty() {
let base_count = Self::is_reimage_worthy(&val, key) as usize;
let base_count =
Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
as usize;
let new_limit = limit.map(|l| l - base_count);
let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
max_stacked_deltas = std::cmp::max(
@@ -706,7 +773,11 @@ where
/// Return all L0 delta layers
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
Ok(self.l0_delta_layers.clone())
Ok(self
.l0_delta_layers
.iter()
.map(|x| self.get_layer_from_mapping(&x.key()).clone())
.collect())
}
/// debugging function to print out the contents of the layer map
@@ -809,12 +880,17 @@ mod tests {
let layer = LayerDescriptor::from(layer);
// same skeletan construction; see scenario below
let not_found: Arc<dyn Layer> = Arc::new(layer.clone());
let new_version: Arc<dyn Layer> = Arc::new(layer);
let not_found = Arc::new(layer.clone());
let new_version = Arc::new(layer);
let mut map = LayerMap::default();
let res = map.batch_update().replace_historic(&not_found, new_version);
let res = map.batch_update().replace_historic(
not_found.get_persistent_layer_desc(),
&not_found,
new_version.get_persistent_layer_desc(),
new_version,
);
assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
}
@@ -823,8 +899,8 @@ mod tests {
let name = LayerFileName::from_str(layer_name).unwrap();
let skeleton = LayerDescriptor::from(name);
let remote: Arc<dyn Layer> = Arc::new(skeleton.clone());
let downloaded: Arc<dyn Layer> = Arc::new(skeleton);
let remote = Arc::new(skeleton.clone());
let downloaded = Arc::new(skeleton);
let mut map = LayerMap::default();
@@ -834,12 +910,18 @@ mod tests {
let expected_in_counts = (1, usize::from(expected_l0));
map.batch_update().insert_historic(remote.clone());
map.batch_update()
.insert_historic(remote.get_persistent_layer_desc(), remote.clone());
assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
let replaced = map
.batch_update()
.replace_historic(&remote, downloaded.clone())
.replace_historic(
remote.get_persistent_layer_desc(),
&remote,
downloaded.get_persistent_layer_desc(),
downloaded.clone(),
)
.expect("name derived attributes are the same");
assert!(
matches!(replaced, Replacement::Replaced { .. }),
@@ -847,11 +929,12 @@ mod tests {
);
assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
map.batch_update().remove_historic(downloaded.clone());
map.batch_update()
.remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
}
fn count_layer_in(map: &LayerMap<dyn Layer>, layer: &Arc<dyn Layer>) -> (usize, usize) {
fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
let historic = map
.iter_historic_layers()
.filter(|x| LayerMap::compare_arced_layers(x, layer))

View File

@@ -0,0 +1,325 @@
//! This module contains the encoding and decoding of the local manifest file.
//!
//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
//! records the state of the storage engine. It contains a snapshot of the
//! state and all operations proceeding that snapshot. The file begins with a
//! header recording MANIFEST version number. After that, it contains a snapshot.
//! The snapshot is followed by a list of operations. Each operation is a list
//! of records. Each record is either an addition or a removal of a layer.
//!
//! With MANIFEST, we can:
//!
//! 1. recover state quickly by reading the file, potentially boosting the
//! startup speed.
//! 2. ensure all operations are atomic and avoid corruption, solving issues
//! like redundant image layer and preparing us for future compaction
//! strategies.
//!
//! There is also a format for storing all layer files on S3, called
//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
//! records all operations as logs, and therefore we can easily replay the
//! operations when recovering from crash, while ensuring those operations
//! are atomic upon restart.
//!
//! Currently, this is not used in the system. Future refactors will ensure
//! the storage state will be recorded in this file, and the system can be
//! recovered from this file. This is tracked in
//! https://github.com/neondatabase/neon/issues/4418
use std::io::{self, Read, Write};
use crate::virtual_file::VirtualFile;
use anyhow::Result;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use crc32c::crc32c;
use serde::{Deserialize, Serialize};
use tracing::log::warn;
use utils::lsn::Lsn;
use super::storage_layer::PersistentLayerDesc;
pub struct Manifest {
file: VirtualFile,
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub struct Snapshot {
pub layers: Vec<PersistentLayerDesc>,
}
/// serde by default encode this in tagged enum, and therefore it will be something
/// like `{ "AddLayer": { ... } }`.
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub enum Record {
AddLayer(PersistentLayerDesc),
RemoveLayer(PersistentLayerDesc),
}
/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
const MANIFEST_VERSION: u64 = 1;
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub struct ManifestHeader {
magic_number: u64,
version: u64,
}
const MANIFEST_HEADER_LEN: usize = 16;
impl ManifestHeader {
fn encode(&self) -> BytesMut {
let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
buf.put_u64(self.magic_number);
buf.put_u64(self.version);
buf
}
fn decode(mut buf: &[u8]) -> Self {
assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
Self {
magic_number: buf.get_u64(),
version: buf.get_u64(),
}
}
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub enum Operation {
/// A snapshot of the current state.
///
/// Lsn field represents the LSN that is persisted to disk for this snapshot.
Snapshot(Snapshot, Lsn),
/// An atomic operation that changes the state.
///
/// Lsn field represents the LSN that is persisted to disk after the operation is done.
/// This will only change when new L0 is flushed to the disk.
Operation(Vec<Record>, Lsn),
}
struct RecordHeader {
size: u32,
checksum: u32,
}
const RECORD_HEADER_LEN: usize = 8;
impl RecordHeader {
fn encode(&self) -> BytesMut {
let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
buf.put_u32(self.size);
buf.put_u32(self.checksum);
buf
}
fn decode(mut buf: &[u8]) -> Self {
assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
Self {
size: buf.get_u32(),
checksum: buf.get_u32(),
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum ManifestLoadError {
#[error("manifest header is corrupted")]
CorruptedManifestHeader,
#[error("unsupported manifest version: got {0}, expected {1}")]
UnsupportedVersion(u64, u64),
#[error("error when decoding record: {0}")]
DecodeRecord(serde_json::Error),
#[error("I/O error: {0}")]
Io(io::Error),
}
#[must_use = "Should check if the manifest is partially corrupted"]
pub struct ManifestPartiallyCorrupted(bool);
impl Manifest {
/// Create a new manifest by writing the manifest header and a snapshot record to the given file.
pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
let mut manifest = Self { file };
manifest.append_manifest_header(ManifestHeader {
magic_number: MANIFEST_MAGIC_NUMBER,
version: MANIFEST_VERSION,
})?;
manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
Ok(manifest)
}
/// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
/// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
/// backup the current one.
pub fn load(
mut file: VirtualFile,
) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
let mut buf = vec![];
file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
// Read manifest header
let mut buf = Bytes::from(buf);
if buf.remaining() < MANIFEST_HEADER_LEN {
return Err(ManifestLoadError::CorruptedManifestHeader);
}
let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
buf.advance(MANIFEST_HEADER_LEN);
if header.version != MANIFEST_VERSION {
return Err(ManifestLoadError::UnsupportedVersion(
header.version,
MANIFEST_VERSION,
));
}
// Read operations
let mut operations = Vec::new();
let corrupted = loop {
if buf.remaining() == 0 {
break false;
}
if buf.remaining() < RECORD_HEADER_LEN {
warn!("incomplete header when decoding manifest, could be corrupted");
break true;
}
let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
let size = size as usize;
buf.advance(RECORD_HEADER_LEN);
if buf.remaining() < size {
warn!("incomplete data when decoding manifest, could be corrupted");
break true;
}
let data = &buf[..size];
if crc32c(data) != checksum {
warn!("checksum mismatch when decoding manifest, could be corrupted");
break true;
}
// if the following decode fails, we cannot use the manifest or safely ignore any record.
operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
buf.advance(size);
};
Ok((
Self { file },
operations,
ManifestPartiallyCorrupted(corrupted),
))
}
fn append_data(&mut self, data: &[u8]) -> Result<()> {
if data.len() >= u32::MAX as usize {
panic!("data too large");
}
let header = RecordHeader {
size: data.len() as u32,
checksum: crc32c(data),
};
let header = header.encode();
self.file.write_all(&header)?;
self.file.write_all(data)?;
self.file.sync_all()?;
Ok(())
}
fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
let encoded = header.encode();
self.file.write_all(&encoded)?;
Ok(())
}
/// Add an operation to the manifest. The operation will be appended to the end of the file,
/// and the file will fsync.
pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
let encoded = Vec::from(serde_json::to_string(&operation)?);
self.append_data(&encoded)
}
}
#[cfg(test)]
mod tests {
use std::fs::OpenOptions;
use crate::repository::Key;
use super::*;
#[test]
fn test_read_manifest() {
let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
std::fs::create_dir_all(&testdir).unwrap();
let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
// Write a manifest with a snapshot and some operations
let snapshot = Snapshot {
layers: vec![layer1, layer2],
};
let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
manifest
.append_operation(Operation::Operation(
vec![Record::AddLayer(layer3.clone())],
Lsn::from(1),
))
.unwrap();
drop(manifest);
// Open the second time and write
let file = VirtualFile::open_with_options(
&testdir.join("MANIFEST"),
OpenOptions::new()
.read(true)
.write(true)
.create_new(false)
.truncate(false),
)
.unwrap();
let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
assert!(!corrupted.0);
assert_eq!(operations.len(), 2);
assert_eq!(
&operations[0],
&Operation::Snapshot(snapshot.clone(), Lsn::from(0))
);
assert_eq!(
&operations[1],
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
);
manifest
.append_operation(Operation::Operation(
vec![
Record::RemoveLayer(layer3.clone()),
Record::AddLayer(layer4.clone()),
],
Lsn::from(2),
))
.unwrap();
drop(manifest);
// Open the third time and verify
let file = VirtualFile::open_with_options(
&testdir.join("MANIFEST"),
OpenOptions::new()
.read(true)
.write(true)
.create_new(false)
.truncate(false),
)
.unwrap();
let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
assert!(!corrupted.0);
assert_eq!(operations.len(), 3);
assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
assert_eq!(
&operations[1],
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
);
assert_eq!(
&operations[2],
&Operation::Operation(
vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
Lsn::from(2)
)
);
}
}

View File

@@ -20,12 +20,9 @@ use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::{
create_tenant_files, CreateTenantFilesMode, SetStoppingError, Tenant, TenantState,
};
use crate::IGNORED_TENANT_FILE_NAME;
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
use utils::completion;
use utils::fs_ext::PathExt;
use utils::id::{TenantId, TimelineId};
@@ -67,7 +64,7 @@ pub async fn init_tenant_mgr(
conf: &'static PageServerConf,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
init_done: (completion::Completion, completion::Barrier),
init_order: InitializationOrder,
) -> anyhow::Result<()> {
// Scan local filesystem for attached tenants
let tenants_dir = conf.tenants_path();
@@ -124,7 +121,7 @@ pub async fn init_tenant_mgr(
&tenant_dir_path,
broker_client.clone(),
remote_storage.clone(),
Some(init_done.clone()),
Some(init_order.clone()),
&ctx,
) {
Ok(tenant) => {
@@ -160,7 +157,7 @@ pub fn schedule_local_tenant_processing(
tenant_path: &Path,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: Option<GenericRemoteStorage>,
init_done: Option<(completion::Completion, completion::Barrier)>,
init_order: Option<InitializationOrder>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
anyhow::ensure!(
@@ -219,7 +216,7 @@ pub fn schedule_local_tenant_processing(
tenant_id,
broker_client,
remote_storage,
init_done,
init_order,
ctx,
)
};
@@ -253,46 +250,28 @@ pub async fn shutdown_all_tenants() {
tenants_clone
}
TenantsMap::ShuttingDown(_) => {
// TODO: it is possible that detach and shutdown happen at the same time. as a
// result, during shutdown we do not wait for detach.
error!("already shutting down, this function isn't supposed to be called more than once");
return;
}
}
};
// Set tenant (and its timlines) to Stoppping state.
//
// Since we can only transition into Stopping state after activation is complete,
// run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
//
// Transitioning tenants to Stopping state has a couple of non-obvious side effects:
// 1. Lock out any new requests to the tenants.
// 2. Signal cancellation to WAL receivers (we wait on it below).
// 3. Signal cancellation for other tenant background loops.
// 4. ???
//
// The waiting for the cancellation is not done uniformly.
// We certainly wait for WAL receivers to shut down.
// That is necessary so that no new data comes in before the freeze_and_flush.
// But the tenant background loops are joined-on in our caller.
// It's mesed up.
let mut join_set = JoinSet::new();
let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
for (tenant_id, tenant) in tenants_to_shut_down {
join_set.spawn(
async move {
match tenant.set_stopping().await {
let freeze_and_flush = true;
match tenant.shutdown(freeze_and_flush).await {
Ok(()) => debug!("tenant successfully stopped"),
Err(SetStoppingError::Broken) => {
info!("tenant is broken, so stopping failed, freeze_and_flush is likely going to make noise as well");
},
Err(SetStoppingError::AlreadyStopping) => {
// our task_mgr::shutdown_tasks are going to coalesce on that just fine
Err(super::ShutdownError::AlreadyStopping) => {
warn!("tenant was already shutting down")
}
}
tenant
}
.instrument(info_span!("set_stopping", %tenant_id)),
.instrument(info_span!("shutdown", %tenant_id)),
);
}
@@ -300,6 +279,7 @@ pub async fn shutdown_all_tenants() {
while let Some(res) = join_set.join_next().await {
match res {
Ok(()) => {}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the futures");
}
@@ -310,50 +290,11 @@ pub async fn shutdown_all_tenants() {
Err(join_error) => {
warn!("unknown kind of JoinError: {join_error}");
}
Ok(tenant) => tenants_to_freeze_and_flush.push(tenant),
}
}
if panicked > 0 {
warn!(panicked, "observed panicks while stopping tenants");
}
// Shut down all existing walreceiver connections and stop accepting the new ones.
task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
// Ok, no background tasks running anymore. Flush any remaining data in
// memory to disk.
//
// We assume that any incoming connections that might request pages from
// the tenant have already been terminated by the caller, so there
// should be no more activity in any of the repositories.
//
// On error, log it but continue with the shutdown for other tenants.
let mut join_set = tokio::task::JoinSet::new();
for tenant in tenants_to_freeze_and_flush {
let tenant_id = tenant.tenant_id();
join_set.spawn(
async move {
if let Err(err) = tenant.freeze_and_flush().await {
warn!("Could not checkpoint tenant during shutdown: {err:?}");
}
}
.instrument(info_span!("freeze_and_flush", %tenant_id)),
);
}
while let Some(next) = join_set.join_next().await {
match next {
Ok(()) => {}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("no cancelling")
}
Err(join_error) if join_error.is_panic() => { /* reported already */ }
Err(join_error) => warn!("unknown kind of JoinError: {join_error}"),
}
warn!(panicked, "observed panicks while shutting down tenants");
}
}
@@ -455,7 +396,9 @@ pub async fn delete_timeline(
ctx: &RequestContext,
) -> Result<(), DeleteTimelineError> {
let tenant = get_tenant(tenant_id, true).await?;
tenant.delete_timeline(timeline_id, ctx).await?;
tenant
.prepare_and_schedule_delete_timeline(timeline_id, ctx)
.await?;
Ok(())
}
@@ -669,35 +612,26 @@ where
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
// avoid holding the lock for the entire process.
{
let tenants_accessor = TENANTS.write().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => {
let tenant = Arc::clone(tenant);
// don't hold TENANTS lock while set_stopping waits for activation to finish
drop(tenants_accessor);
match tenant.set_stopping().await {
Ok(()) => {
// we won, continue stopping procedure
}
Err(SetStoppingError::Broken) => {
// continue the procedure, let's hope the closure can deal with broken tenants
}
Err(SetStoppingError::AlreadyStopping) => {
// the tenant is already stopping or broken, don't do anything
return Err(TenantStateError::IsStopping(tenant_id));
}
}
}
None => return Err(TenantStateError::NotFound(tenant_id)),
let tenant = {
TENANTS
.write()
.await
.get(&tenant_id)
.cloned()
.ok_or(TenantStateError::NotFound(tenant_id))?
};
let freeze_and_flush = false;
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match tenant.shutdown(freeze_and_flush).await {
Ok(()) => {}
Err(super::ShutdownError::AlreadyStopping) => {
return Err(TenantStateError::IsStopping(tenant_id))
}
}
// shutdown all tenant and timeline tasks: gc, compaction, page service)
// No new tasks will be started for this tenant because it's in `Stopping` state.
// Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
match tenant_cleanup
.await
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
@@ -741,7 +675,7 @@ pub async fn immediate_gc(
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("tenant {tenant_id}"))
.map_err(ApiError::NotFound)?;
.map_err(|e| ApiError::NotFound(e.into()))?;
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
@@ -790,11 +724,11 @@ pub async fn immediate_compact(
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("tenant {tenant_id}"))
.map_err(ApiError::NotFound)?;
.map_err(|e| ApiError::NotFound(e.into()))?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
.map_err(|e| ApiError::NotFound(e.into()))?;
// Run in task_mgr to avoid race with tenant_detach operation
let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);

View File

@@ -210,13 +210,15 @@ use chrono::{NaiveDateTime, Utc};
pub use download::{is_temp_download_file, list_remote_timelines};
use scopeguard::ScopeGuard;
use std::collections::{HashMap, VecDeque};
use std::path::Path;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use remote_storage::{DownloadError, GenericRemoteStorage};
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use std::ops::DerefMut;
use tokio::runtime::Runtime;
use tracing::{debug, error, info, warn};
use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
@@ -225,7 +227,9 @@ use crate::metrics::{
RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
};
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
use crate::tenant::upload_queue::Delete;
use crate::{
config::PageServerConf,
task_mgr,
@@ -259,7 +263,7 @@ const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
Deleted,
Deleted(IndexPart),
}
/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
@@ -361,11 +365,42 @@ impl RemoteTimelineClient {
Ok(())
}
/// Initialize the queue in stopped state. Used in startup path
/// to continue deletion operation interrupted by pageserver crash or restart.
pub fn init_upload_queue_stopped_to_continue_deletion(
&self,
index_part: &IndexPart,
) -> anyhow::Result<()> {
// FIXME: consider newtype for DeletedIndexPart.
let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
"bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
))?;
{
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(index_part)?;
self.update_remote_physical_size_gauge(Some(index_part));
}
// also locks upload queue, without dropping the guard above it will be a deadlock
self.stop().expect("initialized line above");
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue
.stopped_mut()
.expect("stopped above")
.deleted_at = SetDeletedFlagProgress::Successful(deleted_at);
Ok(())
}
pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
match &*self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn),
UploadQueue::Stopped(q) => {
Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
}
}
}
@@ -420,7 +455,7 @@ impl RemoteTimelineClient {
.await?;
if index_part.deleted_at.is_some() {
Ok(MaybeDeletedIndexPart::Deleted)
Ok(MaybeDeletedIndexPart::Deleted(index_part))
} else {
Ok(MaybeDeletedIndexPart::IndexPart(index_part))
}
@@ -622,7 +657,11 @@ impl RemoteTimelineClient {
// schedule the actual deletions
for name in names {
let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: false,
});
self.calls_unfinished_metric_begin(&op);
upload_queue.queued_operations.push_back(op);
info!("scheduled layer file deletion {}", name.file_name());
@@ -639,18 +678,11 @@ impl RemoteTimelineClient {
/// Wait for all previously scheduled uploads/deletions to complete
///
pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
let (sender, mut receiver) = tokio::sync::watch::channel(());
let barrier_op = UploadOp::Barrier(sender);
{
let mut receiver = {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
upload_queue.queued_operations.push_back(barrier_op);
// Don't count this kind of operation!
// Launch the task immediately, if possible
self.launch_queued_tasks(upload_queue);
}
self.schedule_barrier(upload_queue)
};
if receiver.changed().await.is_err() {
anyhow::bail!("wait_completion aborted because upload queue was stopped");
@@ -658,6 +690,22 @@ impl RemoteTimelineClient {
Ok(())
}
fn schedule_barrier(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
) -> tokio::sync::watch::Receiver<()> {
let (sender, receiver) = tokio::sync::watch::channel(());
let barrier_op = UploadOp::Barrier(sender);
upload_queue.queued_operations.push_back(barrier_op);
// Don't count this kind of operation!
// Launch the task immediately, if possible
self.launch_queued_tasks(upload_queue);
receiver
}
/// Set the deleted_at field in the remote index file.
///
/// This fails if the upload queue has not been `stop()`ed.
@@ -665,6 +713,7 @@ impl RemoteTimelineClient {
/// The caller is responsible for calling `stop()` AND for waiting
/// for any ongoing upload tasks to finish after `stop()` has succeeded.
/// Check method [`RemoteTimelineClient::stop`] for details.
#[instrument(skip_all)]
pub(crate) async fn persist_index_part_with_deleted_flag(
self: &Arc<Self>,
) -> Result<(), PersistIndexPartWithDeletedFlagError> {
@@ -674,15 +723,7 @@ impl RemoteTimelineClient {
// We must be in stopped state because otherwise
// we can have inprogress index part upload that can overwrite the file
// with missing is_deleted flag that we going to set below
let stopped = match &mut *locked {
UploadQueue::Uninitialized => {
return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
}
UploadQueue::Initialized(_) => {
return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
}
UploadQueue::Stopped(stopped) => stopped,
};
let stopped = locked.stopped_mut()?;
match stopped.deleted_at {
SetDeletedFlagProgress::NotRunning => (), // proceed
@@ -696,48 +737,34 @@ impl RemoteTimelineClient {
let deleted_at = Utc::now().naive_utc();
stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
let mut index_part = IndexPart::new(
stopped.latest_files.clone(),
stopped.last_uploaded_consistent_lsn,
stopped
.latest_metadata
.to_bytes()
.context("serialize metadata")?,
);
let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
.context("IndexPart serialize")?;
index_part.deleted_at = Some(deleted_at);
index_part
};
let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
let mut locked = self_clone.upload_queue.lock().unwrap();
let stopped = match &mut *locked {
UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
"there's no way out of Stopping, and we checked it's Stopping above: {:?}",
locked.as_str(),
),
UploadQueue::Stopped(stopped) => stopped,
};
let stopped = locked
.stopped_mut()
.expect("there's no way out of Stopping, and we checked it's Stopping above");
stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
});
// Have a failpoint that can use the `pause` failpoint action.
// We don't want to block the executor thread, hence, spawn_blocking + await.
#[cfg(feature = "testing")]
tokio::task::spawn_blocking({
let current = tracing::Span::current();
move || {
let _entered = current.entered();
tracing::info!(
"at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
);
fail::fail_point!(
"persist_index_part_with_deleted_flag_after_set_before_upload_pause"
);
}
})
.await
.expect("spawn_blocking");
if cfg!(feature = "testing") {
tokio::task::spawn_blocking({
let current = tracing::Span::current();
move || {
let _entered = current.entered();
tracing::info!("at failpoint persist_deleted_index_part");
fail::fail_point!("persist_deleted_index_part");
}
})
.await
.expect("spawn_blocking");
}
upload::upload_index_part(
self.conf,
&self.storage_impl,
@@ -751,13 +778,10 @@ impl RemoteTimelineClient {
ScopeGuard::into_inner(undo_deleted_at);
{
let mut locked = self.upload_queue.lock().unwrap();
let stopped = match &mut *locked {
UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
"there's no way out of Stopping, and we checked it's Stopping above: {:?}",
locked.as_str(),
),
UploadQueue::Stopped(stopped) => stopped,
};
let stopped = locked
.stopped_mut()
.expect("there's no way out of Stopping, and we checked it's Stopping above");
stopped.deleted_at = SetDeletedFlagProgress::Successful(
index_part_with_deleted_at
.deleted_at
@@ -768,6 +792,92 @@ impl RemoteTimelineClient {
Ok(())
}
/// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
/// The function deletes layer files one by one, then lists the prefix to see if we leaked something
/// deletes leaked files if any and proceeds with deletion of index file at the end.
pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
debug_assert_current_span_has_tenant_and_timeline_id();
let (mut receiver, deletions_queued) = {
let mut deletions_queued = 0;
let mut locked = self.upload_queue.lock().unwrap();
let stopped = locked.stopped_mut()?;
if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
anyhow::bail!("deleted_at is not set")
}
debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
stopped
.upload_queue_for_deletion
.queued_operations
.reserve(stopped.upload_queue_for_deletion.latest_files.len());
// schedule the actual deletions
for name in stopped.upload_queue_for_deletion.latest_files.keys() {
let op = UploadOp::Delete(Delete {
file_kind: RemoteOpFileKind::Layer,
layer_file_name: name.clone(),
scheduled_from_timeline_delete: true,
});
self.calls_unfinished_metric_begin(&op);
stopped
.upload_queue_for_deletion
.queued_operations
.push_back(op);
info!("scheduled layer file deletion {}", name.file_name());
deletions_queued += 1;
}
self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
(
self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
deletions_queued,
)
};
receiver.changed().await?;
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
let remaining = self
.storage_impl
.list_prefixes(Some(&timeline_storage_path))
.await?;
let remaining: Vec<RemotePath> = remaining
.into_iter()
.filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
.collect();
if !remaining.is_empty() {
warn!(
"Found {} files not bound to index_file.json, proceeding with their deletion",
remaining.len()
);
for file in remaining {
warn!("Removing {}", file.object_name().unwrap_or_default());
self.storage_impl.delete(&file).await?;
}
}
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
debug!("deleting index part");
self.storage_impl.delete(&index_file_path).await?;
info!(deletions_queued, "done deleting, including index_part.json");
Ok(())
}
///
/// Pick next tasks from the queue, and start as many of them as possible without violating
/// the ordering constraints.
@@ -786,7 +896,7 @@ impl RemoteTimelineClient {
// have finished.
upload_queue.inprogress_tasks.is_empty()
}
UploadOp::Delete(_, _) => {
UploadOp::Delete(_) => {
// Wait for preceding uploads to finish. Concurrent deletions are OK, though.
upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
}
@@ -817,7 +927,7 @@ impl RemoteTimelineClient {
UploadOp::UploadMetadata(_, _) => {
upload_queue.num_inprogress_metadata_uploads += 1;
}
UploadOp::Delete(_, _) => {
UploadOp::Delete(_) => {
upload_queue.num_inprogress_deletions += 1;
}
UploadOp::Barrier(sender) => {
@@ -891,7 +1001,6 @@ impl RemoteTimelineClient {
unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
}
}
self.calls_unfinished_metric_end(&task.op);
return;
}
@@ -937,16 +1046,16 @@ impl RemoteTimelineClient {
}
res
}
UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
UploadOp::Delete(delete) => {
let path = &self
.conf
.timeline_path(&self.timeline_id, &self.tenant_id)
.join(layer_file_name.file_name());
.join(delete.layer_file_name.file_name());
delete::delete_layer(self.conf, &self.storage_impl, path)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
*metric_file_kind,
delete.file_kind,
RemoteOpKind::Delete,
Arc::clone(&self.metrics),
)
@@ -1012,11 +1121,24 @@ impl RemoteTimelineClient {
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
let upload_queue = match upload_queue_guard.deref_mut() {
UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
UploadQueue::Stopped(_) => {
UploadQueue::Stopped(stopped) => {
// Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
// then stop() took care of it so we just return.
// For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
match &task.op {
UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
_ => None
}
},
UploadQueue::Initialized(qi) => { Some(qi) }
};
let upload_queue = match upload_queue {
Some(upload_queue) => upload_queue,
None => {
info!("another concurrent task already stopped the queue");
return;
}, // nothing to do
UploadQueue::Initialized(qi) => { qi }
}
};
upload_queue.inprogress_tasks.remove(&task.task_id);
@@ -1029,7 +1151,7 @@ impl RemoteTimelineClient {
upload_queue.num_inprogress_metadata_uploads -= 1;
upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
}
UploadOp::Delete(_, _) => {
UploadOp::Delete(_) => {
upload_queue.num_inprogress_deletions -= 1;
}
UploadOp::Barrier(_) => unreachable!(),
@@ -1063,8 +1185,8 @@ impl RemoteTimelineClient {
reason: "metadata uploads are tiny",
},
),
UploadOp::Delete(file_kind, _) => (
*file_kind,
UploadOp::Delete(delete) => (
delete.file_kind,
RemoteOpKind::Delete,
DontTrackSize {
reason: "should we track deletes? positive or negative sign?",
@@ -1111,32 +1233,36 @@ impl RemoteTimelineClient {
info!("another concurrent task already shut down the queue");
Ok(())
}
UploadQueue::Initialized(UploadQueueInitialized {
latest_files,
latest_metadata,
last_uploaded_consistent_lsn,
..
}) => {
UploadQueue::Initialized(initialized) => {
info!("shutting down upload queue");
// Replace the queue with the Stopped state, taking ownership of the old
// Initialized queue. We will do some checks on it, and then drop it.
let qi = {
// take or clone what we need
let latest_files = std::mem::take(latest_files);
let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
// this could be Copy
let latest_metadata = latest_metadata.clone();
let stopped = UploadQueueStopped {
latest_files,
last_uploaded_consistent_lsn,
latest_metadata,
deleted_at: SetDeletedFlagProgress::NotRunning,
// Here we preserve working version of the upload queue for possible use during deletions.
// In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut
// but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
// Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
let upload_queue_for_deletion = UploadQueueInitialized {
task_counter: 0,
latest_files: initialized.latest_files.clone(),
latest_files_changes_since_metadata_upload_scheduled: 0,
latest_metadata: initialized.latest_metadata.clone(),
last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::default(),
queued_operations: VecDeque::default(),
};
let upload_queue =
std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
let upload_queue = std::mem::replace(
&mut *guard,
UploadQueue::Stopped(UploadQueueStopped {
upload_queue_for_deletion,
deleted_at: SetDeletedFlagProgress::NotRunning,
}),
);
if let UploadQueue::Initialized(qi) = upload_queue {
qi
} else {
@@ -1144,8 +1270,6 @@ impl RemoteTimelineClient {
}
};
assert!(qi.latest_files.is_empty(), "do not use this anymore");
// consistency check
assert_eq!(
qi.num_inprogress_layer_uploads
@@ -1243,7 +1367,7 @@ mod tests {
struct TestSetup {
runtime: &'static tokio::runtime::Runtime,
entered_runtime: EnterGuard<'static>,
harness: TenantHarness<'static>,
harness: TenantHarness,
tenant: Arc<Tenant>,
tenant_ctx: RequestContext,
remote_fs_dir: PathBuf,
@@ -1264,7 +1388,12 @@ mod tests {
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = runtime.block_on(harness.load());
// create an empty timeline directory
let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
let _ = runtime.block_on(tenant.create_test_timeline(
TIMELINE_ID,
Lsn(8),
DEFAULT_PG_VERSION,
&ctx,
))?;
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
std::fs::create_dir_all(remote_fs_dir)?;
@@ -1408,7 +1537,7 @@ mod tests {
// Download back the index.json, and check that the list of files is correct
let index_part = match runtime.block_on(client.download_index_file())? {
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
};
assert_file_list(

View File

@@ -7,9 +7,11 @@ use std::collections::{HashMap, HashSet};
use chrono::NaiveDateTime;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::bin_ser::SerializeError;
use crate::tenant::metadata::TimelineMetadata;
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::upload_queue::UploadQueueInitialized;
use utils::lsn::Lsn;
@@ -115,6 +117,21 @@ impl IndexPart {
}
}
impl TryFrom<&UploadQueueInitialized> for IndexPart {
type Error = SerializeError;
fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
Ok(Self::new(
upload_queue.latest_files.clone(),
disk_consistent_lsn,
metadata_bytes,
))
}
}
/// Serialized form of [`LayerFileMetadata`].
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
pub struct IndexLayerMetadata {

View File

@@ -38,7 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
pub use image_layer::{ImageLayer, ImageLayerWriter};
pub use inmemory_layer::InMemoryLayer;
pub use layer_desc::PersistentLayerDesc;
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub use remote_layer::RemoteLayer;
use super::layer_map::BatchedUpdates;
@@ -389,10 +389,10 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
}
/// Returned by [`Layer::iter`]
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
/// Returned by [`Layer::key_iter`]
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
/// range of LSNs.
@@ -454,7 +454,9 @@ pub trait PersistentLayer: Layer {
///
/// Should not change over the lifetime of the layer object because
/// current_physical_size is computed as the som of this value.
fn file_size(&self) -> u64;
fn file_size(&self) -> u64 {
self.layer_desc().file_size
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
@@ -483,6 +485,20 @@ pub struct LayerDescriptor {
pub short_id: String,
}
impl LayerDescriptor {
/// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
/// and the tenant / timeline id does not matter.
pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
PersistentLayerDesc::new_delta(
TenantId::from_array([0; 16]),
TimelineId::from_array([0; 16]),
self.key.clone(),
self.lsn.clone(),
233,
)
}
}
impl Layer for LayerDescriptor {
fn get_key_range(&self) -> Range<Key> {
self.key.clone()

View File

@@ -37,6 +37,7 @@ use crate::virtual_file::VirtualFile;
use crate::{walrecord, TEMP_FILE_SUFFIX};
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{bail, ensure, Context, Result};
use once_cell::sync::OnceCell;
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
@@ -46,7 +47,6 @@ use std::io::{Seek, SeekFrom};
use std::ops::Range;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
use tracing::*;
use utils::{
@@ -182,11 +182,9 @@ pub struct DeltaLayer {
pub desc: PersistentLayerDesc,
pub file_size: u64,
access_stats: LayerAccessStats,
inner: RwLock<DeltaLayerInner>,
inner: OnceCell<DeltaLayerInner>,
}
impl std::fmt::Debug for DeltaLayer {
@@ -196,28 +194,24 @@ impl std::fmt::Debug for DeltaLayer {
f.debug_struct("DeltaLayer")
.field("key_range", &RangeDisplayDebug(&self.desc.key_range))
.field("lsn_range", &self.desc.lsn_range)
.field("file_size", &self.file_size)
.field("file_size", &self.desc.file_size)
.field("inner", &self.inner)
.finish()
}
}
pub struct DeltaLayerInner {
/// If false, the fields below have not been loaded into memory yet.
loaded: bool,
// values copied from summary
index_start_blk: u32,
index_root_blk: u32,
/// Reader object for reading blocks from the file. (None if not loaded yet)
file: Option<FileBlockReader<VirtualFile>>,
/// Reader object for reading blocks from the file.
file: FileBlockReader<VirtualFile>,
}
impl std::fmt::Debug for DeltaLayerInner {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("DeltaLayerInner")
.field("loaded", &self.loaded)
.field("index_start_blk", &self.index_start_blk)
.field("index_root_blk", &self.index_root_blk)
.finish()
@@ -248,7 +242,7 @@ impl Layer for DeltaLayer {
inner.index_start_blk, inner.index_root_blk
);
let file = inner.file.as_ref().unwrap();
let file = &inner.file;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
@@ -317,7 +311,7 @@ impl Layer for DeltaLayer {
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
// Scan the page versions backwards, starting from `lsn`.
let file = inner.file.as_ref().unwrap();
let file = &inner.file;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
@@ -439,10 +433,6 @@ impl PersistentLayer for DeltaLayer {
Ok(())
}
fn file_size(&self) -> u64 {
self.file_size
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
@@ -451,7 +441,7 @@ impl PersistentLayer for DeltaLayer {
HistoricLayerInfo::Delta {
layer_file_name,
layer_file_size: self.file_size,
layer_file_size: self.desc.file_size,
lsn_start: lsn_range.start,
lsn_end: lsn_range.end,
remote: false,
@@ -506,51 +496,22 @@ impl DeltaLayer {
/// Open the underlying file and read the metadata into memory, if it's
/// not loaded already.
///
fn load(
&self,
access_kind: LayerAccessKind,
ctx: &RequestContext,
) -> Result<RwLockReadGuard<DeltaLayerInner>> {
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
self.access_stats
.record_access(access_kind, ctx.task_kind());
loop {
// Quick exit if already loaded
let inner = self.inner.read().unwrap();
if inner.loaded {
return Ok(inner);
}
// Need to open the file and load the metadata. Upgrade our lock to
// a write lock. (Or rather, release and re-lock in write mode.)
drop(inner);
let inner = self.inner.write().unwrap();
if !inner.loaded {
self.load_inner(inner).with_context(|| {
format!("Failed to load delta layer {}", self.path().display())
})?;
} else {
// Another thread loaded it while we were not holding the lock.
}
// We now have the file open and loaded. There's no function to do
// that in the std library RwLock, so we have to release and re-lock
// in read mode. (To be precise, the lock guard was moved in the
// above call to `load_inner`, so it's already been released). And
// while we do that, another thread could unload again, so we have
// to re-check and retry if that happens.
}
// Quick exit if already loaded
self.inner
.get_or_try_init(|| self.load_inner())
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
}
fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
fn load_inner(&self) -> Result<DeltaLayerInner> {
let path = self.path();
// Open the file if it's not open already.
if inner.file.is_none() {
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
inner.file = Some(FileBlockReader::new(file));
}
let file = inner.file.as_mut().unwrap();
let file = VirtualFile::open(&path)
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0)?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -577,13 +538,13 @@ impl DeltaLayer {
}
}
inner.index_start_blk = actual_summary.index_start_blk;
inner.index_root_blk = actual_summary.index_root_blk;
debug!("loaded from {}", &path.display());
inner.loaded = true;
Ok(())
Ok(DeltaLayerInner {
file,
index_start_blk: actual_summary.index_start_blk,
index_root_blk: actual_summary.index_root_blk,
})
}
/// Create a DeltaLayer struct representing an existing file on disk.
@@ -602,15 +563,10 @@ impl DeltaLayer {
timeline_id,
filename.key_range.clone(),
filename.lsn_range.clone(),
file_size,
),
file_size,
access_stats,
inner: RwLock::new(DeltaLayerInner {
loaded: false,
file: None,
index_start_blk: 0,
index_root_blk: 0,
}),
inner: once_cell::sync::OnceCell::new(),
}
}
@@ -634,15 +590,10 @@ impl DeltaLayer {
summary.timeline_id,
summary.key_range,
summary.lsn_range,
metadata.len(),
),
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
file: None,
index_start_blk: 0,
index_root_blk: 0,
}),
inner: once_cell::sync::OnceCell::new(),
})
}
@@ -803,15 +754,10 @@ impl DeltaLayerWriterInner {
self.timeline_id,
self.key_start..key_end,
self.lsn_range.clone(),
metadata.len(),
),
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(DeltaLayerInner {
loaded: false,
file: None,
index_start_blk,
index_root_blk,
}),
inner: once_cell::sync::OnceCell::new(),
};
// fsync the file
@@ -946,13 +892,13 @@ struct DeltaValueIter<'a> {
reader: BlockCursor<Adapter<'a>>,
}
struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
struct Adapter<'a>(&'a DeltaLayerInner);
impl<'a> BlockReader for Adapter<'a> {
type BlockLease = PageReadGuard<'static>;
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
self.0.file.as_ref().unwrap().read_blk(blknum)
self.0.file.read_blk(blknum)
}
}
@@ -965,8 +911,8 @@ impl<'a> Iterator for DeltaValueIter<'a> {
}
impl<'a> DeltaValueIter<'a> {
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
let file = inner.file.as_ref().unwrap();
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
let file = &inner.file;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
@@ -1039,8 +985,8 @@ impl Iterator for DeltaKeyIter {
}
impl<'a> DeltaKeyIter {
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
let file = inner.file.as_ref().unwrap();
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
let file = &inner.file;
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
inner.index_start_blk,
inner.index_root_blk,
@@ -1080,3 +1026,21 @@ impl<'a> DeltaKeyIter {
Ok(iter)
}
}
#[cfg(test)]
mod test {
use super::DeltaKeyIter;
use super::DeltaLayer;
use super::DeltaValueIter;
// We will soon need the iters to be send in the compaction code.
// Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
// Cf https://github.com/neondatabase/neon/issues/4471
#[test]
fn is_send() {
fn assert_send<T: Send>() {}
assert_send::<DeltaLayer>();
assert_send::<DeltaValueIter>();
assert_send::<DeltaKeyIter>();
}
}

View File

@@ -109,8 +109,6 @@ pub struct ImageLayer {
// This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
pub lsn: Lsn,
pub file_size: u64,
access_stats: LayerAccessStats,
inner: RwLock<ImageLayerInner>,
@@ -122,7 +120,7 @@ impl std::fmt::Debug for ImageLayer {
f.debug_struct("ImageLayer")
.field("key_range", &RangeDisplayDebug(&self.desc.key_range))
.field("file_size", &self.file_size)
.field("file_size", &self.desc.file_size)
.field("lsn", &self.lsn)
.field("inner", &self.inner)
.finish()
@@ -258,17 +256,13 @@ impl PersistentLayer for ImageLayer {
Ok(())
}
fn file_size(&self) -> u64 {
self.file_size
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
HistoricLayerInfo::Image {
layer_file_name,
layer_file_size: self.file_size,
layer_file_size: self.desc.file_size,
lsn_start: lsn_range.start,
remote: false,
access_stats: self.access_stats.as_api_model(reset),
@@ -411,9 +405,9 @@ impl ImageLayer {
filename.key_range.clone(),
filename.lsn,
false,
file_size,
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: filename.lsn,
file_size,
access_stats,
inner: RwLock::new(ImageLayerInner {
loaded: false,
@@ -443,9 +437,9 @@ impl ImageLayer {
summary.key_range,
summary.lsn,
false,
metadata.len(),
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: summary.lsn,
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(ImageLayerInner {
file: None,
@@ -578,14 +572,6 @@ impl ImageLayerWriterInner {
file.write_all(buf.as_ref())?;
}
let desc = PersistentLayerDesc::new_img(
self.tenant_id,
self.timeline_id,
self.key_range.clone(),
self.lsn,
self.is_incremental, // for now, image layer ALWAYS covers the full range
);
// Fill in the summary on blk 0
let summary = Summary {
magic: IMAGE_FILE_MAGIC,
@@ -604,6 +590,15 @@ impl ImageLayerWriterInner {
.metadata()
.context("get metadata to determine file size")?;
let desc = PersistentLayerDesc::new_img(
self.tenant_id,
self.timeline_id,
self.key_range.clone(),
self.lsn,
self.is_incremental, // for now, image layer ALWAYS covers the full range
metadata.len(),
);
// Note: Because we open the file in write-only mode, we cannot
// reuse the same VirtualFile for reading later. That's why we don't
// set inner.file here. The first read will have to re-open it.
@@ -611,7 +606,6 @@ impl ImageLayerWriterInner {
path_or_conf: PathOrConf::Conf(self.conf),
desc,
lsn: self.lsn,
file_size: metadata.len(),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: RwLock::new(ImageLayerInner {
loaded: false,

View File

@@ -304,7 +304,7 @@ impl InMemoryLayer {
Ok(())
}
pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
// TODO: Currently, we just leak the storage for any deleted keys
Ok(())

View File

@@ -1,17 +1,20 @@
use anyhow::Result;
use std::ops::Range;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::repository::Key;
use crate::{context::RequestContext, repository::Key};
use super::{DeltaFileName, ImageFileName, LayerFileName};
use serde::{Deserialize, Serialize};
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
/// a unified way to generate layer information like file name.
#[derive(Debug, PartialEq, Eq, Clone)]
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub struct PersistentLayerDesc {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
@@ -24,19 +27,51 @@ pub struct PersistentLayerDesc {
/// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
/// incremental.
pub is_incremental: bool,
/// File size
pub file_size: u64,
}
/// A unique identifier of a persistent layer within the context of one timeline.
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub struct PersistentLayerKey {
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
pub is_delta: bool,
}
impl PersistentLayerDesc {
pub fn key(&self) -> PersistentLayerKey {
PersistentLayerKey {
key_range: self.key_range.clone(),
lsn_range: self.lsn_range.clone(),
is_delta: self.is_delta,
}
}
pub fn short_id(&self) -> String {
self.filename().file_name()
}
#[cfg(test)]
pub fn new_test(key_range: Range<Key>) -> Self {
Self {
tenant_id: TenantId::generate(),
timeline_id: TimelineId::generate(),
key_range,
lsn_range: Lsn(0)..Lsn(1),
is_delta: false,
is_incremental: false,
file_size: 0,
}
}
pub fn new_img(
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn: Lsn,
is_incremental: bool,
file_size: u64,
) -> Self {
Self {
tenant_id,
@@ -45,6 +80,7 @@ impl PersistentLayerDesc {
lsn_range: Self::image_layer_lsn_range(lsn),
is_delta: false,
is_incremental,
file_size,
}
}
@@ -53,6 +89,7 @@ impl PersistentLayerDesc {
timeline_id: TimelineId,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
file_size: u64,
) -> Self {
Self {
tenant_id,
@@ -61,6 +98,7 @@ impl PersistentLayerDesc {
lsn_range,
is_delta: true,
is_incremental: true,
file_size,
}
}
@@ -106,4 +144,48 @@ impl PersistentLayerDesc {
self.image_file_name().into()
}
}
// TODO: remove this in the future once we refactor timeline APIs.
pub fn get_lsn_range(&self) -> Range<Lsn> {
self.lsn_range.clone()
}
pub fn get_key_range(&self) -> Range<Key> {
self.key_range.clone()
}
pub fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
pub fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
pub fn is_incremental(&self) -> bool {
self.is_incremental
}
pub fn is_delta(&self) -> bool {
self.is_delta
}
pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
println!(
"----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
self.tenant_id,
self.timeline_id,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
self.lsn_range.end
);
Ok(())
}
pub fn file_size(&self) -> u64 {
self.file_size
}
}

View File

@@ -142,10 +142,6 @@ impl PersistentLayer for RemoteLayer {
true
}
fn file_size(&self) -> u64 {
self.layer_metadata.file_size()
}
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.filename().file_name();
let lsn_range = self.get_lsn_range();
@@ -190,6 +186,7 @@ impl RemoteLayer {
fname.key_range.clone(),
fname.lsn,
false,
layer_metadata.file_size(),
),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
@@ -211,6 +208,7 @@ impl RemoteLayer {
timelineid,
fname.key_range.clone(),
fname.lsn_range.clone(),
layer_metadata.file_size(),
),
layer_metadata: layer_metadata.clone(),
ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),

View File

@@ -14,7 +14,11 @@ use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::completion;
pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completion::Barrier>) {
/// Start per tenant background loops: compaction and gc.
pub fn start_background_loops(
tenant: &Arc<Tenant>,
background_jobs_can_start: Option<&completion::Barrier>,
) {
let tenant_id = tenant.tenant_id;
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
@@ -25,10 +29,14 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
false,
{
let tenant = Arc::clone(tenant);
let init_done = init_done.cloned();
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
completion::Barrier::maybe_wait(init_done).await;
compaction_loop(tenant)
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
compaction_loop(tenant, cancel)
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
.await;
Ok(())
@@ -44,10 +52,14 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
false,
{
let tenant = Arc::clone(tenant);
let init_done = init_done.cloned();
let background_jobs_can_start = background_jobs_can_start.cloned();
async move {
completion::Barrier::maybe_wait(init_done).await;
gc_loop(tenant)
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()) },
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
gc_loop(tenant, cancel)
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
.await;
Ok(())
@@ -59,12 +71,11 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
///
/// Compaction task's main loop
///
async fn compaction_loop(tenant: Arc<Tenant>) {
async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let wait_duration = Duration::from_secs(2);
info!("starting");
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let cancel = task_mgr::shutdown_token();
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
let mut first = true;
loop {
@@ -129,12 +140,11 @@ async fn compaction_loop(tenant: Arc<Tenant>) {
///
/// GC task's main loop
///
async fn gc_loop(tenant: Arc<Tenant>) {
async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
let wait_duration = Duration::from_secs(2);
info!("starting");
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let cancel = task_mgr::shutdown_token();
// GC might require downloading, to find the cutoff LSN that corresponds to the
// cutoff specified as time.
let ctx =

File diff suppressed because it is too large Load Diff

View File

@@ -34,6 +34,8 @@ use crate::{
},
};
use utils::completion;
use super::Timeline;
#[derive(Default)]
@@ -47,8 +49,12 @@ pub struct EvictionTaskTenantState {
}
impl Timeline {
pub(super) fn launch_eviction_task(self: &Arc<Self>) {
pub(super) fn launch_eviction_task(
self: &Arc<Self>,
background_tasks_can_start: Option<&completion::Barrier>,
) {
let self_clone = Arc::clone(self);
let background_tasks_can_start = background_tasks_can_start.cloned();
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Eviction,
@@ -57,7 +63,13 @@ impl Timeline {
&format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
false,
async move {
self_clone.eviction_task(task_mgr::shutdown_token()).await;
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); }
_ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
};
self_clone.eviction_task(cancel).await;
info!("eviction task finishing");
Ok(())
},
@@ -185,7 +197,7 @@ impl Timeline {
// We don't want to hold the layer map lock during eviction.
// So, we just need to deal with this.
let candidates: Vec<Arc<dyn PersistentLayer>> = {
let layers = self.layers.read().unwrap();
let layers = self.layers.read().await;
let mut candidates = Vec::new();
for hist_layer in layers.iter_historic_layers() {
if hist_layer.is_remote_layer() {

View File

@@ -25,6 +25,7 @@ mod walreceiver_connection;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::timeline::walreceiver::connection_manager::{
connection_manager_loop_step, ConnectionManagerState,
};
@@ -85,7 +86,8 @@ impl WalReceiver {
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
false,
async move {
info!("WAL receiver manager started, connecting to broker");
debug_assert_current_span_has_tenant_and_timeline_id();
debug!("WAL receiver manager started, connecting to broker");
let mut connection_manager_state = ConnectionManagerState::new(
timeline,
conf,
@@ -93,7 +95,7 @@ impl WalReceiver {
loop {
select! {
_ = task_mgr::shutdown_watcher() => {
info!("WAL receiver shutdown requested, shutting down");
trace!("WAL receiver shutdown requested, shutting down");
break;
},
loop_step_result = connection_manager_loop_step(
@@ -104,7 +106,7 @@ impl WalReceiver {
) => match loop_step_result {
ControlFlow::Continue(()) => continue,
ControlFlow::Break(()) => {
info!("Connection manager loop ended, shutting down");
trace!("Connection manager loop ended, shutting down");
break;
}
},
@@ -115,7 +117,7 @@ impl WalReceiver {
*loop_status.write().unwrap() = None;
Ok(())
}
.instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
);
Self {
@@ -198,29 +200,19 @@ impl<E: Clone> TaskHandle<E> {
TaskEvent::End(match self.join_handle.as_mut() {
Some(jh) => {
if !jh.is_finished() {
// Barring any implementation errors in this module, we can
// only arrive here while the task that executes the future
// passed to `Self::spawn()` is still execution. Cf the comment
// in Self::spawn().
//
// This was logging at warning level in earlier versions, presumably
// to leave some breadcrumbs in case we had an implementation
// error that would would make us get stuck in `jh.await`.
//
// There hasn't been such a bug so far.
// But in a busy system, e.g., during pageserver restart,
// we arrive here often enough that the warning-level logs
// became a distraction.
// So, tone them down to info-level.
//
// XXX: rewrite this module to eliminate the race condition.
info!("sender is dropped while join handle is still alive");
// See: https://github.com/neondatabase/neon/issues/2885
trace!("sender is dropped while join handle is still alive");
}
let res = jh
.await
.map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
.and_then(|x| x);
let res = match jh.await {
Ok(res) => res,
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
// already logged
Ok(())
}
Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")),
};
// For cancellation-safety, drop join_handle only after successful .await.
self.join_handle = None;
@@ -243,12 +235,12 @@ impl<E: Clone> TaskHandle<E> {
match jh.await {
Ok(Ok(())) => debug!("Shutdown success"),
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
Err(join_error) => {
if join_error.is_cancelled() {
error!("Shutdown task was cancelled");
} else {
error!("Shutdown task join error: {join_error}")
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
// already logged
}
Err(je) => {
error!("Shutdown task join error: {je}")
}
}
}

View File

@@ -18,7 +18,7 @@ use crate::metrics::{
WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
};
use crate::task_mgr::TaskKind;
use crate::tenant::Timeline;
use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
use pageserver_api::models::TimelineState;
@@ -55,8 +55,11 @@ pub(super) async fn connection_manager_loop_step(
.await
{
Ok(()) => {}
Err(_) => {
info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
Err(new_state) => {
debug!(
?new_state,
"state changed, stopping wal connection manager loop"
);
return ControlFlow::Break(());
}
}
@@ -79,7 +82,7 @@ pub(super) async fn connection_manager_loop_step(
// with other streams on this client (other connection managers). When
// object goes out of scope, stream finishes in drop() automatically.
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
info!("Subscribed for broker timeline updates");
debug!("Subscribed for broker timeline updates");
loop {
let time_until_next_retry = connection_manager_state.time_until_next_retry();
@@ -150,13 +153,13 @@ pub(super) async fn connection_manager_loop_step(
match new_state {
// we're already active as walreceiver, no need to reactivate
TimelineState::Active => continue,
TimelineState::Broken | TimelineState::Stopping => {
info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
TimelineState::Broken { .. } | TimelineState::Stopping => {
debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
return ControlFlow::Break(());
}
TimelineState::Loading => {
warn!("timeline transitioned back to Loading state, that should not happen");
return ControlFlow::Continue(new_state);
return ControlFlow::Continue(());
}
}
}
@@ -164,12 +167,11 @@ pub(super) async fn connection_manager_loop_step(
}
}
} => match new_event {
ControlFlow::Continue(new_state) => {
info!("observed timeline state change, new state is {new_state:?}");
ControlFlow::Continue(()) => {
return ControlFlow::Continue(());
}
ControlFlow::Break(()) => {
info!("Timeline dropped state updates sender, stopping wal connection manager loop");
debug!("Timeline is no longer active, stopping wal connection manager loop");
return ControlFlow::Break(());
}
},
@@ -390,7 +392,6 @@ impl ConnectionManagerState {
self.drop_old_connection(true).await;
let id = self.id;
let node_id = new_sk.safekeeper_id;
let connect_timeout = self.conf.wal_connect_timeout;
let timeline = Arc::clone(&self.timeline);
@@ -398,9 +399,13 @@ impl ConnectionManagerState {
TaskKind::WalReceiverConnectionHandler,
DownloadBehavior::Download,
);
let span = info_span!("connection", %node_id);
let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
async move {
super::walreceiver_connection::handle_walreceiver_connection(
debug_assert_current_span_has_tenant_and_timeline_id();
let res = super::walreceiver_connection::handle_walreceiver_connection(
timeline,
new_sk.wal_source_connconf,
events_sender,
@@ -409,12 +414,23 @@ impl ConnectionManagerState {
ctx,
node_id,
)
.await
.context("walreceiver connection handling failure")
.await;
match res {
Ok(()) => Ok(()),
Err(e) => {
use super::walreceiver_connection::ExpectedError;
if e.is_expected() {
info!("walreceiver connection handling ended: {e:#}");
Ok(())
} else {
// give out an error to have task_mgr give it a really verbose logging
Err(e).context("walreceiver connection handling failure")
}
}
}
}
.instrument(
info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
)
.instrument(span)
});
let now = Utc::now().naive_utc();
@@ -1305,10 +1321,11 @@ mod tests {
const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState {
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
.create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
.await
.expect("Failed to create an empty timeline for dummy wal connection manager");
ConnectionManagerState {

View File

@@ -21,16 +21,16 @@ use postgres_types::PgLsn;
use tokio::{select, sync::watch, time};
use tokio_postgres::{replication::ReplicationStream, Client};
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, trace, warn};
use tracing::{debug, error, info, trace, warn, Instrument};
use super::TaskStateUpdate;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
task_mgr,
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
tenant::{Timeline, WalReceiverInfo},
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
walingest::WalIngest,
walrecord::DecodedWALRecord,
};
@@ -81,13 +81,8 @@ pub(super) async fn handle_walreceiver_connection(
config.application_name("pageserver");
config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
Ok(Ok(client_and_conn)) => client_and_conn,
Ok(Err(conn_err)) => {
let expected_error = ignore_expected_errors(conn_err)?;
info!("DB connection stream finished: {expected_error}");
return Ok(());
}
Err(_) => {
Ok(client_and_conn) => client_and_conn?,
Err(_elapsed) => {
// Timing out to connect to a safekeeper node could happen long time, due to
// many reasons that pageserver cannot control.
// Do not produce an error, but make it visible, that timeouts happen by logging the `event.
@@ -97,7 +92,7 @@ pub(super) async fn handle_walreceiver_connection(
}
};
info!("connected!");
debug!("connected!");
let mut connection_status = WalConnectionStatus {
is_connected: true,
has_processed_wal: false,
@@ -127,20 +122,25 @@ pub(super) async fn handle_walreceiver_connection(
"walreceiver connection",
false,
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
select! {
connection_result = connection => match connection_result {
Ok(()) => info!("Walreceiver db connection closed"),
Ok(()) => debug!("Walreceiver db connection closed"),
Err(connection_error) => {
if let Err(e) = ignore_expected_errors(connection_error) {
warn!("Connection aborted: {e:#}")
if connection_error.is_expected() {
// silence, because most likely we've already exited the outer call
// with a similar error.
} else {
warn!("Connection aborted: {connection_error:#}")
}
}
},
// Future: replace connection_cancellation with connection_ctx cancellation
_ = connection_cancellation.cancelled() => info!("Connection cancelled"),
_ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
}
Ok(())
},
}
.instrument(tracing::info_span!("poller")),
);
// Immediately increment the gauge, then create a job to decrement it on task exit.
@@ -203,20 +203,13 @@ pub(super) async fn handle_walreceiver_connection(
while let Some(replication_message) = {
select! {
_ = cancellation.cancelled() => {
info!("walreceiver interrupted");
debug!("walreceiver interrupted");
None
}
replication_message = physical_stream.next() => replication_message,
}
} {
let replication_message = match replication_message {
Ok(message) => message,
Err(replication_error) => {
let expected_error = ignore_expected_errors(replication_error)?;
info!("Replication stream finished: {expected_error}");
return Ok(());
}
};
let replication_message = replication_message?;
let now = Utc::now().naive_utc();
let last_rec_lsn_before_msg = last_rec_lsn;
@@ -261,8 +254,6 @@ pub(super) async fn handle_walreceiver_connection(
let mut decoded = DecodedWALRecord::default();
let mut modification = timeline.begin_modification(endlsn);
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
// let _enter = info_span!("processing record", lsn = %lsn).entered();
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
// at risk of hitting a deadlock.
@@ -313,12 +304,15 @@ pub(super) async fn handle_walreceiver_connection(
}
}
timeline.check_checkpoint_distance().with_context(|| {
format!(
"Failed to check checkpoint distance for timeline {}",
timeline.timeline_id
)
})?;
timeline
.check_checkpoint_distance()
.await
.with_context(|| {
format!(
"Failed to check checkpoint distance for timeline {}",
timeline.timeline_id
)
})?;
if let Some(last_lsn) = status_update {
let timeline_remote_consistent_lsn =
@@ -421,31 +415,50 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
}
}
/// We don't want to report connectivity problems as real errors towards connection manager because
/// 1. they happen frequently enough to make server logs hard to read and
/// 2. the connection manager can retry other safekeeper.
///
/// If this function returns `Ok(pg_error)`, it's such an error.
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
/// Connection manager will then handle reconnections.
///
/// If this function returns an `Err()`, the caller can bubble it up using `?`.
/// The connection manager will log the error at ERROR level.
fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
if pg_error.is_closed()
|| pg_error
.source()
.and_then(|source| source.downcast_ref::<std::io::Error>())
.map(is_expected_io_error)
.unwrap_or(false)
{
return Ok(pg_error);
} else if let Some(db_error) = pg_error.as_db_error() {
if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
&& db_error.message().contains("ending streaming")
{
return Ok(pg_error);
}
}
Err(pg_error).context("connection error")
/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
pub(super) trait ExpectedError {
/// Test if this error is an ok error.
///
/// We don't want to report connectivity problems as real errors towards connection manager because
/// 1. they happen frequently enough to make server logs hard to read and
/// 2. the connection manager can retry other safekeeper.
///
/// If this function returns `true`, it's such an error.
/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
/// Connection manager will then handle reconnections.
///
/// If this function returns an `false` the error should be propagated and the connection manager
/// will log the error at ERROR level.
fn is_expected(&self) -> bool;
}
impl ExpectedError for postgres::Error {
fn is_expected(&self) -> bool {
self.is_closed()
|| self
.source()
.and_then(|source| source.downcast_ref::<std::io::Error>())
.map(is_expected_io_error)
.unwrap_or(false)
|| self
.as_db_error()
.filter(|db_error| {
db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
&& db_error.message().contains("ending streaming")
})
.is_some()
}
}
impl ExpectedError for anyhow::Error {
fn is_expected(&self) -> bool {
let head = self.downcast_ref::<postgres::Error>();
let tail = self
.chain()
.filter_map(|e| e.downcast_ref::<postgres::Error>());
// check if self or any of the chained/sourced errors are expected
head.into_iter().chain(tail).any(|e| e.is_expected())
}
}

View File

@@ -76,6 +76,12 @@ pub(crate) struct UploadQueueInitialized {
pub(crate) queued_operations: VecDeque<UploadOp>,
}
impl UploadQueueInitialized {
pub(super) fn no_pending_work(&self) -> bool {
self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
}
}
#[derive(Clone, Copy)]
pub(super) enum SetDeletedFlagProgress {
NotRunning,
@@ -84,9 +90,7 @@ pub(super) enum SetDeletedFlagProgress {
}
pub(super) struct UploadQueueStopped {
pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
pub(super) last_uploaded_consistent_lsn: Lsn,
pub(super) latest_metadata: TimelineMetadata,
pub(super) upload_queue_for_deletion: UploadQueueInitialized,
pub(super) deleted_at: SetDeletedFlagProgress,
}
@@ -187,6 +191,15 @@ impl UploadQueue {
UploadQueue::Initialized(x) => Ok(x),
}
}
pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
match self {
UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
anyhow::bail!("queue is in state {}", self.as_str())
}
UploadQueue::Stopped(stopped) => Ok(stopped),
}
}
}
/// An in-progress upload or delete task.
@@ -199,6 +212,13 @@ pub(crate) struct UploadTask {
pub(crate) op: UploadOp,
}
#[derive(Debug)]
pub(crate) struct Delete {
pub(crate) file_kind: RemoteOpFileKind,
pub(crate) layer_file_name: LayerFileName,
pub(crate) scheduled_from_timeline_delete: bool,
}
#[derive(Debug)]
pub(crate) enum UploadOp {
/// Upload a layer file
@@ -207,8 +227,8 @@ pub(crate) enum UploadOp {
/// Upload the metadata file
UploadMetadata(IndexPart, Lsn),
/// Delete a file.
Delete(RemoteOpFileKind, LayerFileName),
/// Delete a layer file
Delete(Delete),
/// Barrier. When the barrier operation is reached,
Barrier(tokio::sync::watch::Sender<()>),
@@ -226,7 +246,12 @@ impl std::fmt::Display for UploadOp {
)
}
UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
UploadOp::Delete(delete) => write!(
f,
"Delete(path: {}, scheduled_from_timeline_delete: {})",
delete.layer_file_name.file_name(),
delete.scheduled_from_timeline_delete
),
UploadOp::Barrier(_) => write!(f, "Barrier"),
}
}

View File

@@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
use anyhow::Result;
use anyhow::{Context, Result};
use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN
modification.commit()?;
modification.commit().await?;
Ok(())
}
@@ -1082,7 +1082,10 @@ impl<'a> WalIngest<'a> {
.await?
{
// create it with 0 size initially, the logic below will extend it
modification.put_rel_creation(rel, 0, ctx).await?;
modification
.put_rel_creation(rel, 0, ctx)
.await
.context("Relation Error")?;
0
} else {
self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
@@ -1171,7 +1174,6 @@ impl<'a> WalIngest<'a> {
#[cfg(test)]
mod tests {
use super::*;
use crate::pgdatadir_mapping::create_test_timeline;
use crate::tenant::harness::*;
use crate::tenant::Timeline;
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
@@ -1200,7 +1202,7 @@ mod tests {
let mut m = tline.begin_modification(Lsn(0x10));
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
m.commit()?;
m.commit().await?;
let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
Ok(walingest)
@@ -1209,7 +1211,9 @@ mod tests {
#[tokio::test]
async fn test_relsize() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let mut walingest = init_walingest_test(&tline, &ctx).await?;
let mut m = tline.begin_modification(Lsn(0x20));
@@ -1217,22 +1221,22 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
.await?;
m.commit()?;
m.commit().await?;
let mut m = tline.begin_modification(Lsn(0x30));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
.await?;
m.commit()?;
m.commit().await?;
let mut m = tline.begin_modification(Lsn(0x40));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
.await?;
m.commit()?;
m.commit().await?;
let mut m = tline.begin_modification(Lsn(0x50));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
.await?;
m.commit()?;
m.commit().await?;
assert_current_logical_size(&tline, Lsn(0x50));
@@ -1318,7 +1322,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
.await?;
m.commit()?;
m.commit().await?;
assert_current_logical_size(&tline, Lsn(0x60));
// Check reported size and contents after truncation
@@ -1360,7 +1364,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
.await?;
m.commit()?;
m.commit().await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1373,7 +1377,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
.await?;
m.commit()?;
m.commit().await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1398,7 +1402,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
.await?;
m.commit()?;
m.commit().await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1428,14 +1432,16 @@ mod tests {
#[tokio::test]
async fn test_drop_extend() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let mut walingest = init_walingest_test(&tline, &ctx).await?;
let mut m = tline.begin_modification(Lsn(0x20));
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
.await?;
m.commit()?;
m.commit().await?;
// Check that rel exists and size is correct
assert_eq!(
@@ -1454,7 +1460,7 @@ mod tests {
// Drop rel
let mut m = tline.begin_modification(Lsn(0x30));
walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
m.commit()?;
m.commit().await?;
// Check that rel is not visible anymore
assert_eq!(
@@ -1472,7 +1478,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
.await?;
m.commit()?;
m.commit().await?;
// Check that rel exists and size is correct
assert_eq!(
@@ -1497,7 +1503,9 @@ mod tests {
#[tokio::test]
async fn test_truncate_extend() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let mut walingest = init_walingest_test(&tline, &ctx).await?;
// Create a 20 MB relation (the size is arbitrary)
@@ -1509,7 +1517,7 @@ mod tests {
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
.await?;
}
m.commit()?;
m.commit().await?;
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
@@ -1554,7 +1562,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
.await?;
m.commit()?;
m.commit().await?;
// Check reported size and contents after truncation
assert_eq!(
@@ -1603,7 +1611,7 @@ mod tests {
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
.await?;
}
m.commit()?;
m.commit().await?;
assert_eq!(
tline
@@ -1637,7 +1645,9 @@ mod tests {
#[tokio::test]
async fn test_large_rel() -> Result<()> {
let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let mut walingest = init_walingest_test(&tline, &ctx).await?;
let mut lsn = 0x10;
@@ -1648,7 +1658,7 @@ mod tests {
walingest
.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
.await?;
m.commit()?;
m.commit().await?;
}
assert_current_logical_size(&tline, Lsn(lsn));
@@ -1664,7 +1674,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
.await?;
m.commit()?;
m.commit().await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE
@@ -1677,7 +1687,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
.await?;
m.commit()?;
m.commit().await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE - 1
@@ -1693,7 +1703,7 @@ mod tests {
walingest
.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
.await?;
m.commit()?;
m.commit().await?;
assert_eq!(
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
size as BlockNumber

26
pgxn/hnsw/Makefile Normal file
View File

@@ -0,0 +1,26 @@
EXTENSION = hnsw
EXTVERSION = 0.1.0
MODULE_big = hnsw
DATA = $(wildcard *--*.sql)
OBJS = hnsw.o hnswalg.o
TESTS = $(wildcard test/sql/*.sql)
REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
REGRESS_OPTS = --inputdir=test --load-extension=hnsw
# For auto-vectorization:
# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
PG_CFLAGS += -O3
PG_CXXFLAGS += -O3 -std=c++11
PG_LDFLAGS += -lstdc++
all: $(EXTENSION)--$(EXTVERSION).sql
PG_CONFIG ?= pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
dist:
mkdir -p dist
git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master

25
pgxn/hnsw/README.md Normal file
View File

@@ -0,0 +1,25 @@
# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
This ANN extension of Postgres is based
on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
<br>
Dmitry Baranchuk, Artem Babenko, Yury Malkov
# Postgres extension
HNSW index is hold in memory (built on demand) and it's maxial size is limited
by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
described in the article).
# Example of usage:
```
create extension hnsw;
create table embeddings(id integer primary key, payload real[]);
create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
```

29
pgxn/hnsw/hnsw--0.1.0.sql Normal file
View File

@@ -0,0 +1,29 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
-- functions
CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
-- operators
CREATE OPERATOR <-> (
LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
COMMUTATOR = '<->'
);
-- access method
CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
AS 'MODULE_PATHNAME' LANGUAGE C;
CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
-- opclasses
CREATE OPERATOR CLASS knn_ops
DEFAULT FOR TYPE real[] USING hnsw AS
OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;

551
pgxn/hnsw/hnsw.c Normal file
View File

@@ -0,0 +1,551 @@
#include "postgres.h"
#include "access/amapi.h"
#include "access/generic_xlog.h"
#include "access/relation.h"
#include "access/reloptions.h"
#include "access/tableam.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
#include "nodes/execnodes.h"
#include "storage/bufmgr.h"
#include "utils/guc.h"
#include "utils/selfuncs.h"
#include <math.h>
#include <float.h>
#include "hnsw.h"
PG_MODULE_MAGIC;
typedef struct {
int32 vl_len_; /* varlena header (do not touch directly!) */
int dims;
int maxelements;
int efConstruction;
int efSearch;
int M;
} HnswOptions;
static relopt_kind hnsw_relopt_kind;
typedef struct {
HierarchicalNSW* hnsw;
size_t curr;
size_t n_results;
ItemPointer results;
} HnswScanOpaqueData;
typedef HnswScanOpaqueData* HnswScanOpaque;
typedef struct {
Oid relid;
uint32 status;
HierarchicalNSW* hnsw;
} HnswHashEntry;
#define SH_PREFIX hnsw_index
#define SH_ELEMENT_TYPE HnswHashEntry
#define SH_KEY_TYPE Oid
#define SH_KEY relid
#define SH_STORE_HASH
#define SH_GET_HASH(tb, a) ((a)->relid)
#define SH_HASH_KEY(tb, key) (key)
#define SH_EQUAL(tb, a, b) ((a) == (b))
#define SH_SCOPE static inline
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"
#define INDEX_HASH_SIZE 11
#define DEFAULT_EF_SEARCH 64
PGDLLEXPORT void _PG_init(void);
static hnsw_index_hash *hnsw_indexes;
/*
* Initialize index options and variables
*/
void
_PG_init(void)
{
hnsw_relopt_kind = add_reloption_kind();
add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
0, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
0, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
100, 0, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
16, 1, INT_MAX, AccessExclusiveLock);
add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
64, 1, INT_MAX, AccessExclusiveLock);
hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
}
static void
hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
bool *isnull, bool tupleIsAlive, void *state)
{
HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
ArrayType* array;
int n_items;
label_t label = 0;
/* Skip nulls */
if (isnull[0])
return;
array = DatumGetArrayTypeP(values[0]);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(hnsw));
}
memcpy(&label, tid, sizeof(*tid));
hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
}
static void
hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
{
IndexInfo* indexInfo = BuildIndexInfo(indexRel);
Assert(indexInfo->ii_NumIndexAttrs == 1);
table_index_build_scan(heapRel, indexRel, indexInfo,
true, true, hnsw_build_callback, (void *) hnsw, NULL);
}
static HierarchicalNSW*
hnsw_get_index(Relation indexRel, Relation heapRel)
{
HierarchicalNSW* hnsw;
Oid indexoid = RelationGetRelid(indexRel);
HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
if (entry == NULL)
{
size_t dims, maxelements;
size_t M;
size_t maxM;
size_t size_links_level0;
size_t size_data_per_element;
size_t data_size;
dsm_handle handle = indexoid << 1; /* make it even */
void* impl_private = NULL;
void* mapped_address = NULL;
Size mapped_size = 0;
Size shmem_size;
bool exists = true;
bool found;
HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
}
dims = opts->dims;
maxelements = opts->maxelements;
M = opts->M;
maxM = M * 2;
data_size = dims * sizeof(coord_t);
size_links_level0 = (maxM + 1) * sizeof(idx_t);
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
shmem_size = hnsw_sizeof() + maxelements * size_data_per_element;
/* first try to attach to existed index */
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
&mapped_address, &mapped_size, DEBUG1))
{
/* index doesn't exists: try to create it */
if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
&mapped_address, &mapped_size, DEBUG1))
{
/* We can do it under shared lock, so some other backend may
* try to initialize index. If create is failed because index already
* created by somebody else, then try to attach to it once again
*/
if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
&mapped_address, &mapped_size, ERROR))
{
return NULL;
}
}
else
{
exists = false;
}
}
Assert(mapped_size == shmem_size);
hnsw = (HierarchicalNSW*)mapped_address;
if (!exists)
{
hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
hnsw_populate(hnsw, indexRel, heapRel);
}
entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
Assert(!found);
entry->hnsw = hnsw;
}
else
{
hnsw = entry->hnsw;
}
return hnsw;
}
/*
* Start or restart an index scan
*/
static IndexScanDesc
hnsw_beginscan(Relation index, int nkeys, int norderbys)
{
IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
Relation heap = relation_open(index->rd_index->indrelid, NoLock);
so->hnsw = hnsw_get_index(index, heap);
relation_close(heap, NoLock);
so->curr = 0;
so->n_results = 0;
so->results = NULL;
scan->opaque = so;
return scan;
}
/*
* Start or restart an index scan
*/
static void
hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
if (so->results)
{
pfree(so->results);
so->results = NULL;
}
so->curr = 0;
if (orderbys && scan->numberOfOrderBys > 0)
memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
}
/*
* Fetch the next tuple in the given scan
*/
static bool
hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
/*
* Index can be used to scan backward, but Postgres doesn't support
* backward scan on operators
*/
Assert(ScanDirectionIsForward(dir));
if (so->curr == 0)
{
Datum value;
ArrayType* array;
int n_items;
size_t n_results;
label_t* results;
HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
size_t efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
/* Safety check */
if (scan->orderByData == NULL)
elog(ERROR, "cannot scan HNSW index without order");
/* No items will match if null */
if (scan->orderByData->sk_flags & SK_ISNULL)
return false;
value = scan->orderByData->sk_argument;
array = DatumGetArrayTypeP(value);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(so->hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(so->hnsw));
}
if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
elog(ERROR, "HNSW index search failed");
so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
so->n_results = n_results;
for (size_t i = 0; i < n_results; i++)
{
memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
}
free(results);
}
if (so->curr >= so->n_results)
{
return false;
}
else
{
scan->xs_heaptid = so->results[so->curr++];
scan->xs_recheckorderby = false;
return true;
}
}
/*
* End a scan and release resources
*/
static void
hnsw_endscan(IndexScanDesc scan)
{
HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
if (so->results)
pfree(so->results);
pfree(so);
scan->opaque = NULL;
}
/*
* Estimate the cost of an index scan
*/
static void
hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
Cost *indexStartupCost, Cost *indexTotalCost,
Selectivity *indexSelectivity, double *indexCorrelation
,double *indexPages
)
{
GenericCosts costs;
/* Never use index without order */
if (path->indexorderbys == NULL)
{
*indexStartupCost = DBL_MAX;
*indexTotalCost = DBL_MAX;
*indexSelectivity = 0;
*indexCorrelation = 0;
*indexPages = 0;
return;
}
MemSet(&costs, 0, sizeof(costs));
genericcostestimate(root, path, loop_count, &costs);
/* Startup cost and total cost are same */
*indexStartupCost = costs.indexTotalCost;
*indexTotalCost = costs.indexTotalCost;
*indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
*indexPages = costs.numIndexPages;
}
/*
* Parse and validate the reloptions
*/
static bytea *
hnsw_options(Datum reloptions, bool validate)
{
static const relopt_parse_elt tab[] = {
{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
};
return (bytea *) build_reloptions(reloptions, validate,
hnsw_relopt_kind,
sizeof(HnswOptions),
tab, lengthof(tab));
}
/*
* Validate catalog entries for the specified operator class
*/
static bool
hnsw_validate(Oid opclassoid)
{
return true;
}
/*
* Build the index for a logged table
*/
static IndexBuildResult *
hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
{
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
return result;
}
/*
* Insert a tuple into the index
*/
static bool
hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
Relation heap, IndexUniqueCheck checkUnique,
bool indexUnchanged,
IndexInfo *indexInfo)
{
HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
Datum value;
ArrayType* array;
int n_items;
label_t label = 0;
/* Skip nulls */
if (isnull[0])
return false;
/* Detoast value */
value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
array = DatumGetArrayTypeP(value);
n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
if (n_items != hnsw_dimensions(hnsw))
{
elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
n_items, hnsw_dimensions(hnsw));
}
memcpy(&label, heap_tid, sizeof(*heap_tid));
if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
elog(ERROR, "HNSW index insert failed");
return true;
}
/*
* Build the index for an unlogged table
*/
static void
hnsw_buildempty(Relation index)
{
/* index will be constructed on dema nd when accessed */
}
/*
* Clean up after a VACUUM operation
*/
static IndexBulkDeleteResult *
hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
{
Relation rel = info->index;
if (stats == NULL)
return NULL;
stats->num_pages = RelationGetNumberOfBlocks(rel);
return stats;
}
/*
* Bulk delete tuples from the index
*/
static IndexBulkDeleteResult *
hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state)
{
if (stats == NULL)
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
return stats;
}
/*
* Define index handler
*
* See https://www.postgresql.org/docs/current/index-api.html
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
Datum
hnsw_handler(PG_FUNCTION_ARGS)
{
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
amroutine->amstrategies = 0;
amroutine->amsupport = 0;
amroutine->amoptsprocnum = 0;
amroutine->amcanorder = false;
amroutine->amcanorderbyop = true;
amroutine->amcanbackward = false; /* can change direction mid-scan */
amroutine->amcanunique = false;
amroutine->amcanmulticol = false;
amroutine->amoptionalkey = true;
amroutine->amsearcharray = false;
amroutine->amsearchnulls = false;
amroutine->amstorage = false;
amroutine->amclusterable = false;
amroutine->ampredlocks = false;
amroutine->amcanparallel = false;
amroutine->amcaninclude = false;
amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
amroutine->amkeytype = InvalidOid;
/* Interface functions */
amroutine->ambuild = hnsw_build;
amroutine->ambuildempty = hnsw_buildempty;
amroutine->aminsert = hnsw_insert;
amroutine->ambulkdelete = hnsw_bulkdelete;
amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
amroutine->amcanreturn = NULL; /* tuple not included in heapsort */
amroutine->amcostestimate = hnsw_costestimate;
amroutine->amoptions = hnsw_options;
amroutine->amproperty = NULL; /* TODO AMPROP_DISTANCE_ORDERABLE */
amroutine->ambuildphasename = NULL;
amroutine->amvalidate = hnsw_validate;
amroutine->amadjustmembers = NULL;
amroutine->ambeginscan = hnsw_beginscan;
amroutine->amrescan = hnsw_rescan;
amroutine->amgettuple = hnsw_gettuple;
amroutine->amgetbitmap = NULL;
amroutine->amendscan = hnsw_endscan;
amroutine->ammarkpos = NULL;
amroutine->amrestrpos = NULL;
/* Interface functions to support parallel index scans */
amroutine->amestimateparallelscan = NULL;
amroutine->aminitparallelscan = NULL;
amroutine->amparallelrescan = NULL;
PG_RETURN_POINTER(amroutine);
}
/*
* Get the L2 distance between vectors
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
Datum
l2_distance(PG_FUNCTION_ARGS)
{
ArrayType *a = PG_GETARG_ARRAYTYPE_P(0);
ArrayType *b = PG_GETARG_ARRAYTYPE_P(1);
int a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
int b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
dist_t distance = 0.0;
dist_t diff;
coord_t *ax = (coord_t*)ARR_DATA_PTR(a);
coord_t *bx = (coord_t*)ARR_DATA_PTR(b);
if (a_dim != b_dim)
{
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("different array dimensions %d and %d", a_dim, b_dim)));
}
for (int i = 0; i < a_dim; i++)
{
diff = ax[i] - bx[i];
distance += diff * diff;
}
PG_RETURN_FLOAT4((dist_t)sqrt(distance));
}

5
pgxn/hnsw/hnsw.control Normal file
View File

@@ -0,0 +1,5 @@
comment = 'hNsw index'
default_version = '0.1.0'
module_pathname = '$libdir/hnsw'
relocatable = true
trusted = true

15
pgxn/hnsw/hnsw.h Normal file
View File

@@ -0,0 +1,15 @@
#pragma once
typedef float coord_t;
typedef float dist_t;
typedef uint32_t idx_t;
typedef uint64_t label_t;
typedef struct HierarchicalNSW HierarchicalNSW;
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
int hnsw_dimensions(HierarchicalNSW* hnsw);
size_t hnsw_count(HierarchicalNSW* hnsw);
size_t hnsw_sizeof(void);

379
pgxn/hnsw/hnswalg.cpp Normal file
View File

@@ -0,0 +1,379 @@
#include "hnswalg.h"
#if defined(__GNUC__)
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
#else
#define PORTABLE_ALIGN32 __declspec(align(32))
#define PREFETCH(addr,hint)
#endif
HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
{
dim = dim_;
data_size = dim * sizeof(coord_t);
efConstruction = efConstruction_;
maxelements = maxelements_;
M = M_;
maxM = maxM_;
size_links_level0 = (maxM + 1) * sizeof(idx_t);
size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
offset_data = size_links_level0;
offset_label = offset_data + data_size;
enterpoint_node = 0;
cur_element_count = 0;
#ifdef __x86_64__
use_avx2 = __builtin_cpu_supports("avx2");
#endif
}
std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
{
std::vector<uint32_t> visited;
visited.resize((cur_element_count + 31) >> 5);
std::priority_queue<std::pair<dist_t, idx_t >> topResults;
std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
topResults.emplace(dist, enterpoint_node);
candidateSet.emplace(-dist, enterpoint_node);
visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
dist_t lowerBound = dist;
while (!candidateSet.empty())
{
std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
if (-curr_el_pair.first > lowerBound)
break;
candidateSet.pop();
idx_t curNodeNum = curr_el_pair.second;
idx_t* data = get_linklist0(curNodeNum);
size_t size = *data++;
PREFETCH(getDataByInternalId(*data), 0);
for (size_t j = 0; j < size; ++j) {
size_t tnum = *(data + j);
PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
visited[tnum >> 5] |= 1 << (tnum & 31);
dist = fstdistfunc(point, getDataByInternalId(tnum));
if (topResults.top().first > dist || topResults.size() < ef) {
candidateSet.emplace(-dist, tnum);
PREFETCH(get_linklist0(candidateSet.top().second), 0);
topResults.emplace(dist, tnum);
if (topResults.size() > ef)
topResults.pop();
lowerBound = topResults.top().first;
}
}
}
}
return topResults;
}
void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
{
if (topResults.size() < NN)
return;
std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
std::vector<std::pair<dist_t, idx_t>> returnlist;
while (topResults.size() > 0) {
resultSet.emplace(-topResults.top().first, topResults.top().second);
topResults.pop();
}
while (resultSet.size()) {
if (returnlist.size() >= NN)
break;
std::pair<dist_t, idx_t> curen = resultSet.top();
dist_t dist_to_query = -curen.first;
resultSet.pop();
bool good = true;
for (std::pair<dist_t, idx_t> curen2 : returnlist) {
dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
getDataByInternalId(curen.second));
if (curdist < dist_to_query) {
good = false;
break;
}
}
if (good) returnlist.push_back(curen);
}
for (std::pair<dist_t, idx_t> elem : returnlist)
topResults.emplace(-elem.first, elem.second);
}
void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
std::priority_queue<std::pair<dist_t, idx_t>> topResults)
{
getNeighborsByHeuristic(topResults, M);
std::vector<idx_t> res;
res.reserve(M);
while (topResults.size() > 0) {
res.push_back(topResults.top().second);
topResults.pop();
}
{
idx_t* data = get_linklist0(cur_c);
if (*data)
throw std::runtime_error("Should be blank");
*data++ = res.size();
for (size_t idx = 0; idx < res.size(); idx++) {
if (data[idx])
throw std::runtime_error("Should be blank");
data[idx] = res[idx];
}
}
for (size_t idx = 0; idx < res.size(); idx++) {
if (res[idx] == cur_c)
throw std::runtime_error("Connection to the same element");
size_t resMmax = maxM;
idx_t *ll_other = get_linklist0(res[idx]);
idx_t sz_link_list_other = *ll_other;
if (sz_link_list_other > resMmax || sz_link_list_other < 0)
throw std::runtime_error("Bad sz_link_list_other");
if (sz_link_list_other < resMmax) {
idx_t *data = ll_other + 1;
data[sz_link_list_other] = cur_c;
*ll_other = sz_link_list_other + 1;
} else {
// finding the "weakest" element to replace it with the new one
idx_t *data = ll_other + 1;
dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
// Heuristic:
std::priority_queue<std::pair<dist_t, idx_t>> candidates;
candidates.emplace(d_max, cur_c);
for (size_t j = 0; j < sz_link_list_other; j++)
candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
getNeighborsByHeuristic(candidates, resMmax);
size_t indx = 0;
while (!candidates.empty()) {
data[indx] = candidates.top().second;
candidates.pop();
indx++;
}
*ll_other = indx;
}
}
}
void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
{
if (cur_element_count >= maxelements) {
throw std::runtime_error("The number of elements exceeds the specified limit");
}
idx_t cur_c = cur_element_count++;
memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
memcpy(getDataByInternalId(cur_c), point, data_size);
memcpy(getExternalLabel(cur_c), &label, sizeof label);
// Do nothing for the first element
if (cur_c != 0) {
std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
mutuallyConnectNewElement(point, cur_c, topResults);
}
};
std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
{
std::priority_queue<std::pair<dist_t, label_t>> topResults;
auto topCandidates = searchBaseLayer(query, k);
while (topCandidates.size() > k) {
topCandidates.pop();
}
while (!topCandidates.empty()) {
std::pair<dist_t, idx_t> rez = topCandidates.top();
label_t label;
memcpy(&label, getExternalLabel(rez.second), sizeof(label));
topResults.push(std::pair<dist_t, label_t>(rez.first, label));
topCandidates.pop();
}
return topResults;
};
dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
{
dist_t distance = 0.0;
for (size_t i = 0; i < n; i++)
{
dist_t diff = x[i] - y[i];
distance += diff * diff;
}
return distance;
}
#ifdef __x86_64__
#include <immintrin.h>
__attribute__((target("avx2")))
dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
{
const size_t TmpResSz = sizeof(__m256) / sizeof(float);
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
size_t qty16 = n / 16;
const float *pEnd1 = x + (qty16 * 16);
__m256 diff, v1, v2;
__m256 sum = _mm256_set1_ps(0);
while (x < pEnd1) {
v1 = _mm256_loadu_ps(x);
x += 8;
v2 = _mm256_loadu_ps(y);
y += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
v1 = _mm256_loadu_ps(x);
x += 8;
v2 = _mm256_loadu_ps(y);
y += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
}
_mm256_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
return (res);
}
dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
{
const size_t TmpResSz = sizeof(__m128) / sizeof(float);
float PORTABLE_ALIGN32 TmpRes[TmpResSz];
size_t qty16 = n / 16;
const float *pEnd1 = x + (qty16 * 16);
__m128 diff, v1, v2;
__m128 sum = _mm_set1_ps(0);
while (x < pEnd1) {
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
v1 = _mm_loadu_ps(x);
x += 4;
v2 = _mm_loadu_ps(y);
y += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
}
_mm_store_ps(TmpRes, sum);
float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
return res;
}
#endif
dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
{
#ifndef __x86_64__
return fstdistfunc_scalar(x, y, dim);
#else
if(use_avx2)
return fstdistfunc_avx2(x, y, dim);
return fstdistfunc_sse(x, y, dim);
#endif
}
bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
{
try
{
auto result = hnsw->searchKnn(point, efSearch);
size_t nResults = result.size();
*results = (label_t*)malloc(nResults*sizeof(label_t));
for (size_t i = nResults; i-- != 0;)
{
(*results)[i] = result.top().second;
result.pop();
}
*n_results = nResults;
return true;
}
catch (std::exception& x)
{
return false;
}
}
bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
{
try
{
hnsw->addPoint(point, label);
return true;
}
catch (std::exception& x)
{
fprintf(stderr, "Catch %s\n", x.what());
return false;
}
}
void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
{
new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
}
int hnsw_dimensions(HierarchicalNSW* hnsw)
{
return (int)hnsw->dim;
}
size_t hnsw_count(HierarchicalNSW* hnsw)
{
return hnsw->cur_element_count;
}
size_t hnsw_sizeof(void)
{
return sizeof(HierarchicalNSW);
}

69
pgxn/hnsw/hnswalg.h Normal file
View File

@@ -0,0 +1,69 @@
#pragma once
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <unordered_map>
#include <unordered_set>
#include <map>
#include <cmath>
#include <queue>
#include <stdexcept>
extern "C" {
#include "hnsw.h"
}
struct HierarchicalNSW
{
size_t maxelements;
size_t cur_element_count;
idx_t enterpoint_node;
size_t dim;
size_t data_size;
size_t offset_data;
size_t offset_label;
size_t size_data_per_element;
size_t M;
size_t maxM;
size_t size_links_level0;
size_t efConstruction;
#ifdef __x86_64__
bool use_avx2;
#endif
char data_level0_memory[0]; // varying size
public:
HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
~HierarchicalNSW();
inline coord_t *getDataByInternalId(idx_t internal_id) const {
return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
}
inline idx_t *get_linklist0(idx_t internal_id) const {
return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
}
inline label_t *getExternalLabel(idx_t internal_id) const {
return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
}
std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
void addPoint(const coord_t *point, label_t label);
std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
dist_t fstdistfunc(const coord_t *x, const coord_t *y);
};

View File

@@ -0,0 +1,28 @@
SET enable_seqscan = off;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
QUERY PLAN
--------------------------------------------------------------------
Index Scan using t_val_idx on t (cost=4.02..8.06 rows=3 width=36)
Order By: (val <-> '{3,3,3}'::real[])
(2 rows)
SELECT * FROM t ORDER BY val <-> array[3,3,3];
val
---------
{1,2,3}
{1,2,4}
{1,1,1}
{0,0,0}
(4 rows)
SELECT COUNT(*) FROM t;
count
-------
5
(1 row)
DROP TABLE t;

View File

@@ -0,0 +1,13 @@
SET enable_seqscan = off;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);
explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
SELECT * FROM t ORDER BY val <-> array[3,3,3];
SELECT COUNT(*) FROM t;
DROP TABLE t;

View File

@@ -4,6 +4,7 @@
MODULE_big = neon
OBJS = \
$(WIN32RES) \
extension_server.o \
file_cache.o \
libpagestore.o \
libpqwalproposer.o \

View File

@@ -0,0 +1,91 @@
/*-------------------------------------------------------------------------
*
* extension_server.c
* Request compute_ctl to download extension files.
*
* IDENTIFICATION
* contrib/neon/extension_server.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tcop/pquery.h"
#include "tcop/utility.h"
#include "access/xact.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "commands/defrem.h"
#include "miscadmin.h"
#include "utils/acl.h"
#include "fmgr.h"
#include "utils/guc.h"
#include "port.h"
#include "fmgr.h"
#include <curl/curl.h>
static int extension_server_port = 0;
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
// curl -X POST http://localhost:8080/extension_server/postgis-3.so
static bool
neon_download_extension_file_http(const char *filename)
{
CURL *curl;
CURLcode res;
char *compute_ctl_url;
char *postdata;
bool ret = false;
if ((curl = curl_easy_init()) == NULL)
{
elog(ERROR, "Failed to initialize curl handle");
}
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s", extension_server_port, filename);
elog(LOG, "curl_easy_perform() url: %s", compute_ctl_url);
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
if (curl)
{
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
/* Check for errors */
if (res == CURLE_OK)
{
elog(LOG, "curl_easy_perform() succeeded");
ret = true;
}
else
{
elog(WARNING, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
}
/* always cleanup */
curl_easy_cleanup(curl);
}
return ret;
}
void pg_init_extension_server()
{
DefineCustomIntVariable("neon.extension_server_port",
"connection string to the compute_ctl",
NULL,
&extension_server_port,
0, 0, INT_MAX,
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
// set download_extension_file_hook
prev_download_extension_file_hook = download_extension_file_hook;
download_extension_file_hook = neon_download_extension_file_http;
}

View File

@@ -0,0 +1 @@

View File

@@ -35,8 +35,11 @@ _PG_init(void)
{
pg_init_libpagestore();
pg_init_walproposer();
InitControlPlaneConnector();
pg_init_extension_server();
// Important: This must happen after other parts of the extension
// are loaded, otherwise any settings to GUCs that were set before
// the extension was loaded will be removed.

View File

@@ -21,6 +21,8 @@ extern char *neon_tenant;
extern void pg_init_libpagestore(void);
extern void pg_init_walproposer(void);
extern void pg_init_extension_server(void);
/*
* Returns true if we shouldn't do REDO on that block in record indicated by
* block_id; false otherwise.

View File

@@ -254,7 +254,7 @@ nwp_register_gucs(void)
DefineCustomIntVariable(
"neon.safekeeper_reconnect_timeout",
"Timeout for reconnecting to offline wal acceptor.",
"Walproposer reconnects to offline safekeepers once in this interval.",
NULL,
&wal_acceptor_reconnect_timeout,
1000, 0, INT_MAX, /* default, min, max */
@@ -264,10 +264,10 @@ nwp_register_gucs(void)
DefineCustomIntVariable(
"neon.safekeeper_connect_timeout",
"Timeout for connection establishement and it's maintenance against safekeeper",
"Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.",
NULL,
&wal_acceptor_connection_timeout,
5000, 0, INT_MAX,
10000, 0, INT_MAX,
PGC_SIGHUP,
GUC_UNIT_MS,
NULL, NULL, NULL);
@@ -441,7 +441,7 @@ WalProposerPoll(void)
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
wal_acceptor_connection_timeout))
{
elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
ShutdownConnection(sk);
}
@@ -1035,9 +1035,16 @@ RecvAcceptorGreeting(Safekeeper *sk)
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse))
return;
elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
/* Protocol is all good, move to voting. */
sk->state = SS_VOTING;
/*
* Note: it would be better to track the counter on per safekeeper basis,
* but at worst walproposer would restart with 'term rejected', so leave as
* is for now.
*/
++n_connected;
if (n_connected <= quorum)
{

56
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
# This file is automatically @generated by Poetry and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -855,35 +855,31 @@ files = [
[[package]]
name = "cryptography"
version = "39.0.1"
version = "41.0.0"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
category = "main"
optional = false
python-versions = ">=3.6"
python-versions = ">=3.7"
files = [
{file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"},
{file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"},
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"},
{file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"},
{file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"},
{file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
{file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
{file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
{file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"},
{file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"},
{file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"},
{file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"},
{file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"},
{file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"},
{file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"},
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"},
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"},
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"},
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"},
{file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"},
{file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"},
{file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"},
{file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"},
{file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"},
{file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"},
{file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"},
{file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"},
{file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"},
{file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"},
{file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"},
{file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"},
{file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"},
]
[package.dependencies]
@@ -892,12 +888,12 @@ cffi = ">=1.12"
[package.extras]
docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"]
sdist = ["setuptools-rust (>=0.11.4)"]
nox = ["nox"]
pep8test = ["black", "check-sdist", "mypy", "ruff"]
sdist = ["build"]
ssh = ["bcrypt (>=3.1.5)"]
test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"]
test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
test-randomorder = ["pytest-randomly"]
tox = ["tox"]
[[package]]
name = "docker"

View File

@@ -93,6 +93,15 @@ With the current approach we made the following design decisions:
and column oids. Command tag capturing was added to the rust-postgres
functionality as part of this change.
### Output options
User can pass several optional headers that will affect resulting json.
1. `Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres.
2. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge
cases where it is hard to use rows represented as objects (e.g. when several fields have the same name).
## Using SNI-based routing on localhost
Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:

View File

@@ -1,5 +1,5 @@
///! Various stuff for dealing with the Neon Console.
///! Later we might move some API wrappers here.
//! Various stuff for dealing with the Neon Console.
//! Later we might move some API wrappers here.
/// Payloads used in the console's APIs.
pub mod messages;

View File

@@ -1,6 +1,9 @@
use futures::pin_mut;
use futures::StreamExt;
use futures::TryFutureExt;
use hyper::body::HttpBody;
use hyper::http::HeaderName;
use hyper::http::HeaderValue;
use hyper::{Body, HeaderMap, Request};
use pq_proto::StartupMessageParams;
use serde_json::json;
@@ -9,8 +12,13 @@ use serde_json::Value;
use tokio_postgres::types::Kind;
use tokio_postgres::types::Type;
use tokio_postgres::Row;
use tracing::error;
use tracing::info;
use tracing::instrument;
use url::Url;
use crate::proxy::invalidate_cache;
use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
use crate::{auth, config::ProxyConfig, console};
#[derive(serde::Deserialize)]
@@ -23,21 +31,28 @@ const APP_NAME: &str = "sql_over_http";
const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
//
// Convert json non-string types to strings, so that they can be passed to Postgres
// as parameters.
//
fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<Option<String>>, serde_json::Error> {
json.iter()
.map(|value| {
match value {
Value::Null => serde_json::to_string(value),
Value::Bool(_) => serde_json::to_string(value),
Value::Number(_) => serde_json::to_string(value),
Value::Object(_) => serde_json::to_string(value),
// special care for nulls
Value::Null => Ok(None),
// no need to escape
Value::String(s) => Ok(s.to_string()),
// convert to text with escaping
Value::Bool(_) => serde_json::to_string(value).map(Some),
Value::Number(_) => serde_json::to_string(value).map(Some),
Value::Object(_) => serde_json::to_string(value).map(Some),
// avoid escaping here, as we pass this as a parameter
Value::String(s) => Ok(Some(s.to_string())),
// special care for arrays
Value::Array(_) => json_array_to_pg_array(value),
@@ -54,33 +69,44 @@ fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
//
// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
//
fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::Error> {
match value {
// same
Value::Null => serde_json::to_string(value),
Value::Bool(_) => serde_json::to_string(value),
Value::Number(_) => serde_json::to_string(value),
Value::Object(_) => serde_json::to_string(value),
// special care for nulls
Value::Null => Ok(None),
// now needs to be escaped, as it is part of the array
Value::String(_) => serde_json::to_string(value),
// convert to text with escaping
Value::Bool(_) => serde_json::to_string(value).map(Some),
Value::Number(_) => serde_json::to_string(value).map(Some),
Value::Object(_) => serde_json::to_string(value).map(Some),
// here string needs to be escaped, as it is part of the array
Value::String(_) => serde_json::to_string(value).map(Some),
// recurse into array
Value::Array(arr) => {
let vals = arr
.iter()
.map(json_array_to_pg_array)
.map(|r| r.map(|v| v.unwrap_or_else(|| "NULL".to_string())))
.collect::<Result<Vec<_>, _>>()?
.join(",");
Ok(format!("{{{}}}", vals))
Ok(Some(format!("{{{}}}", vals)))
}
}
}
struct ConnInfo {
username: String,
dbname: String,
hostname: String,
password: String,
}
fn get_conn_info(
headers: &HeaderMap,
sni_hostname: Option<String>,
) -> Result<(String, String, String, String), anyhow::Error> {
) -> Result<ConnInfo, anyhow::Error> {
let connection_string = headers
.get("Neon-Connection-String")
.ok_or(anyhow::anyhow!("missing connection string"))?
@@ -133,12 +159,12 @@ fn get_conn_info(
}
}
Ok((
username.to_owned(),
dbname.to_owned(),
hostname.to_owned(),
password.to_owned(),
))
Ok(ConnInfo {
username: username.to_owned(),
dbname: dbname.to_owned(),
hostname: hostname.to_owned(),
password: password.to_owned(),
})
}
// TODO: return different http error codes
@@ -151,13 +177,18 @@ pub async fn handle(
// Determine the destination and connection params
//
let headers = request.headers();
let (username, dbname, hostname, password) = get_conn_info(headers, sni_hostname)?;
let conn_info = get_conn_info(headers, sni_hostname)?;
let credential_params = StartupMessageParams::new([
("user", &username),
("database", &dbname),
("user", &conn_info.username),
("database", &conn_info.dbname),
("application_name", APP_NAME),
]);
// Determine the output options. Default behaviour is 'false'. Anything that is not
// strictly 'true' assumed to be false.
let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
//
// Wake up the destination if needed. Code here is a bit involved because
// we reuse the code from the usual proxy and we need to prepare few structures
@@ -168,21 +199,20 @@ pub async fn handle(
let creds = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&credential_params, Some(&hostname), common_names))
.map(|_| {
auth::ClientCredentials::parse(
&credential_params,
Some(&conn_info.hostname),
common_names,
)
})
.transpose()?;
let extra = console::ConsoleReqExtra {
session_id: uuid::Uuid::new_v4(),
application_name: Some(APP_NAME),
};
let node = creds.wake_compute(&extra).await?.expect("msg");
let conf = node.value.config;
let port = *conf.get_ports().first().expect("no port");
let host = match conf.get_hosts().first().expect("no host") {
tokio_postgres::config::Host::Tcp(host) => host,
tokio_postgres::config::Host::Unix(_) => {
return Err(anyhow::anyhow!("unix socket is not supported"));
}
};
let mut node_info = creds.wake_compute(&extra).await?.expect("msg");
let request_content_length = match request.body().size_hint().upper() {
Some(v) => v,
@@ -202,28 +232,10 @@ pub async fn handle(
let QueryData { query, params } = serde_json::from_slice(&body)?;
let query_params = json_to_pg_text(params)?;
//
// Connenct to the destination
//
let (client, connection) = tokio_postgres::Config::new()
.host(host)
.port(port)
.user(&username)
.password(&password)
.dbname(&dbname)
.max_backend_message_size(MAX_RESPONSE_SIZE)
.connect(tokio_postgres::NoTls)
.await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
eprintln!("connection error: {}", e);
}
});
//
// Now execute the query and return the result
//
let client = connect_to_compute(&mut node_info, &extra, &creds, &conn_info).await?;
let row_stream = client.query_raw_txt(query, query_params).await?;
// Manually drain the stream into a vector to leave row_stream hanging
@@ -262,6 +274,11 @@ pub async fn handle(
json!({
"name": Value::String(c.name().to_owned()),
"dataTypeID": Value::Number(c.type_().oid().into()),
"tableID": c.table_oid(),
"columnID": c.column_id(),
"dataTypeSize": c.type_size(),
"dataTypeModifier": c.type_modifier(),
"format": "text",
})
})
.collect::<Vec<_>>()
@@ -272,7 +289,7 @@ pub async fn handle(
// convert rows to JSON
let rows = rows
.iter()
.map(pg_text_row_to_json)
.map(|row| pg_text_row_to_json(row, raw_output, array_mode))
.collect::<Result<Vec<_>, _>>()?;
// resulting JSON format is based on the format of node-postgres result
@@ -281,26 +298,106 @@ pub async fn handle(
"rowCount": command_tag_count,
"rows": rows,
"fields": fields,
"rowAsArray": array_mode,
}))
}
/// This function is a copy of `connect_to_compute` from `src/proxy.rs` with
/// the difference that it uses `tokio_postgres` for the connection.
#[instrument(skip_all)]
async fn connect_to_compute(
node_info: &mut console::CachedNodeInfo,
extra: &console::ConsoleReqExtra<'_>,
creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
conn_info: &ConnInfo,
) -> anyhow::Result<tokio_postgres::Client> {
let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
loop {
match connect_to_compute_once(node_info, conn_info).await {
Err(e) if num_retries > 0 => {
info!("compute node's state has changed; requesting a wake-up");
match creds.wake_compute(extra).await? {
// Update `node_info` and try one more time.
Some(new) => {
*node_info = new;
}
// Link auth doesn't work that way, so we just exit.
None => return Err(e),
}
}
other => return other,
}
num_retries -= 1;
info!("retrying after wake-up ({num_retries} attempts left)");
}
}
async fn connect_to_compute_once(
node_info: &console::CachedNodeInfo,
conn_info: &ConnInfo,
) -> anyhow::Result<tokio_postgres::Client> {
let mut config = (*node_info.config).clone();
let (client, connection) = config
.user(&conn_info.username)
.password(&conn_info.password)
.dbname(&conn_info.dbname)
.max_backend_message_size(MAX_RESPONSE_SIZE)
.connect(tokio_postgres::NoTls)
.inspect_err(|e: &tokio_postgres::Error| {
error!(
"failed to connect to compute node hosts={:?} ports={:?}: {}",
node_info.config.get_hosts(),
node_info.config.get_ports(),
e
);
invalidate_cache(node_info)
})
.await?;
tokio::spawn(async move {
if let Err(e) = connection.await {
error!("connection error: {}", e);
}
});
Ok(client)
}
//
// Convert postgres row with text-encoded values to JSON object
//
pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
let res = row
.columns()
.iter()
.enumerate()
.map(|(i, column)| {
let name = column.name();
let pg_value = row.as_text(i)?;
let json_value = pg_text_to_json(pg_value, column.type_())?;
Ok((name.to_string(), json_value))
})
.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
pub fn pg_text_row_to_json(
row: &Row,
raw_output: bool,
array_mode: bool,
) -> Result<Value, anyhow::Error> {
let iter = row.columns().iter().enumerate().map(|(i, column)| {
let name = column.name();
let pg_value = row.as_text(i)?;
let json_value = if raw_output {
match pg_value {
Some(v) => Value::String(v.to_string()),
None => Value::Null,
}
} else {
pg_text_to_json(pg_value, column.type_())?
};
Ok((name.to_string(), json_value))
});
Ok(Value::Object(res))
if array_mode {
// drop keys and aggregate into array
let arr = iter
.map(|r| r.map(|(_key, val)| val))
.collect::<Result<Vec<Value>, anyhow::Error>>()?;
Ok(Value::Array(arr))
} else {
let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
Ok(Value::Object(obj))
}
}
//
@@ -308,10 +405,6 @@ pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
//
pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
if let Some(val) = pg_value {
if val == "NULL" {
return Ok(Value::Null);
}
if let Kind::Array(elem_type) = pg_type.kind() {
return pg_array_parse(val, elem_type);
}
@@ -373,6 +466,27 @@ fn _pg_array_parse(
}
}
fn push_checked(
entry: &mut String,
entries: &mut Vec<Value>,
elem_type: &Type,
) -> Result<(), anyhow::Error> {
if !entry.is_empty() {
// While in usual postgres response we get nulls as None and everything else
// as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
// string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
// here while we have quotation info and convert them to None.
if entry == "NULL" {
entries.push(pg_text_to_json(None, elem_type)?);
} else {
entries.push(pg_text_to_json(Some(entry), elem_type)?);
}
entry.clear();
}
Ok(())
}
while let Some((mut i, mut c)) = pg_array_chr.next() {
let mut escaped = false;
@@ -395,9 +509,7 @@ fn _pg_array_parse(
'}' => {
level -= 1;
if level == 0 {
if !entry.is_empty() {
entries.push(pg_text_to_json(Some(&entry), elem_type)?);
}
push_checked(&mut entry, &mut entries, elem_type)?;
if nested {
return Ok((Value::Array(entries), i));
}
@@ -405,17 +517,15 @@ fn _pg_array_parse(
}
'"' if !escaped => {
if quote {
// push even if empty
// end of quoted string, so push it manually without any checks
// for emptiness or nulls
entries.push(pg_text_to_json(Some(&entry), elem_type)?);
entry = String::new();
entry.clear();
}
quote = !quote;
}
',' if !quote => {
if !entry.is_empty() {
entries.push(pg_text_to_json(Some(&entry), elem_type)?);
entry = String::new();
}
push_checked(&mut entry, &mut entries, elem_type)?;
}
_ => {
entry.push(c);
@@ -439,30 +549,35 @@ mod tests {
fn test_atomic_types_to_pg_params() {
let json = vec![Value::Bool(true), Value::Bool(false)];
let pg_params = json_to_pg_text(json).unwrap();
assert_eq!(pg_params, vec!["true", "false"]);
assert_eq!(
pg_params,
vec![Some("true".to_owned()), Some("false".to_owned())]
);
let json = vec![Value::Number(serde_json::Number::from(42))];
let pg_params = json_to_pg_text(json).unwrap();
assert_eq!(pg_params, vec!["42"]);
assert_eq!(pg_params, vec![Some("42".to_owned())]);
let json = vec![Value::String("foo\"".to_string())];
let pg_params = json_to_pg_text(json).unwrap();
assert_eq!(pg_params, vec!["foo\""]);
assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);
let json = vec![Value::Null];
let pg_params = json_to_pg_text(json).unwrap();
assert_eq!(pg_params, vec!["null"]);
assert_eq!(pg_params, vec![None]);
}
#[test]
fn test_json_array_to_pg_array() {
// atoms and escaping
let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]";
let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
let json: Value = serde_json::from_str(json).unwrap();
let pg_params = json_to_pg_text(vec![json]).unwrap();
assert_eq!(
pg_params,
vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"]
vec![Some(
"{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned()
)]
);
// nested arrays
@@ -471,7 +586,9 @@ mod tests {
let pg_params = json_to_pg_text(vec![json]).unwrap();
assert_eq!(
pg_params,
vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"]
vec![Some(
"{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
)]
);
}

View File

@@ -26,7 +26,6 @@ use tls_listener::TlsListener;
use tokio::{
io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
net::TcpListener,
select,
};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, warn, Instrument};
@@ -193,14 +192,9 @@ async fn ws_handler(
// TODO: that deserves a refactor as now this function also handles http json client besides websockets.
// Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
let result = select! {
_ = tokio::time::sleep(std::time::Duration::from_secs(10)) => {
Err(anyhow::anyhow!("Query timed out"))
}
response = sql_over_http::handle(config, request, sni_hostname) => {
response
}
};
let result = sql_over_http::handle(config, request, sni_hostname)
.instrument(info_span!("sql-over-http"))
.await;
let status_code = match result {
Ok(_) => StatusCode::OK,
Err(_) => StatusCode::BAD_REQUEST,

View File

@@ -22,7 +22,7 @@ use tracing::{error, info, warn};
use utils::measured_stream::MeasuredStream;
/// Number of times we should retry the `/proxy_wake_compute` http request.
const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
pub const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -283,34 +283,35 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
}
}
/// If we couldn't connect, a cached connection info might be to blame
/// (e.g. the compute node's address might've changed at the wrong time).
/// Invalidate the cache entry (if any) to prevent subsequent errors.
#[tracing::instrument(name = "invalidate_cache", skip_all)]
pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
let is_cached = node_info.cached();
if is_cached {
warn!("invalidating stalled compute node info cache entry");
node_info.invalidate();
}
let label = match is_cached {
true => "compute_cached",
false => "compute_uncached",
};
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
}
/// Try to connect to the compute node once.
#[tracing::instrument(name = "connect_once", skip_all)]
async fn connect_to_compute_once(
node_info: &console::CachedNodeInfo,
) -> Result<PostgresConnection, compute::ConnectionError> {
// If we couldn't connect, a cached connection info might be to blame
// (e.g. the compute node's address might've changed at the wrong time).
// Invalidate the cache entry (if any) to prevent subsequent errors.
let invalidate_cache = |_: &compute::ConnectionError| {
let is_cached = node_info.cached();
if is_cached {
warn!("invalidating stalled compute node info cache entry");
node_info.invalidate();
}
let label = match is_cached {
true => "compute_cached",
false => "compute_uncached",
};
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
};
let allow_self_signed_compute = node_info.allow_self_signed_compute;
node_info
.config
.connect(allow_self_signed_compute)
.inspect_err(invalidate_cache)
.inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
.await
}

View File

@@ -1,4 +1,4 @@
///! A group of high-level tests for connection establishing logic and auth.
//! A group of high-level tests for connection establishing logic and auth.
use super::*;
use crate::{auth, sasl, scram};
use async_trait::async_trait;

Some files were not shown because too many files have changed in this diff Show More