Compare commits

...

214 Commits

Author SHA1 Message Date
quantumish
44ec0e6f06 Document problems and pitfalls with fine-grained hashmap impl 2025-08-12 13:42:48 -07:00
quantumish
d33071a386 Fix concurrency bugs in resizing (WIP) 2025-08-12 10:34:29 -07:00
David Freifeld
add1a0ad78 Add initial work in implementing incremental resizing (WIP) 2025-07-14 09:02:56 -07:00
David Freifeld
282b90df28 Fix freelist bug, clean up interface to BucketIdx 2025-07-11 12:47:27 -07:00
David Freifeld
7f655ddb9a Heavily WIP rewrite of hashmap for concurrency + perf 2025-07-10 17:34:30 -07:00
David Freifeld
da6f4dbafe Merge branch 'communicator-rewrite' into quantumish/lfc-resizable-map 2025-07-10 17:33:56 -07:00
Heikki Linnakangas
a79fd3bda7 Move logic for picking request slot to the C code
With this refactoring, the Rust code deals with one giant array of
requests, and doesn't know that it's sliced up per backend
process. The C code is now responsible for slicing it up.

This also adds code to complete old IOs at backends start that were
started and left behind by a previous session. That was a little more
straightforward to do with the refactoring, which is why I tackled it
now.
2025-07-07 12:59:08 +03:00
Heikki Linnakangas
e1b58d5d69 Don't segfault if one of the unimplemented functions are called
We'll need to implement these, but let's stop the crashing for now
2025-07-07 11:33:44 +03:00
Erik Grinaker
9ae004f3bc Rename ShardMap to ShardSpec 2025-07-06 19:13:59 +02:00
Erik Grinaker
341c5f53d8 Restructure get_page retries 2025-07-06 18:35:47 +02:00
Erik Grinaker
4b06b547c1 pageserver/client_grpc: add shard map updates 2025-07-06 13:27:17 +02:00
Heikki Linnakangas
74e0d85a04 fix: Don't lose track of in-progress request if query is cancelled 2025-07-06 13:04:03 +03:00
Erik Grinaker
23ba42446b Fix accidental 1ms sleeps for GetPages 2025-07-06 11:09:58 +02:00
Heikki Linnakangas
71a83daac2 Revert crate dependencies to the versions in main branch
Some tests were failing with "Only request bodies with a known size
can be checksum validated." erros. This is a known issue with more
recent aws client versions, see
https://github.com/neondatabase/neon/issues/11363.
2025-07-05 18:03:19 +03:00
Heikki Linnakangas
1b8355a9f9 put back option lost in merge 2025-07-05 17:36:27 +03:00
Heikki Linnakangas
e14bb4be39 Merge remote-tracking branch 'origin/main' into communicator-rewrite 2025-07-05 16:59:51 +03:00
Heikki Linnakangas
f3a6c0d8ff cargo fmt 2025-07-05 16:26:24 +03:00
Heikki Linnakangas
17ec37aab2 Close gRPC getpage streams on shutdown
Some tests were failing, because pageserver didn't shut down promptly.
Tonic server graceful shutdown was a little too graceful; any open
streams linger until they're closed. Check the cancellation token
while waiting for next request, and close the stream if
shutdown/cancellation was requested.
2025-07-05 16:26:24 +03:00
Heikki Linnakangas
d6ec1f1a1c Skip legacy LFC initialization when communicator is used
It clashes with the initialization of the LFC file
2025-07-05 16:26:24 +03:00
Erik Grinaker
6f3fb4433f Add TODO 2025-07-05 14:15:34 +02:00
Erik Grinaker
d7678df445 Reap idle pool resources 2025-07-05 13:35:28 +02:00
Erik Grinaker
03d9f0ec41 Comment tweaks 2025-07-05 11:16:40 +02:00
Erik Grinaker
56845f2da2 Add GetPageClass::is_bulk 2025-07-05 11:15:28 +02:00
Heikki Linnakangas
9a37bfdf63 Fix re-finding an entry in bucket chain 2025-07-05 00:44:46 +03:00
Heikki Linnakangas
4c916552e8 Reduce logging noise
These are very useful while debugging, but also very noisy; let's dial
it down a little.
2025-07-04 23:11:36 +03:00
Heikki Linnakangas
50fbf4ac53 Fix hash table initialization across forked processes
attach_writer()/reader() are called from each forked process. It's too
late to do initialization there, in fact we used to overwrite the
contents of the hash table (or at least the freelist?) every time a
new process attached to it. The initialization must be done earlier,
in the HashMapInit() constructors.
2025-07-04 23:08:34 +03:00
Erik Grinaker
cb698a3951 Add dedicated client pools for bulk requests 2025-07-04 21:52:25 +02:00
Erik Grinaker
f6cc5cbd0c Split out retry handler to separate module 2025-07-04 20:20:09 +02:00
Heikki Linnakangas
00affada26 Add request ID to all communicator log lines as context information 2025-07-04 20:34:26 +03:00
Heikki Linnakangas
90d3c09c24 Minor cleanup
Tidy up and add some comments. Rename a few things for clarity.
2025-07-04 20:32:59 +03:00
Heikki Linnakangas
6c398aeae7 Fix dependency in Makefile 2025-07-04 20:24:21 +03:00
Heikki Linnakangas
1856bbbb9f Minor cleanup and commenting 2025-07-04 18:28:34 +03:00
Heikki Linnakangas
bd46dd60a0 Add a temporary timeout to handling an IO request in the communicator
It's nicer to timeout in the communicator and return an error to the
backend, than PANIC the backend.
2025-07-04 16:08:22 +03:00
Heikki Linnakangas
5f2d476a58 Add request ID to io-in-progress locking table, to ease debugging
I also added INFO messages for when a backend blocks on the
io-in-progress lock. It's probably too noisy for production, but
useful now to get a picture of how much it happens.
2025-07-04 15:55:57 +03:00
Heikki Linnakangas
3231cb6138 Await the io-in-progress locking futures
Otherwise they don't do anything. Oops.
2025-07-04 15:55:57 +03:00
Heikki Linnakangas
e558e0da5c Assign request_id earlier, in the originating backend
Makes it more useful for stitching together logs etc. for a specific
request.
2025-07-04 15:55:55 +03:00
Heikki Linnakangas
70bf2e088d Request multiple block numbers in a single GetPageV request
That's how it was always intended to be used
2025-07-04 15:49:04 +03:00
Heikki Linnakangas
da3f9ee72d cargo fmt 2025-07-04 12:39:41 +03:00
Erik Grinaker
88d1127bf4 Tweak GetPageSplitter 2025-07-03 21:12:26 +02:00
David Freifeld
794bb7a9e8 Merge branch 'quantumish/comm-lfc-integration' into communicator-rewrite 2025-07-03 10:52:29 -07:00
Erik Grinaker
42e4e5a418 Add GetPage request splitting 2025-07-03 18:31:12 +02:00
Heikki Linnakangas
96a817fa2b Fix the case that storage auth token is _not_ used
I broke that in previous commit while fixing the case of using a token.
2025-07-03 18:39:06 +03:00
Heikki Linnakangas
e7b057f2e8 Fix passing storage JWT token to the communicator process
Makes the 'test_compute_auth_to_pageserver' test pass
2025-07-03 18:14:22 +03:00
Heikki Linnakangas
956c2f4378 cargo fmt 2025-07-03 16:16:42 +03:00
Heikki Linnakangas
3293e4685e Fix cases where pageserver gets stuck waiting for LSN
The compute might make a request with an LSN that it hasn't even
flushed yet.
2025-07-03 16:14:45 +03:00
Erik Grinaker
6f8650782f Client tweaks 2025-07-03 14:54:23 +02:00
Erik Grinaker
14214eb853 Add client shard routing 2025-07-03 14:42:35 +02:00
Erik Grinaker
d4b4724921 Sanity-check Pageserver URLs 2025-07-03 14:18:14 +02:00
Erik Grinaker
9aba9550dd Instrument client methods 2025-07-03 14:11:53 +02:00
Erik Grinaker
375e8e5592 Improve retries and logging 2025-07-03 14:02:43 +02:00
Erik Grinaker
52c586f678 Restructure shard management 2025-07-03 11:51:19 +02:00
Erik Grinaker
de97b73d6e Lint fixes 2025-07-03 10:38:14 +02:00
Heikki Linnakangas
d8556616c9 Fix running Postgres in "vanilla mode", without neon storage
Some tests do that
2025-07-03 00:32:40 +03:00
Heikki Linnakangas
d8296e60e6 Fix caching of newly extended pages
This fixes read errors e.g. in test_compute_catalog.py test (and
probably many others).
2025-07-02 23:21:42 +03:00
Heikki Linnakangas
7263d6e2e5 Clarify error message if not_modified_lsn > request_lsn
I'm seeing this error from some python tests. Which means there's a
bug in the compute side of course, but it took me a while to figure
that out.
2025-07-02 23:21:42 +03:00
David Freifeld
b018b0ff60 Fix cargo clippy lints for neon-shmem 2025-07-02 13:01:26 -07:00
David Freifeld
86fb7b966a Update integrated_cache.rs to use new hashmap API 2025-07-02 12:18:37 -07:00
David Freifeld
0c099b0944 Merge branch 'quantumish/lfc-resizable-map' into quantumish/comm-lfc-integration 2025-07-02 12:05:24 -07:00
David Freifeld
2fe27f510d Make neon-shmem tests thread-safe and report errno in panics 2025-07-02 11:57:49 -07:00
David Freifeld
19b5618578 Switch to neon_shmem::sync lock_api and integrate into hashmap 2025-07-02 11:44:38 -07:00
Erik Grinaker
12dade35fa Comment tweaks 2025-07-02 14:47:27 +02:00
Erik Grinaker
1ec63bd6bc Misc pool improvements 2025-07-02 14:42:06 +02:00
Heikki Linnakangas
7012b4aa90 Remove --grpc options from neon_local endpoint reconfigure and start calls
They don't exist in neon_local anymore, and aren't actually used in
tests either.
2025-07-02 15:10:18 +03:00
Heikki Linnakangas
2cc28c75be Fix "ERROR: could not read size of rel ..." in many regression tests.
We were incorrectly skipping the call to communicator_new_rel_create(),
which resulted in an error during index build, when the btree build code
tried to check the size of the newly-created relation.
2025-07-02 14:10:11 +03:00
Erik Grinaker
bf01145ae4 Remove some old code 2025-07-02 11:46:54 +02:00
Erik Grinaker
8ab8fc11a3 Use new PageserverClient 2025-07-02 11:27:56 +02:00
Erik Grinaker
6f0af96a54 Add new PageserverClient 2025-07-02 10:59:40 +02:00
Heikki Linnakangas
9913d2668a print retried pageserver requests to log
Not sure how verbose we want this to be in production, but for now,
more is better.

This shows that many tests are failing with errors like these:

    PG:2025-07-01 23:02:34.311 GMT [1456523] LOG:  [COMMUNICATOR] send_process_get_rel_size_request: got error status: NotFound, message: "Read error", details: [], metadata: MetadataMap { headers: {"content-type": "application/grpc", "date": "Tue, 01 Jul 2025 23:02:34 GMT"} }, retrying​

I haven't debugged why that is yet. Did the compute make a bogus request?
2025-07-02 02:04:04 +03:00
Heikki Linnakangas
2fefece77d temporary hack to make regression tests fail faster 2025-07-02 01:42:39 +03:00
Heikki Linnakangas
471191e64e Fix updating relsize cache during WAL replay
This makes some of the test_runner/regress/test_hot_standby.py tests
pass, (Others are still failing..)
2025-07-01 21:22:04 +03:00
Erik Grinaker
f6761760a2 Documentation and tweaks 2025-07-01 17:54:41 +02:00
Erik Grinaker
0bce818d5e Add stream pool 2025-07-01 17:54:41 +02:00
Erik Grinaker
48be1da6ef Add initial client pool 2025-07-01 17:54:41 +02:00
Erik Grinaker
d2efc80e40 Add initial ChannelPool 2025-07-01 17:54:41 +02:00
Erik Grinaker
958c2577f5 pageserver: tighten up page_api::Client 2025-07-01 17:54:41 +02:00
Heikki Linnakangas
175c2e11e3 Add assertions that the legacy relsize cache is not used with new communicator
And fix a few cases where it was being called
2025-07-01 16:44:25 +03:00
Heikki Linnakangas
efdb07e7b6 Implement function to check if page is in local cache
This is needed for read replicas. There's one more TODO that needs to
implemented before read replicas work though, in
neon_extend_rel_size()
2025-07-01 16:22:51 +03:00
Heikki Linnakangas
b0970b415c Don't call legacy lfc function when new communicator is used 2025-07-01 15:47:26 +03:00
David Freifeld
9d3e07ef2c Add initial prototype of shmem sync primitives 2025-06-30 17:07:07 -07:00
Heikki Linnakangas
7429dd711c fix the .metrics.socket filename in the ignore list 2025-06-30 23:41:09 +03:00
Heikki Linnakangas
88ac1e356b Ignore the metrics unix domain socket in tests 2025-06-30 23:39:01 +03:00
Erik Grinaker
c3cb1ab98d Merge branch 'main' into communicator-rewrite 2025-06-30 21:07:01 +02:00
Erik Grinaker
81ac4ef43a Add a generic pool prototype 2025-06-30 14:49:34 +02:00
Erik Grinaker
a5b0fc560c Fix/allow remaining clippy lints 2025-06-30 12:36:20 +02:00
Erik Grinaker
67b04f8ab3 Fix a bunch of linter warnings 2025-06-30 11:10:02 +02:00
Erik Grinaker
9d9e3cd08a Fix test_normal_work grpc param 2025-06-30 10:13:46 +02:00
Heikki Linnakangas
97a8f4ef85 Handle unexpected EOF while doing an LFC read more gracefully
There's a bug somewhere because this happens in python regression
tests. We need to hunt that down, but in any case, let's not get stuck
in an infinite loop if it happens.
2025-06-30 00:59:53 +03:00
Heikki Linnakangas
39f31957e3 Handle pageserver response with different number of pages gracefully
Some tests are hitting this case, where pageserver returns 0 page
images in the response to a GetPage request. I suspect it's because
the code doesn't handle sharding correclty? In any case, let's not
panic on it, but return an IO error to the originating backend.
2025-06-29 23:44:28 +03:00
Heikki Linnakangas
924c6a6fdf Fix handling the case that server closes the stream
- avoid panic by checking for Ok(None) response from
  tonic::Streaming::message() instead of just using unwrap()
- There was a race condition, if the caller sent the message, but the
  receiver task concurrently received Ok(None) indicating the stream
  was closed. (I didn't see that in action, but I think it could happen
  by reading the code)
2025-06-29 22:53:39 +03:00
Heikki Linnakangas
7020476bf5 Run cargo fmt 2025-06-29 22:53:09 +03:00
Heikki Linnakangas
80e948db93 Remove ununused mock factory
After reading the code a few times, I didn't quite understand what it
was, to be honest, or how it was going to be used. Remove it now to
reduce noise, but we can resurrect it from git history if we need it
in the future.
2025-06-29 22:52:48 +03:00
Heikki Linnakangas
bfb30d434c minor code tidy-up 2025-06-29 22:51:34 +03:00
Heikki Linnakangas
f3ba201800 Run cargo fmt 2025-06-29 21:21:07 +03:00
Heikki Linnakangas
8b7796cbfa wip 2025-06-29 21:20:48 +03:00
Heikki Linnakangas
fdc7e9c2a4 Extract repeated code to look up RequestTracker into a helper function 2025-06-29 21:20:14 +03:00
Heikki Linnakangas
a352d290eb Plumb through both libpq and grpc connection strings to the compute
Add a new 'pageserver_connection_info' field in the compute spec. It
replaces the old 'pageserver_connstring' field with a more complicated
struct that includes both libpq and grpc URLs, for each shard (or only
one of the the URLs, depending on the configuration). It also includes
a flag suggesting which one to use; compute_ctl now uses it to decide
which protocol to use for the basebackup.

This is compatible with everything that's in production, because the
control plane never used the 'pageserver_connstring' field. That was
added a long time ago with the idea that it would replace the code
that digs the 'neon.pageserver_connstring' GUC from the list of
Postgres settings, but we never got around to do that in the control
plane. Hence, it was only used with neon_local. But the plan now is to
pass the 'pageserver_connection_info' from the control plane, and once
that's fully deployed everywhere, the code to parse
'neon.pageserver_connstring' in compute_ctl can be removed.

The 'grpc' flag on an endpoint in endpoint config is now more of a
suggestion. Compute_ctl gets both URLs, so it can choose to use libpq
or grpc as it wishes. It currently always obeys the 'prefer_grpc' flag
that's part of the connection info though. Postgres however uses grpc
iff the new rust-based communicator is enabled.

TODO/plan for the control plane:

- Start to pass `pageserver_connection_info` in the spec file.
- Also keep the current `neon.pageserver_connstring` setting for now,
  for backwards compatibility with old computes

After that, the `pageserver_connection_info.prefer_grpc` flag in the
spec file can be used to control whether compute_ctl uses grpc or
libpq.  The actual compute's grpc usage will be controlled by the
`neon.enable_new_communicator` GUC. It can be set separately from
'prefer_grpc'.

Later:

- Once all old computes are gone, remove the code to pass
  `neon.pageserver_connstring`
2025-06-29 18:16:49 +03:00
Heikki Linnakangas
8c122a1c98 Don't call into the old LFC when using the new communicator
This fixes errors like `index "pg_class_relname_nsp_index" contains
unexpected zero page at block 2` when running the python tests

smgrzeroextend() still called into the old LFC's lfc_write() function,
even when using the new communicator, which zeroed some arbitrary
pages in the LFC file, overwriting pages managed by the new LFC
implementation managed by `integrated_cache.rs`
2025-06-29 17:40:46 +03:00
David Freifeld
74330920ee Simplify API, squash bugs, and expand hashmap test suite 2025-06-27 17:11:22 -07:00
David Freifeld
c3c136ef3a Remove statistics utilities from neon_shmem crate 2025-06-27 17:10:52 -07:00
David Freifeld
78b6da270b Sketchily integrate hashmap rewrite with integrated_cache 2025-06-26 16:45:48 -07:00
David Freifeld
47664e40d4 Initial work in visualizing properties of hashmap 2025-06-26 16:00:33 -07:00
David Freifeld
b1e3161d4e Satisfy cargo clippy lints, simplify shrinking API 2025-06-26 14:32:32 -07:00
David Freifeld
4713715c59 Merge branch 'communicator-rewrite' of github.com:neondatabase/neon into communicator-rewrite 2025-06-26 10:26:41 -07:00
David Freifeld
1e74b52f7e Merge branch 'quantumish/lfc-resizable-map' into communicator-rewrite 2025-06-26 10:26:22 -07:00
Erik Grinaker
e3ecdfbecc pgxn/neon: actually use UNAME_S 2025-06-26 12:38:44 +02:00
Erik Grinaker
d08e553835 pgxn/neon: fix callback_get_request_lsn_unsafe return type 2025-06-26 12:33:59 +02:00
Erik Grinaker
7fffb5b4df pgxn/neon: fix macOS build 2025-06-26 12:33:39 +02:00
David Freifeld
1fb3639170 Properly change type of HashMapInit in .with_hasher() 2025-06-25 03:03:19 -07:00
David Freifeld
00dfaa2eb4 Add Criterion microbenchmarks for rehashing and insertions 2025-06-24 16:30:59 -07:00
David Freifeld
ae740ca1bb Document hashmap implementation, fix get_bucket_for_value
Previously, `get_bucket_for_value` incorrectly divided by the size of
`V` to get the bucket index. Now it divides by the size of `Bucket<K,V>`.
2025-06-24 16:27:17 -07:00
David Freifeld
24e6c68772 Remove prev entry tracking, refactor HashMapInit into proper builder 2025-06-24 13:34:22 -07:00
David Freifeld
93a45708ff Change finish_shrink to remap entries in shrunk space 2025-06-23 16:15:43 -07:00
Heikki Linnakangas
46b5c0be0b Remove duplicated migration script
I messed this up during the merge I guess?
2025-06-23 19:46:32 +03:00
Heikki Linnakangas
2d913ff125 fix some mismerges 2025-06-23 18:21:16 +03:00
Heikki Linnakangas
e90be06d46 silence a few compiler warnings
about unnecessary 'mut's and 'use's
2025-06-23 18:16:54 +03:00
Heikki Linnakangas
356ba67607 Merge remote-tracking branch 'origin/main' into HEAD
I also included build script changes from
https://github.com/neondatabase/neon/pull/12266, which is not yet
merged but will be soon.
2025-06-23 17:46:30 +03:00
David Freifeld
610ea22c46 Generalize map to allow arbitrary hash fns, add clear() helper method 2025-06-20 11:46:02 -07:00
Heikki Linnakangas
1847f4de54 Add missing #include.
Got a warning on macos without this
2025-06-18 17:26:20 +03:00
David Freifeld
477648b8cd Clean up hashmap implementation, add bucket tests 2025-06-17 11:23:10 -07:00
Heikki Linnakangas
e8af3a2811 remove unused struct in example code, to silence compiler warning 2025-06-17 02:09:21 +03:00
Heikki Linnakangas
b603e3dddb Silence compiler warnings in example code 2025-06-17 02:07:33 +03:00
Heikki Linnakangas
83007782fd fix compilation of example 2025-06-17 02:07:15 +03:00
David Freifeld
bb1e359872 Add testing utilities for hash map, freelist bugfixes 2025-06-16 16:02:39 -07:00
David Freifeld
ac87544e79 Implement shrinking, add basic tests for core operations 2025-06-16 13:13:38 -07:00
David Freifeld
b6b122e07b nw: add shrinking and deletion skeletons 2025-06-16 10:20:30 -07:00
Erik Grinaker
782062014e Fix test_normal_work endpoint restart 2025-06-16 10:16:27 +02:00
Erik Grinaker
d0b3629412 Tweak base backups 2025-06-13 13:47:26 -07:00
Heikki Linnakangas
16d6898e44 git add missing file 2025-06-12 02:37:59 +03:00
Erik Grinaker
f4d51c0f5c Use gRPC for test_normal_work 2025-06-09 22:51:15 +02:00
Erik Grinaker
ec17ae0658 Handle gRPC basebackups in compute_ctl 2025-06-09 22:50:57 +02:00
Erik Grinaker
9ecce60ded Plumb gRPC addr through storage-controller 2025-06-09 20:24:18 +02:00
Erik Grinaker
e74a957045 test_runner: initial gRPC protocol support 2025-06-06 16:56:33 +02:00
Erik Grinaker
396a16a3b2 test_runner: enable gRPC Pageserver 2025-06-06 14:55:29 +02:00
Elizabeth Murray
7140a50225 Minor changes to get integration tests to run for communicator. 2025-06-06 04:32:51 +02:00
Elizabeth Murray
68f18ccacf Request Tracker Prototype
Does not include splitting requests across shards.
2025-06-05 13:32:18 -07:00
Heikki Linnakangas
786888d93f Instead of a fixed TCP port for metrics, listen on a unix domain socket
That avoids clashes if you run two computes at the same time. More
secure too. We might want to have a TCP port in the long run, but this
is less trouble for now.

To see the metrics with curl you can use:

    curl --unix-socket .neon/endpoints/ep-main/pgdata/.metrics.socket http://localhost/metrics
2025-06-05 21:28:11 +03:00
Heikki Linnakangas
255537dda1 avoid hitting assertion failure in MarkPostmasterChildWalSender() 2025-06-05 20:08:32 +03:00
Erik Grinaker
8b494f6a24 Ignore communicator_bindings.h 2025-06-05 17:52:50 +02:00
Erik Grinaker
28a61741b3 Mangle gRPC connstrings to use port 51051 2025-06-05 17:46:58 +02:00
Heikki Linnakangas
10b936bf03 Use a custom Rust implementation to replace the LFC hash table
The new implementation lives in a separately allocated shared memory
area, which could be resized. Resizing it isn't actually implemented
yet, though. It would require some co-operation from the LFC code.
2025-06-05 18:31:29 +03:00
Erik Grinaker
2fb6164bf8 Misc build fixes 2025-06-05 17:22:11 +02:00
Erik Grinaker
328f28dfe5 impl Default for SlabBlockHeader 2025-06-05 17:18:28 +02:00
Erik Grinaker
95838056da Fix RelTag fields 2025-06-05 17:13:51 +02:00
Heikki Linnakangas
6145cfd1c2 Move neon-shmem facility to separate module within the crate 2025-06-05 18:13:03 +03:00
Erik Grinaker
6d451654f1 Remove generated communicator_bindings.h 2025-06-05 17:12:13 +02:00
Heikki Linnakangas
96b4de1de6 Make LFC chunk size a compile-time constant
A runtime setting is nicer, but the next commit will replace the hash
table with a different implementation that requires the value size to
be a compile-time constant.
2025-06-05 18:08:40 +03:00
Heikki Linnakangas
9fdf5fbb7e Use a separate freelist to track LFC "holes"
When the LFC is shrunk, we punch holes in the underlying file to
release the disk space to the OS. We tracked it in the same hash table
as the in-use entries, because that was convenient. However, I'm
working on being able to shrink the hash table too, and once we do
that, we'll need some other place to track the holes. Implement a
simple scheme of an in-memory array and a chain of on-disk blocks for
that.
2025-06-05 18:08:35 +03:00
Erik Grinaker
37c58522a2 Merge branch 'main' into communicator-rewrite 2025-06-05 15:08:05 +02:00
Erik Grinaker
4b6f02e47d Merge branch 'main' into communicator-rewrite 2025-06-04 10:23:29 +02:00
Erik Grinaker
8202c6172f Merge branch 'main' into communicator-rewrite 2025-06-03 16:04:31 +02:00
Erik Grinaker
69a47d789d pageserver: remove gRPC compute service prototype 2025-06-03 13:47:21 +02:00
Erik Grinaker
b36f880710 Fix Linux build failures 2025-06-03 13:37:56 +02:00
Erik Grinaker
745b750f33 Merge branch 'main' into communicator-rewrite 2025-06-03 13:29:45 +02:00
Heikki Linnakangas
f06bb2bbd8 Implement growing the hash table. Fix unit tests. 2025-05-29 15:54:55 +03:00
Heikki Linnakangas
b3c25418a6 Add metrics to track memory usage of the rust communicator 2025-05-29 02:14:01 +03:00
Heikki Linnakangas
33549bad1d use separate hash tables for relsize cache and block mappings 2025-05-28 23:57:55 +03:00
Heikki Linnakangas
009168d711 Add placeholder shmem hashmap implementation
Use that instead of the half-baked Adaptive Radix Tree
implementation. ART would probably be better in the long run, but more
complicated to implement.
2025-05-28 11:08:35 +03:00
Elizabeth Murray
7c9bd542a6 Fix compile warnings, minor cleanup. 2025-05-26 06:30:48 -07:00
Elizabeth Murray
014823b305 Add a new iteration of a new client pool with some updates. 2025-05-26 05:29:32 -07:00
Elizabeth Murray
af9379ccf6 Use a sempahore to gate access to connections. Add metrics for testing. 2025-05-26 05:28:50 -07:00
Heikki Linnakangas
bb28109ffa Merge remote-tracking branch 'origin/main' into communicator-rewrite-with-integrated-cache
There were conflicts because of the differences in the page_api
protocol that was merged to main vs what was on the branch. I adapted
the code for the protocol in main.
2025-05-26 11:52:32 +03:00
Elizabeth Murray
60a0bec1c0 Set default max consumers per connection to a high number. 2025-05-19 07:00:39 -07:00
Elizabeth Murray
31fa7a545d Remove unnecessary info include now that the info message is gone. 2025-05-19 06:52:07 -07:00
Elizabeth Murray
ac464c5f2c Return info message that was used for debugging. 2025-05-19 06:39:16 -07:00
Elizabeth Murray
0dddb1e373 Add back whitespace that was removed. 2025-05-19 06:34:52 -07:00
Elizabeth Murray
3acb263e62 Add first iteration of simulating a flakey network with a custom TCP. 2025-05-19 06:33:30 -07:00
Elizabeth Murray
1e83398cdd Correct out-of-date comment. 2025-05-14 07:31:52 -07:00
Elizabeth Murray
be8ed81532 Connection pool: update error accounting, sweep idle connections, add config options. 2025-05-14 07:31:52 -07:00
Heikki Linnakangas
12b08c4b82 Fix shutdown 2025-05-14 01:49:55 +03:00
Heikki Linnakangas
827358dd03 Handle OOMs a little more gracefully 2025-05-12 23:33:22 +03:00
Heikki Linnakangas
d367273000 minor cleanup 2025-05-12 23:11:55 +03:00
Heikki Linnakangas
e2bad5d9e9 Add debugging HTTP endpoint for dumping the cache tree 2025-05-12 22:54:03 +03:00
Heikki Linnakangas
5623e4665b bunch of fixes 2025-05-12 18:40:54 +03:00
Heikki Linnakangas
8abb4dab6d implement shrinking nodes 2025-05-12 03:57:10 +03:00
Heikki Linnakangas
731667ac37 better metrics of the art tree 2025-05-12 02:08:51 +03:00
Heikki Linnakangas
6a1374d106 Pack tree node structs more tightly, avoiding alignment padding 2025-05-12 01:01:58 +03:00
Heikki Linnakangas
f7c908f2f0 more metrics 2025-05-12 01:01:50 +03:00
Heikki Linnakangas
86671e3a0b Add a bunch of metric counters 2025-05-11 20:11:13 +03:00
Heikki Linnakangas
319cd74f73 Fix eviction 2025-05-11 19:34:50 +03:00
Heikki Linnakangas
0efefbf77c Add a few metrics, fix page eviction 2025-05-10 03:13:28 +03:00
Heikki Linnakangas
e6a4171fa1 fix concurrency issues with the LFC
- Add another locking hash table to track which cached pages are currently being
  modified, by smgrwrite() or smgrread() or by prefetch.

- Use single-value Leaf pages in the art tree. That seems simpler after all,
  and it eliminates some corner cases where a Value needed to be cloned, which
  made it tricky to use atomics or other interior mutability on the Values
2025-05-10 02:36:48 +03:00
Heikki Linnakangas
0c25ea9e31 reduce LOG noise 2025-05-09 18:27:36 +03:00
Heikki Linnakangas
6692321026 Remove dependency on io_uring, use plain std::fs ops instead
io_uring is a great idea in the long term, but for now, let's make it
easier to develop locally on macos, where io_uring is not available.
2025-05-06 17:46:21 +03:00
Heikki Linnakangas
791df28755 Linked list fix and add unit test 2025-05-06 16:46:54 +03:00
Heikki Linnakangas
d20da994f4 git add missing file 2025-05-06 15:36:48 +03:00
Heikki Linnakangas
6dbbdaae73 run 'cargo fmt' 2025-05-06 15:35:56 +03:00
Heikki Linnakangas
977bc09d2a Bunch of fixes, smarter iterator, metrics exporter 2025-05-06 15:28:50 +03:00
Heikki Linnakangas
44269fcd5e Implement simple eviction and free block tracking 2025-05-06 15:28:15 +03:00
Heikki Linnakangas
44cc648dc8 Implement iterator over keys
the implementation is not very optimized, but probably good enough for an MVP
2025-05-06 15:27:38 +03:00
Heikki Linnakangas
884e028a4a implement deletion in art tree 2025-05-06 15:27:38 +03:00
Heikki Linnakangas
42df3e5453 debugging stats 2025-05-06 15:27:38 +03:00
Heikki Linnakangas
fc743e284f more work on allocators 2025-05-06 15:27:38 +03:00
Heikki Linnakangas
d02f9a2139 Collect garbage, handle OOMs 2025-05-06 15:27:38 +03:00
Heikki Linnakangas
083118e98e Implement epoch system 2025-05-06 15:27:38 +03:00
Heikki Linnakangas
54cd2272f1 more memory allocation stuff 2025-05-06 15:27:38 +03:00
Heikki Linnakangas
e40193e3c8 simple block-based allocator 2025-05-06 15:27:38 +03:00
Heikki Linnakangas
ce9f7bacc1 Fix communicator client for recent changes in protocol and client code 2025-05-06 15:26:51 +03:00
Heikki Linnakangas
b7891f8fe8 Include 'neon-shard-id' header in client requests 2025-05-06 15:23:30 +03:00
Elizabeth Murray
5f2adaa9ad Remove some additional debug info messages. 2025-05-02 10:50:53 -07:00
Elizabeth Murray
3e5e396c8d Remove some debug info messages. 2025-05-02 10:24:18 -07:00
Elizabeth Murray
9d781c6fda Add a connection pool module to the grpc client. 2025-05-02 10:22:33 -07:00
Erik Grinaker
cf5d038472 service documentation 2025-05-02 15:20:12 +02:00
Erik Grinaker
d785100c02 page_api: add GetPageRequest::class 2025-05-02 10:48:32 +02:00
Erik Grinaker
2c0d930e3d page_api: add GetPageResponse::status 2025-04-30 16:48:45 +02:00
Erik Grinaker
66171a117b page_api: add GetPageRequestBatch 2025-04-30 15:31:11 +02:00
Erik Grinaker
df2806e7a0 page_api: add GetPageRequest::id 2025-04-30 15:00:16 +02:00
Erik Grinaker
07631692db page_api: protobuf comments 2025-04-30 12:36:11 +02:00
Erik Grinaker
4c77397943 Add neon-shard-id header 2025-04-30 11:18:06 +02:00
Erik Grinaker
7bb58be546 Use authorization header instead of neon-auth-token 2025-04-30 10:38:44 +02:00
Erik Grinaker
b5373de208 page_api: add get_slru_segment() 2025-04-29 17:59:27 +02:00
Erik Grinaker
b86c610f42 page_api: tweaks 2025-04-29 17:23:51 +02:00
Erik Grinaker
0f520d79ab pageserver: rename data_api to page_api 2025-04-29 15:58:52 +02:00
Heikki Linnakangas
93eb7bb6b8 include lots of changes that went missing by accident 2025-04-29 15:32:27 +03:00
Heikki Linnakangas
e58d0fece1 New communicator, with "integrated" cache accessible from all processes 2025-04-29 11:52:44 +03:00
80 changed files with 14981 additions and 1055 deletions

1
.gitignore vendored
View File

@@ -15,6 +15,7 @@ neon.iml
/.neon
/integration_tests/.neon
compaction-suite-results.*
pgxn/neon/communicator/communicator_bindings.h
# Coverage
*.profraw

421
Cargo.lock generated
View File

@@ -247,12 +247,32 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "atomic"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a89cbf775b137e9b968e67227ef7f775587cde3fd31b0d8599dbd0f598a48340"
dependencies = [
"bytemuck",
]
[[package]]
name = "atomic-take"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]]
name = "atomic_enum"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -687,13 +707,40 @@ dependencies = [
"tracing",
]
[[package]]
name = "axum"
version = "0.7.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f"
dependencies = [
"async-trait",
"axum-core 0.4.5",
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"itoa",
"matchit 0.7.3",
"memchr",
"mime",
"percent-encoding",
"pin-project-lite",
"rustversion",
"serde",
"sync_wrapper 1.0.1",
"tower 0.5.2",
"tower-layer",
"tower-service",
]
[[package]]
name = "axum"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d6fd624c75e18b3b4c6b9caf42b1afe24437daaee904069137d8bab077be8b8"
dependencies = [
"axum-core",
"axum-core 0.5.0",
"base64 0.22.1",
"bytes",
"form_urlencoded",
@@ -701,10 +748,10 @@ dependencies = [
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"hyper 1.4.1",
"hyper 1.6.0",
"hyper-util",
"itoa",
"matchit",
"matchit 0.8.4",
"memchr",
"mime",
"percent-encoding",
@@ -724,6 +771,26 @@ dependencies = [
"tracing",
]
[[package]]
name = "axum-core"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199"
dependencies = [
"async-trait",
"bytes",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"mime",
"pin-project-lite",
"rustversion",
"sync_wrapper 1.0.1",
"tower-layer",
"tower-service",
]
[[package]]
name = "axum-core"
version = "0.5.0"
@@ -750,8 +817,8 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
dependencies = [
"axum",
"axum-core",
"axum 0.8.1",
"axum-core 0.5.0",
"bytes",
"form_urlencoded",
"futures-util",
@@ -1029,9 +1096,23 @@ checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
[[package]]
name = "bytemuck"
version = "1.16.3"
version = "1.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "102087e286b4677862ea56cf8fc58bb2cdfa8725c40ffb80fe3a008eb7f2fc83"
checksum = "5c76a5792e44e4abe34d3abf15636779261d45a7450612059293d1d2cfc63422"
dependencies = [
"bytemuck_derive",
]
[[package]]
name = "bytemuck_derive"
version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ecc273b49b3205b83d648f0690daa588925572cc5063745bfe547fe7ec8e1a1"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "byteorder"
@@ -1288,10 +1369,31 @@ dependencies = [
[[package]]
name = "communicator"
version = "0.1.0"
version = "0.0.0"
dependencies = [
"atomic_enum",
"axum 0.8.1",
"bytes",
"cbindgen",
"clashmap",
"http 1.1.0",
"libc",
"metrics",
"neon-shmem",
"nix 0.30.1",
"pageserver_api",
"pageserver_client_grpc",
"pageserver_page_api",
"prometheus",
"prost 0.13.5",
"thiserror 1.0.69",
"tokio",
"tokio-pipe",
"tonic 0.12.3",
"tracing",
"tracing-subscriber",
"uring-common",
"utils",
"workspace_hack",
]
@@ -1321,7 +1423,7 @@ dependencies = [
"aws-sdk-kms",
"aws-sdk-s3",
"aws-smithy-types",
"axum",
"axum 0.8.1",
"axum-extra",
"base64 0.22.1",
"bytes",
@@ -1625,9 +1727,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.19"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crossterm"
@@ -2081,7 +2183,7 @@ name = "endpoint_storage"
version = "0.0.1"
dependencies = [
"anyhow",
"axum",
"axum 0.8.1",
"axum-extra",
"camino",
"camino-tempfile",
@@ -2342,6 +2444,12 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "foldhash"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "form_urlencoded"
version = "1.2.1"
@@ -2362,7 +2470,7 @@ dependencies = [
"futures-core",
"futures-sink",
"http-body-util",
"hyper 1.4.1",
"hyper 1.6.0",
"hyper-util",
"pin-project",
"rand 0.8.5",
@@ -2532,6 +2640,18 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "getrandom"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasi 0.14.2+wasi-0.2.4",
]
[[package]]
name = "gettid"
version = "0.1.3"
@@ -2697,6 +2817,16 @@ version = "0.15.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
[[package]]
name = "hashbrown"
version = "0.15.4"
source = "git+https://github.com/quantumish/hashbrown.git?rev=6610e6d#6610e6d2b1f288ef7b0709a3efefbc846395dc5e"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
]
[[package]]
name = "hashlink"
version = "0.9.1"
@@ -2921,9 +3051,9 @@ dependencies = [
[[package]]
name = "httparse"
version = "1.8.0"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
[[package]]
name = "httpdate"
@@ -2973,9 +3103,9 @@ dependencies = [
[[package]]
name = "hyper"
version = "1.4.1"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80"
dependencies = [
"bytes",
"futures-channel",
@@ -3015,7 +3145,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c"
dependencies = [
"futures-util",
"http 1.1.0",
"hyper 1.4.1",
"hyper 1.6.0",
"hyper-util",
"rustls 0.22.4",
"rustls-pki-types",
@@ -3030,7 +3160,7 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793"
dependencies = [
"hyper 1.4.1",
"hyper 1.6.0",
"hyper-util",
"pin-project-lite",
"tokio",
@@ -3039,20 +3169,21 @@ dependencies = [
[[package]]
name = "hyper-util"
version = "0.1.7"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9"
checksum = "dc2fdfdbff08affe55bb779f33b053aa1fe5dd5b54c257343c17edfa55711bdb"
dependencies = [
"bytes",
"futures-channel",
"futures-core",
"futures-util",
"http 1.1.0",
"http-body 1.0.0",
"hyper 1.4.1",
"hyper 1.6.0",
"libc",
"pin-project-lite",
"socket2",
"tokio",
"tower 0.4.13",
"tower-service",
"tracing",
]
@@ -3596,9 +3727,9 @@ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
[[package]]
name = "lock_api"
version = "0.4.10"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
dependencies = [
"autocfg",
"scopeguard",
@@ -3641,6 +3772,12 @@ dependencies = [
"regex-automata 0.1.10",
]
[[package]]
name = "matchit"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
[[package]]
name = "matchit"
version = "0.8.4"
@@ -3748,8 +3885,8 @@ dependencies = [
"procfs",
"prometheus",
"rand 0.8.5",
"rand_distr",
"twox-hash",
"rand_distr 0.4.3",
"twox-hash 1.6.3",
]
[[package]]
@@ -3836,10 +3973,35 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
name = "neon-shmem"
version = "0.1.0"
dependencies = [
"ahash",
"atomic",
"bytemuck",
"criterion",
"foldhash",
"hashbrown 0.15.4",
"libc",
"lock_api",
"nix 0.30.1",
"rand 0.9.1",
"rand_distr 0.5.1",
"rustc-hash 2.1.1",
"seahash",
"tempfile",
"thiserror 1.0.69",
"twox-hash 2.1.1",
"workspace_hack",
"xxhash-rust",
]
[[package]]
name = "neonart"
version = "0.1.0"
dependencies = [
"crossbeam-utils",
"rand 0.9.1",
"rand_distr 0.5.1",
"spin",
"tracing",
]
[[package]]
@@ -4275,15 +4437,19 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"axum 0.8.1",
"bytes",
"camino",
"clap",
"futures",
"hdrhistogram",
"http 1.1.0",
"humantime",
"humantime-serde",
"metrics",
"pageserver_api",
"pageserver_client",
"pageserver_client_grpc",
"pageserver_page_api",
"rand 0.8.5",
"reqwest",
@@ -4367,6 +4533,7 @@ dependencies = [
"pageserver_client",
"pageserver_compaction",
"pageserver_page_api",
"peekable",
"pem",
"pin-project-lite",
"postgres-protocol",
@@ -4380,6 +4547,7 @@ dependencies = [
"pprof",
"pq_proto",
"procfs",
"prost 0.13.5",
"rand 0.8.5",
"range-set-blaze",
"regex",
@@ -4416,7 +4584,7 @@ dependencies = [
"tower 0.5.2",
"tracing",
"tracing-utils",
"twox-hash",
"twox-hash 1.6.3",
"url",
"utils",
"uuid",
@@ -4483,6 +4651,38 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "pageserver_client_grpc"
version = "0.1.0"
dependencies = [
"anyhow",
"arc-swap",
"async-trait",
"bytes",
"chrono",
"compute_api",
"dashmap 5.5.0",
"futures",
"http 1.1.0",
"hyper 1.6.0",
"hyper-util",
"metrics",
"pageserver_api",
"pageserver_page_api",
"priority-queue",
"rand 0.8.5",
"scopeguard",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tokio-util",
"tonic 0.13.1",
"tower 0.4.13",
"tracing",
"utils",
"uuid",
]
[[package]]
name = "pageserver_compaction"
version = "0.1.0"
@@ -4608,7 +4808,7 @@ dependencies = [
"paste",
"seq-macro",
"thrift",
"twox-hash",
"twox-hash 1.6.3",
"zstd",
"zstd-sys",
]
@@ -4654,6 +4854,15 @@ dependencies = [
"sha2",
]
[[package]]
name = "peekable"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
dependencies = [
"smallvec",
]
[[package]]
name = "pem"
version = "3.0.3"
@@ -5087,6 +5296,17 @@ dependencies = [
"elliptic-curve 0.13.8",
]
[[package]]
name = "priority-queue"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5676d703dda103cbb035b653a9f11448c0a7216c7926bd35fcb5865475d0c970"
dependencies = [
"autocfg",
"equivalent",
"indexmap 2.9.0",
]
[[package]]
name = "proc-macro2"
version = "1.0.94"
@@ -5287,7 +5507,7 @@ dependencies = [
"humantime",
"humantime-serde",
"hyper 0.14.30",
"hyper 1.4.1",
"hyper 1.6.0",
"hyper-util",
"indexmap 2.9.0",
"ipnet",
@@ -5311,7 +5531,7 @@ dependencies = [
"postgres_backend",
"pq_proto",
"rand 0.8.5",
"rand_distr",
"rand_distr 0.4.3",
"rcgen",
"redis",
"regex",
@@ -5415,6 +5635,12 @@ dependencies = [
"proc-macro2",
]
[[package]]
name = "r-efi"
version = "5.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
[[package]]
name = "rand"
version = "0.7.3"
@@ -5439,6 +5665,16 @@ dependencies = [
"rand_core 0.6.4",
]
[[package]]
name = "rand"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
dependencies = [
"rand_chacha 0.9.0",
"rand_core 0.9.3",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
@@ -5459,6 +5695,16 @@ dependencies = [
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
"rand_core 0.9.3",
]
[[package]]
name = "rand_core"
version = "0.5.1"
@@ -5477,6 +5723,15 @@ dependencies = [
"getrandom 0.2.11",
]
[[package]]
name = "rand_core"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
dependencies = [
"getrandom 0.3.3",
]
[[package]]
name = "rand_distr"
version = "0.4.3"
@@ -5487,6 +5742,16 @@ dependencies = [
"rand 0.8.5",
]
[[package]]
name = "rand_distr"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
dependencies = [
"num-traits",
"rand 0.9.1",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
@@ -5683,7 +5948,7 @@ dependencies = [
"http-body-util",
"http-types",
"humantime-serde",
"hyper 1.4.1",
"hyper 1.6.0",
"itertools 0.10.5",
"metrics",
"once_cell",
@@ -5723,7 +5988,7 @@ dependencies = [
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"hyper 1.4.1",
"hyper 1.6.0",
"hyper-rustls 0.26.0",
"hyper-util",
"ipnet",
@@ -5780,7 +6045,7 @@ dependencies = [
"futures",
"getrandom 0.2.11",
"http 1.1.0",
"hyper 1.4.1",
"hyper 1.6.0",
"parking_lot 0.11.2",
"reqwest",
"reqwest-middleware",
@@ -5801,7 +6066,7 @@ dependencies = [
"async-trait",
"getrandom 0.2.11",
"http 1.1.0",
"matchit",
"matchit 0.8.4",
"opentelemetry",
"reqwest",
"reqwest-middleware",
@@ -6288,6 +6553,12 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"
[[package]]
name = "seahash"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
[[package]]
name = "sec1"
version = "0.3.0"
@@ -6749,12 +7020,12 @@ dependencies = [
[[package]]
name = "socket2"
version = "0.5.5"
version = "0.5.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678"
dependencies = [
"libc",
"windows-sys 0.48.0",
"windows-sys 0.52.0",
]
[[package]]
@@ -6762,6 +7033,9 @@ name = "spin"
version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
dependencies = [
"lock_api",
]
[[package]]
name = "spinning_top"
@@ -6820,7 +7094,7 @@ dependencies = [
"http-body-util",
"http-utils",
"humantime",
"hyper 1.4.1",
"hyper 1.6.0",
"hyper-util",
"metrics",
"once_cell",
@@ -7429,6 +7703,16 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "tokio-pipe"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
dependencies = [
"libc",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.10"
@@ -7623,16 +7907,25 @@ version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
dependencies = [
"async-stream",
"async-trait",
"axum 0.7.9",
"base64 0.22.1",
"bytes",
"h2 0.4.4",
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"hyper 1.6.0",
"hyper-timeout",
"hyper-util",
"percent-encoding",
"pin-project",
"prost 0.13.5",
"socket2",
"tokio",
"tokio-stream",
"tower 0.4.13",
"tower-layer",
"tower-service",
"tracing",
@@ -7645,7 +7938,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
dependencies = [
"async-trait",
"axum",
"axum 0.8.1",
"base64 0.22.1",
"bytes",
"flate2",
@@ -7653,7 +7946,7 @@ dependencies = [
"http 1.1.0",
"http-body 1.0.0",
"http-body-util",
"hyper 1.4.1",
"hyper 1.6.0",
"hyper-timeout",
"hyper-util",
"percent-encoding",
@@ -7706,11 +7999,16 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
dependencies = [
"futures-core",
"futures-util",
"indexmap 1.9.3",
"pin-project",
"pin-project-lite",
"rand 0.8.5",
"slab",
"tokio",
"tokio-util",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
@@ -7981,6 +8279,15 @@ dependencies = [
"static_assertions",
]
[[package]]
name = "twox-hash"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56"
dependencies = [
"rand 0.9.1",
]
[[package]]
name = "typed-json"
version = "0.1.1"
@@ -8194,7 +8501,7 @@ name = "vm_monitor"
version = "0.1.0"
dependencies = [
"anyhow",
"axum",
"axum 0.8.1",
"cgroups-rs",
"clap",
"futures",
@@ -8306,6 +8613,15 @@ version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasi"
version = "0.14.2+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
dependencies = [
"wit-bindgen-rt",
]
[[package]]
name = "wasite"
version = "0.1.0"
@@ -8663,6 +8979,15 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "wit-bindgen-rt"
version = "0.39.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "workspace_hack"
version = "0.1.0"
@@ -8670,8 +8995,8 @@ dependencies = [
"ahash",
"anstream",
"anyhow",
"axum",
"axum-core",
"axum 0.8.1",
"axum-core 0.5.0",
"base64 0.21.7",
"base64ct",
"bytes",
@@ -8703,7 +9028,7 @@ dependencies = [
"hex",
"hmac",
"hyper 0.14.30",
"hyper 1.4.1",
"hyper 1.6.0",
"hyper-util",
"indexmap 2.9.0",
"itertools 0.12.1",
@@ -8828,6 +9153,12 @@ version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd"
[[package]]
name = "xxhash-rust"
version = "0.8.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
[[package]]
name = "yasna"
version = "0.5.2"

View File

@@ -8,6 +8,7 @@ members = [
"pageserver/compaction",
"pageserver/ctl",
"pageserver/client",
"pageserver/client_grpc",
"pageserver/pagebench",
"pageserver/page_api",
"proxy",
@@ -34,6 +35,7 @@ members = [
"libs/pq_proto",
"libs/tenant_size_model",
"libs/metrics",
"libs/neonart",
"libs/postgres_connection",
"libs/remote_storage",
"libs/tracing-utils",
@@ -90,6 +92,7 @@ clap = { version = "4.0", features = ["derive", "env"] }
clashmap = { version = "1.0", features = ["raw-api"] }
comfy-table = "7.1"
const_format = "0.2"
crossbeam-utils = "0.8.21"
crc32c = "0.6"
diatomic-waker = { version = "0.2.3" }
either = "1.8"
@@ -148,6 +151,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pem = "3.0.3"
peekable = "0.3.0"
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
procfs = "0.16"
@@ -184,6 +188,7 @@ smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
spki = "0.7.3"
spin = "0.9.8"
strum = "0.26"
strum_macros = "0.26"
"subtle" = "2.5.0"
@@ -195,7 +200,6 @@ thiserror = "1.0"
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
tokio = { version = "1.43.1", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -237,6 +241,9 @@ x509-cert = { version = "0.2.5" }
env_logger = "0.11"
log = "0.4"
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
@@ -260,6 +267,7 @@ neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
pageserver = { path = "./pageserver" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
pageserver_client = { path = "./pageserver/client" }
pageserver_client_grpc = { path = "./pageserver/client_grpc" }
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
pageserver_page_api = { path = "./pageserver/page_api" }
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }

View File

@@ -1,4 +1,4 @@
use anyhow::{Context, Result};
use anyhow::{Context, Result, anyhow};
use chrono::{DateTime, Utc};
use compute_api::privilege::Privilege;
use compute_api::responses::{
@@ -6,7 +6,8 @@ use compute_api::responses::{
LfcPrewarmState, TlsConfig,
};
use compute_api::spec::{
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
PageserverShardConnectionInfo, PgIdent,
};
use futures::StreamExt;
use futures::future::join_all;
@@ -224,7 +225,7 @@ pub struct ParsedSpec {
pub spec: ComputeSpec,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub pageserver_connstr: String,
pub pageserver_conninfo: PageserverConnectionInfo,
pub safekeeper_connstrings: Vec<String>,
pub storage_auth_token: Option<String>,
/// k8s dns name and port
@@ -271,6 +272,27 @@ impl ParsedSpec {
}
}
fn extract_pageserver_conninfo_from_guc(
pageserver_connstring_guc: &str,
) -> PageserverConnectionInfo {
PageserverConnectionInfo {
shards: pageserver_connstring_guc
.split(',')
.enumerate()
.map(|(i, connstr)| {
(
i as u32,
PageserverShardConnectionInfo {
libpq_url: Some(connstr.to_string()),
grpc_url: None,
},
)
})
.collect(),
prefer_grpc: false,
}
}
impl TryFrom<ComputeSpec> for ParsedSpec {
type Error = String;
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
@@ -280,11 +302,17 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
// For backwards-compatibility, the top-level fields in the spec file
// may be empty. In that case, we need to dig them from the GUCs in the
// cluster.settings field.
let pageserver_connstr = spec
.pageserver_connstring
.clone()
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
.ok_or("pageserver connstr should be provided")?;
let pageserver_conninfo = match &spec.pageserver_connection_info {
Some(x) => x.clone(),
None => {
if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
extract_pageserver_conninfo_from_guc(&guc)
} else {
return Err("pageserver connstr should be provided".to_string());
}
}
};
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
if matches!(spec.mode, ComputeMode::Primary) {
spec.cluster
@@ -334,7 +362,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
let res = ParsedSpec {
spec,
pageserver_connstr,
pageserver_conninfo,
safekeeper_connstrings,
storage_auth_token,
tenant_id,
@@ -424,7 +452,7 @@ impl ComputeNode {
let mut new_state = ComputeState::new();
if let Some(spec) = config.spec {
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow!(msg))?;
new_state.pspec = Some(pspec);
}
@@ -1025,12 +1053,11 @@ impl ComputeNode {
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let started = Instant::now();
let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
let (connected, size) = if spec.pageserver_conninfo.prefer_grpc {
self.try_get_basebackup_grpc(spec, lsn)?
} else {
self.try_get_basebackup_libpq(spec, lsn)?
};
let mut state = self.state.lock().unwrap();
@@ -1045,20 +1072,21 @@ impl ComputeNode {
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
/// the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_connstr = spec
.pageserver_connstr
.split(',')
.next()
.unwrap()
.to_string();
let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
let shard0 = spec
.pageserver_conninfo
.shards
.get(&0)
.expect("shard 0 connection info missing");
let shard0_url = shard0.grpc_url.clone().expect("no grpc_url for shard 0");
let shard_index = match spec.pageserver_conninfo.shards.len() as u8 {
0 | 1 => ShardIndex::unsharded(),
count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
};
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
let mut client = page_api::Client::new(
shard0_connstr,
let mut client = page_api::Client::connect(
shard0_url,
spec.tenant_id,
spec.timeline_id,
shard_index,
@@ -1093,8 +1121,13 @@ impl ComputeNode {
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
/// when the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let mut config = postgres::Config::from_str(shard0_connstr)?;
let shard0 = spec
.pageserver_conninfo
.shards
.get(&0)
.expect("shard 0 connection info missing");
let shard0_connstr = shard0.libpq_url.clone().expect("no libpq_url for shard 0");
let mut config = postgres::Config::from_str(&shard0_connstr)?;
// Use the storage auth token from the config file, if given.
// Note: this overrides any password set in the connection string.
@@ -1180,10 +1213,7 @@ impl ComputeNode {
return result;
}
Err(ref e) if attempts < max_attempts => {
warn!(
"Failed to get basebackup: {} (attempt {}/{})",
e, attempts, max_attempts
);
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
retry_period_ms *= 1.5;
}
@@ -1392,16 +1422,8 @@ impl ComputeNode {
}
};
info!(
"getting basebackup@{} from pageserver {}",
lsn, &pspec.pageserver_connstr
);
self.get_basebackup(compute_state, lsn).with_context(|| {
format!(
"failed to get basebackup@{} from pageserver {}",
lsn, &pspec.pageserver_connstr
)
})?;
self.get_basebackup(compute_state, lsn)
.with_context(|| format!("failed to get basebackup@{lsn}"))?;
// Update pg_hba.conf received with basebackup.
update_pg_hba(pgdata_path)?;
@@ -2065,7 +2087,7 @@ LIMIT 100",
self.params
.remote_ext_base_url
.as_ref()
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
.ok_or(DownloadError::BadInput(anyhow!(
"Remote extensions storage is not configured",
)))?;
@@ -2261,7 +2283,7 @@ LIMIT 100",
let remote_extensions = spec
.remote_extensions
.as_ref()
.ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;
.ok_or(anyhow!("Remote extensions are not configured"))?;
info!("parse shared_preload_libraries from spec.cluster.settings");
let mut libs_vec = Vec::new();
@@ -2340,22 +2362,22 @@ LIMIT 100",
/// The operation will time out after a specified duration.
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
let state = self.state.lock().unwrap();
let old_pageserver_connstr = state
let old_pageserver_conninfo = state
.pspec
.as_ref()
.expect("spec must be set")
.pageserver_connstr
.pageserver_conninfo
.clone();
let mut unchanged = true;
let _ = self
.state_changed
.wait_timeout_while(state, duration, |s| {
let pageserver_connstr = &s
let pageserver_conninfo = &s
.pspec
.as_ref()
.expect("spec must be set")
.pageserver_connstr;
unchanged = pageserver_connstr == &old_pageserver_connstr;
.pageserver_conninfo;
unchanged = pageserver_conninfo == &old_pageserver_conninfo;
unchanged
})
.unwrap();

View File

@@ -56,9 +56,51 @@ pub fn write_postgres_conf(
// Add options for connecting to storage
writeln!(file, "# Neon storage settings")?;
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
if let Some(conninfo) = &spec.pageserver_connection_info {
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
for shardno in 0..conninfo.shards.len() {
let info = conninfo.shards.get(&(shardno as u32)).ok_or_else(|| {
anyhow::anyhow!("shard {shardno} missing from pageserver_connection_info shard map")
})?;
if let Some(url) = &info.libpq_url {
if let Some(ref mut urls) = libpq_urls {
urls.push(url.clone());
}
} else {
libpq_urls = None
}
if let Some(url) = &info.grpc_url {
if let Some(ref mut urls) = grpc_urls {
urls.push(url.clone());
}
} else {
grpc_urls = None
}
}
if let Some(libpq_urls) = libpq_urls {
writeln!(
file,
"neon.pageserver_connstring={}",
escape_conf_value(&libpq_urls.join(","))
)?;
} else {
writeln!(file, "# no neon.pageserver_connstring")?;
}
if let Some(grpc_urls) = grpc_urls {
writeln!(
file,
"neon.pageserver_grpc_urls={}",
escape_conf_value(&grpc_urls.join(","))
)?;
} else {
writeln!(file, "# no neon.pageserver_grpc_urls")?;
}
}
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={stripe_size}")?;
}

View File

@@ -4,8 +4,7 @@ use std::thread;
use std::time::{Duration, SystemTime};
use anyhow::{Result, bail};
use compute_api::spec::{ComputeMode, PageserverProtocol};
use itertools::Itertools as _;
use compute_api::spec::{ComputeMode, PageserverConnectionInfo};
use pageserver_page_api as page_api;
use postgres::{NoTls, SimpleQueryMessage};
use tracing::{info, warn};
@@ -78,17 +77,16 @@ fn acquire_lsn_lease_with_retry(
loop {
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
let (connstrings, auth) = {
let (conninfo, auth) = {
let state = compute.state.lock().unwrap();
let spec = state.pspec.as_ref().expect("spec must be set");
(
spec.pageserver_connstr.clone(),
spec.pageserver_conninfo.clone(),
spec.storage_auth_token.clone(),
)
};
let result =
try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
match result {
Ok(Some(res)) => {
return Ok(res);
@@ -112,17 +110,16 @@ fn acquire_lsn_lease_with_retry(
/// Tries to acquire LSN leases on all Pageserver shards.
fn try_acquire_lsn_lease(
connstrings: &str,
conninfo: PageserverConnectionInfo,
auth: Option<&str>,
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<Option<SystemTime>> {
let connstrings = connstrings.split(',').collect_vec();
let shard_count = connstrings.len();
let shard_count = conninfo.shards.len();
let mut leases = Vec::new();
for (shard_number, &connstring) in connstrings.iter().enumerate() {
for (shard_number, shard) in conninfo.shards.into_iter() {
let tenant_shard_id = match shard_count {
0 | 1 => TenantShardId::unsharded(tenant_id),
shard_count => TenantShardId {
@@ -132,13 +129,22 @@ fn try_acquire_lsn_lease(
},
};
let lease = match PageserverProtocol::from_connstring(connstring)? {
PageserverProtocol::Libpq => {
acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
}
PageserverProtocol::Grpc => {
acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
}
let lease = if conninfo.prefer_grpc {
acquire_lsn_lease_grpc(
&shard.grpc_url.unwrap(),
auth,
tenant_shard_id,
timeline_id,
lsn,
)?
} else {
acquire_lsn_lease_libpq(
&shard.libpq_url.unwrap(),
auth,
tenant_shard_id,
timeline_id,
lsn,
)?
};
leases.push(lease);
}
@@ -192,7 +198,7 @@ fn acquire_lsn_lease_grpc(
lsn: Lsn,
) -> Result<Option<SystemTime>> {
tokio::runtime::Handle::current().block_on(async move {
let mut client = page_api::Client::new(
let mut client = page_api::Client::connect(
connstring.to_string(),
tenant_shard_id.tenant_id,
timeline_id,

View File

@@ -16,7 +16,7 @@ use std::time::Duration;
use anyhow::{Context, Result, anyhow, bail};
use clap::Parser;
use compute_api::requests::ComputeClaimsScope;
use compute_api::spec::{ComputeMode, PageserverProtocol};
use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverShardConnectionInfo};
use control_plane::broker::StorageBroker;
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
@@ -1516,29 +1516,35 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
)?;
}
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
// Use gRPC if requested.
let pageserver = if endpoint.grpc {
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
(PageserverProtocol::Grpc, host, port)
} else {
let (shards, stripe_size) = if let Some(ps_id) = pageserver_id {
let conf = env.get_pageserver_conf(ps_id).unwrap();
let libpq_url = Some({
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
let port = port.unwrap_or(5432);
(PageserverProtocol::Libpq, host, port)
format!("postgres://no_user@{host}:{port}")
});
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
let pageserver = PageserverShardConnectionInfo {
libpq_url,
grpc_url,
};
// If caller is telling us what pageserver to use, this is not a tenant which is
// fully managed by storage controller, therefore not sharded.
(vec![pageserver], DEFAULT_STRIPE_SIZE)
(vec![(0, pageserver)], DEFAULT_STRIPE_SIZE)
} else {
// Look up the currently attached location of the tenant, and its striping metadata,
// to pass these on to postgres.
let storage_controller = StorageController::from_env(env);
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
let pageservers = futures::future::try_join_all(
locate_result.shards.into_iter().map(|shard| async move {
let shards = futures::future::try_join_all(locate_result.shards.into_iter().map(
|shard| async move {
if let ComputeMode::Static(lsn) = endpoint.mode {
// Initialize LSN leases for static computes.
let conf = env.get_pageserver_conf(shard.node_id).unwrap();
@@ -1550,28 +1556,34 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
.await?;
}
let pageserver = if endpoint.grpc {
(
PageserverProtocol::Grpc,
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
shard.listen_grpc_port.expect("no gRPC port"),
)
let libpq_host = Host::parse(&shard.listen_pg_addr)?;
let libpq_port = shard.listen_pg_port;
let libpq_url =
Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
} else {
(
PageserverProtocol::Libpq,
Host::parse(&shard.listen_pg_addr)?,
shard.listen_pg_port,
)
None
};
anyhow::Ok(pageserver)
}),
)
let pageserver = PageserverShardConnectionInfo {
libpq_url,
grpc_url,
};
anyhow::Ok((shard.shard_id.shard_number.0 as u32, pageserver))
},
))
.await?;
let stripe_size = locate_result.shard_params.stripe_size;
(pageservers, stripe_size)
(shards, stripe_size)
};
assert!(!shards.is_empty());
let pageserver_conninfo = PageserverConnectionInfo {
shards: shards.into_iter().collect(),
prefer_grpc: endpoint.grpc,
};
assert!(!pageservers.is_empty());
let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
@@ -1601,7 +1613,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
endpoint_storage_addr,
safekeepers_generation,
safekeepers,
pageservers,
pageserver_conninfo,
remote_ext_base_url: remote_ext_base_url.clone(),
shard_stripe_size: stripe_size.0 as usize,
create_test_user: args.create_test_user,
@@ -1620,20 +1632,27 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
.endpoints
.get(endpoint_id.as_str())
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
let shards = if let Some(ps_id) = args.endpoint_pageserver_id {
let conf = env.get_pageserver_conf(ps_id)?;
// Use gRPC if requested.
let pageserver = if endpoint.grpc {
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
(PageserverProtocol::Grpc, host, port)
} else {
let libpq_url = Some({
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
let port = port.unwrap_or(5432);
(PageserverProtocol::Libpq, host, port)
format!("postgres://no_user@{host}:{port}")
});
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
vec![pageserver]
let pageserver = PageserverShardConnectionInfo {
libpq_url,
grpc_url,
};
// If caller is telling us what pageserver to use, this is not a tenant which is
// fully managed by storage controller, therefore not sharded.
vec![(0, pageserver)]
} else {
let storage_controller = StorageController::from_env(env);
storage_controller
@@ -1643,28 +1662,36 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
.into_iter()
.map(|shard| {
// Use gRPC if requested.
if endpoint.grpc {
(
PageserverProtocol::Grpc,
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
.expect("bad hostname"),
shard.listen_grpc_port.expect("no gRPC port"),
)
let libpq_host = Host::parse(&shard.listen_pg_addr).expect("bad hostname");
let libpq_port = shard.listen_pg_port;
let libpq_url =
Some(format!("postgres://no_user@{libpq_host}:{libpq_port}"));
let grpc_url = if let Some(grpc_host) = shard.listen_grpc_addr {
let grpc_port = shard.listen_grpc_port.expect("no gRPC port");
Some(format!("grpc://no_user@{grpc_host}:{grpc_port}"))
} else {
(
PageserverProtocol::Libpq,
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
shard.listen_pg_port,
)
}
None
};
(
shard.shard_id.shard_number.0 as u32,
PageserverShardConnectionInfo {
libpq_url,
grpc_url,
},
)
})
.collect::<Vec<_>>()
};
let pageserver_conninfo = PageserverConnectionInfo {
shards: shards.into_iter().collect(),
prefer_grpc: endpoint.grpc,
};
// If --safekeepers argument is given, use only the listed
// safekeeper nodes; otherwise all from the env.
let safekeepers = parse_safekeepers(&args.safekeepers)?;
endpoint
.reconfigure(Some(pageservers), None, safekeepers, None)
.reconfigure(Some(pageserver_conninfo), None, safekeepers, None)
.await?;
}
EndpointCmd::Stop(args) => {

View File

@@ -56,9 +56,13 @@ use compute_api::responses::{
TlsConfig,
};
use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
PgIdent, RemoteExtSpec, Role,
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
RemoteExtSpec, Role,
};
// re-export these, because they're used in the reconfigure() function
pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
use jsonwebtoken::jwk::{
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
@@ -74,7 +78,6 @@ use sha2::{Digest, Sha256};
use spki::der::Decode;
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
use tracing::debug;
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
use crate::local_env::LocalEnv;
@@ -379,7 +382,7 @@ pub struct EndpointStartArgs {
pub endpoint_storage_addr: String,
pub safekeepers_generation: Option<SafekeeperGeneration>,
pub safekeepers: Vec<NodeId>,
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
pub pageserver_conninfo: PageserverConnectionInfo,
pub remote_ext_base_url: Option<String>,
pub shard_stripe_size: usize,
pub create_test_user: bool,
@@ -653,14 +656,6 @@ impl Endpoint {
}
}
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
pageservers
.iter()
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
.collect::<Vec<_>>()
.join(",")
}
/// Map safekeepers ids to the actual connection strings.
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
let mut safekeeper_connstrings = Vec::new();
@@ -706,9 +701,6 @@ impl Endpoint {
std::fs::remove_dir_all(self.pgdata())?;
}
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
assert!(!pageserver_connstring.is_empty());
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
// check for file remote_extensions_spec.json
@@ -767,7 +759,7 @@ impl Endpoint {
branch_id: None,
endpoint_id: Some(self.endpoint_id.clone()),
mode: self.mode,
pageserver_connstring: Some(pageserver_connstring),
pageserver_connection_info: Some(args.pageserver_conninfo),
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: args.auth_token.clone(),
@@ -980,7 +972,7 @@ impl Endpoint {
pub async fn reconfigure(
&self,
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
pageserver_conninfo: Option<PageserverConnectionInfo>,
stripe_size: Option<ShardStripeSize>,
safekeepers: Option<Vec<NodeId>>,
safekeeper_generation: Option<SafekeeperGeneration>,
@@ -996,15 +988,17 @@ impl Endpoint {
let postgresql_conf = self.read_postgresql_conf()?;
spec.cluster.postgresql_conf = Some(postgresql_conf);
// If pageservers are not specified, don't change them.
if let Some(pageservers) = pageservers {
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
spec.pageserver_connstring = Some(pageserver_connstr);
if stripe_size.is_some() {
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
}
if let Some(pageserver_conninfo) = pageserver_conninfo {
// If pageservers are provided, we need to ensure that they are not empty.
// This is a requirement for the compute_ctl configuration.
anyhow::ensure!(
!pageserver_conninfo.shards.is_empty(),
"no pageservers provided"
);
spec.pageserver_connection_info = Some(pageserver_conninfo);
}
if stripe_size.is_some() {
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
}
// If safekeepers are not specified, don't change them.
@@ -1053,7 +1047,7 @@ impl Endpoint {
pub async fn reconfigure_pageservers(
&self,
pageservers: Vec<(PageserverProtocol, Host, u16)>,
pageservers: PageserverConnectionInfo,
stripe_size: Option<ShardStripeSize>,
) -> Result<()> {
self.reconfigure(Some(pageservers), stripe_size, None, None)

View File

@@ -105,7 +105,11 @@ pub struct ComputeSpec {
// updated to fill these fields, we can make these non optional.
pub tenant_id: Option<TenantId>,
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
// Pageserver information can be passed in two different ways:
// 1. Here
// 2. in cluster.settings. This is legacy, we are switching to method 1.
pub pageserver_connection_info: Option<PageserverConnectionInfo>,
// More neon ids that we expose to the compute_ctl
// and to postgres as neon extension GUCs.
@@ -214,6 +218,20 @@ pub enum ComputeFeature {
UnknownFeature,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverConnectionInfo {
pub shards: HashMap<u32, PageserverShardConnectionInfo>,
pub prefer_grpc: bool,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverShardConnectionInfo {
pub libpq_url: Option<String>,
pub grpc_url: Option<String>,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct RemoteExtSpec {
pub public_extensions: Option<Vec<String>>,
@@ -331,6 +349,12 @@ impl ComputeMode {
}
}
impl Display for ComputeMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.to_type_str())
}
}
/// Log level for audit logging
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeAudit {
@@ -442,7 +466,7 @@ pub struct JwksSettings {
}
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
#[derive(Clone, Copy, Debug, Default)]
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum PageserverProtocol {
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
#[default]

View File

@@ -6,8 +6,29 @@ license.workspace = true
[dependencies]
thiserror.workspace = true
nix.workspace=true
nix.workspace = true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
rustc-hash = { version = "2.1.1" }
rand = "0.9.1"
libc.workspace = true
lock_api = "0.4.13"
atomic = "0.6.1"
bytemuck = { version = "1.23.1", features = ["derive"] }
[dev-dependencies]
criterion = { workspace = true, features = ["html_reports"] }
rand_distr = "0.5.1"
xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
ahash.workspace = true
twox-hash = { version = "2.1.1" }
seahash = "4.1.0"
hashbrown = { git = "https://github.com/quantumish/hashbrown.git", rev = "6610e6d" }
foldhash = "0.1.5"
[target.'cfg(target_os = "macos")'.dependencies]
tempfile = "3.14.0"
[[bench]]
name = "hmap_resize"
harness = false

View File

@@ -0,0 +1,330 @@
use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
use neon_shmem::hash::HashMapAccess;
use neon_shmem::hash::HashMapInit;
use neon_shmem::hash::entry::Entry;
use rand::distr::{Distribution, StandardUniform};
use rand::prelude::*;
use std::default::Default;
use std::hash::BuildHasher;
// Taken from bindings to C code
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
#[repr(C)]
pub struct FileCacheKey {
pub _spc_id: u32,
pub _db_id: u32,
pub _rel_number: u32,
pub _fork_num: u32,
pub _block_num: u32,
}
impl Distribution<FileCacheKey> for StandardUniform {
// questionable, but doesn't need to be good randomness
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
FileCacheKey {
_spc_id: rng.random(),
_db_id: rng.random(),
_rel_number: rng.random(),
_fork_num: rng.random(),
_block_num: rng.random(),
}
}
}
#[derive(Clone, Debug)]
#[repr(C)]
pub struct FileCacheEntry {
pub _offset: u32,
pub _access_count: u32,
pub _prev: *mut FileCacheEntry,
pub _next: *mut FileCacheEntry,
pub _state: [u32; 8],
}
impl FileCacheEntry {
fn dummy() -> Self {
Self {
_offset: 0,
_access_count: 0,
_prev: std::ptr::null_mut(),
_next: std::ptr::null_mut(),
_state: [0; 8],
}
}
}
// Utilities for applying operations.
#[derive(Clone, Debug)]
struct TestOp<K, V>(K, Option<V>);
fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
op: TestOp<K, V>,
map: &mut HashMapAccess<K, V, S>,
) {
let entry = map.entry(op.0);
match op.1 {
Some(new) => match entry {
Entry::Occupied(mut e) => Some(e.insert(new)),
Entry::Vacant(e) => {
_ = e.insert(new).unwrap();
None
}
},
None => match entry {
Entry::Occupied(e) => Some(e.remove()),
Entry::Vacant(_) => None,
},
};
}
// Hash utilities
struct SeaRandomState {
k1: u64,
k2: u64,
k3: u64,
k4: u64,
}
impl std::hash::BuildHasher for SeaRandomState {
type Hasher = seahash::SeaHasher;
fn build_hasher(&self) -> Self::Hasher {
seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
}
}
impl SeaRandomState {
fn new() -> Self {
let mut rng = rand::rng();
Self {
k1: rng.random(),
k2: rng.random(),
k3: rng.random(),
k4: rng.random(),
}
}
}
fn small_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Small maps");
group.sample_size(10);
group.bench_function("small_rehash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_xxhash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(twox_hash::xxhash64::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_ahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(ahash::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_seahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(SeaRandomState::new())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.finish();
}
fn real_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Realistic workloads");
group.sample_size(10);
group.bench_function("real_bulk_insert", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut rng = rand::rng();
b.iter_batched(
|| HashMapInit::new_resizeable(size, size * 2).attach_writer(),
|writer| {
for _ in 0..ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
let entry = writer.entry(key);
std::hint::black_box(match entry {
Entry::Occupied(mut e) => {
e.insert(val);
}
Entry::Vacant(e) => {
_ = e.insert(val).unwrap();
}
})
}
},
BatchSize::SmallInput,
)
});
group.bench_function("real_rehash", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("real_rehash_hashbrown", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher::default();
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(&k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(&k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
} else {
None
},
)
});
});
for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
group.bench_with_input(
BenchmarkId::new("real_rehash_varied", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
},
);
group.bench_with_input(
BenchmarkId::new("real_rehash_varied_hashbrown", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher::default();
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(&k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(&k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| {
std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
})
} else {
None
},
)
});
},
);
}
group.finish();
}
criterion_group!(benches, small_benchs, real_benchs);
criterion_main!(benches);

622
libs/neon-shmem/src/hash.rs Normal file
View File

@@ -0,0 +1,622 @@
use std::cell::UnsafeCell;
use std::hash::{BuildHasher, Hash};
use std::mem::MaybeUninit;
use std::ptr::NonNull;
use std::sync::atomic::Ordering;
use crate::shmem::ShmemHandle;
use crate::{shmem, sync::*};
mod core;
mod bucket;
pub mod entry;
#[cfg(test)]
mod tests;
use core::{
CoreHashMap, DictShard, EntryKey, EntryTag,
FullError, MaybeUninitDictShard
};
use bucket::{Bucket, BucketIdx};
use entry::Entry;
/// Wrapper struct around multiple [`ShmemHandle`]s.
struct HashMapHandles {
keys_shmem: ShmemHandle,
idxs_shmem: ShmemHandle,
vals_shmem: ShmemHandle,
}
/// This represents a hash table that (possibly) lives in shared memory.
/// If a new process is launched with fork(), the child process inherits
/// this struct.
#[must_use]
pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
shmem_handles: Option<HashMapHandles>,
shared_ptr: *mut HashMapShared<'a, K, V>,
hasher: S,
num_buckets: usize,
num_shards: usize,
resize_lock: Mutex<()>,
}
/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
/// If a child process is launched with fork(), the child process should
/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
///
/// XXX: We're not making use of it at the moment, but this struct could
/// hold process-local information in the future.
pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
shmem_handles: Option<HashMapHandles>,
shared_ptr: *mut HashMapShared<'a, K, V>,
hasher: S,
resize_lock: Mutex<()>,
}
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
/// Change the 'hasher' used by the hash table.
///
/// NOTE: This must be called right after creating the hash table,
/// before inserting any entries and before calling attach_writer/reader.
/// Otherwise different accessors could be using different hash function,
/// with confusing results.
///
/// TODO(quantumish): consider splitting out into a separate builder type?
pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
HashMapInit {
hasher,
shmem_handles: self.shmem_handles,
shared_ptr: self.shared_ptr,
num_buckets: self.num_buckets,
num_shards: self.num_shards,
resize_lock: self.resize_lock,
}
}
/// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets.
pub fn estimate_sizes(num_buckets: usize, num_shards: usize) -> (usize, usize, usize) {
(
(size_of::<EntryKey<K>>() * num_buckets)
+ (size_of::<libc::pthread_rwlock_t>() * num_shards)
+ (size_of::<RwLock<DictShard<'_, K>>>() * num_shards)
+ size_of::<HashMapShared<K, V>>()
+ 1000,
(size_of::<BucketIdx>() * num_buckets)+ 1000,
(size_of::<Bucket<V>>() * num_buckets) + 1000
)
}
fn carve_space<T>(ptr: &mut *mut u8, amount: usize) -> *mut T {
*ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<T>())) };
let out = ptr.cast();
*ptr = unsafe { ptr.add(size_of::<T>() * amount) };
out
}
fn new(
num_buckets: usize,
num_shards: usize,
mut keys_ptr: *mut u8,
mut idxs_ptr: *mut u8,
mut vals_ptr: *mut u8,
shmem_handles: Option<HashMapHandles>,
hasher: S,
) -> Self {
// Set up the main area: hashmap info at front, keys at back
let mutex_ptr = Self::carve_space::<libc::pthread_mutex_t>(&mut keys_ptr, 1);
let shared_ptr = Self::carve_space::<HashMapShared<K, V>>(&mut keys_ptr, 1);
let shards_ptr = Self::carve_space::<RwLock<DictShard<'_, K>>>(&mut keys_ptr, num_shards);
let locks_ptr = Self::carve_space::<libc::pthread_rwlock_t>(&mut keys_ptr, num_shards);
let keys_ptr = Self::carve_space::<EntryKey<K>>(&mut keys_ptr, num_buckets);
// Set up the area of bucket idxs and the area of buckets. Not much to do!
let idxs_ptr = Self::carve_space::<BucketIdx>(&mut idxs_ptr, num_buckets);
let vals_ptr = Self::carve_space::<Bucket<V>>(&mut vals_ptr, num_buckets);
// Initialize the shards.
let shards_uninit: &mut [MaybeUninit<RwLock<MaybeUninitDictShard<'_, K>>>] =
unsafe { std::slice::from_raw_parts_mut(shards_ptr.cast(), num_shards) };
let shard_size = num_buckets / num_shards;
for i in 0..num_shards {
let size = ((i + 1) * shard_size).min(num_buckets) - (i * shard_size);
unsafe {
shards_uninit[i].write(RwLock::from_raw(
PthreadRwLock::new(NonNull::new_unchecked(locks_ptr.add(i))),
MaybeUninitDictShard {
keys: std::slice::from_raw_parts_mut(keys_ptr.add(i * shard_size).cast(), size),
idxs: std::slice::from_raw_parts_mut(idxs_ptr.add(i * shard_size).cast(), size)
}
));
};
}
let shards: &mut [RwLock<MaybeUninitDictShard<'_, K>>] =
unsafe { std::slice::from_raw_parts_mut(shards_ptr.cast(), num_shards) };
let buckets: *const [MaybeUninit<Bucket<V>>] =
unsafe { std::slice::from_raw_parts(vals_ptr.cast(), num_buckets) };
unsafe {
let hashmap = CoreHashMap::new(&*(buckets as *const UnsafeCell<_>), shards);
std::ptr::write(shared_ptr, hashmap);
}
let resize_lock = Mutex::from_raw(
unsafe { PthreadMutex::new(NonNull::new_unchecked(mutex_ptr)) }, ()
);
Self {
num_shards,
num_buckets,
shmem_handles,
shared_ptr,
hasher,
resize_lock,
}
}
/// Attach to a hash table for writing.
pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
HashMapAccess {
shmem_handles: self.shmem_handles,
shared_ptr: self.shared_ptr,
hasher: self.hasher,
resize_lock: self.resize_lock,
}
}
/// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`].
pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
self.attach_writer()
}
}
type HashMapShared<'a, K, V> = CoreHashMap<'a, K, V>;
impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
where
K: Clone + Hash + Eq,
{
/// Place the hash table within a user-supplied fixed memory area.
pub fn with_fixed(
num_buckets: usize,
num_shards: usize,
area: &'a mut [MaybeUninit<u8>]
) -> Self {
let (keys_size, idxs_size, _) = Self::estimate_sizes(num_buckets, num_shards);
let ptr = area.as_mut_ptr().cast();
Self::new(
num_buckets,
num_shards,
ptr,
unsafe { ptr.add(keys_size) },
unsafe { ptr.add(keys_size).add(idxs_size) },
None,
rustc_hash::FxBuildHasher,
)
}
/// Place a new hash map in the given shared memory area
///
/// # Panics
/// Will panic on failure to resize area to expected map size.
pub fn with_shmems(
num_buckets: usize,
num_shards: usize,
keys_shmem: ShmemHandle,
idxs_shmem: ShmemHandle,
vals_shmem: ShmemHandle,
) -> Self {
let (keys_size, idxs_size, vals_size) = Self::estimate_sizes(num_buckets, num_shards);
keys_shmem.set_size(keys_size).expect("could not resize shared memory area");
idxs_shmem.set_size(idxs_size).expect("could not resize shared memory area");
vals_shmem.set_size(vals_size).expect("could not resize shared memory area");
Self::new(
num_buckets,
num_shards,
keys_shmem.data_ptr.as_ptr().cast(),
idxs_shmem.data_ptr.as_ptr().cast(),
vals_shmem.data_ptr.as_ptr().cast(),
Some(HashMapHandles { keys_shmem, idxs_shmem, vals_shmem }),
rustc_hash::FxBuildHasher,
)
}
/// Make a resizable hash map within a new shared memory area with the given name.
pub fn new_resizeable_named(
num_buckets: usize,
max_buckets: usize,
num_shards: usize,
name: &str
) -> Self {
let (keys_size, idxs_size, vals_size) = Self::estimate_sizes(num_buckets, num_shards);
let (keys_max, idxs_max, vals_max) = Self::estimate_sizes(max_buckets, num_shards);
let keys_shmem = ShmemHandle::new(&format!("{name}_keys"), keys_size, keys_max)
.expect("failed to make shared memory area");
let idxs_shmem = ShmemHandle::new(&format!("{name}_idxs"), idxs_size, idxs_max)
.expect("failed to make shared memory area");
let vals_shmem = ShmemHandle::new(&format!("{name}_vals"), vals_size, vals_max)
.expect("failed to make shared memory area");
Self::new(
num_buckets,
num_shards,
keys_shmem.data_ptr.as_ptr().cast(),
idxs_shmem.data_ptr.as_ptr().cast(),
vals_shmem.data_ptr.as_ptr().cast(),
Some(HashMapHandles { keys_shmem, idxs_shmem, vals_shmem }),
rustc_hash::FxBuildHasher,
)
}
/// Make a resizable hash map within a new anonymous shared memory area.
pub fn new_resizeable(
num_buckets: usize,
max_buckets: usize,
num_shards: usize,
) -> Self {
use std::sync::atomic::{AtomicUsize, Ordering};
static COUNTER: AtomicUsize = AtomicUsize::new(0);
let val = COUNTER.fetch_add(1, Ordering::Relaxed);
let name = format!("neon_shmem_hmap{val}");
Self::new_resizeable_named(num_buckets, max_buckets, num_shards, &name)
}
}
impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
where
K: Clone + Hash + Eq,
{
/// Hash a key using the map's hasher.
#[inline]
fn get_hash_value(&self, key: &K) -> u64 {
self.hasher.hash_one(key)
}
/// Get a reference to the corresponding value for a key.
pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, V>> {
let hash = self.get_hash_value(key);
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
map.get_with_hash(key, hash)
}
/// Get a reference to the entry containing a key.
pub fn entry(&self, key: K) -> Result<Entry<'a, K, V>, FullError> {
let hash = self.get_hash_value(&key);
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
map.entry_with_hash(key, hash)
}
/// Remove a key given its hash. Returns the associated value if it existed.
pub fn remove(&self, key: &K) -> Option<V> {
let hash = self.get_hash_value(key);
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
match map.entry_with_hash(key.clone(), hash) {
Ok(Entry::Occupied(mut e)) => Some(e.remove()),
_ => None,
}
}
/// Insert/update a key. Returns the previous associated value if it existed.
///
/// # Errors
/// Will return [`core::FullError`] if there is no more space left in the map.
pub fn insert(&self, key: K, value: V) -> Result<Option<V>, core::FullError> {
let hash = self.get_hash_value(&key);
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
match map.entry_with_hash(key.clone(), hash)? {
Entry::Occupied(mut e) => Ok(Some(e.insert(value))),
Entry::Vacant(e) => {
_ = e.insert(value);
Ok(None)
}
}
}
pub unsafe fn get_at_bucket(&self, pos: usize) -> Option<&V> {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
if pos >= map.bucket_arr.len() {
return None;
}
let bucket = &map.bucket_arr[pos];
if bucket.next.load(Ordering::Relaxed).full_checked().is_some() {
Some(unsafe { bucket.val.assume_init_ref() })
} else {
None
}
}
pub unsafe fn entry_at_bucket(&self, pos: usize) -> Option<entry::OccupiedEntry<'a, K, V>> {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
if pos >= map.bucket_arr.len() {
return None;
}
let bucket = &map.bucket_arr[pos];
bucket.next.load(Ordering::Relaxed).full_checked().map(|entry_pos| {
let shard_size = map.get_num_buckets() / map.dict_shards.len();
let shard_index = entry_pos / shard_size;
let shard_off = entry_pos % shard_size;
entry::OccupiedEntry {
shard: map.dict_shards[shard_index].write(),
shard_pos: shard_off,
bucket_pos: pos,
bucket_arr: &map.bucket_arr,
key_pos: entry_pos,
}
})
}
/// bucket the number of buckets in the table.
pub fn get_num_buckets(&self) -> usize {
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
map.get_num_buckets()
}
/// Returns the index of the bucket a given value corresponds to.
pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
let origin = map.bucket_arr.as_mut_ptr() as *const _;
let idx = (val_ptr as usize - origin as usize) / size_of::<Bucket<V>>();
assert!(idx < map.bucket_arr.len());
idx
}
/// Returns the number of occupied buckets in the table.
pub fn get_num_buckets_in_use(&self) -> usize {
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
map.bucket_arr.buckets_in_use.load(Ordering::Relaxed)
}
/// Clears all entries in a table. Does not reset any shrinking operations.
pub fn clear(&self) {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
map.clear();
}
/// Begin a rehash operation. Converts all existing entries
// TODO: missing logic to prevent furhter resize operations when one is already underway.
// One future feature could be to allow interruptible resizes. We wouldn't pay much of a
// space penalty if we used something like https://crates.io/crates/u4 inside EntryTag
// to allow for many tiers of older chains (we would have to track previous sizes within
// a sliding window at the front of the memory region or something)
fn begin_rehash(
&self,
shards: &mut Vec<RwLockWriteGuard<'_, DictShard<'_, K>>>,
rehash_buckets: usize
) -> bool {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
assert!(rehash_buckets <= map.get_num_buckets(), "rehashing subset of buckets");
if map.rehash_index.load(Ordering::Relaxed) >= map.rehash_end.load(Ordering::Relaxed) {
return false;
}
shards.iter_mut().for_each(|x| x.keys.iter_mut().for_each(|key| {
match key.tag {
EntryTag::Occupied => key.tag = EntryTag::Rehash,
EntryTag::Tombstone => key.tag = EntryTag::RehashTombstone,
_ => (),
}
}));
map.rehash_index.store(0, Ordering::Relaxed);
map.rehash_end.store(rehash_buckets, Ordering::Relaxed);
true
}
// Unfinished, final large-ish piece standing in the way of a prototype.
//
// Based off the hashbrown implementation but adapted to an incremental context. See below:
// https://github.com/quantumish/hashbrown/blob/6610e6d2b1f288ef7b0709a3efefbc846395dc5e/src/raw/mod.rs#L2866
fn do_rehash(&self) -> bool {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
// TODO(quantumish): refactor these out into settable quantities
const REHASH_CHUNK_SIZE: usize = 10;
let end = map.rehash_end.load(Ordering::Relaxed);
let ind = map.rehash_index.load(Ordering::Relaxed);
if ind >= end { return true }
// We have to use a mutex to prevent concurrent rehashes as they provide a pretty
// obvious chance at a deadlock: one thread wants to rehash an entry into a shard
// which is held by another thread which wants to rehash its block into the shard
// held by the first. Doesn't seem like there's an obvious way around this?
let _guard = self.resize_lock.try_lock();
if _guard.is_none() { return false }
map.rehash_index.store((ind+REHASH_CHUNK_SIZE).min(end), Ordering::Relaxed);
let shard_size = map.get_num_buckets() / map.dict_shards.len();
for i in ind..(ind+REHASH_CHUNK_SIZE).min(end) {
let (shard_index, shard_off) = (i / shard_size, i % shard_size);
let mut shard = map.dict_shards[shard_index].write();
if shard.keys[shard_off].tag != EntryTag::Rehash {
continue;
}
loop {
let hash = self.get_hash_value(unsafe {
shard.keys[shard_off].val.assume_init_ref()
});
let key = unsafe { shard.keys[shard_off].val.assume_init_ref() }.clone();
let new = map.entry(key, hash, |tag| match tag {
EntryTag::Empty => core::MapEntryType::Empty,
EntryTag::Occupied => core::MapEntryType::Occupied,
EntryTag::Tombstone => core::MapEntryType::Skip,
_ => core::MapEntryType::Tombstone,
}).unwrap();
// I believe the blocker here is that this unfortunately this would require
// duplicating a lot of the logic of a write lookup again but with the caveat
// that we're already holding one of the shard locks and need to pass that
// context on. One thing I was considering at the time was using a hashmap to
// manage the lock guards and passing that around?
todo!("finish rehash implementation")
// match new.tag() {
// EntryTag::Empty | EntryTag::RehashTombstone => {
// shard.keys[shard_off].tag = EntryTag::Empty;
// unsafe {
// std::mem::swap(
// shard.keys[shard_off].val.assume_init_mut(),
// new.
// },
// EntryTag::Rehash => {
// },
// _ => unreachable!()
// }
}
}
false
}
pub fn finish_rehash(&self) {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
while self.do_rehash() {}
}
pub fn shuffle(&self) {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
let mut shards: Vec<_> = map.dict_shards.iter().map(|x| x.write()).collect();
self.begin_rehash(&mut shards, map.get_num_buckets());
}
fn reshard(&self, shards: &mut Vec<RwLockWriteGuard<'_, DictShard<'_, K>>>, num_buckets: usize) {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
let shard_size = num_buckets / map.dict_shards.len();
for i in 0..map.dict_shards.len() {
let size = ((i + 1) * shard_size).min(num_buckets) - (i * shard_size);
unsafe {
shards[i].keys = std::slice::from_raw_parts_mut(shards[i].keys.as_mut_ptr(), size);
shards[i].idxs = std::slice::from_raw_parts_mut(shards[i].idxs.as_mut_ptr(), size);
}
}
}
fn resize_shmem(&self, num_buckets: usize) -> Result<(), shmem::Error> {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
let shmem_handles = self
.shmem_handles
.as_ref()
.expect("grow called on a fixed-size hash table");
let (keys_size, idxs_size, vals_size) =
HashMapInit::<K, V, S>::estimate_sizes(num_buckets, map.dict_shards.len());
shmem_handles.keys_shmem.set_size(keys_size)?;
shmem_handles.idxs_shmem.set_size(idxs_size)?;
shmem_handles.vals_shmem.set_size(vals_size)?;
Ok(())
}
pub fn grow(&self, num_buckets: usize) -> Result<(), shmem::Error> {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
let _resize_guard = self.resize_lock.lock();
let mut shards: Vec<_> = map.dict_shards.iter().map(|x| x.write()).collect();
let old_num_buckets = map.bucket_arr.len();
assert!(
num_buckets >= old_num_buckets,
"grow called with a smaller number of buckets"
);
if num_buckets == old_num_buckets {
return Ok(());
}
// Grow memory areas and initialize each of them.
self.resize_shmem(num_buckets)?;
unsafe {
let buckets_ptr = map.bucket_arr.as_mut_ptr();
for i in old_num_buckets..num_buckets {
let bucket = buckets_ptr.add(i);
bucket.write(Bucket::empty(
if i < num_buckets - 1 {
BucketIdx::new(i + 1)
} else {
map.bucket_arr.free_head.load(Ordering::Relaxed)
}
));
}
// TODO(quantumish) a bit questionable to use pointers here
let first_shard = &mut shards[0];
let keys_ptr = first_shard.keys.as_mut_ptr();
for i in old_num_buckets..num_buckets {
let key = keys_ptr.add(i);
key.write(EntryKey {
tag: EntryTag::Empty,
val: MaybeUninit::uninit(),
});
}
let idxs_ptr = first_shard.idxs.as_mut_ptr();
for i in old_num_buckets..num_buckets {
let idx = idxs_ptr.add(i);
idx.write(BucketIdx::INVALID);
}
}
self.reshard(&mut shards, num_buckets);
map.bucket_arr.free_head.store(
BucketIdx::new(old_num_buckets), Ordering::Relaxed
);
self.begin_rehash(&mut shards, old_num_buckets);
Ok(())
}
pub fn begin_shrink(&mut self, num_buckets: usize) {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
let _resize_guard = self.resize_lock.lock();
assert!(
num_buckets <= map.get_num_buckets(),
"shrink called with a larger number of buckets"
);
_ = self
.shmem_handles
.as_ref()
.expect("shrink called on a fixed-size hash table");
map.bucket_arr.alloc_limit.store(
BucketIdx::new(num_buckets), Ordering::SeqCst
);
}
// TODO(quantumish): Safety? Maybe replace this with expanded version of finish_shrink?
pub fn shrink_goal(&self) -> Option<usize> {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
let goal = map.bucket_arr.alloc_limit.load(Ordering::Relaxed);
goal.next_checked()
}
pub fn finish_shrink(&self) -> Result<(), shmem::Error> {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
let _resize_guard = self.resize_lock.lock();
let mut shards: Vec<_> = map.dict_shards.iter().map(|x| x.write()).collect();
let num_buckets = map.bucket_arr.alloc_limit
.load(Ordering::Relaxed)
.next_checked()
.expect("called finish_shrink when no shrink is in progress");
if map.get_num_buckets() == num_buckets {
return Ok(());
}
assert!(
map.bucket_arr.buckets_in_use.load(Ordering::Relaxed) <= num_buckets,
"called finish_shrink before enough entries were removed"
);
self.resize_shmem(num_buckets)?;
self.reshard(&mut shards, num_buckets);
map.bucket_arr.alloc_limit.store(BucketIdx::INVALID, Ordering::Relaxed);
self.begin_rehash(&mut shards, num_buckets);
Ok(())
}
}

View File

@@ -0,0 +1,301 @@
//! Lock-free stable array of buckets managed with a freelist.
//!
//! Since the positions of entries in the dictionary and the bucket array are not correlated,
//! we either had to separately shard both and deal with the overhead of two lock acquisitions
//! per read/write, or make the bucket array lock free. This is *generally* fine since most
//! accesses of the bucket array are done while holding the lock on the corresponding dict shard
//! and thus synchronized. May not hold up to the removals done by the LFC which is a problem.
//!
//! Routines are pretty closely adapted from https://timharris.uk/papers/2001-disc.pdf
//!
//! Notable caveats:
//! - Can only store around 2^30 entries, which is actually only 10x our current workload.
//! - This is because we need two tag bits to distinguish full/empty and marked/unmarked entries.
//! - Has not been seriously tested.
//!
//! Full entries also store the index to their corresponding dictionary entry in order
//! to enable .entry_at_bucket() which is needed for the clock eviction algo in the LFC.
use std::cell::UnsafeCell;
use std::mem::MaybeUninit;
use std::sync::atomic::{AtomicUsize, Ordering};
use atomic::Atomic;
#[derive(bytemuck::NoUninit, Clone, Copy, PartialEq, Eq)]
#[repr(transparent)]
pub(crate) struct BucketIdx(pub(super) u32);
// This should always be true as `BucketIdx` is a simple newtype.
const _: () = assert!(Atomic::<BucketIdx>::is_lock_free());
impl BucketIdx {
/// Tag for next pointers in free entries.
const NEXT_TAG: u32 = 0b00 << 30;
/// Tag for marked next pointers in free entries.
const MARK_TAG: u32 = 0b01 << 30;
/// Tag for full entries.
const FULL_TAG: u32 = 0b10 << 30;
/// Reserved. Don't use me.
const RSVD_TAG: u32 = 0b11 << 30;
/// Invalid index within the bucket array (can be mixed with any tag).
pub const INVALID: Self = Self(0x3FFFFFFF);
/// Max index within the bucket array (can be mixed with any tag).
pub const MAX: usize = Self::INVALID.0 as usize - 1;
pub(super) fn is_marked(&self) -> bool {
self.0 & Self::RSVD_TAG == Self::MARK_TAG
}
pub(super) fn as_marked(self) -> Self {
Self((self.0 & Self::INVALID.0) | Self::MARK_TAG)
}
pub(super) fn get_unmarked(self) -> Self {
Self(self.0 & Self::INVALID.0)
}
pub fn new(val: usize) -> Self {
debug_assert!(val < Self::MAX);
Self(val as u32)
}
pub fn new_full(val: usize) -> Self {
debug_assert!(val < Self::MAX);
Self(val as u32 | Self::FULL_TAG)
}
/// Try to extract a valid index if the tag is NEXT.
pub fn next_checked(&self) -> Option<usize> {
if self.0 & Self::RSVD_TAG == Self::NEXT_TAG && *self != Self::INVALID {
Some(self.0 as usize)
} else {
None
}
}
/// Try to extract an index if the tag is FULL.
pub fn full_checked(&self) -> Option<usize> {
if self.0 & Self::RSVD_TAG == Self::FULL_TAG {
Some((self.0 & Self::INVALID.0) as usize)
} else {
None
}
}
}
/// Entry within the bucket array. Value is only initialized if you
pub(crate) struct Bucket<V> {
// Only initialized if `next` field is tagged with FULL.
pub val: MaybeUninit<V>,
// Either points to next entry in freelist if empty or points
// to the corresponding entry in dictionary if full.
pub next: Atomic<BucketIdx>,
}
impl<V> Bucket<V> {
pub fn empty(next: BucketIdx) -> Self {
Self {
val: MaybeUninit::uninit(),
next: Atomic::new(next)
}
}
pub fn as_ref(&self) -> &V {
unsafe { self.val.assume_init_ref() }
}
pub fn as_mut(&mut self) -> &mut V {
unsafe { self.val.assume_init_mut() }
}
pub fn replace(&mut self, new_val: V) -> V {
unsafe { std::mem::replace(self.val.assume_init_mut(), new_val) }
}
}
pub(crate) struct BucketArray<'a, V> {
/// Buckets containing values.
pub(crate) buckets: &'a UnsafeCell<[Bucket<V>]>,
/// Head of the freelist.
pub(crate) free_head: Atomic<BucketIdx>,
/// Maximum index of a bucket allowed to be allocated.
pub(crate) alloc_limit: Atomic<BucketIdx>,
/// The number of currently occupied buckets.
pub(crate) buckets_in_use: AtomicUsize,
// Unclear what the purpose of this is.
pub(crate) _user_list_head: Atomic<BucketIdx>,
}
impl <'a, V> std::ops::Index<usize> for BucketArray<'a, V> {
type Output = Bucket<V>;
fn index(&self, index: usize) -> &Self::Output {
let buckets: &[_] = unsafe { &*(self.buckets.get() as *mut _) };
&buckets[index]
}
}
impl <'a, V> std::ops::IndexMut<usize> for BucketArray<'a, V> {
fn index_mut(&mut self, index: usize) -> &mut Self::Output {
let buckets: &mut [_] = unsafe { &mut *(self.buckets.get() as *mut _) };
&mut buckets[index]
}
}
impl<'a, V> BucketArray<'a, V> {
pub fn new(buckets: &'a UnsafeCell<[Bucket<V>]>) -> Self {
Self {
buckets,
free_head: Atomic::new(BucketIdx(0)),
_user_list_head: Atomic::new(BucketIdx(0)),
alloc_limit: Atomic::new(BucketIdx::INVALID),
buckets_in_use: 0.into(),
}
}
pub fn as_mut_ptr(&self) -> *mut Bucket<V> {
unsafe { (&mut *self.buckets.get()).as_mut_ptr() }
}
pub fn get_mut(&self, index: usize) -> &mut Bucket<V> {
let buckets: &mut [_] = unsafe { &mut *(self.buckets.get() as *mut _) };
&mut buckets[index]
}
pub fn len(&self) -> usize {
unsafe { (&*self.buckets.get()).len() }
}
/// Deallocate a bucket, adding it to the free list.
// Adapted from List::insert in https://timharris.uk/papers/2001-disc.pdf
pub fn dealloc_bucket(&self, pos: usize) -> V {
loop {
let free = self.free_head.load(Ordering::Relaxed);
self[pos].next.store(free, Ordering::Relaxed);
if self.free_head.compare_exchange_weak(
free, BucketIdx::new(pos), Ordering::Relaxed, Ordering::Relaxed
).is_ok() {
self.buckets_in_use.fetch_sub(1, Ordering::Relaxed);
return unsafe { self[pos].val.assume_init_read() };
}
}
}
/// Find a usable bucket at the front of the free list.
// Adapted from List::search in https://timharris.uk/papers/2001-disc.pdf
#[allow(unused_assignments)]
fn find_bucket(&self) -> (BucketIdx, BucketIdx) {
let mut left_node = BucketIdx::INVALID;
let mut right_node = BucketIdx::INVALID;
let mut left_node_next = BucketIdx::INVALID;
loop {
let mut t = BucketIdx::INVALID;
let mut t_next = self.free_head.load(Ordering::Relaxed);
let alloc_limit = self.alloc_limit.load(Ordering::Relaxed).next_checked();
while t_next.is_marked() || t.next_checked()
.map_or(true, |v| alloc_limit.map_or(false, |l| v > l))
{
if !t_next.is_marked() {
left_node = t;
left_node_next = t_next;
}
t = t_next.get_unmarked();
if t == BucketIdx::INVALID { break }
t_next = self[t.0 as usize].next.load(Ordering::Relaxed);
}
right_node = t;
if left_node_next == right_node {
if right_node != BucketIdx::INVALID && self[right_node.0 as usize]
.next.load(Ordering::Relaxed).is_marked()
{
continue;
} else {
return (left_node, right_node);
}
}
let left_ref = if left_node != BucketIdx::INVALID {
&self[left_node.0 as usize].next
} else { &self.free_head };
if left_ref.compare_exchange_weak(
left_node_next, right_node, Ordering::Relaxed, Ordering::Relaxed
).is_ok() {
if right_node != BucketIdx::INVALID && self[right_node.0 as usize]
.next.load(Ordering::Relaxed).is_marked()
{
continue;
} else {
return (left_node, right_node);
}
}
}
}
/// Pop a bucket from the free list.
// Adapted from List::delete in https://timharris.uk/papers/2001-disc.pdf
#[allow(unused_assignments)]
pub(crate) fn alloc_bucket(&self, value: V, key_pos: usize) -> Option<BucketIdx> {
let mut right_node_next = BucketIdx::INVALID;
let mut left_idx = BucketIdx::INVALID;
let mut right_idx = BucketIdx::INVALID;
loop {
(left_idx, right_idx) = self.find_bucket();
if right_idx == BucketIdx::INVALID {
return None;
}
let right = &self[right_idx.0 as usize];
right_node_next = right.next.load(Ordering::Relaxed);
if !right_node_next.is_marked() {
if right.next.compare_exchange_weak(
right_node_next, right_node_next.as_marked(),
Ordering::Relaxed, Ordering::Relaxed
).is_ok() {
break;
}
}
}
let left_ref = if left_idx != BucketIdx::INVALID {
&self[left_idx.0 as usize].next
} else {
&self.free_head
};
if left_ref.compare_exchange_weak(
right_idx, right_node_next,
Ordering::Relaxed, Ordering::Relaxed
).is_err() {
todo!()
}
self.buckets_in_use.fetch_add(1, Ordering::Relaxed);
self[right_idx.0 as usize].next.store(
BucketIdx::new_full(key_pos), Ordering::Relaxed
);
self.get_mut(right_idx.0 as usize).val.write(value);
Some(right_idx)
}
pub fn clear(&mut self) {
for i in 0..self.len() {
self[i] = Bucket::empty(
if i < self.len() - 1 {
BucketIdx::new(i + 1)
} else {
BucketIdx::INVALID
}
);
}
self.free_head.store(BucketIdx(0), Ordering::Relaxed);
self.buckets_in_use.store(0, Ordering::Relaxed);
}
}

View File

@@ -0,0 +1,335 @@
//! Sharded linear probing hash table.
//! NOTE/FIXME: one major bug with this design is that the current hashmap DOES NOT TRACK
//! the previous size of the hashmap and thus does lookups incorrectly/badly. This should
//! be a reasonably minor fix?
use std::cell::UnsafeCell;
use std::hash::Hash;
use std::mem::MaybeUninit;
use std::sync::atomic::{Ordering, AtomicUsize};
use crate::sync::*;
use crate::hash::{
entry::*,
bucket::{BucketArray, Bucket, BucketIdx}
};
/// Metadata tag for the type of an entry in the hashmap.
#[derive(PartialEq, Eq, Clone, Copy)]
pub(crate) enum EntryTag {
/// An occupied entry inserted after a resize operation.
Occupied,
/// An occupied entry inserted before a resize operation
/// a.k.a. an entry that needs to be rehashed at some point.
Rehash,
/// An entry that was once `Occupied`.
Tombstone,
/// An entry that was once `Rehash`.
RehashTombstone,
/// An empty entry.
Empty,
}
/// Searching the chains of a hashmap oftentimes requires interpreting
/// a set of metadata tags differently. This enum encodes the ways a
/// metadata tag can be treated during a lookup.
pub(crate) enum MapEntryType {
/// Should be treated as if it were occupied.
Occupied,
/// Should be treated as if it were a tombstone.
Tombstone,
/// Should be treated as if it were empty.
Empty,
/// Should be ignored.
Skip
}
/// A key within the dictionary component of the hashmap.
pub(crate) struct EntryKey<K> {
// NOTE: This could be split out to save 3 bytes per entry!
// Wasn't sure it was worth the penalty of another shmem area.
pub(crate) tag: EntryTag,
pub(crate) val: MaybeUninit<K>,
}
/// A shard of the dictionary.
pub(crate) struct DictShard<'a, K> {
pub(crate) keys: &'a mut [EntryKey<K>],
pub(crate) idxs: &'a mut [BucketIdx],
}
impl<'a, K> DictShard<'a, K> {
fn len(&self) -> usize {
self.keys.len()
}
}
pub(crate) struct MaybeUninitDictShard<'a, K> {
pub(crate) keys: &'a mut [MaybeUninit<EntryKey<K>>],
pub(crate) idxs: &'a mut [MaybeUninit<BucketIdx>],
}
/// Core hash table implementation.
pub(crate) struct CoreHashMap<'a, K, V> {
/// Dictionary used to map hashes to bucket indices.
pub(crate) dict_shards: &'a mut [RwLock<DictShard<'a, K>>],
/// Stable bucket array used to store the values.
pub(crate) bucket_arr: BucketArray<'a, V>,
/// Index of the next entry to process for rehashing.
pub(crate) rehash_index: AtomicUsize,
/// Index of the end of the range to be rehashed.
pub(crate) rehash_end: AtomicUsize,
}
/// Error for when there are no empty buckets left but one is needed.
#[derive(Debug, PartialEq)]
pub struct FullError();
impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
pub fn new(
buckets_cell: &'a UnsafeCell<[MaybeUninit<Bucket<V>>]>,
dict_shards: &'a mut [RwLock<MaybeUninitDictShard<'a, K>>],
) -> Self {
let buckets = unsafe { &mut *buckets_cell.get() };
// Initialize the buckets
for i in 0..buckets.len() {
buckets[i].write(Bucket::empty(
if i < buckets.len() - 1 {
BucketIdx::new(i + 1)
} else {
BucketIdx::INVALID
})
);
}
// Initialize the dictionary
for shard in dict_shards.iter_mut() {
let mut dicts = shard.write();
for e in dicts.keys.iter_mut() {
e.write(EntryKey {
tag: EntryTag::Empty,
val: MaybeUninit::uninit(),
});
}
for e in dicts.idxs.iter_mut() {
e.write(BucketIdx::INVALID);
}
}
let buckets_cell = unsafe {
&*(buckets_cell as *const _ as *const UnsafeCell<_>)
};
// TODO: use std::slice::assume_init_mut() once it stabilizes
let dict_shards = unsafe {
std::slice::from_raw_parts_mut(dict_shards.as_mut_ptr().cast(),
dict_shards.len())
};
Self {
dict_shards,
rehash_index: buckets.len().into(),
rehash_end: buckets.len().into(),
bucket_arr: BucketArray::new(buckets_cell),
}
}
/// Get the value associated with a key (if it exists) given its hash.
pub fn get_with_hash(&'a self, key: &K, hash: u64) -> Option<ValueReadGuard<'a, V>> {
let ind = self.rehash_index.load(Ordering::Relaxed);
let end = self.rehash_end.load(Ordering::Relaxed);
// First search the chains from the current context (thus treat
// to-be-rehashed entries as tombstones within a current chain).
let res = self.get(key, hash, |tag| match tag {
EntryTag::Empty => MapEntryType::Empty,
EntryTag::Occupied => MapEntryType::Occupied,
_ => MapEntryType::Tombstone,
});
if res.is_some() {
return res;
}
if ind < end {
// Search chains from the previous size of the map if a rehash is in progress.
// Ignore any entries inserted since the resize operation occurred.
self.get(key, hash, |tag| match tag {
EntryTag::Empty => MapEntryType::Empty,
EntryTag::Rehash => MapEntryType::Occupied,
_ => MapEntryType::Tombstone,
})
} else {
None
}
}
pub fn entry_with_hash(&'a mut self, key: K, hash: u64) -> Result<Entry<'a, K, V>, FullError> {
let ind = self.rehash_index.load(Ordering::Relaxed);
let end = self.rehash_end.load(Ordering::Relaxed);
let res = self.entry(key.clone(), hash, |tag| match tag {
EntryTag::Empty => MapEntryType::Empty,
EntryTag::Occupied => MapEntryType::Occupied,
// We can't treat old entries as tombstones here, as we definitely can't
// insert over them! Instead we can just skip directly over them.
EntryTag::Rehash => MapEntryType::Skip,
_ => MapEntryType::Tombstone,
});
if ind < end {
if let Ok(Entry::Occupied(_)) = res {
res
} else {
self.entry(key, hash, |tag| match tag {
EntryTag::Empty => MapEntryType::Empty,
EntryTag::Occupied => MapEntryType::Skip,
EntryTag::Rehash => MapEntryType::Occupied,
_ => MapEntryType::Tombstone
})
}
} else {
res
}
}
fn get<F>(&'a self, key: &K, hash: u64, f: F) -> Option<ValueReadGuard<'a, V>>
where F: Fn(EntryTag) -> MapEntryType
{
let num_buckets = self.get_num_buckets();
let shard_size = num_buckets / self.dict_shards.len();
let bucket_pos = hash as usize % num_buckets;
let shard_start = bucket_pos / shard_size;
for off in 0..self.dict_shards.len() {
let shard_idx = (shard_start + off) % self.dict_shards.len();
let shard = self.dict_shards[shard_idx].read();
let entry_start = if off == 0 { bucket_pos % shard_size } else { 0 };
for entry_idx in entry_start..shard.len() {
match f(shard.keys[entry_idx].tag) {
MapEntryType::Empty => return None,
MapEntryType::Tombstone | MapEntryType::Skip => continue,
MapEntryType::Occupied => {
let cand_key = unsafe { shard.keys[entry_idx].val.assume_init_ref() };
if cand_key == key {
let bucket_idx = shard.idxs[entry_idx].next_checked()
.expect("position is valid");
return Some(RwLockReadGuard::map(
shard, |_| self.bucket_arr[bucket_idx].as_ref()
));
}
},
}
}
}
None
}
pub fn entry<F>(&'a self, key: K, hash: u64, f: F) -> Result<Entry<'a, K, V>, FullError>
where F: Fn(EntryTag) -> MapEntryType
{
// We need to keep holding on the locks for each shard we process since if we don't find the
// key anywhere, we want to insert it at the earliest possible position (which may be several
// shards away). Ideally cross-shard chains are quite rare, so this shouldn't be a big deal.
//
// NB: Somewhat real chance of a deadlock! E.g. one thread has a ridiculously long chain that
// starts at block N and wraps around the hashmap to N-1, yet another thread begins a lookup at
// N-1 during this and has a chain that lasts a few shards. Then thread 1 is blocked on thread 2
// to get to shard N-1 but thread 2 is blocked on thread 1 to get to shard N. Pretty fringe case
// since chains shouldn't last very long, but still a problem with this somewhat naive sharding
// mechanism.
//
// We could fix this by either refusing to hold locks and only inserting into the earliest entry
// within the current shard (which effectively means after a while we forget about certain open
// entries at the end of shards) or by pivoting to a more involved concurrency setup?
let mut shards = Vec::new();
let mut insert_pos = None;
let mut insert_shard = None;
let num_buckets = self.get_num_buckets();
let shard_size = num_buckets / self.dict_shards.len();
let mut entry_pos = hash as usize % num_buckets;
let shard_start = entry_pos / shard_size;
for off in 0..self.dict_shards.len() {
let shard_idx = (shard_start + off) % self.dict_shards.len();
let shard = self.dict_shards[shard_idx].write();
let mut inserted = false;
let entry_start = if off == 0 { entry_pos % shard_size } else { 0 };
for entry_idx in entry_start..shard.len() {
entry_pos += 1;
match f(shard.keys[entry_idx].tag) {
MapEntryType::Skip => continue,
MapEntryType::Empty => {
let ((shard, idx), shard_pos) = match (insert_shard, insert_pos) {
(Some((s, i)), Some(p)) => ((s, i), p),
(None, Some(p)) => ((shard, shard_idx), p),
(None, None) => ((shard, shard_idx), entry_idx),
_ => unreachable!()
};
return Ok(Entry::Vacant(VacantEntry {
_key: key,
shard,
shard_pos,
key_pos: (shard_size * idx) + shard_pos,
bucket_arr: &self.bucket_arr,
}))
},
MapEntryType::Tombstone => {
if insert_pos.is_none() {
insert_pos = Some(entry_idx);
inserted = true;
}
},
MapEntryType::Occupied => {
let cand_key = unsafe { shard.keys[entry_idx].val.assume_init_ref() };
if *cand_key == key {
let bucket_pos = shard.idxs[entry_idx].next_checked().unwrap();
return Ok(Entry::Occupied(OccupiedEntry {
shard,
shard_pos: entry_idx,
bucket_pos,
bucket_arr: &self.bucket_arr,
key_pos: entry_pos,
}));
}
}
}
}
if inserted {
insert_shard = Some((shard, shard_idx));
} else {
shards.push(shard);
}
}
if let (Some((shard, idx)), Some(shard_pos)) = (insert_shard, insert_pos) {
Ok(Entry::Vacant(VacantEntry {
_key: key,
shard,
shard_pos,
key_pos: (shard_size * idx) + shard_pos,
bucket_arr: &self.bucket_arr,
}))
} else {
Err(FullError{})
}
}
/// Get number of buckets in map.
pub fn get_num_buckets(&self) -> usize {
self.bucket_arr.len()
}
pub fn clear(&mut self) {
let mut shards: Vec<_> = self.dict_shards.iter().map(|x| x.write()).collect();
for shard in shards.iter_mut() {
for e in shard.keys.iter_mut() {
e.tag = EntryTag::Empty;
}
for e in shard.idxs.iter_mut() {
*e = BucketIdx::INVALID;
}
}
self.bucket_arr.clear();
}
}

View File

@@ -0,0 +1,81 @@
//! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap.
use crate::hash::{
core::{DictShard, EntryTag},
bucket::{BucketArray, BucketIdx}
};
use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
use std::hash::Hash;
pub enum Entry<'a, K, V> {
Occupied(OccupiedEntry<'a, K, V>),
Vacant(VacantEntry<'a, K, V>),
}
pub struct OccupiedEntry<'a, K, V> {
/// Mutable reference to the shard of the map the entry is in.
pub(crate) shard: RwLockWriteGuard<'a, DictShard<'a, K>>,
/// The position of the entry in the shard.
pub(crate) shard_pos: usize,
/// True logical position of the entry in the map.
pub(crate) key_pos: usize,
/// Mutable reference to the bucket array containing entry.
pub(crate) bucket_arr: &'a BucketArray<'a, V>,
/// The position of the bucket in the [`CoreHashMap`] bucket array.
pub(crate) bucket_pos: usize,
}
impl<K, V> OccupiedEntry<'_, K, V> {
pub fn get(&self) -> &V {
self.bucket_arr[self.bucket_pos].as_ref()
}
pub fn get_mut(&mut self) -> &mut V {
self.bucket_arr.get_mut(self.bucket_pos).as_mut()
}
/// Inserts a value into the entry, replacing (and returning) the existing value.
pub fn insert(&mut self, value: V) -> V {
self.bucket_arr.get_mut(self.bucket_pos).replace(value)
}
/// Removes the entry from the hash map, returning the value originally stored within it.
pub fn remove(&mut self) -> V {
self.shard.idxs[self.shard_pos] = BucketIdx::INVALID;
self.shard.keys[self.shard_pos].tag = EntryTag::Tombstone;
self.bucket_arr.dealloc_bucket(self.bucket_pos)
}
}
/// An abstract view into a vacant entry within the map.
pub struct VacantEntry<'a, K, V> {
/// The key of the occupied entry
pub(crate) _key: K,
/// Mutable reference to the shard of the map the entry is in.
pub(crate) shard: RwLockWriteGuard<'a, DictShard<'a, K>>,
/// The position of the entry in the shard.
pub(crate) shard_pos: usize,
/// True logical position of the entry in the map.
pub(crate) key_pos: usize,
/// Mutable reference to the bucket array containing entry.
pub(crate) bucket_arr: &'a BucketArray<'a, V>,
}
impl<'a, K: Clone + Hash + Eq, V> VacantEntry<'a, K, V> {
/// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
pub fn insert(mut self, value: V) -> ValueWriteGuard<'a, V> {
let pos = self.bucket_arr.alloc_bucket(value, self.key_pos)
.expect("bucket is available if entry is");
self.shard.keys[self.shard_pos].tag = EntryTag::Occupied;
self.shard.keys[self.shard_pos].val.write(self._key);
let idx = pos.next_checked().expect("position is valid");
self.shard.idxs[self.shard_pos] = pos;
RwLockWriteGuard::map(self.shard, |_| {
self.bucket_arr.get_mut(idx).as_mut()
})
}
}

View File

@@ -0,0 +1,428 @@
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::fmt::Debug;
use std::mem::MaybeUninit;
use crate::hash::Entry;
use crate::hash::HashMapAccess;
use crate::hash::HashMapInit;
use crate::hash::core::FullError;
use rand::seq::SliceRandom;
use rand::{Rng, RngCore};
use rand_distr::Zipf;
const TEST_KEY_LEN: usize = 16;
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
struct TestKey([u8; TEST_KEY_LEN]);
impl From<&TestKey> for u128 {
fn from(val: &TestKey) -> u128 {
u128::from_be_bytes(val.0)
}
}
impl From<u128> for TestKey {
fn from(val: u128) -> TestKey {
TestKey(val.to_be_bytes())
}
}
impl<'a> From<&'a [u8]> for TestKey {
fn from(bytes: &'a [u8]) -> TestKey {
TestKey(bytes.try_into().unwrap())
}
}
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
let w = HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, 100, "test_inserts")
.attach_writer();
for (idx, k) in keys.iter().enumerate() {
let res = w.entry((*k).into());
match res.unwrap() {
Entry::Occupied(mut e) => {
e.insert(idx);
}
Entry::Vacant(e) => {
_ = e.insert(idx);
}
};
}
for (idx, k) in keys.iter().enumerate() {
let x = w.get(&(*k).into());
let value = x.as_deref().copied();
assert_eq!(value, Some(idx));
}
}
#[test]
fn dense() {
// This exercises splitting a node with prefix
let keys: &[u128] = &[0, 1, 2, 3, 256];
test_inserts(keys);
// Dense keys
let mut keys: Vec<u128> = (0..10000).collect();
test_inserts(&keys);
// Do the same in random orders
for _ in 1..10 {
keys.shuffle(&mut rand::rng());
test_inserts(&keys);
}
}
#[test]
fn sparse() {
// sparse keys
let mut keys: Vec<TestKey> = Vec::new();
let mut used_keys = HashSet::new();
for _ in 0..10000 {
loop {
let key = rand::random::<u128>();
if used_keys.contains(&key) {
continue;
}
used_keys.insert(key);
keys.push(key.into());
break;
}
}
test_inserts(&keys);
}
#[derive(Clone, Debug)]
struct TestOp(TestKey, Option<usize>);
fn apply_op(
op: &TestOp,
map: &mut HashMapAccess<TestKey, usize>,
shadow: &mut BTreeMap<TestKey, usize>,
) {
// apply the change to the shadow tree first
let shadow_existing = if let Some(v) = op.1 {
shadow.insert(op.0, v)
} else {
shadow.remove(&op.0)
};
let entry = map.entry(op.0);
let hash_existing = match op.1 {
Some(new) => match entry.unwrap() {
Entry::Occupied(mut e) => Some(e.insert(new)),
Entry::Vacant(e) => {
_ = e.insert(new);
None
}
},
None => match entry.unwrap() {
Entry::Occupied(mut e) => Some(e.remove()),
Entry::Vacant(_) => None,
},
};
assert_eq!(shadow_existing, hash_existing);
}
fn do_random_ops(
num_ops: usize,
size: u32,
del_prob: f64,
writer: &mut HashMapAccess<TestKey, usize>,
shadow: &mut BTreeMap<TestKey, usize>,
rng: &mut rand::rngs::ThreadRng,
) {
for i in 0..num_ops {
let key: TestKey = ((rng.next_u32() % size) as u128).into();
let op = TestOp(
key,
if rng.random_bool(del_prob) {
Some(i)
} else {
None
},
);
apply_op(&op, writer, shadow);
}
}
fn do_deletes(
num_ops: usize,
writer: &mut HashMapAccess<TestKey, usize>,
shadow: &mut BTreeMap<TestKey, usize>,
) {
for _ in 0..num_ops {
let (k, _) = shadow.pop_first().unwrap();
writer.remove(&k);
}
}
fn do_shrink(
writer: &mut HashMapAccess<TestKey, usize>,
shadow: &mut BTreeMap<TestKey, usize>,
to: usize,
) {
assert!(writer.shrink_goal().is_none());
writer.begin_shrink(to);
assert_eq!(writer.shrink_goal(), Some(to as usize));
while writer.get_num_buckets_in_use() > to as usize {
let (k, _) = shadow.pop_first().unwrap();
let entry = writer.entry(k).unwrap();
if let Entry::Occupied(mut e) = entry {
e.remove();
}
}
let old_usage = writer.get_num_buckets_in_use();
writer.finish_shrink().unwrap();
assert!(writer.shrink_goal().is_none());
assert_eq!(writer.get_num_buckets_in_use(), old_usage);
}
#[test]
fn random_ops() {
let mut writer =
HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, 10, "test_random")
.attach_writer();
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
let mut rng = rand::rng();
for i in 0..100000 {
let key: TestKey = (rng.sample(distribution) as u128).into();
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
apply_op(&op, &mut writer, &mut shadow);
}
}
// #[test]
// fn test_shuffle() {
// let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, 10, "test_shuf")
// .attach_writer();
// let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
// let mut rng = rand::rng();
// do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
// writer.shuffle();
// do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
// }
// #[test]
// fn test_grow() {
// let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 2000, 10, "test_grow")
// .attach_writer();
// let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
// let mut rng = rand::rng();
// do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
// let old_usage = writer.get_num_buckets_in_use();
// writer.grow(1500).unwrap();
// assert_eq!(writer.get_num_buckets_in_use(), old_usage);
// assert_eq!(writer.get_num_buckets(), 1500);
// do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
// }
#[test]
fn test_clear() {
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, 10, "test_clear")
.attach_writer();
// let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
// let mut rng = rand::rng();
// do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
// writer.clear();
// assert_eq!(writer.get_num_buckets_in_use(), 0);
// assert_eq!(writer.get_num_buckets(), 1500);
// while let Some((key, _)) = shadow.pop_first() {
// assert!(writer.get(&key).is_none());
// }
// do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
// for i in 0..(1500 - writer.get_num_buckets_in_use()) {
// writer.insert((1500 + i as u128).into(), 0).unwrap();
// }
// assert_eq!(writer.insert(5000.into(), 0), Err(FullError {}));
// writer.clear();
// assert!(writer.insert(5000.into(), 0).is_ok());
}
// #[test]
// fn test_idx_remove() {
// let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, 10, "test_clear")
// .attach_writer();
// let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
// let mut rng = rand::rng();
// do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
// for _ in 0..100 {
// let idx = (rng.next_u32() % 1500) as usize;
// if let Some(e) = writer.entry_at_bucket(idx) {
// shadow.remove(&e._key);
// e.remove();
// }
// }
// while let Some((key, val)) = shadow.pop_first() {
// assert_eq!(*writer.get(&key).unwrap(), val);
// }
// }
// #[test]
// fn test_idx_get() {
// let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
// .attach_writer();
// let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
// let mut rng = rand::rng();
// do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
// for _ in 0..100 {
// let idx = (rng.next_u32() % 1500) as usize;
// if let Some(pair) = writer.get_at_bucket(idx) {
// {
// let v: *const usize = &pair.1;
// assert_eq!(writer.get_bucket_for_value(v), idx);
// }
// {
// let v: *const usize = &pair.1;
// assert_eq!(writer.get_bucket_for_value(v), idx);
// }
// }
// }
// }
// #[test]
// fn test_shrink() {
// let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink")
// .attach_writer();
// let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
// let mut rng = rand::rng();
// do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
// do_shrink(&mut writer, &mut shadow, 1000);
// assert_eq!(writer.get_num_buckets(), 1000);
// do_deletes(500, &mut writer, &mut shadow);
// do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
// assert!(writer.get_num_buckets_in_use() <= 1000);
// }
// #[test]
// fn test_shrink_grow_seq() {
// let mut writer =
// HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 20000, "test_grow_seq")
// .attach_writer();
// let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
// let mut rng = rand::rng();
// do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
// eprintln!("Shrinking to 750");
// do_shrink(&mut writer, &mut shadow, 750);
// do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
// eprintln!("Growing to 1500");
// writer.grow(1500).unwrap();
// do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
// eprintln!("Shrinking to 200");
// while shadow.len() > 100 {
// do_deletes(1, &mut writer, &mut shadow);
// }
// do_shrink(&mut writer, &mut shadow, 200);
// do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
// eprintln!("Growing to 10k");
// writer.grow(10000).unwrap();
// do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
// }
#[test]
fn test_bucket_ops() {
let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, 10, "test_bucket_ops")
.attach_writer();
match writer.entry(1.into()).unwrap() {
Entry::Occupied(mut e) => {
e.insert(2);
}
Entry::Vacant(e) => {
_ = e.insert(2);
},
}
assert_eq!(writer.get_num_buckets_in_use(), 1);
assert_eq!(writer.get_num_buckets(), 1000);
assert_eq!(*writer.get(&1.into()).unwrap(), 2);
let pos = match writer.entry(1.into()).unwrap() {
Entry::Occupied(e) => {
assert_eq!(e._key, 1.into());
let pos = e.bucket_pos as usize;
pos
}
Entry::Vacant(_) => {
panic!("Insert didn't affect entry");
}
};
assert_eq!(unsafe { writer.get_at_bucket(pos).unwrap() }, &2);
{
let ptr: *const usize = &*writer.get(&1.into()).unwrap();
assert_eq!(writer.get_bucket_for_value(ptr), pos);
}
writer.remove(&1.into());
assert!(writer.get(&1.into()).is_none());
}
// #[test]
// fn test_shrink_zero() {
// let mut writer =
// HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink_zero")
// .attach_writer();
// writer.begin_shrink(0);
// for i in 0..1500 {
// writer.entry_at_bucket(i).map(|x| x.remove());
// }
// writer.finish_shrink().unwrap();
// assert_eq!(writer.get_num_buckets_in_use(), 0);
// let entry = writer.entry(1.into());
// if let Entry::Vacant(v) = entry {
// assert!(v.insert(2).is_err());
// } else {
// panic!("Somehow got non-vacant entry in empty map.")
// }
// writer.grow(50).unwrap();
// let entry = writer.entry(1.into());
// if let Entry::Vacant(v) = entry {
// assert!(v.insert(2).is_ok());
// } else {
// panic!("Somehow got non-vacant entry in empty map.")
// }
// assert_eq!(writer.get_num_buckets_in_use(), 1);
// }
// #[test]
// #[should_panic]
// fn test_grow_oom() {
// let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_grow_oom")
// .attach_writer();
// writer.grow(20000).unwrap();
// }
// #[test]
// #[should_panic]
// fn test_shrink_bigger() {
// let mut writer =
// HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_bigger")
// .attach_writer();
// writer.begin_shrink(2000);
// }
// #[test]
// #[should_panic]
// fn test_shrink_early_finish() {
// let writer =
// HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_early_finish")
// .attach_writer();
// writer.finish_shrink().unwrap();
// }
// #[test]
// #[should_panic]
// fn test_shrink_fixed_size() {
// let mut area = [MaybeUninit::uninit(); 10000];
// let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
// let mut writer = init_struct.attach_writer();
// writer.begin_shrink(1);
// }

View File

@@ -1,418 +1,5 @@
//! Shared memory utilities for neon communicator
use std::num::NonZeroUsize;
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
use std::ptr::NonNull;
use std::sync::atomic::{AtomicUsize, Ordering};
use nix::errno::Errno;
use nix::sys::mman::MapFlags;
use nix::sys::mman::ProtFlags;
use nix::sys::mman::mmap as nix_mmap;
use nix::sys::mman::munmap as nix_munmap;
use nix::unistd::ftruncate as nix_ftruncate;
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
/// specified at creation.
///
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
/// future.
pub struct ShmemHandle {
/// memfd file descriptor
fd: OwnedFd,
max_size: usize,
// Pointer to the beginning of the shared memory area. The header is stored there.
shared_ptr: NonNull<SharedStruct>,
// Pointer to the beginning of the user data
pub data_ptr: NonNull<u8>,
}
/// This is stored at the beginning in the shared memory area.
struct SharedStruct {
max_size: usize,
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
current_size: AtomicUsize,
}
const RESIZE_IN_PROGRESS: usize = 1 << 63;
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
/// Error type returned by the ShmemHandle functions.
#[derive(thiserror::Error, Debug)]
#[error("{msg}: {errno}")]
pub struct Error {
pub msg: String,
pub errno: Errno,
}
impl Error {
fn new(msg: &str, errno: Errno) -> Error {
Error {
msg: msg.to_string(),
errno,
}
}
}
impl ShmemHandle {
/// Create a new shared memory area. To communicate between processes, the processes need to be
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
///
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
/// processes can continue using it, however.
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
// create the backing anonymous file.
let fd = create_backing_file(name)?;
Self::new_with_fd(fd, initial_size, max_size)
}
fn new_with_fd(
fd: OwnedFd,
initial_size: usize,
max_size: usize,
) -> Result<ShmemHandle, Error> {
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
// is a little larger than this because of the SharedStruct header. Make the upper limit
// somewhat smaller than that, because with anything close to that, you'll run out of
// memory anyway.
if max_size >= 1 << 48 {
panic!("max size {max_size} too large");
}
if initial_size > max_size {
panic!("initial size {initial_size} larger than max size {max_size}");
}
// The actual initial / max size is the one given by the caller, plus the size of
// 'SharedStruct'.
let initial_size = HEADER_SIZE + initial_size;
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
// Reserve address space for it with mmap
//
// TODO: Use MAP_HUGETLB if possible
let start_ptr = unsafe {
nix_mmap(
None,
max_size,
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
MapFlags::MAP_SHARED,
&fd,
0,
)
}
.map_err(|e| Error::new("mmap failed: {e}", e))?;
// Reserve space for the initial size
enlarge_file(fd.as_fd(), initial_size as u64)?;
// Initialize the header
let shared: NonNull<SharedStruct> = start_ptr.cast();
unsafe {
shared.write(SharedStruct {
max_size: max_size.into(),
current_size: AtomicUsize::new(initial_size),
})
};
// The user data begins after the header
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
Ok(ShmemHandle {
fd,
max_size: max_size.into(),
shared_ptr: shared,
data_ptr,
})
}
// return reference to the header
fn shared(&self) -> &SharedStruct {
unsafe { self.shared_ptr.as_ref() }
}
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
/// when creating the area.
///
/// This may only be called from one process/thread concurrently. We detect that case
/// and return an Error.
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
let new_size = new_size + HEADER_SIZE;
let shared = self.shared();
if new_size > self.max_size {
panic!(
"new size ({} is greater than max size ({})",
new_size, self.max_size
);
}
assert_eq!(self.max_size, shared.max_size);
// Lock the area by setting the bit in 'current_size'
//
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
// since this is not performance-critical, better safe than sorry .
let mut old_size = shared.current_size.load(Ordering::Acquire);
loop {
if (old_size & RESIZE_IN_PROGRESS) != 0 {
return Err(Error::new(
"concurrent resize detected",
Errno::UnknownErrno,
));
}
match shared.current_size.compare_exchange(
old_size,
new_size,
Ordering::Acquire,
Ordering::Relaxed,
) {
Ok(_) => break,
Err(x) => old_size = x,
}
}
// Ok, we got the lock.
//
// NB: If anything goes wrong, we *must* clear the bit!
let result = {
use std::cmp::Ordering::{Equal, Greater, Less};
match new_size.cmp(&old_size) {
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
}),
Equal => Ok(()),
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
}
};
// Unlock
shared.current_size.store(
if result.is_ok() { new_size } else { old_size },
Ordering::Release,
);
result
}
/// Returns the current user-visible size of the shared memory segment.
///
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
/// responsibility not to access the area beyond the current size.
pub fn current_size(&self) -> usize {
let total_current_size =
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
total_current_size - HEADER_SIZE
}
}
impl Drop for ShmemHandle {
fn drop(&mut self) {
// SAFETY: The pointer was obtained from mmap() with the given size.
// We unmap the entire region.
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
// The fd is dropped automatically by OwnedFd.
}
}
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
/// development and testing, but in production we want the file to stay in memory.
///
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
#[allow(unused_variables)]
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
#[cfg(not(target_os = "macos"))]
{
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
.map_err(|e| Error::new("memfd_create failed: {e}", e))
}
#[cfg(target_os = "macos")]
{
let file = tempfile::tempfile().map_err(|e| {
Error::new(
"could not create temporary file to back shmem area: {e}",
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
)
})?;
Ok(OwnedFd::from(file))
}
}
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
// we don't get a segfault later when trying to actually use it.
#[cfg(not(target_os = "macos"))]
{
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
Error::new(
"could not grow shmem segment, posix_fallocate failed: {e}",
e,
)
})
}
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
#[cfg(target_os = "macos")]
{
nix::unistd::ftruncate(fd, size as i64)
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
}
}
#[cfg(test)]
mod tests {
use super::*;
use nix::unistd::ForkResult;
use std::ops::Range;
/// check that all bytes in given range have the expected value.
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
for i in range {
let b = unsafe { *(ptr.add(i)) };
assert_eq!(expected, b, "unexpected byte at offset {i}");
}
}
/// Write 'b' to all bytes in the given range
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
}
// simple single-process test of growing and shrinking
#[test]
fn test_shmem_resize() -> Result<(), Error> {
let max_size = 1024 * 1024;
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
assert_eq!(init_struct.current_size(), 0);
// Initial grow
let size1 = 10000;
init_struct.set_size(size1).unwrap();
assert_eq!(init_struct.current_size(), size1);
// Write some data
let data_ptr = init_struct.data_ptr.as_ptr();
write_range(data_ptr, 0xAA, 0..size1);
assert_range(data_ptr, 0xAA, 0..size1);
// Shrink
let size2 = 5000;
init_struct.set_size(size2).unwrap();
assert_eq!(init_struct.current_size(), size2);
// Grow again
let size3 = 20000;
init_struct.set_size(size3).unwrap();
assert_eq!(init_struct.current_size(), size3);
// Try to read it. The area that was shrunk and grown again should read as all zeros now
assert_range(data_ptr, 0xAA, 0..5000);
assert_range(data_ptr, 0, 5000..size1);
// Try to grow beyond max_size
//let size4 = max_size + 1;
//assert!(init_struct.set_size(size4).is_err());
// Dropping init_struct should unmap the memory
drop(init_struct);
Ok(())
}
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
/// but is stored in the shared memory area and works across processes. It's implemented by
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
struct SimpleBarrier {
num_procs: usize,
count: AtomicUsize,
}
impl SimpleBarrier {
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
unsafe {
*ptr = SimpleBarrier {
num_procs,
count: AtomicUsize::new(0),
}
}
}
pub fn wait(&self) {
let old = self.count.fetch_add(1, Ordering::Relaxed);
let generation = old / self.num_procs;
let mut current = old + 1;
while current < (generation + 1) * self.num_procs {
std::thread::sleep(std::time::Duration::from_millis(10));
current = self.count.load(Ordering::Relaxed);
}
}
}
#[test]
fn test_multi_process() {
// Initialize
let max_size = 1_000_000_000_000;
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
let ptr = init_struct.data_ptr.as_ptr();
// Store the SimpleBarrier in the first 1k of the area.
init_struct.set_size(10000).unwrap();
let barrier_ptr: *mut SimpleBarrier = unsafe {
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
.cast()
};
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
// Fork another test process. The code after this runs in both processes concurrently.
let fork_result = unsafe { nix::unistd::fork().unwrap() };
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
if fork_result.is_parent() {
write_range(ptr, 0xAA, 1000..2000);
} else {
write_range(ptr, 0xBB, 2000..3000);
}
barrier.wait();
// Verify the contents. (in both processes)
assert_range(ptr, 0xAA, 1000..2000);
assert_range(ptr, 0xBB, 2000..3000);
// Grow, from the child this time
let size = 10_000_000;
if !fork_result.is_parent() {
init_struct.set_size(size).unwrap();
}
barrier.wait();
// make some writes at the end
if fork_result.is_parent() {
write_range(ptr, 0xAA, (size - 10)..size);
} else {
write_range(ptr, 0xBB, (size - 20)..(size - 10));
}
barrier.wait();
// Verify the contents. (This runs in both processes)
assert_range(ptr, 0, (size - 1000)..(size - 20));
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
assert_range(ptr, 0xAA, (size - 10)..size);
if let ForkResult::Parent { child } = fork_result {
nix::sys::wait::waitpid(child, None).unwrap();
}
}
}
pub mod hash;
pub mod shmem;
pub mod sync;

View File

@@ -0,0 +1,409 @@
//! Dynamically resizable contiguous chunk of shared memory
use std::num::NonZeroUsize;
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
use std::ptr::NonNull;
use std::sync::atomic::{AtomicUsize, Ordering};
use nix::errno::Errno;
use nix::sys::mman::MapFlags;
use nix::sys::mman::ProtFlags;
use nix::sys::mman::mmap as nix_mmap;
use nix::sys::mman::munmap as nix_munmap;
use nix::unistd::ftruncate as nix_ftruncate;
/// `ShmemHandle` represents a shared memory area that can be shared by processes over `fork()`.
/// Unlike shared memory allocated by Postgres, this area is resizable, up to `max_size` that's
/// specified at creation.
///
/// The area is backed by an anonymous file created with `memfd_create()`. The full address space for
/// `max_size` is reserved up-front with `mmap()`, but whenever you call [`ShmemHandle::set_size`],
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
/// future.
pub struct ShmemHandle {
/// memfd file descriptor
fd: OwnedFd,
max_size: usize,
// Pointer to the beginning of the shared memory area. The header is stored there.
shared_ptr: NonNull<SharedStruct>,
// Pointer to the beginning of the user data
pub data_ptr: NonNull<u8>,
}
/// This is stored at the beginning in the shared memory area.
struct SharedStruct {
max_size: usize,
/// Current size of the backing file. The high-order bit is used for the [`RESIZE_IN_PROGRESS`] flag.
current_size: AtomicUsize,
}
const RESIZE_IN_PROGRESS: usize = 1 << 63;
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
/// Error type returned by the [`ShmemHandle`] functions.
#[derive(thiserror::Error, Debug)]
#[error("{msg}: {errno}")]
pub struct Error {
pub msg: String,
pub errno: Errno,
}
impl Error {
fn new(msg: &str, errno: Errno) -> Self {
Self {
msg: msg.to_string(),
errno,
}
}
}
impl ShmemHandle {
/// Create a new shared memory area. To communicate between processes, the processes need to be
/// `fork()`'d after calling this, so that the `ShmemHandle` is inherited by all processes.
///
/// If the `ShmemHandle` is dropped, the memory is unmapped from the current process. Other
/// processes can continue using it, however.
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<Self, Error> {
// create the backing anonymous file.
let fd = create_backing_file(name)?;
Self::new_with_fd(fd, initial_size, max_size)
}
fn new_with_fd(fd: OwnedFd, initial_size: usize, max_size: usize) -> Result<Self, Error> {
// We reserve the high-order bit for the `RESIZE_IN_PROGRESS` flag, and the actual size
// is a little larger than this because of the SharedStruct header. Make the upper limit
// somewhat smaller than that, because with anything close to that, you'll run out of
// memory anyway.
assert!(max_size < 1 << 48, "max size {max_size} too large");
assert!(
initial_size <= max_size,
"initial size {initial_size} larger than max size {max_size}"
);
// The actual initial / max size is the one given by the caller, plus the size of
// 'SharedStruct'.
let initial_size = HEADER_SIZE + initial_size;
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
// Reserve address space for it with mmap
//
// TODO: Use MAP_HUGETLB if possible
let start_ptr = unsafe {
nix_mmap(
None,
max_size,
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
MapFlags::MAP_SHARED,
&fd,
0,
)
}
.map_err(|e| Error::new("mmap failed", e))?;
// Reserve space for the initial size
enlarge_file(fd.as_fd(), initial_size as u64)?;
// Initialize the header
let shared: NonNull<SharedStruct> = start_ptr.cast();
unsafe {
shared.write(SharedStruct {
max_size: max_size.into(),
current_size: AtomicUsize::new(initial_size),
});
}
// The user data begins after the header
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
Ok(Self {
fd,
max_size: max_size.into(),
shared_ptr: shared,
data_ptr,
})
}
// return reference to the header
fn shared(&self) -> &SharedStruct {
unsafe { self.shared_ptr.as_ref() }
}
/// Resize the shared memory area. `new_size` must not be larger than the `max_size` specified
/// when creating the area.
///
/// This may only be called from one process/thread concurrently. We detect that case
/// and return an [`shmem::Error`](Error).
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
let new_size = new_size + HEADER_SIZE;
let shared = self.shared();
assert!(
new_size <= self.max_size,
"new size ({new_size}) is greater than max size ({})",
self.max_size
);
assert_eq!(self.max_size, shared.max_size);
// Lock the area by setting the bit in `current_size`
//
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
// and the `posix_fallocate`/`ftruncate` call is surely a synchronization point anyway. But
// since this is not performance-critical, better safe than sorry.
let mut old_size = shared.current_size.load(Ordering::Acquire);
loop {
if (old_size & RESIZE_IN_PROGRESS) != 0 {
return Err(Error::new(
"concurrent resize detected",
Errno::UnknownErrno,
));
}
match shared.current_size.compare_exchange(
old_size,
new_size,
Ordering::Acquire,
Ordering::Relaxed,
) {
Ok(_) => break,
Err(x) => old_size = x,
}
}
// Ok, we got the lock.
//
// NB: If anything goes wrong, we *must* clear the bit!
let result = {
use std::cmp::Ordering::{Equal, Greater, Less};
match new_size.cmp(&old_size) {
Less => nix_ftruncate(&self.fd, new_size as i64)
.map_err(|e| Error::new("could not shrink shmem segment, ftruncate failed", e)),
Equal => Ok(()),
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
}
};
// Unlock
shared.current_size.store(
if result.is_ok() { new_size } else { old_size },
Ordering::Release,
);
result
}
/// Returns the current user-visible size of the shared memory segment.
///
/// NOTE: a concurrent [`ShmemHandle::set_size()`] call can change the size at any time.
/// It is the caller's responsibility not to access the area beyond the current size.
pub fn current_size(&self) -> usize {
let total_current_size =
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
total_current_size - HEADER_SIZE
}
}
impl Drop for ShmemHandle {
fn drop(&mut self) {
// SAFETY: The pointer was obtained from mmap() with the given size.
// We unmap the entire region.
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
// The fd is dropped automatically by OwnedFd.
}
}
/// Create a "backing file" for the shared memory area. On Linux, use `memfd_create()`, to create an
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
/// development and testing, but in production we want the file to stay in memory.
///
/// Disable unused variables warnings because `name` is unused in the macos path.
#[allow(unused_variables)]
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
#[cfg(not(target_os = "macos"))]
{
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
.map_err(|e| Error::new("memfd_create failed", e))
}
#[cfg(target_os = "macos")]
{
let file = tempfile::tempfile().map_err(|e| {
Error::new(
"could not create temporary file to back shmem area",
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
)
})?;
Ok(OwnedFd::from(file))
}
}
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
// we don't get a segfault later when trying to actually use it.
#[cfg(not(target_os = "macos"))]
{
nix::fcntl::posix_fallocate(fd, 0, size as i64)
.map_err(|e| Error::new("could not grow shmem segment, posix_fallocate failed", e))
}
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
#[cfg(target_os = "macos")]
{
nix::unistd::ftruncate(fd, size as i64)
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed", e))
}
}
#[cfg(test)]
mod tests {
use super::*;
use nix::unistd::ForkResult;
use std::ops::Range;
/// check that all bytes in given range have the expected value.
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
for i in range {
let b = unsafe { *(ptr.add(i)) };
assert_eq!(expected, b, "unexpected byte at offset {i}");
}
}
/// Write 'b' to all bytes in the given range
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
}
// simple single-process test of growing and shrinking
#[test]
fn test_shmem_resize() -> Result<(), Error> {
let max_size = 1024 * 1024;
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
assert_eq!(init_struct.current_size(), 0);
// Initial grow
let size1 = 10000;
init_struct.set_size(size1).unwrap();
assert_eq!(init_struct.current_size(), size1);
// Write some data
let data_ptr = init_struct.data_ptr.as_ptr();
write_range(data_ptr, 0xAA, 0..size1);
assert_range(data_ptr, 0xAA, 0..size1);
// Shrink
let size2 = 5000;
init_struct.set_size(size2).unwrap();
assert_eq!(init_struct.current_size(), size2);
// Grow again
let size3 = 20000;
init_struct.set_size(size3).unwrap();
assert_eq!(init_struct.current_size(), size3);
// Try to read it. The area that was shrunk and grown again should read as all zeros now
assert_range(data_ptr, 0xAA, 0..5000);
assert_range(data_ptr, 0, 5000..size1);
// Try to grow beyond max_size
//let size4 = max_size + 1;
//assert!(init_struct.set_size(size4).is_err());
// Dropping init_struct should unmap the memory
drop(init_struct);
Ok(())
}
/// This is used in tests to coordinate between test processes. It's like `std::sync::Barrier`,
/// but is stored in the shared memory area and works across processes. It's implemented by
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
struct SimpleBarrier {
num_procs: usize,
count: AtomicUsize,
}
impl SimpleBarrier {
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
unsafe {
*ptr = SimpleBarrier {
num_procs,
count: AtomicUsize::new(0),
}
}
}
pub fn wait(&self) {
let old = self.count.fetch_add(1, Ordering::Relaxed);
let generation = old / self.num_procs;
let mut current = old + 1;
while current < (generation + 1) * self.num_procs {
std::thread::sleep(std::time::Duration::from_millis(10));
current = self.count.load(Ordering::Relaxed);
}
}
}
#[test]
fn test_multi_process() {
// Initialize
let max_size = 1_000_000_000_000;
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
let ptr = init_struct.data_ptr.as_ptr();
// Store the SimpleBarrier in the first 1k of the area.
init_struct.set_size(10000).unwrap();
let barrier_ptr: *mut SimpleBarrier = unsafe {
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
.cast()
};
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
// Fork another test process. The code after this runs in both processes concurrently.
let fork_result = unsafe { nix::unistd::fork().unwrap() };
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
if fork_result.is_parent() {
write_range(ptr, 0xAA, 1000..2000);
} else {
write_range(ptr, 0xBB, 2000..3000);
}
barrier.wait();
// Verify the contents. (in both processes)
assert_range(ptr, 0xAA, 1000..2000);
assert_range(ptr, 0xBB, 2000..3000);
// Grow, from the child this time
let size = 10_000_000;
if !fork_result.is_parent() {
init_struct.set_size(size).unwrap();
}
barrier.wait();
// make some writes at the end
if fork_result.is_parent() {
write_range(ptr, 0xAA, (size - 10)..size);
} else {
write_range(ptr, 0xBB, (size - 20)..(size - 10));
}
barrier.wait();
// Verify the contents. (This runs in both processes)
assert_range(ptr, 0, (size - 1000)..(size - 20));
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
assert_range(ptr, 0xAA, (size - 10)..size);
if let ForkResult::Parent { child } = fork_result {
nix::sys::wait::waitpid(child, None).unwrap();
}
}
}

169
libs/neon-shmem/src/sync.rs Normal file
View File

@@ -0,0 +1,169 @@
//! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory.
use std::mem::MaybeUninit;
use std::ptr::NonNull;
use nix::errno::Errno;
pub type RwLock<T> = lock_api::RwLock<PthreadRwLock, T>;
pub type Mutex<T> = lock_api::Mutex<PthreadMutex, T>;
pub(crate) type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>;
pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>;
pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>;
/// Wrapper around a pointer to a [`libc::pthread_rwlock_t`].
///
/// `PthreadRwLock(None)` is an invalid state for this type. It only exists because the
/// [`lock_api::RawRwLock`] trait has a mandatory `INIT` const member to allow for static
/// initialization of the lock. Unfortunately, pthread seemingly does not support any way
/// to statically initialize a `pthread_rwlock_t` with `PTHREAD_PROCESS_SHARED` set. However,
/// `lock_api` allows manual construction and seemingly doesn't use `INIT` itself so for
/// now it's set to this invalid value to satisfy the trait constraints.
pub struct PthreadRwLock(Option<NonNull<libc::pthread_rwlock_t>>);
impl PthreadRwLock {
pub fn new(lock: NonNull<libc::pthread_rwlock_t>) -> Self {
unsafe {
let mut attrs = MaybeUninit::uninit();
// Ignoring return value here - only possible error is OOM.
libc::pthread_rwlockattr_init(attrs.as_mut_ptr());
libc::pthread_rwlockattr_setpshared(
attrs.as_mut_ptr(),
libc::PTHREAD_PROCESS_SHARED
);
// TODO(quantumish): worth making this function fallible?
libc::pthread_rwlock_init(lock.as_ptr(), attrs.as_mut_ptr());
// Safety: POSIX specifies that "any function affecting the attributes
// object (including destruction) shall not affect any previously
// initialized read-write locks".
libc::pthread_rwlockattr_destroy(attrs.as_mut_ptr());
Self(Some(lock))
}
}
fn inner(&self) -> NonNull<libc::pthread_rwlock_t> {
self.0.unwrap_or_else(
|| panic!("PthreadRwLock constructed badly - something likely used RawRwLock::INIT")
)
}
fn unlock(&self) {
unsafe {
let res = libc::pthread_rwlock_unlock(self.inner().as_ptr());
assert!(res == 0, "unlock failed with {}", Errno::from_raw(res));
}
}
}
unsafe impl lock_api::RawRwLock for PthreadRwLock {
type GuardMarker = lock_api::GuardSend;
/// *DO NOT USE THIS.* See [`PthreadRwLock`] for the full explanation.
const INIT: Self = Self(None);
fn lock_shared(&self) {
unsafe {
let res = libc::pthread_rwlock_rdlock(self.inner().as_ptr());
assert!(res == 0, "rdlock failed with {}", Errno::from_raw(res));
}
}
fn try_lock_shared(&self) -> bool {
unsafe {
let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr());
match res {
0 => true,
libc::EAGAIN => false,
o => panic!("try_rdlock failed with {}", Errno::from_raw(o)),
}
}
}
fn lock_exclusive(&self) {
unsafe {
let res = libc::pthread_rwlock_wrlock(self.inner().as_ptr());
assert!(res == 0, "wrlock failed with {}", Errno::from_raw(res));
}
}
fn try_lock_exclusive(&self) -> bool {
unsafe {
let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr());
match res {
0 => true,
libc::EAGAIN => false,
o => panic!("try_wrlock failed with {}", Errno::from_raw(o)),
}
}
}
unsafe fn unlock_exclusive(&self) {
self.unlock();
}
unsafe fn unlock_shared(&self) {
self.unlock();
}
}
pub struct PthreadMutex(Option<NonNull<libc::pthread_mutex_t>>);
impl PthreadMutex {
pub fn new(lock: NonNull<libc::pthread_mutex_t>) -> Self {
unsafe {
let mut attrs = MaybeUninit::uninit();
// Ignoring return value here - only possible error is OOM.
libc::pthread_mutexattr_init(attrs.as_mut_ptr());
libc::pthread_mutexattr_setpshared(
attrs.as_mut_ptr(),
libc::PTHREAD_PROCESS_SHARED
);
libc::pthread_mutex_init(lock.as_ptr(), attrs.as_mut_ptr());
// Safety: POSIX specifies that "any function affecting the attributes
// object (including destruction) shall not affect any previously
// initialized read-write locks".
libc::pthread_mutexattr_destroy(attrs.as_mut_ptr());
Self(Some(lock))
}
}
fn inner(&self) -> NonNull<libc::pthread_mutex_t> {
self.0.unwrap_or_else(
|| panic!("PthreadMutex constructed badly - something likely used RawMutex::INIT")
)
}
}
unsafe impl lock_api::RawMutex for PthreadMutex {
type GuardMarker = lock_api::GuardSend;
/// *DO NOT USE THIS.* See [`PthreadRwLock`] for the full explanation.
const INIT: Self = Self(None);
fn lock(&self) {
unsafe {
let res = libc::pthread_mutex_lock(self.inner().as_ptr());
assert!(res == 0, "lock failed with {}", Errno::from_raw(res));
}
}
fn try_lock(&self) -> bool {
unsafe {
let res = libc::pthread_mutex_trylock(self.inner().as_ptr());
match res {
0 => true,
libc::EAGAIN => false,
o => panic!("try_rdlock failed with {}", Errno::from_raw(o)),
}
}
}
unsafe fn unlock(&self) {
unsafe {
let res = libc::pthread_mutex_unlock(self.inner().as_ptr());
assert!(res == 0, "unlock failed with {}", Errno::from_raw(res));
}
}
}

14
libs/neonart/Cargo.toml Normal file
View File

@@ -0,0 +1,14 @@
[package]
name = "neonart"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
crossbeam-utils.workspace = true
spin.workspace = true
tracing.workspace = true
[dev-dependencies]
rand = "0.9.1"
rand_distr = "0.5.1"

View File

@@ -0,0 +1,599 @@
mod lock_and_version;
pub(crate) mod node_ptr;
mod node_ref;
use std::vec::Vec;
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
use crate::algorithm::node_ptr::MAX_PREFIX_LEN;
use crate::algorithm::node_ref::{NewNodeRef, NodeRef, ReadLockedNodeRef, WriteLockedNodeRef};
use crate::allocator::OutOfMemoryError;
use crate::TreeWriteGuard;
use crate::UpdateAction;
use crate::allocator::ArtAllocator;
use crate::epoch::EpochPin;
use crate::{Key, Value};
pub(crate) type RootPtr<V> = node_ptr::NodePtr<V>;
#[derive(Debug)]
pub enum ArtError {
ConcurrentUpdate, // need to retry
OutOfMemory,
}
impl From<ConcurrentUpdateError> for ArtError {
fn from(_: ConcurrentUpdateError) -> ArtError {
ArtError::ConcurrentUpdate
}
}
impl From<OutOfMemoryError> for ArtError {
fn from(_: OutOfMemoryError) -> ArtError {
ArtError::OutOfMemory
}
}
pub fn new_root<V: Value>(
allocator: &impl ArtAllocator<V>,
) -> Result<RootPtr<V>, OutOfMemoryError> {
node_ptr::new_root(allocator)
}
pub(crate) fn search<'e, K: Key, V: Value>(
key: &K,
root: RootPtr<V>,
epoch_pin: &'e EpochPin,
) -> Option<&'e V> {
loop {
let root_ref = NodeRef::from_root_ptr(root);
if let Ok(result) = lookup_recurse(key.as_bytes(), root_ref, None, epoch_pin) {
break result;
}
// retry
}
}
pub(crate) fn iter_next<'e, V: Value>(
key: &[u8],
root: RootPtr<V>,
epoch_pin: &'e EpochPin,
) -> Option<(Vec<u8>, &'e V)> {
loop {
let mut path = Vec::new();
let root_ref = NodeRef::from_root_ptr(root);
match next_recurse(key, &mut path, root_ref, epoch_pin) {
Ok(Some(v)) => {
assert_eq!(path.len(), key.len());
break Some((path, v));
}
Ok(None) => break None,
Err(ConcurrentUpdateError()) => {
// retry
continue;
}
}
}
}
pub(crate) fn update_fn<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>, F>(
key: &K,
value_fn: F,
root: RootPtr<V>,
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
) -> Result<(), OutOfMemoryError>
where
F: FnOnce(Option<&V>) -> UpdateAction<V>,
{
let value_fn_cell = std::cell::Cell::new(Some(value_fn));
loop {
let root_ref = NodeRef::from_root_ptr(root);
let this_value_fn = |arg: Option<&V>| value_fn_cell.take().unwrap()(arg);
let key_bytes = key.as_bytes();
match update_recurse(
key_bytes,
this_value_fn,
root_ref,
None,
None,
guard,
0,
key_bytes,
) {
Ok(()) => break Ok(()),
Err(ArtError::ConcurrentUpdate) => {
continue; // retry
}
Err(ArtError::OutOfMemory) => break Err(OutOfMemoryError()),
}
}
}
// Error means you must retry.
//
// This corresponds to the 'lookupOpt' function in the paper
#[allow(clippy::only_used_in_recursion)]
fn lookup_recurse<'e, V: Value>(
key: &[u8],
node: NodeRef<'e, V>,
parent: Option<ReadLockedNodeRef<V>>,
epoch_pin: &'e EpochPin,
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
let rnode = node.read_lock_or_restart()?;
if let Some(parent) = parent {
parent.read_unlock_or_restart()?;
}
// check if the prefix matches, may increment level
let prefix_len = if let Some(prefix_len) = rnode.prefix_matches(key) {
prefix_len
} else {
rnode.read_unlock_or_restart()?;
return Ok(None);
};
if rnode.is_leaf() {
assert_eq!(key.len(), prefix_len);
let vptr = rnode.get_leaf_value_ptr()?;
// safety: It's OK to return a ref of the pointer because we checked the version
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
// as long as the epoch is pinned.
let v = unsafe { vptr.as_ref().unwrap() };
return Ok(Some(v));
}
let key = &key[prefix_len..];
// find child (or leaf value)
let next_node = rnode.find_child_or_restart(key[0])?;
match next_node {
None => Ok(None), // key not found
Some(child) => lookup_recurse(&key[1..], child, Some(rnode), epoch_pin),
}
}
#[allow(clippy::only_used_in_recursion)]
fn next_recurse<'e, V: Value>(
min_key: &[u8],
path: &mut Vec<u8>,
node: NodeRef<'e, V>,
epoch_pin: &'e EpochPin,
) -> Result<Option<&'e V>, ConcurrentUpdateError> {
let rnode = node.read_lock_or_restart()?;
let prefix = rnode.get_prefix();
if !prefix.is_empty() {
path.extend_from_slice(prefix);
}
use std::cmp::Ordering;
let comparison = path.as_slice().cmp(&min_key[0..path.len()]);
if comparison == Ordering::Less {
rnode.read_unlock_or_restart()?;
return Ok(None);
}
if rnode.is_leaf() {
assert_eq!(path.len(), min_key.len());
let vptr = rnode.get_leaf_value_ptr()?;
// safety: It's OK to return a ref of the pointer because we checked the version
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
// as long as the epoch is pinned.
let v = unsafe { vptr.as_ref().unwrap() };
return Ok(Some(v));
}
let mut min_key_byte = match comparison {
Ordering::Less => unreachable!(), // checked this above already
Ordering::Equal => min_key[path.len()],
Ordering::Greater => 0,
};
loop {
match rnode.find_next_child_or_restart(min_key_byte)? {
None => {
return Ok(None);
}
Some((key_byte, child_ref)) => {
let path_len = path.len();
path.push(key_byte);
let result = next_recurse(min_key, path, child_ref, epoch_pin)?;
if result.is_some() {
return Ok(result);
}
if key_byte == u8::MAX {
return Ok(None);
}
path.truncate(path_len);
min_key_byte = key_byte + 1;
}
}
}
}
// This corresponds to the 'insertOpt' function in the paper
#[allow(clippy::only_used_in_recursion)]
#[allow(clippy::too_many_arguments)]
pub(crate) fn update_recurse<'e, K: Key, V: Value, A: ArtAllocator<V>, F>(
key: &[u8],
value_fn: F,
node: NodeRef<'e, V>,
rparent: Option<(ReadLockedNodeRef<V>, u8)>,
rgrandparent: Option<(ReadLockedNodeRef<V>, u8)>,
guard: &'_ mut TreeWriteGuard<'e, K, V, A>,
level: usize,
orig_key: &[u8],
) -> Result<(), ArtError>
where
F: FnOnce(Option<&V>) -> UpdateAction<V>,
{
let rnode = node.read_lock_or_restart()?;
let prefix_match_len = rnode.prefix_matches(key);
if prefix_match_len.is_none() {
let (rparent, parent_key) = rparent.expect("direct children of the root have no prefix");
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
match value_fn(None) {
UpdateAction::Nothing => {}
UpdateAction::Insert(new_value) => {
insert_split_prefix(key, new_value, &mut wnode, &mut wparent, parent_key, guard)?;
}
UpdateAction::Remove => {
panic!("unexpected Remove action on insertion");
}
}
wnode.write_unlock();
wparent.write_unlock();
return Ok(());
}
let prefix_match_len = prefix_match_len.unwrap();
let key = &key[prefix_match_len..];
let level = level + prefix_match_len;
if rnode.is_leaf() {
assert_eq!(key.len(), 0);
let (rparent, parent_key) = rparent.expect("root cannot be leaf");
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
// safety: Now that we have acquired the write lock, we have exclusive access to the
// value. XXX: There might be concurrent reads though?
let value_mut = wnode.get_leaf_value_mut();
match value_fn(Some(value_mut)) {
UpdateAction::Nothing => {
wparent.write_unlock();
wnode.write_unlock();
}
UpdateAction::Insert(_) => panic!("cannot insert over existing value"),
UpdateAction::Remove => {
guard.remember_obsolete_node(wnode.as_ptr());
wparent.delete_child(parent_key);
wnode.write_unlock_obsolete();
if let Some(rgrandparent) = rgrandparent {
// FIXME: Ignore concurrency error. It doesn't lead to
// corruption, but it means we might leak something. Until
// another update cleans it up.
let _ = cleanup_parent(wparent, rgrandparent, guard);
}
}
}
return Ok(());
}
let next_node = rnode.find_child_or_restart(key[0])?;
if next_node.is_none() {
if rnode.is_full() {
let (rparent, parent_key) = rparent.expect("root node cannot become full");
let mut wparent = rparent.upgrade_to_write_lock_or_restart()?;
let wnode = rnode.upgrade_to_write_lock_or_restart()?;
match value_fn(None) {
UpdateAction::Nothing => {
wnode.write_unlock();
wparent.write_unlock();
}
UpdateAction::Insert(new_value) => {
insert_and_grow(key, new_value, wnode, &mut wparent, parent_key, guard)?;
wparent.write_unlock();
}
UpdateAction::Remove => {
panic!("unexpected Remove action on insertion");
}
};
} else {
let mut wnode = rnode.upgrade_to_write_lock_or_restart()?;
if let Some((rparent, _)) = rparent {
rparent.read_unlock_or_restart()?;
}
match value_fn(None) {
UpdateAction::Nothing => {}
UpdateAction::Insert(new_value) => {
insert_to_node(&mut wnode, key, new_value, guard)?;
}
UpdateAction::Remove => {
panic!("unexpected Remove action on insertion");
}
};
wnode.write_unlock();
}
Ok(())
} else {
let next_child = next_node.unwrap(); // checked above it's not None
if let Some((ref rparent, _)) = rparent {
rparent.check_or_restart()?;
}
// recurse to next level
update_recurse(
&key[1..],
value_fn,
next_child,
Some((rnode, key[0])),
rparent,
guard,
level + 1,
orig_key,
)
}
}
#[derive(Clone)]
enum PathElement {
Prefix(Vec<u8>),
KeyByte(u8),
}
impl std::fmt::Debug for PathElement {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
PathElement::Prefix(prefix) => write!(fmt, "{prefix:?}"),
PathElement::KeyByte(key_byte) => write!(fmt, "{key_byte}"),
}
}
}
pub(crate) fn dump_tree<V: Value + std::fmt::Debug>(
root: RootPtr<V>,
epoch_pin: &'_ EpochPin,
dst: &mut dyn std::io::Write,
) {
let root_ref = NodeRef::from_root_ptr(root);
let _ = dump_recurse(&[], root_ref, epoch_pin, 0, dst);
}
// TODO: return an Err if writeln!() returns error, instead of unwrapping
#[allow(clippy::only_used_in_recursion)]
fn dump_recurse<'e, V: Value + std::fmt::Debug>(
path: &[PathElement],
node: NodeRef<'e, V>,
epoch_pin: &'e EpochPin,
level: usize,
dst: &mut dyn std::io::Write,
) -> Result<(), ConcurrentUpdateError> {
let indent = str::repeat(" ", level);
let rnode = node.read_lock_or_restart()?;
let mut path = Vec::from(path);
let prefix = rnode.get_prefix();
if !prefix.is_empty() {
path.push(PathElement::Prefix(Vec::from(prefix)));
}
if rnode.is_leaf() {
let vptr = rnode.get_leaf_value_ptr()?;
// safety: It's OK to return a ref of the pointer because we checked the version
// and the lifetime of 'epoch_pin' enforces that the reference is only accessible
// as long as the epoch is pinned.
let val = unsafe { vptr.as_ref().unwrap() };
writeln!(dst, "{indent} {path:?}: {val:?}").unwrap();
return Ok(());
}
for key_byte in 0..=u8::MAX {
match rnode.find_child_or_restart(key_byte)? {
None => continue,
Some(child_ref) => {
let rchild = child_ref.read_lock_or_restart()?;
writeln!(
dst,
"{} {:?}, {}: prefix {:?}",
indent,
&path,
key_byte,
rchild.get_prefix()
)
.unwrap();
let mut child_path = path.clone();
child_path.push(PathElement::KeyByte(key_byte));
dump_recurse(&child_path, child_ref, epoch_pin, level + 1, dst)?;
}
}
}
Ok(())
}
///```text
/// [fooba]r -> value
///
/// [foo]b -> [a]r -> value
/// e -> [ls]e -> value
///```
fn insert_split_prefix<K: Key, V: Value, A: ArtAllocator<V>>(
key: &[u8],
value: V,
node: &mut WriteLockedNodeRef<V>,
parent: &mut WriteLockedNodeRef<V>,
parent_key: u8,
guard: &'_ TreeWriteGuard<K, V, A>,
) -> Result<(), OutOfMemoryError> {
let old_node = node;
let old_prefix = old_node.get_prefix();
let common_prefix_len = common_prefix(key, old_prefix);
// Allocate a node for the new value.
let new_value_node = allocate_node_for_value(
&key[common_prefix_len + 1..],
value,
guard.tree_writer.allocator,
)?;
// Allocate a new internal node with the common prefix
// FIXME: deallocate 'new_value_node' on OOM
let mut prefix_node =
node_ref::new_internal(&key[..common_prefix_len], guard.tree_writer.allocator)?;
// Add the old node and the new nodes to the new internal node
prefix_node.insert_old_child(old_prefix[common_prefix_len], old_node);
prefix_node.insert_new_child(key[common_prefix_len], new_value_node);
// Modify the prefix of the old child in place
old_node.truncate_prefix(old_prefix.len() - common_prefix_len - 1);
// replace the pointer in the parent
parent.replace_child(parent_key, prefix_node.into_ptr());
Ok(())
}
fn insert_to_node<K: Key, V: Value, A: ArtAllocator<V>>(
wnode: &mut WriteLockedNodeRef<V>,
key: &[u8],
value: V,
guard: &'_ TreeWriteGuard<K, V, A>,
) -> Result<(), OutOfMemoryError> {
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
wnode.insert_child(key[0], value_child.into_ptr());
Ok(())
}
// On entry: 'parent' and 'node' are locked
fn insert_and_grow<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
key: &[u8],
value: V,
wnode: WriteLockedNodeRef<V>,
parent: &mut WriteLockedNodeRef<V>,
parent_key_byte: u8,
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
) -> Result<(), ArtError> {
let mut bigger_node = wnode.grow(guard.tree_writer.allocator)?;
// FIXME: deallocate 'bigger_node' on OOM
let value_child = allocate_node_for_value(&key[1..], value, guard.tree_writer.allocator)?;
bigger_node.insert_new_child(key[0], value_child);
// Replace the pointer in the parent
parent.replace_child(parent_key_byte, bigger_node.into_ptr());
guard.remember_obsolete_node(wnode.as_ptr());
wnode.write_unlock_obsolete();
Ok(())
}
fn cleanup_parent<'e, 'g, K: Key, V: Value, A: ArtAllocator<V>>(
wparent: WriteLockedNodeRef<V>,
rgrandparent: (ReadLockedNodeRef<V>, u8),
guard: &'g mut TreeWriteGuard<'e, K, V, A>,
) -> Result<(), ArtError> {
let (rgrandparent, grandparent_key_byte) = rgrandparent;
// If the parent becomes completely empty after the deletion, remove the parent from the
// grandparent. (This case is possible because we reserve only 8 bytes for the prefix.)
// TODO: not implemented.
// If the parent has only one child, replace the parent with the remaining child. (This is not
// possible if the child's prefix field cannot absorb the parent's)
if wparent.num_children() == 1 {
// Try to lock the remaining child. This can fail if the child is updated
// concurrently.
let (key_byte, remaining_child) = wparent.find_remaining_child();
let mut wremaining_child = remaining_child.write_lock_or_restart()?;
if 1 + wremaining_child.get_prefix().len() + wparent.get_prefix().len() <= MAX_PREFIX_LEN {
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
// Ok, we have locked the leaf, the parent, the grandparent, and the parent's only
// remaining leaf. Proceed with the updates.
// Update the prefix on the remaining leaf
wremaining_child.prepend_prefix(wparent.get_prefix(), key_byte);
// Replace the pointer in the grandparent to point directly to the remaining leaf
wgrandparent.replace_child(grandparent_key_byte, wremaining_child.as_ptr());
// Mark the parent as deleted.
guard.remember_obsolete_node(wparent.as_ptr());
wparent.write_unlock_obsolete();
return Ok(());
}
}
// If the parent's children would fit on a smaller node type after the deletion, replace it with
// a smaller node.
if wparent.can_shrink() {
let mut wgrandparent = rgrandparent.upgrade_to_write_lock_or_restart()?;
let smaller_node = wparent.shrink(guard.tree_writer.allocator)?;
// Replace the pointer in the grandparent
wgrandparent.replace_child(grandparent_key_byte, smaller_node.into_ptr());
guard.remember_obsolete_node(wparent.as_ptr());
wparent.write_unlock_obsolete();
return Ok(());
}
// nothing to do
wparent.write_unlock();
Ok(())
}
// Allocate a new leaf node to hold 'value'. If the key is long, we
// may need to allocate new internal nodes to hold it too
fn allocate_node_for_value<'a, V: Value, A: ArtAllocator<V>>(
key: &[u8],
value: V,
allocator: &'a A,
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError> {
let mut prefix_off = key.len().saturating_sub(MAX_PREFIX_LEN);
let leaf_node = node_ref::new_leaf(&key[prefix_off..key.len()], value, allocator)?;
let mut node = leaf_node;
while prefix_off > 0 {
// Need another internal node
let remain_prefix = &key[0..prefix_off];
prefix_off = remain_prefix.len().saturating_sub(MAX_PREFIX_LEN + 1);
let mut internal_node = node_ref::new_internal(
&remain_prefix[prefix_off..remain_prefix.len() - 1],
allocator,
)?;
internal_node.insert_new_child(*remain_prefix.last().unwrap(), node);
node = internal_node;
}
Ok(node)
}
fn common_prefix(a: &[u8], b: &[u8]) -> usize {
for i in 0..MAX_PREFIX_LEN {
if a[i] != b[i] {
return i;
}
}
panic!("prefixes are equal");
}

View File

@@ -0,0 +1,117 @@
//! Each node in the tree has contains one atomic word that stores three things:
//!
//! Bit 0: set if the node is "obsolete". An obsolete node has been removed from the tree,
//! but might still be accessed by concurrent readers until the epoch expires.
//! Bit 1: set if the node is currently write-locked. Used as a spinlock.
//! Bits 2-63: Version number, incremented every time the node is modified.
//!
//! AtomicLockAndVersion represents that.
use std::sync::atomic::{AtomicU64, Ordering};
pub(crate) struct ConcurrentUpdateError();
pub(crate) struct AtomicLockAndVersion {
inner: AtomicU64,
}
impl AtomicLockAndVersion {
pub(crate) fn new() -> AtomicLockAndVersion {
AtomicLockAndVersion {
inner: AtomicU64::new(0),
}
}
}
impl AtomicLockAndVersion {
pub(crate) fn read_lock_or_restart(&self) -> Result<u64, ConcurrentUpdateError> {
let version = self.await_node_unlocked();
if is_obsolete(version) {
return Err(ConcurrentUpdateError());
}
Ok(version)
}
pub(crate) fn check_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
self.read_unlock_or_restart(version)
}
pub(crate) fn read_unlock_or_restart(&self, version: u64) -> Result<(), ConcurrentUpdateError> {
if self.inner.load(Ordering::Acquire) != version {
return Err(ConcurrentUpdateError());
}
Ok(())
}
pub(crate) fn upgrade_to_write_lock_or_restart(
&self,
version: u64,
) -> Result<(), ConcurrentUpdateError> {
if self
.inner
.compare_exchange(
version,
set_locked_bit(version),
Ordering::Acquire,
Ordering::Relaxed,
)
.is_err()
{
return Err(ConcurrentUpdateError());
}
Ok(())
}
pub(crate) fn write_lock_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
let old = self.inner.load(Ordering::Relaxed);
if is_obsolete(old) || is_locked(old) {
return Err(ConcurrentUpdateError());
}
if self
.inner
.compare_exchange(
old,
set_locked_bit(old),
Ordering::Acquire,
Ordering::Relaxed,
)
.is_err()
{
return Err(ConcurrentUpdateError());
}
Ok(())
}
pub(crate) fn write_unlock(&self) {
// reset locked bit and overflow into version
self.inner.fetch_add(2, Ordering::Release);
}
pub(crate) fn write_unlock_obsolete(&self) {
// set obsolete, reset locked, overflow into version
self.inner.fetch_add(3, Ordering::Release);
}
// Helper functions
fn await_node_unlocked(&self) -> u64 {
let mut version = self.inner.load(Ordering::Acquire);
while is_locked(version) {
// spinlock
std::thread::yield_now();
version = self.inner.load(Ordering::Acquire)
}
version
}
}
fn set_locked_bit(version: u64) -> u64 {
version + 2
}
fn is_obsolete(version: u64) -> bool {
(version & 1) == 1
}
fn is_locked(version: u64) -> bool {
(version & 2) == 2
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,349 @@
use std::fmt::Debug;
use std::marker::PhantomData;
use super::node_ptr;
use super::node_ptr::NodePtr;
use crate::EpochPin;
use crate::Value;
use crate::algorithm::lock_and_version::AtomicLockAndVersion;
use crate::algorithm::lock_and_version::ConcurrentUpdateError;
use crate::allocator::ArtAllocator;
use crate::allocator::OutOfMemoryError;
pub struct NodeRef<'e, V> {
ptr: NodePtr<V>,
phantom: PhantomData<&'e EpochPin<'e>>,
}
impl<'e, V> Debug for NodeRef<'e, V> {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
write!(fmt, "{:?}", self.ptr)
}
}
impl<'e, V: Value> NodeRef<'e, V> {
pub(crate) fn from_root_ptr(root_ptr: NodePtr<V>) -> NodeRef<'e, V> {
NodeRef {
ptr: root_ptr,
phantom: PhantomData,
}
}
pub(crate) fn read_lock_or_restart(
&self,
) -> Result<ReadLockedNodeRef<'e, V>, ConcurrentUpdateError> {
let version = self.lockword().read_lock_or_restart()?;
Ok(ReadLockedNodeRef {
ptr: self.ptr,
version,
phantom: self.phantom,
})
}
pub(crate) fn write_lock_or_restart(
&self,
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
self.lockword().write_lock_or_restart()?;
Ok(WriteLockedNodeRef {
ptr: self.ptr,
phantom: self.phantom,
})
}
fn lockword(&self) -> &AtomicLockAndVersion {
self.ptr.lockword()
}
}
/// A reference to a node that has been optimistically read-locked. The functions re-check
/// the version after each read.
pub struct ReadLockedNodeRef<'e, V> {
ptr: NodePtr<V>,
version: u64,
phantom: PhantomData<&'e EpochPin<'e>>,
}
impl<'e, V: Value> ReadLockedNodeRef<'e, V> {
pub(crate) fn is_leaf(&self) -> bool {
self.ptr.is_leaf()
}
pub(crate) fn is_full(&self) -> bool {
self.ptr.is_full()
}
pub(crate) fn get_prefix(&self) -> &[u8] {
self.ptr.get_prefix()
}
/// Note: because we're only holding a read lock, the prefix can change concurrently.
/// You must be prepared to restart, if read_unlock() returns error later.
///
/// Returns the length of the prefix, or None if it's not a match
pub(crate) fn prefix_matches(&self, key: &[u8]) -> Option<usize> {
self.ptr.prefix_matches(key)
}
pub(crate) fn find_child_or_restart(
&self,
key_byte: u8,
) -> Result<Option<NodeRef<'e, V>>, ConcurrentUpdateError> {
let child_or_value = self.ptr.find_child(key_byte);
self.ptr.lockword().check_or_restart(self.version)?;
match child_or_value {
None => Ok(None),
Some(child_ptr) => Ok(Some(NodeRef {
ptr: child_ptr,
phantom: self.phantom,
})),
}
}
pub(crate) fn find_next_child_or_restart(
&self,
min_key_byte: u8,
) -> Result<Option<(u8, NodeRef<'e, V>)>, ConcurrentUpdateError> {
let child_or_value = self.ptr.find_next_child(min_key_byte);
self.ptr.lockword().check_or_restart(self.version)?;
match child_or_value {
None => Ok(None),
Some((k, child_ptr)) => Ok(Some((
k,
NodeRef {
ptr: child_ptr,
phantom: self.phantom,
},
))),
}
}
pub(crate) fn get_leaf_value_ptr(&self) -> Result<*const V, ConcurrentUpdateError> {
let result = self.ptr.get_leaf_value();
self.ptr.lockword().check_or_restart(self.version)?;
// Extend the lifetime.
let result = std::ptr::from_ref(result);
Ok(result)
}
pub(crate) fn upgrade_to_write_lock_or_restart(
self,
) -> Result<WriteLockedNodeRef<'e, V>, ConcurrentUpdateError> {
self.ptr
.lockword()
.upgrade_to_write_lock_or_restart(self.version)?;
Ok(WriteLockedNodeRef {
ptr: self.ptr,
phantom: self.phantom,
})
}
pub(crate) fn read_unlock_or_restart(self) -> Result<(), ConcurrentUpdateError> {
self.ptr.lockword().check_or_restart(self.version)?;
Ok(())
}
pub(crate) fn check_or_restart(&self) -> Result<(), ConcurrentUpdateError> {
self.ptr.lockword().check_or_restart(self.version)?;
Ok(())
}
}
/// A reference to a node that has been optimistically read-locked. The functions re-check
/// the version after each read.
pub struct WriteLockedNodeRef<'e, V> {
ptr: NodePtr<V>,
phantom: PhantomData<&'e EpochPin<'e>>,
}
impl<'e, V: Value> WriteLockedNodeRef<'e, V> {
pub(crate) fn can_shrink(&self) -> bool {
self.ptr.can_shrink()
}
pub(crate) fn num_children(&self) -> usize {
self.ptr.num_children()
}
pub(crate) fn write_unlock(mut self) {
self.ptr.lockword().write_unlock();
self.ptr = NodePtr::null();
}
pub(crate) fn write_unlock_obsolete(mut self) {
self.ptr.lockword().write_unlock_obsolete();
self.ptr = NodePtr::null();
}
pub(crate) fn get_prefix(&self) -> &[u8] {
self.ptr.get_prefix()
}
pub(crate) fn truncate_prefix(&mut self, new_prefix_len: usize) {
self.ptr.truncate_prefix(new_prefix_len)
}
pub(crate) fn prepend_prefix(&mut self, prefix: &[u8], prefix_byte: u8) {
self.ptr.prepend_prefix(prefix, prefix_byte)
}
pub(crate) fn insert_child(&mut self, key_byte: u8, child: NodePtr<V>) {
self.ptr.insert_child(key_byte, child)
}
pub(crate) fn get_leaf_value_mut(&mut self) -> &mut V {
self.ptr.get_leaf_value_mut()
}
pub(crate) fn grow<'a, A>(
&self,
allocator: &'a A,
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
where
A: ArtAllocator<V>,
{
let new_node = self.ptr.grow(allocator)?;
Ok(NewNodeRef {
ptr: new_node,
allocator,
extra_nodes: Vec::new(),
})
}
pub(crate) fn shrink<'a, A>(
&self,
allocator: &'a A,
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
where
A: ArtAllocator<V>,
{
let new_node = self.ptr.shrink(allocator)?;
Ok(NewNodeRef {
ptr: new_node,
allocator,
extra_nodes: Vec::new(),
})
}
pub(crate) fn as_ptr(&self) -> NodePtr<V> {
self.ptr
}
pub(crate) fn replace_child(&mut self, key_byte: u8, replacement: NodePtr<V>) {
self.ptr.replace_child(key_byte, replacement);
}
pub(crate) fn delete_child(&mut self, key_byte: u8) {
self.ptr.delete_child(key_byte);
}
pub(crate) fn find_remaining_child(&self) -> (u8, NodeRef<'e, V>) {
assert_eq!(self.num_children(), 1);
let child_or_value = self.ptr.find_next_child(0);
match child_or_value {
None => panic!("could not find only child in node"),
Some((k, child_ptr)) => (
k,
NodeRef {
ptr: child_ptr,
phantom: self.phantom,
},
),
}
}
}
impl<'e, V> Drop for WriteLockedNodeRef<'e, V> {
fn drop(&mut self) {
if !self.ptr.is_null() {
self.ptr.lockword().write_unlock();
}
}
}
pub(crate) struct NewNodeRef<'a, V, A>
where
V: Value,
A: ArtAllocator<V>,
{
ptr: NodePtr<V>,
allocator: &'a A,
extra_nodes: Vec<NodePtr<V>>,
}
impl<'a, V, A> NewNodeRef<'a, V, A>
where
V: Value,
A: ArtAllocator<V>,
{
pub(crate) fn insert_old_child(&mut self, key_byte: u8, child: &WriteLockedNodeRef<V>) {
self.ptr.insert_child(key_byte, child.as_ptr())
}
pub(crate) fn into_ptr(mut self) -> NodePtr<V> {
let ptr = self.ptr;
self.ptr = NodePtr::null();
ptr
}
pub(crate) fn insert_new_child(&mut self, key_byte: u8, child: NewNodeRef<'a, V, A>) {
let child_ptr = child.into_ptr();
self.ptr.insert_child(key_byte, child_ptr);
self.extra_nodes.push(child_ptr);
}
}
impl<'a, V, A> Drop for NewNodeRef<'a, V, A>
where
V: Value,
A: ArtAllocator<V>,
{
/// This drop implementation deallocates the newly allocated node, if into_ptr() was not called.
fn drop(&mut self) {
if !self.ptr.is_null() {
self.ptr.deallocate(self.allocator);
for p in self.extra_nodes.iter() {
p.deallocate(self.allocator);
}
}
}
}
pub(crate) fn new_internal<'a, V, A>(
prefix: &[u8],
allocator: &'a A,
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
where
V: Value,
A: ArtAllocator<V>,
{
Ok(NewNodeRef {
ptr: node_ptr::new_internal(prefix, allocator)?,
allocator,
extra_nodes: Vec::new(),
})
}
pub(crate) fn new_leaf<'a, V, A>(
prefix: &[u8],
value: V,
allocator: &'a A,
) -> Result<NewNodeRef<'a, V, A>, OutOfMemoryError>
where
V: Value,
A: ArtAllocator<V>,
{
Ok(NewNodeRef {
ptr: node_ptr::new_leaf(prefix, value, allocator)?,
allocator,
extra_nodes: Vec::new(),
})
}

View File

@@ -0,0 +1,156 @@
pub mod block;
mod multislab;
mod slab;
pub mod r#static;
use std::alloc::Layout;
use std::marker::PhantomData;
use std::mem::MaybeUninit;
use std::sync::atomic::Ordering;
use crate::allocator::multislab::MultiSlabAllocator;
use crate::allocator::r#static::alloc_from_slice;
use spin;
use crate::Tree;
pub use crate::algorithm::node_ptr::{
NodeInternal4, NodeInternal16, NodeInternal48, NodeInternal256, NodeLeaf,
};
#[derive(Debug)]
pub struct OutOfMemoryError();
pub trait ArtAllocator<V: crate::Value> {
fn alloc_tree(&self) -> *mut Tree<V>;
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V>;
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V>;
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V>;
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V>;
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V>;
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>);
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>);
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>);
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>);
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>);
}
pub struct ArtMultiSlabAllocator<'t, V>
where
V: crate::Value,
{
tree_area: spin::Mutex<Option<&'t mut MaybeUninit<Tree<V>>>>,
pub(crate) inner: MultiSlabAllocator<'t, 5>,
phantom_val: PhantomData<V>,
}
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
const LAYOUTS: [Layout; 5] = [
Layout::new::<NodeInternal4<V>>(),
Layout::new::<NodeInternal16<V>>(),
Layout::new::<NodeInternal48<V>>(),
Layout::new::<NodeInternal256<V>>(),
Layout::new::<NodeLeaf<V>>(),
];
pub fn new(area: &'t mut [MaybeUninit<u8>]) -> &'t mut ArtMultiSlabAllocator<'t, V> {
let (allocator_area, remain) = alloc_from_slice::<ArtMultiSlabAllocator<V>>(area);
let (tree_area, remain) = alloc_from_slice::<Tree<V>>(remain);
allocator_area.write(ArtMultiSlabAllocator {
tree_area: spin::Mutex::new(Some(tree_area)),
inner: MultiSlabAllocator::new(remain, &Self::LAYOUTS),
phantom_val: PhantomData,
})
}
}
impl<'t, V: crate::Value> ArtAllocator<V> for ArtMultiSlabAllocator<'t, V> {
fn alloc_tree(&self) -> *mut Tree<V> {
let mut t = self.tree_area.lock();
if let Some(tree_area) = t.take() {
return tree_area.as_mut_ptr().cast();
}
panic!("cannot allocate more than one tree");
}
fn alloc_node_internal4(&self) -> *mut NodeInternal4<V> {
self.inner.alloc_slab(0).cast()
}
fn alloc_node_internal16(&self) -> *mut NodeInternal16<V> {
self.inner.alloc_slab(1).cast()
}
fn alloc_node_internal48(&self) -> *mut NodeInternal48<V> {
self.inner.alloc_slab(2).cast()
}
fn alloc_node_internal256(&self) -> *mut NodeInternal256<V> {
self.inner.alloc_slab(3).cast()
}
fn alloc_node_leaf(&self) -> *mut NodeLeaf<V> {
self.inner.alloc_slab(4).cast()
}
fn dealloc_node_internal4(&self, ptr: *mut NodeInternal4<V>) {
self.inner.dealloc_slab(0, ptr.cast())
}
fn dealloc_node_internal16(&self, ptr: *mut NodeInternal16<V>) {
self.inner.dealloc_slab(1, ptr.cast())
}
fn dealloc_node_internal48(&self, ptr: *mut NodeInternal48<V>) {
self.inner.dealloc_slab(2, ptr.cast())
}
fn dealloc_node_internal256(&self, ptr: *mut NodeInternal256<V>) {
self.inner.dealloc_slab(3, ptr.cast())
}
fn dealloc_node_leaf(&self, ptr: *mut NodeLeaf<V>) {
self.inner.dealloc_slab(4, ptr.cast())
}
}
impl<'t, V: crate::Value> ArtMultiSlabAllocator<'t, V> {
pub(crate) fn get_statistics(&self) -> ArtMultiSlabStats {
ArtMultiSlabStats {
num_internal4: self.inner.slab_descs[0]
.num_allocated
.load(Ordering::Relaxed),
num_internal16: self.inner.slab_descs[1]
.num_allocated
.load(Ordering::Relaxed),
num_internal48: self.inner.slab_descs[2]
.num_allocated
.load(Ordering::Relaxed),
num_internal256: self.inner.slab_descs[3]
.num_allocated
.load(Ordering::Relaxed),
num_leaf: self.inner.slab_descs[4]
.num_allocated
.load(Ordering::Relaxed),
num_blocks_internal4: self.inner.slab_descs[0].num_blocks.load(Ordering::Relaxed),
num_blocks_internal16: self.inner.slab_descs[1].num_blocks.load(Ordering::Relaxed),
num_blocks_internal48: self.inner.slab_descs[2].num_blocks.load(Ordering::Relaxed),
num_blocks_internal256: self.inner.slab_descs[3].num_blocks.load(Ordering::Relaxed),
num_blocks_leaf: self.inner.slab_descs[4].num_blocks.load(Ordering::Relaxed),
}
}
}
#[derive(Clone, Debug)]
pub struct ArtMultiSlabStats {
pub num_internal4: u64,
pub num_internal16: u64,
pub num_internal48: u64,
pub num_internal256: u64,
pub num_leaf: u64,
pub num_blocks_internal4: u64,
pub num_blocks_internal16: u64,
pub num_blocks_internal48: u64,
pub num_blocks_internal256: u64,
pub num_blocks_leaf: u64,
}

View File

@@ -0,0 +1,191 @@
//! Simple allocator of fixed-size blocks
use std::mem::MaybeUninit;
use std::sync::atomic::{AtomicU64, Ordering};
use spin;
pub const BLOCK_SIZE: usize = 16 * 1024;
const INVALID_BLOCK: u64 = u64::MAX;
pub(crate) struct BlockAllocator<'t> {
blocks_ptr: &'t [MaybeUninit<u8>],
num_blocks: u64,
num_initialized: AtomicU64,
freelist_head: spin::Mutex<u64>,
}
struct FreeListBlock {
inner: spin::Mutex<FreeListBlockInner>,
}
struct FreeListBlockInner {
next: u64,
num_free_blocks: u64,
free_blocks: [u64; 100], // FIXME: fill the rest of the block
}
impl<'t> BlockAllocator<'t> {
pub(crate) fn new(area: &'t mut [MaybeUninit<u8>]) -> Self {
// Use all the space for the blocks
let padding = area.as_ptr().align_offset(BLOCK_SIZE);
let remain = &mut area[padding..];
let num_blocks = (remain.len() / BLOCK_SIZE) as u64;
BlockAllocator {
blocks_ptr: remain,
num_blocks,
num_initialized: AtomicU64::new(0),
freelist_head: spin::Mutex::new(INVALID_BLOCK),
}
}
/// safety: you must hold a lock on the pointer to this block, otherwise it might get
/// reused for another kind of block
fn read_freelist_block(&self, blkno: u64) -> &FreeListBlock {
let ptr: *const FreeListBlock = self.get_block_ptr(blkno).cast();
unsafe { ptr.as_ref().unwrap() }
}
fn get_block_ptr(&self, blkno: u64) -> *mut u8 {
assert!(blkno < self.num_blocks);
unsafe {
self.blocks_ptr
.as_ptr()
.byte_offset(blkno as isize * BLOCK_SIZE as isize)
}
.cast_mut()
.cast()
}
#[allow(clippy::mut_from_ref)]
pub(crate) fn alloc_block(&self) -> &mut [MaybeUninit<u8>] {
// FIXME: handle OOM
let blkno = self.alloc_block_internal();
if blkno == INVALID_BLOCK {
panic!("out of memory");
}
let ptr: *mut MaybeUninit<u8> = self.get_block_ptr(blkno).cast();
unsafe { std::slice::from_raw_parts_mut(ptr, BLOCK_SIZE) }
}
fn alloc_block_internal(&self) -> u64 {
// check the free list.
{
let mut freelist_head = self.freelist_head.lock();
if *freelist_head != INVALID_BLOCK {
let freelist_block = self.read_freelist_block(*freelist_head);
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
let mut g = freelist_block.inner.lock();
if g.num_free_blocks > 0 {
g.num_free_blocks -= 1;
let result = g.free_blocks[g.num_free_blocks as usize];
return result;
} else {
// consume the freelist block itself
let result = *freelist_head;
*freelist_head = g.next;
// This freelist block is now unlinked and can be repurposed
drop(g);
return result;
}
}
}
// If there are some blocks left that we've never used, pick next such block
let mut next_uninitialized = self.num_initialized.load(Ordering::Relaxed);
while next_uninitialized < self.num_blocks {
match self.num_initialized.compare_exchange(
next_uninitialized,
next_uninitialized + 1,
Ordering::Relaxed,
Ordering::Relaxed,
) {
Ok(_) => {
return next_uninitialized;
}
Err(old) => {
next_uninitialized = old;
continue;
}
}
}
// out of blocks
INVALID_BLOCK
}
// TODO: this is currently unused. The slab allocator never releases blocks
#[allow(dead_code)]
pub(crate) fn release_block(&self, block_ptr: *mut u8) {
let blockno = unsafe { block_ptr.byte_offset_from(self.blocks_ptr) / BLOCK_SIZE as isize };
self.release_block_internal(blockno as u64);
}
fn release_block_internal(&self, blockno: u64) {
let mut freelist_head = self.freelist_head.lock();
if *freelist_head != INVALID_BLOCK {
let freelist_block = self.read_freelist_block(*freelist_head);
// acquire lock on the freelist block before releasing the lock on the parent (i.e. lock coupling)
let mut g = freelist_block.inner.lock();
let num_free_blocks = g.num_free_blocks;
if num_free_blocks < g.free_blocks.len() as u64 {
g.free_blocks[num_free_blocks as usize] = blockno;
g.num_free_blocks += 1;
return;
}
}
// Convert the block into a new freelist block
let block_ptr: *mut FreeListBlock = self.get_block_ptr(blockno).cast();
let init = FreeListBlock {
inner: spin::Mutex::new(FreeListBlockInner {
next: *freelist_head,
num_free_blocks: 0,
free_blocks: [INVALID_BLOCK; 100],
}),
};
unsafe { (*block_ptr) = init };
*freelist_head = blockno;
}
// for debugging
pub(crate) fn get_statistics(&self) -> BlockAllocatorStats {
let mut num_free_blocks = 0;
let mut _prev_lock = None;
let head_lock = self.freelist_head.lock();
let mut next_blk = *head_lock;
let mut _head_lock = Some(head_lock);
while next_blk != INVALID_BLOCK {
let freelist_block = self.read_freelist_block(next_blk);
let lock = freelist_block.inner.lock();
num_free_blocks += lock.num_free_blocks;
next_blk = lock.next;
_prev_lock = Some(lock); // hold the lock until we've read the next block
_head_lock = None;
}
BlockAllocatorStats {
num_blocks: self.num_blocks,
num_initialized: self.num_initialized.load(Ordering::Relaxed),
num_free_blocks,
}
}
}
#[derive(Clone, Debug)]
pub struct BlockAllocatorStats {
pub num_blocks: u64,
pub num_initialized: u64,
pub num_free_blocks: u64,
}

View File

@@ -0,0 +1,33 @@
use std::alloc::Layout;
use std::mem::MaybeUninit;
use crate::allocator::block::BlockAllocator;
use crate::allocator::slab::SlabDesc;
pub struct MultiSlabAllocator<'t, const N: usize> {
pub(crate) block_allocator: BlockAllocator<'t>,
pub(crate) slab_descs: [SlabDesc; N],
}
impl<'t, const N: usize> MultiSlabAllocator<'t, N> {
pub(crate) fn new(
area: &'t mut [MaybeUninit<u8>],
layouts: &[Layout; N],
) -> MultiSlabAllocator<'t, N> {
let block_allocator = BlockAllocator::new(area);
MultiSlabAllocator {
block_allocator,
slab_descs: std::array::from_fn(|i| SlabDesc::new(&layouts[i])),
}
}
pub(crate) fn alloc_slab(&self, slab_idx: usize) -> *mut u8 {
self.slab_descs[slab_idx].alloc_chunk(&self.block_allocator)
}
pub(crate) fn dealloc_slab(&self, slab_idx: usize, ptr: *mut u8) {
self.slab_descs[slab_idx].dealloc_chunk(ptr, &self.block_allocator)
}
}

View File

@@ -0,0 +1,433 @@
//! A slab allocator that carves out fixed-size chunks from larger blocks.
//!
//!
use std::alloc::Layout;
use std::mem::MaybeUninit;
use std::ops::Deref;
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use spin;
use super::alloc_from_slice;
use super::block::BlockAllocator;
use crate::allocator::block::BLOCK_SIZE;
pub(crate) struct SlabDesc {
pub(crate) layout: Layout,
block_lists: spin::RwLock<BlockLists>,
pub(crate) num_blocks: AtomicU64,
pub(crate) num_allocated: AtomicU64,
}
// FIXME: Not sure if SlabDesc is really Sync or Send. It probably is when it's empty, but
// 'block_lists' contains pointers when it's not empty. In the current use as part of the
// the art tree, SlabDescs are only moved during initialization.
unsafe impl Sync for SlabDesc {}
unsafe impl Send for SlabDesc {}
#[derive(Default, Debug)]
struct BlockLists {
full_blocks: BlockList,
nonfull_blocks: BlockList,
}
impl BlockLists {
// Unlink a node. It must be in either one of the two lists.
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
let list = unsafe {
if (*elem).next.is_null() {
if self.full_blocks.tail == elem {
Some(&mut self.full_blocks)
} else {
Some(&mut self.nonfull_blocks)
}
} else if (*elem).prev.is_null() {
if self.full_blocks.head == elem {
Some(&mut self.full_blocks)
} else {
Some(&mut self.nonfull_blocks)
}
} else {
None
}
};
unsafe { unlink_slab_block(list, elem) };
}
}
unsafe fn unlink_slab_block(mut list: Option<&mut BlockList>, elem: *mut SlabBlockHeader) {
unsafe {
if (*elem).next.is_null() {
assert_eq!(list.as_ref().unwrap().tail, elem);
list.as_mut().unwrap().tail = (*elem).prev;
} else {
assert_eq!((*(*elem).next).prev, elem);
(*(*elem).next).prev = (*elem).prev;
}
if (*elem).prev.is_null() {
assert_eq!(list.as_ref().unwrap().head, elem);
list.as_mut().unwrap().head = (*elem).next;
} else {
assert_eq!((*(*elem).prev).next, elem);
(*(*elem).prev).next = (*elem).next;
}
}
}
#[derive(Debug)]
struct BlockList {
head: *mut SlabBlockHeader,
tail: *mut SlabBlockHeader,
}
impl Default for BlockList {
fn default() -> Self {
BlockList {
head: std::ptr::null_mut(),
tail: std::ptr::null_mut(),
}
}
}
impl BlockList {
unsafe fn push_head(&mut self, elem: *mut SlabBlockHeader) {
unsafe {
if self.is_empty() {
self.tail = elem;
(*elem).next = std::ptr::null_mut();
} else {
(*elem).next = self.head;
(*self.head).prev = elem;
}
(*elem).prev = std::ptr::null_mut();
self.head = elem;
}
}
fn is_empty(&self) -> bool {
self.head.is_null()
}
unsafe fn unlink(&mut self, elem: *mut SlabBlockHeader) {
unsafe { unlink_slab_block(Some(self), elem) }
}
#[cfg(test)]
fn dump(&self) {
let mut next = self.head;
while !next.is_null() {
let n = unsafe { next.as_ref() }.unwrap();
eprintln!(
" blk {:?} (free {}/{})",
next,
n.num_free_chunks.load(Ordering::Relaxed),
n.num_chunks
);
next = n.next;
}
}
}
impl SlabDesc {
pub(crate) fn new(layout: &Layout) -> SlabDesc {
SlabDesc {
layout: *layout,
block_lists: spin::RwLock::new(BlockLists::default()),
num_allocated: AtomicU64::new(0),
num_blocks: AtomicU64::new(0),
}
}
}
#[derive(Debug)]
struct SlabBlockHeader {
free_chunks_head: spin::Mutex<*mut FreeChunk>,
num_free_chunks: AtomicU32,
num_chunks: u32, // this is really a constant for a given Layout
// these fields are protected by the lock on the BlockLists
prev: *mut SlabBlockHeader,
next: *mut SlabBlockHeader,
}
struct FreeChunk {
next: *mut FreeChunk,
}
enum ReadOrWriteGuard<'a, T> {
Read(spin::RwLockReadGuard<'a, T>),
Write(spin::RwLockWriteGuard<'a, T>),
}
impl<'a, T> Deref for ReadOrWriteGuard<'a, T> {
type Target = T;
fn deref(&self) -> &<Self as Deref>::Target {
match self {
ReadOrWriteGuard::Read(g) => g.deref(),
ReadOrWriteGuard::Write(g) => g.deref(),
}
}
}
impl SlabDesc {
pub fn alloc_chunk(&self, block_allocator: &BlockAllocator) -> *mut u8 {
// Are there any free chunks?
let mut acquire_write = false;
'outer: loop {
let mut block_lists_guard = if acquire_write {
ReadOrWriteGuard::Write(self.block_lists.write())
} else {
ReadOrWriteGuard::Read(self.block_lists.read())
};
'inner: loop {
let block_ptr = block_lists_guard.nonfull_blocks.head;
if block_ptr.is_null() {
break 'outer;
}
unsafe {
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
if !(*free_chunks_head).is_null() {
let result = *free_chunks_head;
(*free_chunks_head) = (*result).next;
let _old = (*block_ptr).num_free_chunks.fetch_sub(1, Ordering::Relaxed);
self.num_allocated.fetch_add(1, Ordering::Relaxed);
return result.cast();
}
}
// The block at the head of the list was full. Grab write lock and retry
match block_lists_guard {
ReadOrWriteGuard::Read(_) => {
acquire_write = true;
continue 'outer;
}
ReadOrWriteGuard::Write(ref mut g) => {
// move the node to the list of full blocks
unsafe {
g.nonfull_blocks.unlink(block_ptr);
g.full_blocks.push_head(block_ptr);
};
continue 'inner;
}
}
}
}
// no free chunks. Allocate a new block (and the chunk from that)
let (new_block, new_chunk) = self.alloc_block_and_chunk(block_allocator);
self.num_blocks.fetch_add(1, Ordering::Relaxed);
// Add the block to the list in the SlabDesc
unsafe {
let mut block_lists_guard = self.block_lists.write();
block_lists_guard.nonfull_blocks.push_head(new_block);
}
self.num_allocated.fetch_add(1, Ordering::Relaxed);
new_chunk
}
pub fn dealloc_chunk(&self, chunk_ptr: *mut u8, _block_allocator: &BlockAllocator) {
// Find the block it belongs to. You can find the block from the address. (And knowing the
// layout, you could calculate the chunk number too.)
let block_ptr: *mut SlabBlockHeader = {
let block_addr = (chunk_ptr.addr() / BLOCK_SIZE) * BLOCK_SIZE;
chunk_ptr.with_addr(block_addr).cast()
};
let chunk_ptr: *mut FreeChunk = chunk_ptr.cast();
// Mark the chunk as free in 'freechunks' list
let num_chunks;
let num_free_chunks;
unsafe {
let mut free_chunks_head = (*block_ptr).free_chunks_head.lock();
(*chunk_ptr).next = *free_chunks_head;
*free_chunks_head = chunk_ptr;
num_free_chunks = (*block_ptr).num_free_chunks.fetch_add(1, Ordering::Relaxed) + 1;
num_chunks = (*block_ptr).num_chunks;
}
if num_free_chunks == 1 {
// If the block was full previously, add it to the nonfull blocks list. Note that
// we're not holding the lock anymore, so it can immediately become full again.
// That's harmless, it will be moved back to the full list again when a call
// to alloc_chunk() sees it.
let mut block_lists = self.block_lists.write();
unsafe {
block_lists.unlink(block_ptr);
block_lists.nonfull_blocks.push_head(block_ptr);
};
} else if num_free_chunks == num_chunks {
// If the block became completely empty, move it to the free list
// TODO
// FIXME: we're still holding the spinlock. It's not exactly safe to return it to
// the free blocks list, is it? Defer it as garbage to wait out concurrent updates?
//block_allocator.release_block()
}
// update stats
self.num_allocated.fetch_sub(1, Ordering::Relaxed);
}
fn alloc_block_and_chunk(
&self,
block_allocator: &BlockAllocator,
) -> (*mut SlabBlockHeader, *mut u8) {
// fixme: handle OOM
let block_slice: &mut [MaybeUninit<u8>] = block_allocator.alloc_block();
let (block_header, remain) = alloc_from_slice::<SlabBlockHeader>(block_slice);
let padding = remain.as_ptr().align_offset(self.layout.align());
let num_chunks = (remain.len() - padding) / self.layout.size();
let first_chunk_ptr: *mut FreeChunk = remain[padding..].as_mut_ptr().cast();
unsafe {
let mut chunk_ptr = first_chunk_ptr;
for _ in 0..num_chunks - 1 {
let next_chunk_ptr = chunk_ptr.byte_add(self.layout.size());
(*chunk_ptr).next = next_chunk_ptr;
chunk_ptr = next_chunk_ptr;
}
(*chunk_ptr).next = std::ptr::null_mut();
let result_chunk = first_chunk_ptr;
let block_header = block_header.write(SlabBlockHeader {
free_chunks_head: spin::Mutex::new((*first_chunk_ptr).next),
prev: std::ptr::null_mut(),
next: std::ptr::null_mut(),
num_chunks: num_chunks as u32,
num_free_chunks: AtomicU32::new(num_chunks as u32 - 1),
});
(block_header, result_chunk.cast())
}
}
#[cfg(test)]
fn dump(&self) {
eprintln!(
"slab dump ({} blocks, {} allocated chunks)",
self.num_blocks.load(Ordering::Relaxed),
self.num_allocated.load(Ordering::Relaxed)
);
let lists = self.block_lists.read();
eprintln!("nonfull blocks:");
lists.nonfull_blocks.dump();
eprintln!("full blocks:");
lists.full_blocks.dump();
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::Rng;
use rand_distr::Zipf;
struct TestObject {
val: usize,
_dummy: [u8; BLOCK_SIZE / 4],
}
struct TestObjectSlab<'a>(SlabDesc, BlockAllocator<'a>);
impl<'a> TestObjectSlab<'a> {
fn new(block_allocator: BlockAllocator) -> TestObjectSlab {
TestObjectSlab(SlabDesc::new(&Layout::new::<TestObject>()), block_allocator)
}
fn alloc(&self, val: usize) -> *mut TestObject {
let obj: *mut TestObject = self.0.alloc_chunk(&self.1).cast();
unsafe { (*obj).val = val };
obj
}
fn dealloc(&self, obj: *mut TestObject) {
self.0.dealloc_chunk(obj.cast(), &self.1)
}
}
#[test]
fn test_slab_alloc() {
const MEM_SIZE: usize = 100000000;
let mut area = Box::new_uninit_slice(MEM_SIZE);
let block_allocator = BlockAllocator::new(&mut area);
let slab = TestObjectSlab::new(block_allocator);
let mut all: Vec<*mut TestObject> = Vec::new();
for i in 0..11 {
all.push(slab.alloc(i));
}
#[allow(clippy::needless_range_loop)]
for i in 0..11 {
assert!(unsafe { (*all[i]).val == i });
}
let distribution = Zipf::new(10.0, 1.1).unwrap();
let mut rng = rand::rng();
for _ in 0..100000 {
slab.0.dump();
let idx = rng.sample(distribution) as usize;
let ptr: *mut TestObject = all[idx];
if !ptr.is_null() {
assert_eq!(unsafe { (*ptr).val }, idx);
slab.dealloc(ptr);
all[idx] = std::ptr::null_mut();
} else {
all[idx] = slab.alloc(idx);
}
}
}
fn new_test_blk(i: u32) -> *mut SlabBlockHeader {
Box::into_raw(Box::new(SlabBlockHeader {
free_chunks_head: spin::Mutex::new(std::ptr::null_mut()),
num_free_chunks: AtomicU32::new(0),
num_chunks: i,
prev: std::ptr::null_mut(),
next: std::ptr::null_mut(),
}))
}
#[test]
fn test_block_linked_list() {
// note: these are leaked, but that's OK for tests
let a = new_test_blk(0);
let b = new_test_blk(1);
let mut list = BlockList::default();
assert!(list.is_empty());
unsafe {
list.push_head(a);
assert!(!list.is_empty());
list.unlink(a);
}
assert!(list.is_empty());
unsafe {
list.push_head(b);
list.push_head(a);
assert_eq!(list.head, a);
assert_eq!((*a).next, b);
assert_eq!((*b).prev, a);
assert_eq!(list.tail, b);
list.unlink(a);
list.unlink(b);
assert!(list.is_empty());
}
}
}

View File

@@ -0,0 +1,44 @@
use std::mem::MaybeUninit;
pub fn alloc_from_slice<T>(
area: &mut [MaybeUninit<u8>],
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size());
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { result_ptr.as_mut().unwrap() };
(result, remain)
}
pub fn alloc_array_from_slice<T>(
area: &mut [MaybeUninit<u8>],
len: usize,
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() * len > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size() * len);
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
(result, remain)
}

142
libs/neonart/src/epoch.rs Normal file
View File

@@ -0,0 +1,142 @@
//! This is similar to crossbeam_epoch crate, but works in shared memory
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use crossbeam_utils::CachePadded;
const NUM_SLOTS: usize = 1000;
/// This is the struct that is stored in shmem
///
/// bit 0: is it pinned or not?
/// rest of the bits are the epoch counter.
pub struct EpochShared {
global_epoch: AtomicU64,
participants: [CachePadded<AtomicU64>; NUM_SLOTS],
broadcast_lock: spin::Mutex<()>,
}
impl EpochShared {
pub fn new() -> EpochShared {
EpochShared {
global_epoch: AtomicU64::new(2),
participants: [const { CachePadded::new(AtomicU64::new(2)) }; NUM_SLOTS],
broadcast_lock: spin::Mutex::new(()),
}
}
pub fn register(&self) -> LocalHandle {
LocalHandle {
global: self,
last_slot: AtomicUsize::new(0), // todo: choose more intelligently
}
}
fn release_pin(&self, slot: usize, _epoch: u64) {
let global_epoch = self.global_epoch.load(Ordering::Relaxed);
self.participants[slot].store(global_epoch, Ordering::Relaxed);
}
fn pin_internal(&self, slot_hint: usize) -> (usize, u64) {
// pick a slot
let mut slot = slot_hint;
let epoch = loop {
let old = self.participants[slot].fetch_or(1, Ordering::Relaxed);
if old & 1 == 0 {
// Got this slot
break old;
}
// the slot was busy by another thread / process. try a different slot
slot += 1;
if slot == NUM_SLOTS {
slot = 0;
}
continue;
};
(slot, epoch)
}
pub(crate) fn advance(&self) -> u64 {
// Advance the global epoch
let old_epoch = self.global_epoch.fetch_add(2, Ordering::Relaxed);
// Anyone that release their pin after this will update their slot.
old_epoch + 2
}
pub(crate) fn broadcast(&self) {
let Some(_guard) = self.broadcast_lock.try_lock() else {
return;
};
let epoch = self.global_epoch.load(Ordering::Relaxed);
let old_epoch = epoch.wrapping_sub(2);
// Update all free slots.
for i in 0..NUM_SLOTS {
// TODO: check result, as a sanity check. It should either be the old epoch, or pinned
let _ = self.participants[i].compare_exchange(
old_epoch,
epoch,
Ordering::Relaxed,
Ordering::Relaxed,
);
}
// FIXME: memory fence here, since we used Relaxed?
}
pub(crate) fn get_oldest(&self) -> u64 {
// Read all slots.
let now = self.global_epoch.load(Ordering::Relaxed);
let mut oldest = now;
for i in 0..NUM_SLOTS {
let this_epoch = self.participants[i].load(Ordering::Relaxed);
let delta = now.wrapping_sub(this_epoch);
if delta > u64::MAX / 2 {
// this is very recent
} else if delta > now.wrapping_sub(oldest) {
oldest = this_epoch;
}
}
oldest
}
pub(crate) fn get_current(&self) -> u64 {
self.global_epoch.load(Ordering::Relaxed)
}
}
pub(crate) struct EpochPin<'e> {
slot: usize,
pub(crate) epoch: u64,
handle: &'e LocalHandle<'e>,
}
impl<'e> Drop for EpochPin<'e> {
fn drop(&mut self) {
self.handle.global.release_pin(self.slot, self.epoch);
}
}
pub struct LocalHandle<'g> {
global: &'g EpochShared,
last_slot: AtomicUsize,
}
impl<'g> LocalHandle<'g> {
pub fn pin(&self) -> EpochPin {
let (slot, epoch) = self
.global
.pin_internal(self.last_slot.load(Ordering::Relaxed));
self.last_slot.store(slot, Ordering::Relaxed);
EpochPin {
handle: self,
epoch,
slot,
}
}
}

583
libs/neonart/src/lib.rs Normal file
View File

@@ -0,0 +1,583 @@
//! Adaptive Radix Tree (ART) implementation, with Optimistic Lock Coupling.
//!
//! The data structure is described in these two papers:
//!
//! [1] Leis, V. & Kemper, Alfons & Neumann, Thomas. (2013).
//! The adaptive radix tree: ARTful indexing for main-memory databases.
//! Proceedings - International Conference on Data Engineering. 38-49. 10.1109/ICDE.2013.6544812.
//! https://db.in.tum.de/~leis/papers/ART.pdf
//!
//! [2] Leis, Viktor & Scheibner, Florian & Kemper, Alfons & Neumann, Thomas. (2016).
//! The ART of practical synchronization.
//! 1-8. 10.1145/2933349.2933352.
//! https://db.in.tum.de/~leis/papers/artsync.pdf
//!
//! [1] describes the base data structure, and [2] describes the Optimistic Lock Coupling that we
//! use.
//!
//! The papers mention a few different variants. We have made the following choices in this
//! implementation:
//!
//! - All keys have the same length
//!
//! - Single-value leaves.
//!
//! - For collapsing inner nodes, we use the Pessimistic approach, where each inner node stores a
//! variable length "prefix", which stores the keys of all the one-way nodes which have been
//! removed. However, similar to the "hybrid" approach described in the paper, each node only has
//! space for a constant-size prefix of 8 bytes. If a node would have a longer prefix, then we
//! create create one-way nodes to store them. (There was no particular reason for this choice,
//! the "hybrid" approach described in the paper might be better.)
//!
//! - For concurrency, we use Optimistic Lock Coupling. The paper [2] also describes another method,
//! ROWEX, which generally performs better when there is contention, but that is not important
//! for use and Optimisic Lock Coupling is simpler to implement.
//!
//! ## Requirements
//!
//! This data structure is currently used for the integrated LFC, relsize and last-written LSN cache
//! in the compute communicator, part of the 'neon' Postgres extension. We have some unique
//! requirements, which is why we had to write our own. Namely:
//!
//! - The data structure has to live in fixed-sized shared memory segment. That rules out any
//! built-in Rust collections and most crates. (Except possibly with the 'allocator_api' rust
//! feature, which still nightly-only experimental as of this writing).
//!
//! - The data structure is accessed from multiple processes. Only one process updates the data
//! structure, but other processes perform reads. That rules out using built-in Rust locking
//! primitives like Mutex and RwLock, and most crates too.
//!
//! - Within the one process with write-access, multiple threads can perform updates concurrently.
//! That rules out using PostgreSQL LWLocks for the locking.
//!
//! The implementation is generic, and doesn't depend on any PostgreSQL specifics, but it has been
//! written with that usage and the above constraints in mind. Some noteworthy assumptions:
//!
//! - Contention is assumed to be rare. In the integrated cache in PostgreSQL, there's higher level
//! locking in the PostgreSQL buffer manager, which ensures that two backends should not try to
//! read / write the same page at the same time. (Prefetching can conflict with actual reads,
//! however.)
//!
//! - The keys in the integrated cache are 17 bytes long.
//!
//! ## Usage
//!
//! Because this is designed to be used as a Postgres shared memory data structure, initialization
//! happens in three stages:
//!
//! 0. A fixed area of shared memory is allocated at postmaster startup.
//!
//! 1. TreeInitStruct::new() is called to initialize it, still in Postmaster process, before any
//! other process or thread is running. It returns a TreeInitStruct, which is inherited by all
//! the processes through fork().
//!
//! 2. One process may have write-access to the struct, by calling
//! [TreeInitStruct::attach_writer]. (That process is the communicator process.)
//!
//! 3. Other processes get read-access to the struct, by calling [TreeInitStruct::attach_reader]
//!
//! "Write access" means that you can insert / update / delete values in the tree.
//!
//! NOTE: The Values stored in the tree are sometimes moved, when a leaf node fills up and a new
//! larger node needs to be allocated. The versioning and epoch-based allocator ensure that the data
//! structure stays consistent, but if the Value has interior mutability, like atomic fields,
//! updates to such fields might be lost if the leaf node is concurrently moved! If that becomes a
//! problem, the version check could be passed up to the caller, so that the caller could detect the
//! lost updates and retry the operation.
//!
//! ## Implementation
//!
//! node_ptr: Provides low-level implementations of the four different node types (eight actually,
//! since there is an Internal and Leaf variant of each)
//!
//! lock_and_version.rs: Provides an abstraction for the combined lock and version counter on each
//! node.
//!
//! node_ref.rs: The code in node_ptr.rs deals with raw pointers. node_ref.rs provides more type-safe
//! abstractions on top.
//!
//! algorithm.rs: Contains the functions to implement lookups and updates in the tree
//!
//! allocator.rs: Provides a facility to allocate memory for the tree nodes. (We must provide our
//! own abstraction for that because we need the data structure to live in a pre-allocated shared
//! memory segment).
//!
//! epoch.rs: The data structure requires that when a node is removed from the tree, it is not
//! immediately deallocated, but stays around for as long as concurrent readers might still have
//! pointers to them. This is enforced by an epoch system. This is similar to
//! e.g. crossbeam_epoch, but we couldn't use that either because it has to work across processes
//! communicating over the shared memory segment.
//!
//! ## See also
//!
//! There are some existing Rust ART implementations out there, but none of them filled all
//! the requirements:
//!
//! - https://github.com/XiangpengHao/congee
//! - https://github.com/declanvk/blart
//!
//! ## TODO
//!
//! - Removing values has not been implemented
mod algorithm;
pub mod allocator;
mod epoch;
use algorithm::RootPtr;
use algorithm::node_ptr::NodePtr;
use std::collections::VecDeque;
use std::fmt::Debug;
use std::marker::PhantomData;
use std::ptr::NonNull;
use std::sync::atomic::{AtomicBool, Ordering};
use crate::epoch::EpochPin;
#[cfg(test)]
mod tests;
use allocator::ArtAllocator;
pub use allocator::ArtMultiSlabAllocator;
pub use allocator::OutOfMemoryError;
/// Fixed-length key type.
///
pub trait Key: Debug {
const KEY_LEN: usize;
fn as_bytes(&self) -> &[u8];
}
/// Values stored in the tree
///
/// Values need to be Cloneable, because when a node "grows", the value is copied to a new node and
/// the old sticks around until all readers that might see the old value are gone.
// fixme obsolete, no longer needs Clone
pub trait Value {}
const MAX_GARBAGE: usize = 1024;
/// The root of the tree, plus other tree-wide data. This is stored in the shared memory.
pub struct Tree<V: Value> {
/// For simplicity, so that we never need to grow or shrink the root, the root node is always an
/// Internal256 node. Also, it never has a prefix (that's actually a bit wasteful, incurring one
/// indirection to every lookup)
root: RootPtr<V>,
writer_attached: AtomicBool,
epoch: epoch::EpochShared,
}
unsafe impl<V: Value + Sync> Sync for Tree<V> {}
unsafe impl<V: Value + Send> Send for Tree<V> {}
struct GarbageQueue<V>(VecDeque<(NodePtr<V>, u64)>);
unsafe impl<V: Value + Sync> Sync for GarbageQueue<V> {}
unsafe impl<V: Value + Send> Send for GarbageQueue<V> {}
impl<V> GarbageQueue<V> {
fn new() -> GarbageQueue<V> {
GarbageQueue(VecDeque::with_capacity(MAX_GARBAGE))
}
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>, epoch: u64) {
self.0.push_front((ptr, epoch));
}
fn next_obsolete(&mut self, cutoff_epoch: u64) -> Option<NodePtr<V>> {
if let Some(back) = self.0.back() {
if back.1 < cutoff_epoch {
return Some(self.0.pop_back().unwrap().0);
}
}
None
}
}
/// Struct created at postmaster startup
pub struct TreeInitStruct<'t, K: Key, V: Value, A: ArtAllocator<V>> {
tree: &'t Tree<V>,
allocator: &'t A,
phantom_key: PhantomData<K>,
}
/// The worker process has a reference to this. The write operations are only safe
/// from the worker process
pub struct TreeWriteAccess<'t, K: Key, V: Value, A: ArtAllocator<V>>
where
K: Key,
V: Value,
{
tree: &'t Tree<V>,
pub allocator: &'t A,
epoch_handle: epoch::LocalHandle<'t>,
phantom_key: PhantomData<K>,
/// Obsolete nodes that cannot be recycled until their epoch expires.
garbage: spin::Mutex<GarbageQueue<V>>,
}
/// The backends have a reference to this. It cannot be used to modify the tree
pub struct TreeReadAccess<'t, K: Key, V: Value>
where
K: Key,
V: Value,
{
tree: &'t Tree<V>,
epoch_handle: epoch::LocalHandle<'t>,
phantom_key: PhantomData<K>,
}
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeInitStruct<'t, K, V, A> {
pub fn new(allocator: &'t A) -> TreeInitStruct<'t, K, V, A> {
let tree_ptr = allocator.alloc_tree();
let tree_ptr = NonNull::new(tree_ptr).expect("out of memory");
let init = Tree {
root: algorithm::new_root(allocator).expect("out of memory"),
writer_attached: AtomicBool::new(false),
epoch: epoch::EpochShared::new(),
};
unsafe { tree_ptr.write(init) };
TreeInitStruct {
tree: unsafe { tree_ptr.as_ref() },
allocator,
phantom_key: PhantomData,
}
}
pub fn attach_writer(self) -> TreeWriteAccess<'t, K, V, A> {
let previously_attached = self.tree.writer_attached.swap(true, Ordering::Relaxed);
if previously_attached {
panic!("writer already attached");
}
TreeWriteAccess {
tree: self.tree,
allocator: self.allocator,
phantom_key: PhantomData,
epoch_handle: self.tree.epoch.register(),
garbage: spin::Mutex::new(GarbageQueue::new()),
}
}
pub fn attach_reader(self) -> TreeReadAccess<'t, K, V> {
TreeReadAccess {
tree: self.tree,
phantom_key: PhantomData,
epoch_handle: self.tree.epoch.register(),
}
}
}
impl<'t, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteAccess<'t, K, V, A> {
pub fn start_write<'g>(&'t self) -> TreeWriteGuard<'g, K, V, A>
where
't: 'g,
{
TreeWriteGuard {
tree_writer: self,
epoch_pin: self.epoch_handle.pin(),
phantom_key: PhantomData,
created_garbage: false,
}
}
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
TreeReadGuard {
tree: self.tree,
epoch_pin: self.epoch_handle.pin(),
phantom_key: PhantomData,
}
}
}
impl<'t, K: Key, V: Value> TreeReadAccess<'t, K, V> {
pub fn start_read(&'t self) -> TreeReadGuard<'t, K, V> {
TreeReadGuard {
tree: self.tree,
epoch_pin: self.epoch_handle.pin(),
phantom_key: PhantomData,
}
}
}
pub struct TreeReadGuard<'e, K, V>
where
K: Key,
V: Value,
{
tree: &'e Tree<V>,
epoch_pin: EpochPin<'e>,
phantom_key: PhantomData<K>,
}
impl<'e, K: Key, V: Value> TreeReadGuard<'e, K, V> {
pub fn get(&'e self, key: &K) -> Option<&'e V> {
algorithm::search(key, self.tree.root, &self.epoch_pin)
}
}
pub struct TreeWriteGuard<'e, K, V, A>
where
K: Key,
V: Value,
A: ArtAllocator<V>,
{
tree_writer: &'e TreeWriteAccess<'e, K, V, A>,
epoch_pin: EpochPin<'e>,
phantom_key: PhantomData<K>,
created_garbage: bool,
}
pub enum UpdateAction<V> {
Nothing,
Insert(V),
Remove,
}
impl<'e, K: Key, V: Value, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
/// Get a value
pub fn get(&'e mut self, key: &K) -> Option<&'e V> {
algorithm::search(key, self.tree_writer.tree.root, &self.epoch_pin)
}
/// Insert a value
pub fn insert(self, key: &K, value: V) -> Result<bool, OutOfMemoryError> {
let mut success = None;
self.update_with_fn(key, |existing| {
if existing.is_some() {
success = Some(false);
UpdateAction::Nothing
} else {
success = Some(true);
UpdateAction::Insert(value)
}
})?;
Ok(success.expect("value_fn not called"))
}
/// Remove value. Returns true if it existed
pub fn remove(self, key: &K) -> bool {
let mut result = false;
// FIXME: It's not clear if OOM is expected while removing. It seems
// not nice, but shrinking a node can OOM. Then again, we could opt
// to not shrink a node if we cannot allocate, to live a little longer.
self.update_with_fn(key, |existing| match existing {
Some(_) => {
result = true;
UpdateAction::Remove
}
None => UpdateAction::Nothing,
})
.expect("out of memory while removing");
result
}
/// Try to remove value and return the old value.
pub fn remove_and_return(self, key: &K) -> Option<V>
where
V: Clone,
{
let mut old = None;
self.update_with_fn(key, |existing| {
old = existing.cloned();
UpdateAction::Remove
})
.expect("out of memory while removing");
old
}
/// Update key using the given function. All the other modifying operations are based on this.
///
/// The function is passed a reference to the existing value, if any. If the function
/// returns None, the value is removed from the tree (or if there was no existing value,
/// does nothing). If the function returns Some, the existing value is replaced, of if there
/// was no existing value, it is inserted. FIXME: update comment
pub fn update_with_fn<F>(mut self, key: &K, value_fn: F) -> Result<(), OutOfMemoryError>
where
F: FnOnce(Option<&V>) -> UpdateAction<V>,
{
algorithm::update_fn(key, value_fn, self.tree_writer.tree.root, &mut self)?;
if self.created_garbage {
let _ = self.collect_garbage();
}
Ok(())
}
fn remember_obsolete_node(&mut self, ptr: NodePtr<V>) {
self.tree_writer
.garbage
.lock()
.remember_obsolete_node(ptr, self.epoch_pin.epoch);
self.created_garbage = true;
}
// returns number of nodes recycled
fn collect_garbage(&self) -> usize {
self.tree_writer.tree.epoch.advance();
self.tree_writer.tree.epoch.broadcast();
let cutoff_epoch = self.tree_writer.tree.epoch.get_oldest();
let mut result = 0;
let mut garbage_queue = self.tree_writer.garbage.lock();
while let Some(ptr) = garbage_queue.next_obsolete(cutoff_epoch) {
ptr.deallocate(self.tree_writer.allocator);
result += 1;
}
result
}
}
pub struct TreeIterator<K>
where
K: Key + for<'a> From<&'a [u8]>,
{
done: bool,
pub next_key: Vec<u8>,
max_key: Option<Vec<u8>>,
phantom_key: PhantomData<K>,
}
impl<K> TreeIterator<K>
where
K: Key + for<'a> From<&'a [u8]>,
{
pub fn new_wrapping() -> TreeIterator<K> {
TreeIterator {
done: false,
next_key: vec![0; K::KEY_LEN],
max_key: None,
phantom_key: PhantomData,
}
}
pub fn new(range: &std::ops::Range<K>) -> TreeIterator<K> {
let result = TreeIterator {
done: false,
next_key: Vec::from(range.start.as_bytes()),
max_key: Some(Vec::from(range.end.as_bytes())),
phantom_key: PhantomData,
};
assert_eq!(result.next_key.len(), K::KEY_LEN);
assert_eq!(result.max_key.as_ref().unwrap().len(), K::KEY_LEN);
result
}
pub fn next<'g, V>(&mut self, read_guard: &'g TreeReadGuard<'g, K, V>) -> Option<(K, &'g V)>
where
V: Value,
{
if self.done {
return None;
}
let mut wrapped_around = false;
loop {
assert_eq!(self.next_key.len(), K::KEY_LEN);
if let Some((k, v)) =
algorithm::iter_next(&self.next_key, read_guard.tree.root, &read_guard.epoch_pin)
{
assert_eq!(k.len(), K::KEY_LEN);
assert_eq!(self.next_key.len(), K::KEY_LEN);
// Check if we reached the end of the range
if let Some(max_key) = &self.max_key {
if k.as_slice() >= max_key.as_slice() {
self.done = true;
break None;
}
}
// increment the key
self.next_key = k.clone();
increment_key(self.next_key.as_mut_slice());
let k = k.as_slice().into();
break Some((k, v));
} else {
if self.max_key.is_some() {
self.done = true;
} else {
// Start from beginning
if !wrapped_around {
for i in 0..K::KEY_LEN {
self.next_key[i] = 0;
}
wrapped_around = true;
continue;
} else {
// The tree is completely empty
// FIXME: perhaps we should remember the starting point instead.
// Currently this will scan some ranges twice.
break None;
}
}
break None;
}
}
}
}
fn increment_key(key: &mut [u8]) -> bool {
for i in (0..key.len()).rev() {
let (byte, overflow) = key[i].overflowing_add(1);
key[i] = byte;
if !overflow {
return false;
}
}
true
}
// Debugging functions
impl<'e, K: Key, V: Value + Debug, A: ArtAllocator<V>> TreeWriteGuard<'e, K, V, A> {
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
algorithm::dump_tree(self.tree_writer.tree.root, &self.epoch_pin, dst)
}
}
impl<'e, K: Key, V: Value + Debug> TreeReadGuard<'e, K, V> {
pub fn dump(&mut self, dst: &mut dyn std::io::Write) {
algorithm::dump_tree(self.tree.root, &self.epoch_pin, dst)
}
}
impl<'e, K: Key, V: Value> TreeWriteAccess<'e, K, V, ArtMultiSlabAllocator<'e, V>> {
pub fn get_statistics(&self) -> ArtTreeStatistics {
self.allocator.get_statistics();
ArtTreeStatistics {
blocks: self.allocator.inner.block_allocator.get_statistics(),
slabs: self.allocator.get_statistics(),
epoch: self.tree.epoch.get_current(),
oldest_epoch: self.tree.epoch.get_oldest(),
num_garbage: self.garbage.lock().0.len() as u64,
}
}
}
#[derive(Clone, Debug)]
pub struct ArtTreeStatistics {
pub blocks: allocator::block::BlockAllocatorStats,
pub slabs: allocator::ArtMultiSlabStats,
pub epoch: u64,
pub oldest_epoch: u64,
pub num_garbage: u64,
}

236
libs/neonart/src/tests.rs Normal file
View File

@@ -0,0 +1,236 @@
use std::collections::BTreeMap;
use std::collections::HashSet;
use std::fmt::{Debug, Formatter};
use std::sync::atomic::{AtomicUsize, Ordering};
use crate::ArtAllocator;
use crate::ArtMultiSlabAllocator;
use crate::TreeInitStruct;
use crate::TreeIterator;
use crate::TreeWriteAccess;
use crate::UpdateAction;
use crate::{Key, Value};
use rand::Rng;
use rand::seq::SliceRandom;
use rand_distr::Zipf;
const TEST_KEY_LEN: usize = 16;
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
struct TestKey([u8; TEST_KEY_LEN]);
impl TestKey {
const MIN: TestKey = TestKey([0; TEST_KEY_LEN]);
const MAX: TestKey = TestKey([u8::MAX; TEST_KEY_LEN]);
}
impl Key for TestKey {
const KEY_LEN: usize = TEST_KEY_LEN;
fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl From<&TestKey> for u128 {
fn from(val: &TestKey) -> u128 {
u128::from_be_bytes(val.0)
}
}
impl From<u128> for TestKey {
fn from(val: u128) -> TestKey {
TestKey(val.to_be_bytes())
}
}
impl<'a> From<&'a [u8]> for TestKey {
fn from(bytes: &'a [u8]) -> TestKey {
TestKey(bytes.try_into().unwrap())
}
}
impl Value for usize {}
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
const MEM_SIZE: usize = 10000000;
let mut area = Box::new_uninit_slice(MEM_SIZE);
let allocator = ArtMultiSlabAllocator::new(&mut area);
let init_struct = TreeInitStruct::<TestKey, usize, _>::new(allocator);
let tree_writer = init_struct.attach_writer();
for (idx, k) in keys.iter().enumerate() {
let w = tree_writer.start_write();
let res = w.insert(&(*k).into(), idx);
assert!(res.is_ok());
}
for (idx, k) in keys.iter().enumerate() {
let r = tree_writer.start_read();
let value = r.get(&(*k).into());
assert_eq!(value, Some(idx).as_ref());
}
eprintln!("stats: {:?}", tree_writer.get_statistics());
}
#[test]
fn dense() {
// This exercises splitting a node with prefix
let keys: &[u128] = &[0, 1, 2, 3, 256];
test_inserts(keys);
// Dense keys
let mut keys: Vec<u128> = (0..10000).collect();
test_inserts(&keys);
// Do the same in random orders
for _ in 1..10 {
keys.shuffle(&mut rand::rng());
test_inserts(&keys);
}
}
#[test]
fn sparse() {
// sparse keys
let mut keys: Vec<TestKey> = Vec::new();
let mut used_keys = HashSet::new();
for _ in 0..10000 {
loop {
let key = rand::random::<u128>();
if used_keys.contains(&key) {
continue;
}
used_keys.insert(key);
keys.push(key.into());
break;
}
}
test_inserts(&keys);
}
struct TestValue(AtomicUsize);
impl TestValue {
fn new(val: usize) -> TestValue {
TestValue(AtomicUsize::new(val))
}
fn load(&self) -> usize {
self.0.load(Ordering::Relaxed)
}
}
impl Value for TestValue {}
impl Clone for TestValue {
fn clone(&self) -> TestValue {
TestValue::new(self.load())
}
}
impl Debug for TestValue {
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
write!(fmt, "{:?}", self.load())
}
}
#[derive(Clone, Debug)]
struct TestOp(TestKey, Option<usize>);
fn apply_op<A: ArtAllocator<TestValue>>(
op: &TestOp,
tree: &TreeWriteAccess<TestKey, TestValue, A>,
shadow: &mut BTreeMap<TestKey, usize>,
) {
eprintln!("applying op: {op:?}");
// apply the change to the shadow tree first
let shadow_existing = if let Some(v) = op.1 {
shadow.insert(op.0, v)
} else {
shadow.remove(&op.0)
};
// apply to Art tree
let w = tree.start_write();
w.update_with_fn(&op.0, |existing| {
assert_eq!(existing.map(TestValue::load), shadow_existing);
match (existing, op.1) {
(None, None) => UpdateAction::Nothing,
(None, Some(new_val)) => UpdateAction::Insert(TestValue::new(new_val)),
(Some(_old_val), None) => UpdateAction::Remove,
(Some(old_val), Some(new_val)) => {
old_val.0.store(new_val, Ordering::Relaxed);
UpdateAction::Nothing
}
}
})
.expect("out of memory");
}
fn test_iter<A: ArtAllocator<TestValue>>(
tree: &TreeWriteAccess<TestKey, TestValue, A>,
shadow: &BTreeMap<TestKey, usize>,
) {
let mut shadow_iter = shadow.iter();
let mut iter = TreeIterator::new(&(TestKey::MIN..TestKey::MAX));
loop {
let shadow_item = shadow_iter.next().map(|(k, v)| (*k, *v));
let r = tree.start_read();
let item = iter.next(&r);
if shadow_item != item.map(|(k, v)| (k, v.load())) {
eprintln!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
tree.start_read().dump(&mut std::io::stderr());
eprintln!("SHADOW:");
for si in shadow {
eprintln!("key: {:?}, val: {}", si.0, si.1);
}
panic!("FAIL: iterator returned {item:?}, expected {shadow_item:?}");
}
if item.is_none() {
break;
}
}
}
#[test]
fn random_ops() {
const MEM_SIZE: usize = 10000000;
let mut area = Box::new_uninit_slice(MEM_SIZE);
let allocator = ArtMultiSlabAllocator::new(&mut area);
let init_struct = TreeInitStruct::<TestKey, TestValue, _>::new(allocator);
let tree_writer = init_struct.attach_writer();
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
let mut rng = rand::rng();
for i in 0..100000 {
let mut key: TestKey = (rng.sample(distribution) as u128).into();
if rng.random_bool(0.10) {
key = TestKey::from(u128::from(&key) | 0xffffffff);
}
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
apply_op(&op, &tree_writer, &mut shadow);
if i % 1000 == 0 {
eprintln!("{i} ops processed");
eprintln!("stats: {:?}", tree_writer.get_statistics());
test_iter(&tree_writer, &shadow);
}
}
}

View File

@@ -332,7 +332,11 @@ fn hash_combine(mut a: u32, mut b: u32) -> u32 {
///
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
/// and will be handled at higher levels when shards are split.
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
pub fn key_to_shard_number(
count: ShardCount,
stripe_size: ShardStripeSize,
key: &Key,
) -> ShardNumber {
// Fast path for un-sharded tenants or broadcast keys
if count < ShardCount(2) || key_is_shard0(key) {
return ShardNumber(0);

View File

@@ -49,6 +49,12 @@ pub struct TenantShardId {
pub shard_count: ShardCount,
}
impl std::fmt::Display for ShardCount {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
pub const MIN: Self = Self(0);

View File

@@ -54,6 +54,7 @@ pageserver_api.workspace = true
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
pageserver_compaction.workspace = true
pageserver_page_api.workspace = true
peekable.workspace = true
pem.workspace = true
pin-project-lite.workspace = true
postgres_backend.workspace = true
@@ -66,6 +67,7 @@ postgres-types.workspace = true
posthog_client_lite.workspace = true
pprof.workspace = true
pq_proto.workspace = true
prost.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true

View File

@@ -0,0 +1,44 @@
[package]
name = "pageserver_client_grpc"
version = "0.1.0"
edition = "2024"
[features]
testing = ["pageserver_api/testing"]
[dependencies]
anyhow.workspace = true
arc-swap.workspace = true
bytes.workspace = true
futures.workspace = true
http.workspace = true
thiserror.workspace = true
tonic.workspace = true
tracing.workspace = true
tokio = { version = "1.43.1", features = [
"full",
"macros",
"net",
"io-util",
"rt",
"rt-multi-thread",
] }
uuid = { version = "1", features = ["v4"] }
tower = { version = "0.4", features = ["timeout", "util"] }
rand = "0.8"
tokio-util = { version = "0.7", features = ["compat"] }
hyper-util = "0.1.9"
hyper = "1.6.0"
metrics.workspace = true
priority-queue = "2.3.1"
scopeguard.workspace = true
async-trait = { version = "0.1" }
tokio-stream = "0.1"
dashmap = "5"
chrono = { version = "0.4", features = ["serde"] }
compute_api.workspace = true
pageserver_page_api.workspace = true
pageserver_api.workspace = true
utils.workspace = true

View File

@@ -0,0 +1,471 @@
use std::collections::HashMap;
use std::num::NonZero;
use std::sync::Arc;
use anyhow::anyhow;
use arc_swap::ArcSwap;
use futures::stream::FuturesUnordered;
use futures::{FutureExt as _, StreamExt as _};
use tracing::instrument;
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
use crate::retry::Retry;
use crate::split::GetPageSplitter;
use compute_api::spec::PageserverProtocol;
use pageserver_api::shard::ShardStripeSize;
use pageserver_page_api as page_api;
use utils::id::{TenantId, TimelineId};
use utils::shard::{ShardCount, ShardIndex, ShardNumber};
/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
/// when full.
///
/// TODO: tune all of these constants, and consider making them configurable.
/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
/// with only streams.
const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
/// Max number of concurrent unary request clients per shard.
const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
/// Max number of pipelined requests per stream.
const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
/// are more throughput-oriented, we have a smaller limit but higher queue depth.
const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
/// get a larger queue depth.
const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
/// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
/// basic `page_api::Client` gRPC client, and supports:
///
/// * Sharded tenants across multiple Pageservers.
/// * Pooling of connections, clients, and streams for efficient resource use.
/// * Concurrent use by many callers.
/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
/// * Automatic retries.
/// * Observability.
///
/// TODO: this client does not support base backups or LSN leases, as these are only used by
/// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
pub struct PageserverClient {
/// The tenant ID.
tenant_id: TenantId,
/// The timeline ID.
timeline_id: TimelineId,
/// The JWT auth token for this tenant, if any.
auth_token: Option<String>,
/// The shards for this tenant.
shards: ArcSwap<Shards>,
/// The retry configuration.
retry: Retry,
}
impl PageserverClient {
/// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
/// in the shard spec, which must be complete and must use gRPC URLs.
pub fn new(
tenant_id: TenantId,
timeline_id: TimelineId,
shard_spec: ShardSpec,
auth_token: Option<String>,
) -> anyhow::Result<Self> {
let shards = Shards::new(tenant_id, timeline_id, shard_spec, auth_token.clone())?;
Ok(Self {
tenant_id,
timeline_id,
auth_token,
shards: ArcSwap::new(Arc::new(shards)),
retry: Retry,
})
}
/// Updates the shards from the given shard spec. In-flight requests will complete using the
/// existing shards, but may retry with the new shards if they fail.
///
/// TODO: verify that in-flight requests are allowed to complete, and that the old pools are
/// properly spun down and dropped afterwards.
pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> {
let shards = Shards::new(
self.tenant_id,
self.timeline_id,
shard_spec,
self.auth_token.clone(),
)?;
self.shards.store(Arc::new(shards));
Ok(())
}
/// Returns whether a relation exists.
#[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
pub async fn check_rel_exists(
&self,
req: page_api::CheckRelExistsRequest,
) -> tonic::Result<page_api::CheckRelExistsResponse> {
self.retry
.with(async || {
// Relation metadata is only available on shard 0.
let mut client = self.shards.load_full().get_zero().client().await?;
client.check_rel_exists(req).await
})
.await
}
/// Returns the total size of a database, as # of bytes.
#[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))]
pub async fn get_db_size(
&self,
req: page_api::GetDbSizeRequest,
) -> tonic::Result<page_api::GetDbSizeResponse> {
self.retry
.with(async || {
// Relation metadata is only available on shard 0.
let mut client = self.shards.load_full().get_zero().client().await?;
client.get_db_size(req).await
})
.await
}
/// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
/// splits requests that straddle shard boundaries, and assembles the responses.
///
/// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
/// errors. All responses will have `GetPageStatusCode::Ok`.
#[instrument(skip_all, fields(
req_id = %req.request_id,
class = %req.request_class,
rel = %req.rel,
blkno = %req.block_numbers[0],
blks = %req.block_numbers.len(),
lsn = %req.read_lsn,
))]
pub async fn get_page(
&self,
req: page_api::GetPageRequest,
) -> tonic::Result<page_api::GetPageResponse> {
// Make sure we have at least one page.
if req.block_numbers.is_empty() {
return Err(tonic::Status::invalid_argument("no block number"));
}
// The shards may change while we're fetching pages. We execute the request using a stable
// view of the shards (especially important for requests that span shards), but retry the
// top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
// retries and re-splits in some cases where requests span shards, but these are expected to
// be rare.
//
// TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
// once we figure out how to handle these.
self.retry
.with(async || Self::get_page_with_shards(req.clone(), &self.shards.load_full()).await)
.await
}
/// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
/// concurrent shard updates. Does not retry internally, but is retried by `get_page()`.
async fn get_page_with_shards(
req: page_api::GetPageRequest,
shards: &Shards,
) -> tonic::Result<page_api::GetPageResponse> {
// Fast path: request is for a single shard.
if let Some(shard_id) =
GetPageSplitter::is_single_shard(&req, shards.count, shards.stripe_size)
{
return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
}
// Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
// reassemble the responses.
let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size);
let mut shard_requests = FuturesUnordered::new();
for (shard_id, shard_req) in splitter.drain_requests() {
let future = Self::get_page_with_shard(shard_req, shards.get(shard_id)?)
.map(move |result| result.map(|resp| (shard_id, resp)));
shard_requests.push(future);
}
while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
splitter.add_response(shard_id, shard_response);
}
splitter.assemble_response()
}
/// Fetches pages on the given shard. Does not retry internally.
async fn get_page_with_shard(
req: page_api::GetPageRequest,
shard: &Shard,
) -> tonic::Result<page_api::GetPageResponse> {
let expected = req.block_numbers.len();
let stream = shard.stream(req.request_class.is_bulk()).await;
let resp = stream.send(req).await?;
// Convert per-request errors into a tonic::Status.
if resp.status_code != page_api::GetPageStatusCode::Ok {
return Err(tonic::Status::new(
resp.status_code.into(),
resp.reason.unwrap_or_else(|| String::from("unknown error")),
));
}
// Check that we received the expected number of pages.
let actual = resp.page_images.len();
if expected != actual {
return Err(tonic::Status::data_loss(format!(
"expected {expected} pages, got {actual}",
)));
}
Ok(resp)
}
/// Returns the size of a relation, as # of blocks.
#[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
pub async fn get_rel_size(
&self,
req: page_api::GetRelSizeRequest,
) -> tonic::Result<page_api::GetRelSizeResponse> {
self.retry
.with(async || {
// Relation metadata is only available on shard 0.
let mut client = self.shards.load_full().get_zero().client().await?;
client.get_rel_size(req).await
})
.await
}
/// Fetches an SLRU segment.
#[instrument(skip_all, fields(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn))]
pub async fn get_slru_segment(
&self,
req: page_api::GetSlruSegmentRequest,
) -> tonic::Result<page_api::GetSlruSegmentResponse> {
self.retry
.with(async || {
// SLRU segments are only available on shard 0.
let mut client = self.shards.load_full().get_zero().client().await?;
client.get_slru_segment(req).await
})
.await
}
}
/// Shard specification for a PageserverClient.
pub struct ShardSpec {
/// Maps shard indices to gRPC URLs.
///
/// INVARIANT: every shard 0..count is present, and shard 0 is always present.
/// INVARIANT: every URL is valid and uses grpc:// scheme.
urls: HashMap<ShardIndex, String>,
/// The shard count.
///
/// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
count: ShardCount,
/// The stripe size for these shards.
stripe_size: ShardStripeSize,
}
impl ShardSpec {
/// Creates a new shard spec with the given URLs and stripe size. All shards must be given.
/// The stripe size may be omitted for unsharded tenants.
pub fn new(
urls: HashMap<ShardIndex, String>,
stripe_size: Option<ShardStripeSize>,
) -> anyhow::Result<Self> {
// Compute the shard count.
let count = match urls.len() {
0 => return Err(anyhow!("no shards provided")),
1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
n => ShardCount::new(n as u8),
};
// Determine the stripe size. It doesn't matter for unsharded tenants.
if stripe_size.is_none() && !count.is_unsharded() {
return Err(anyhow!("stripe size must be given for sharded tenants"));
}
let stripe_size = stripe_size.unwrap_or_default();
// Validate the shard spec.
for (shard_id, url) in &urls {
// The shard index must match the computed shard count, even for unsharded tenants.
if shard_id.shard_count != count {
return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
}
// The shard index' number and count must be consistent.
if !shard_id.is_unsharded() && shard_id.shard_number.0 >= shard_id.shard_count.0 {
return Err(anyhow!("invalid shard index {shard_id}"));
}
// The above conditions guarantee that we have all shards 0..count: len() matches count,
// shard number < count, and numbers are unique (via hashmap).
// Validate the URL.
if PageserverProtocol::from_connstring(url)? != PageserverProtocol::Grpc {
return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
}
}
Ok(Self {
urls,
count,
stripe_size,
})
}
}
/// Tracks the tenant's shards.
struct Shards {
/// Shards by shard index.
///
/// INVARIANT: every shard 0..count is present.
/// INVARIANT: shard 0 is always present.
by_index: HashMap<ShardIndex, Shard>,
/// The shard count.
///
/// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
count: ShardCount,
/// The stripe size. Only used for sharded tenants.
stripe_size: ShardStripeSize,
}
impl Shards {
/// Creates a new set of shards based on a shard spec.
fn new(
tenant_id: TenantId,
timeline_id: TimelineId,
shard_spec: ShardSpec,
auth_token: Option<String>,
) -> anyhow::Result<Self> {
// NB: the shard spec has already been validated when constructed.
let mut shards = HashMap::with_capacity(shard_spec.urls.len());
for (shard_id, url) in shard_spec.urls {
shards.insert(
shard_id,
Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?,
);
}
Ok(Self {
by_index: shards,
count: shard_spec.count,
stripe_size: shard_spec.stripe_size,
})
}
/// Looks up the given shard.
#[allow(clippy::result_large_err)] // TODO: check perf impact
fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
self.by_index
.get(&shard_id)
.ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
}
/// Returns shard 0.
fn get_zero(&self) -> &Shard {
self.get(ShardIndex::new(ShardNumber(0), self.count))
.expect("always present")
}
}
/// A single shard. Uses dedicated resource pools with the following structure:
///
/// * Channel pool: unbounded.
/// * Unary client pool: MAX_UNARY_CLIENTS.
/// * Stream client pool: unbounded.
/// * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
/// * Bulk channel pool: unbounded.
/// * Bulk client pool: unbounded.
/// * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
struct Shard {
/// Unary gRPC client pool.
client_pool: Arc<ClientPool>,
/// GetPage stream pool.
stream_pool: Arc<StreamPool>,
/// GetPage stream pool for bulk requests, e.g. prefetches.
bulk_stream_pool: Arc<StreamPool>,
}
impl Shard {
/// Creates a new shard. It has its own dedicated resource pools.
fn new(
url: String,
tenant_id: TenantId,
timeline_id: TimelineId,
shard_id: ShardIndex,
auth_token: Option<String>,
) -> anyhow::Result<Self> {
// Common channel pool for unary and stream requests. Bounded by client/stream pools.
let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
// Client pool for unary requests.
let client_pool = ClientPool::new(
channel_pool.clone(),
tenant_id,
timeline_id,
shard_id,
auth_token.clone(),
Some(MAX_UNARY_CLIENTS),
);
// GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
// but shares a channel pool with it (as it's unbounded).
let stream_pool = StreamPool::new(
ClientPool::new(
channel_pool.clone(),
tenant_id,
timeline_id,
shard_id,
auth_token.clone(),
None, // unbounded, limited by stream pool
),
Some(MAX_STREAMS),
MAX_STREAM_QUEUE_DEPTH,
);
// Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
// to avoid head-of-line blocking of latency-sensitive requests.
let bulk_stream_pool = StreamPool::new(
ClientPool::new(
ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
tenant_id,
timeline_id,
shard_id,
auth_token,
None, // unbounded, limited by stream pool
),
Some(MAX_BULK_STREAMS),
MAX_BULK_STREAM_QUEUE_DEPTH,
);
Ok(Self {
client_pool,
stream_pool,
bulk_stream_pool,
})
}
/// Returns a pooled client for this shard.
async fn client(&self) -> tonic::Result<ClientGuard> {
self.client_pool
.get()
.await
.map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
}
/// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
/// pool (e.g. for prefetches).
async fn stream(&self, bulk: bool) -> StreamGuard {
match bulk {
false => self.stream_pool.get().await,
true => self.bulk_stream_pool.get().await,
}
}
}

View File

@@ -0,0 +1,6 @@
mod client;
mod pool;
mod retry;
mod split;
pub use client::{PageserverClient, ShardSpec};

View File

@@ -0,0 +1,755 @@
//! This module provides various Pageserver gRPC client resource pools.
//!
//! These pools are designed to reuse gRPC resources (connections, clients, and streams) across
//! multiple concurrent callers (i.e. Postgres backends). This avoids the resource cost and latency
//! of creating dedicated TCP connections and server tasks for every Postgres backend.
//!
//! Each resource has its own, nested pool. The pools are custom-built for the properties of each
//! resource -- they are different enough that a generic pool isn't suitable.
//!
//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
//! can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
//! per-channel client limit. Channels may be closed when they are no longer used by any clients.
//!
//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
//! channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
//! single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
//! from the pool after some time, to free up the channel.
//!
//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
//! ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
//! returns a guard that can be used to send a single request, to properly enforce queue depth and
//! route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
//! possibly pipelining multiple requests from multiple callers on the same stream (up to some
//! queue depth). Idle streams may be removed from the pool after a while to free up the client.
//!
//! Each channel corresponds to one TCP connection. Each client unary request and each stream
//! corresponds to one HTTP/2 stream and server task.
//!
//! TODO: error handling (including custom error types).
//! TODO: observability.
use std::collections::{BTreeMap, HashMap};
use std::num::NonZero;
use std::ops::{Deref, DerefMut};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex, Weak};
use std::time::{Duration, Instant};
use futures::StreamExt as _;
use tokio::sync::mpsc::{Receiver, Sender};
use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
use tokio_util::sync::CancellationToken;
use tonic::transport::{Channel, Endpoint};
use tracing::{error, warn};
use pageserver_page_api as page_api;
use utils::id::{TenantId, TimelineId};
use utils::shard::ShardIndex;
/// Reap channels/clients/streams that have been idle for this long.
///
/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
/// channels, and/or stream pool clients.
const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
false => Duration::from_secs(180),
true => Duration::from_secs(1), // exercise reaping in tests
};
/// Reap idle resources with this interval.
const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) {
false => Duration::from_secs(10),
true => Duration::from_secs(1), // exercise reaping in tests
};
/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
/// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
/// The pool does not limit the number of channels, and instead relies on `ClientPool` or
/// `StreamPool` to limit the number of concurrent clients.
///
/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
///
/// TODO: consider prewarming a set of channels, to avoid initial connection latency.
/// TODO: consider adding a circuit breaker for errors and fail fast.
pub struct ChannelPool {
/// Pageserver endpoint to connect to.
endpoint: Endpoint,
/// Max number of clients per channel. Beyond this, a new channel will be created.
max_clients_per_channel: NonZero<usize>,
/// Open channels.
channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
/// Reaps idle channels.
idle_reaper: Reaper,
/// Channel ID generator.
next_channel_id: AtomicUsize,
}
type ChannelID = usize;
struct ChannelEntry {
/// The gRPC channel (i.e. TCP connection). Shared by multiple clients.
channel: Channel,
/// Number of clients using this channel.
clients: usize,
/// The channel has been idle (no clients) since this time. None if channel is in use.
/// INVARIANT: Some if clients == 0, otherwise None.
idle_since: Option<Instant>,
}
impl ChannelPool {
/// Creates a new channel pool for the given Pageserver endpoint.
pub fn new<E>(endpoint: E, max_clients_per_channel: NonZero<usize>) -> anyhow::Result<Arc<Self>>
where
E: TryInto<Endpoint> + Send + Sync + 'static,
<E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
{
let pool = Arc::new(Self {
endpoint: endpoint.try_into()?,
max_clients_per_channel,
channels: Mutex::default(),
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
next_channel_id: AtomicUsize::default(),
});
pool.idle_reaper.spawn(&pool);
Ok(pool)
}
/// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
///
/// This never blocks (except for mutex acquisition). The channel is connected lazily on first
/// use, and the `ChannelPool` does not have a channel limit. Channels will be re-established
/// automatically on failure (TODO: verify).
///
/// Callers should not clone the returned channel, and must hold onto the returned guard as long
/// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf
/// client requires an owned `Channel` and we don't have access to the channel's internal
/// refcount.
///
/// This is not performance-sensitive. It is only called when creating a new client, and clients
/// are pooled and reused by `ClientPool`. The total number of channels will also be small. O(n)
/// performance is therefore okay.
pub fn get(self: &Arc<Self>) -> ChannelGuard {
let mut channels = self.channels.lock().unwrap();
// Try to find an existing channel with available capacity. We check entries in BTreeMap
// order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
// with lower-ordered channel IDs first. This will cluster clients in lower-ordered
// channels, and free up higher-ordered channels such that they can be reaped.
for (&id, entry) in channels.iter_mut() {
assert!(
entry.clients <= self.max_clients_per_channel.get(),
"channel overflow"
);
assert_eq!(
entry.idle_since.is_some(),
entry.clients == 0,
"incorrect channel idle state"
);
if entry.clients < self.max_clients_per_channel.get() {
entry.clients += 1;
entry.idle_since = None;
return ChannelGuard {
pool: Arc::downgrade(self),
id,
channel: Some(entry.channel.clone()),
};
}
}
// Create a new channel. We connect lazily on first use, such that we don't block here and
// other clients can join onto the same channel while it's connecting.
let channel = self.endpoint.connect_lazy();
let id = self.next_channel_id.fetch_add(1, Ordering::Relaxed);
let entry = ChannelEntry {
channel: channel.clone(),
clients: 1, // account for the guard below
idle_since: None,
};
channels.insert(id, entry);
ChannelGuard {
pool: Arc::downgrade(self),
id,
channel: Some(channel),
}
}
}
impl Reapable for ChannelPool {
/// Reaps channels that have been idle since before the cutoff.
fn reap_idle(&self, cutoff: Instant) {
self.channels.lock().unwrap().retain(|_, entry| {
let Some(idle_since) = entry.idle_since else {
assert_ne!(entry.clients, 0, "empty channel not marked idle");
return true;
};
assert_eq!(entry.clients, 0, "idle channel has clients");
idle_since >= cutoff
})
}
}
/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
/// since the gRPC client requires an owned `Channel`.
pub struct ChannelGuard {
pool: Weak<ChannelPool>,
id: ChannelID,
channel: Option<Channel>,
}
impl ChannelGuard {
/// Returns the inner owned channel. Panics if called more than once. The caller must hold onto
/// the guard as long as the channel is in use, and should not clone it.
pub fn take(&mut self) -> Channel {
self.channel.take().expect("channel already taken")
}
}
/// Returns the channel to the pool.
impl Drop for ChannelGuard {
fn drop(&mut self) {
let Some(pool) = self.pool.upgrade() else {
return; // pool was dropped
};
let mut channels = pool.channels.lock().unwrap();
let entry = channels.get_mut(&self.id).expect("unknown channel");
assert!(entry.idle_since.is_none(), "active channel marked idle");
assert!(entry.clients > 0, "channel underflow");
entry.clients -= 1;
if entry.clients == 0 {
entry.idle_since = Some(Instant::now()); // mark channel as idle
}
}
}
/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner
/// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total
/// number of concurrent clients to `max_clients` via semaphore.
///
/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
pub struct ClientPool {
/// Tenant ID.
tenant_id: TenantId,
/// Timeline ID.
timeline_id: TimelineId,
/// Shard ID.
shard_id: ShardIndex,
/// Authentication token, if any.
auth_token: Option<String>,
/// Channel pool to acquire channels from.
channel_pool: Arc<ChannelPool>,
/// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
limiter: Option<Arc<Semaphore>>,
/// Idle pooled clients. Acquired clients are removed from here and returned on drop.
///
/// The first client in the map will be acquired next. The map is sorted by client ID, which in
/// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
/// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
/// clients are reaped.
idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
/// Reaps idle clients.
idle_reaper: Reaper,
/// Unique client ID generator.
next_client_id: AtomicUsize,
}
type ClientID = (ChannelID, usize);
struct ClientEntry {
/// The pooled gRPC client.
client: page_api::Client,
/// The channel guard for the channel used by the client.
channel_guard: ChannelGuard,
/// The client has been idle since this time. All clients in `ClientPool::idle` are idle by
/// definition, so this is the time when it was added back to the pool.
idle_since: Instant,
}
impl ClientPool {
/// Creates a new client pool for the given tenant shard. Channels are acquired from the given
/// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. Allows up to
/// `max_clients` concurrent clients, or unbounded if None.
pub fn new(
channel_pool: Arc<ChannelPool>,
tenant_id: TenantId,
timeline_id: TimelineId,
shard_id: ShardIndex,
auth_token: Option<String>,
max_clients: Option<NonZero<usize>>,
) -> Arc<Self> {
let pool = Arc::new(Self {
tenant_id,
timeline_id,
shard_id,
auth_token,
channel_pool,
idle: Mutex::default(),
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
next_client_id: AtomicUsize::default(),
});
pool.idle_reaper.spawn(&pool);
pool
}
/// Gets a client from the pool, or creates a new one if necessary. Connections are established
/// lazily and do not block, but this call can block if the pool is at `max_clients`. The client
/// is returned to the pool when the guard is dropped.
///
/// This is moderately performance-sensitive. It is called for every unary request, but these
/// establish a new gRPC stream per request so they're already expensive. GetPage requests use
/// the `StreamPool` instead.
pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
// Acquire a permit if the pool is bounded.
let mut permit = None;
if let Some(limiter) = self.limiter.clone() {
permit = Some(limiter.acquire_owned().await.expect("never closed"));
}
// Fast path: acquire an idle client from the pool.
if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() {
return Ok(ClientGuard {
pool: Arc::downgrade(self),
id,
client: Some(entry.client),
channel_guard: Some(entry.channel_guard),
permit,
});
}
// Slow path: construct a new client.
let mut channel_guard = self.channel_pool.get();
let client = page_api::Client::new(
channel_guard.take(),
self.tenant_id,
self.timeline_id,
self.shard_id,
self.auth_token.clone(),
None,
)?;
Ok(ClientGuard {
pool: Arc::downgrade(self),
id: (
channel_guard.id,
self.next_client_id.fetch_add(1, Ordering::Relaxed),
),
client: Some(client),
channel_guard: Some(channel_guard),
permit,
})
}
}
impl Reapable for ClientPool {
/// Reaps clients that have been idle since before the cutoff.
fn reap_idle(&self, cutoff: Instant) {
self.idle
.lock()
.unwrap()
.retain(|_, entry| entry.idle_since >= cutoff)
}
}
/// A client acquired from the pool. The inner client can be accessed via Deref. The client is
/// returned to the pool when dropped.
pub struct ClientGuard {
pool: Weak<ClientPool>,
id: ClientID,
client: Option<page_api::Client>, // Some until dropped
channel_guard: Option<ChannelGuard>, // Some until dropped
permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
}
impl Deref for ClientGuard {
type Target = page_api::Client;
fn deref(&self) -> &Self::Target {
self.client.as_ref().expect("not dropped")
}
}
impl DerefMut for ClientGuard {
fn deref_mut(&mut self) -> &mut Self::Target {
self.client.as_mut().expect("not dropped")
}
}
/// Returns the client to the pool.
impl Drop for ClientGuard {
fn drop(&mut self) {
let Some(pool) = self.pool.upgrade() else {
return; // pool was dropped
};
let entry = ClientEntry {
client: self.client.take().expect("dropped once"),
channel_guard: self.channel_guard.take().expect("dropped once"),
idle_since: Instant::now(),
};
pool.idle.lock().unwrap().insert(self.id, entry);
_ = self.permit; // returned on drop, referenced for visibility
}
}
/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
/// acquires a client from the inner `ClientPool` for the stream's lifetime.
///
/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
/// a single request and await the response. Internally, requests are multiplexed across streams and
/// channels. This allows proper queue depth enforcement and response routing.
///
/// TODO: consider making this generic over request and response types; not currently needed.
pub struct StreamPool {
/// The client pool to acquire clients from. Must be unbounded.
client_pool: Arc<ClientPool>,
/// All pooled streams.
///
/// Incoming requests will be sent over an existing stream with available capacity. If all
/// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
/// stream has an associated Tokio task that processes requests and responses.
streams: Mutex<HashMap<StreamID, StreamEntry>>,
/// The max number of concurrent streams, or None if unbounded.
max_streams: Option<NonZero<usize>>,
/// The max number of concurrent requests per stream.
max_queue_depth: NonZero<usize>,
/// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
/// None if the pool is unbounded.
limiter: Option<Arc<Semaphore>>,
/// Reaps idle streams.
idle_reaper: Reaper,
/// Stream ID generator.
next_stream_id: AtomicUsize,
}
type StreamID = usize;
type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
struct StreamEntry {
/// Sends caller requests to the stream task. The stream task exits when this is dropped.
sender: RequestSender,
/// Number of in-flight requests on this stream.
queue_depth: usize,
/// The time when this stream went idle (queue_depth == 0).
/// INVARIANT: Some if queue_depth == 0, otherwise None.
idle_since: Option<Instant>,
}
impl StreamPool {
/// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
/// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
///
/// The client pool must be unbounded. The stream pool will enforce its own limits, and because
/// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
/// The stream pool should generally have its own dedicated client pool (but it can share a
/// channel pool with others since these are always unbounded).
pub fn new(
client_pool: Arc<ClientPool>,
max_streams: Option<NonZero<usize>>,
max_queue_depth: NonZero<usize>,
) -> Arc<Self> {
assert!(client_pool.limiter.is_none(), "bounded client pool");
let pool = Arc::new(Self {
client_pool,
streams: Mutex::default(),
limiter: max_streams.map(|max_streams| {
Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
}),
max_streams,
max_queue_depth,
idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
next_stream_id: AtomicUsize::default(),
});
pool.idle_reaper.spawn(&pool);
pool
}
/// Acquires an available stream from the pool, or spins up a new stream async if all streams
/// are full. Returns a guard that can be used to send a single request on the stream and await
/// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
/// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
///
/// This is very performance-sensitive, as it is on the GetPage hot path.
///
/// TODO: this must do something more sophisticated for performance. We want:
///
/// * Cheap, concurrent access in the common case where we can use a pooled stream.
/// * Quick acquisition of pooled streams with available capacity.
/// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
/// * Prefer filling up existing streams' queue depth before spinning up new streams.
/// * Don't hold a lock while spinning up new streams.
/// * Allow concurrent clients to join onto streams while they're spun up.
/// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
///
/// For now, we just do something simple but inefficient (linear scan under mutex).
pub async fn get(self: &Arc<Self>) -> StreamGuard {
// Acquire a permit if the pool is bounded.
let mut permit = None;
if let Some(limiter) = self.limiter.clone() {
permit = Some(limiter.acquire_owned().await.expect("never closed"));
}
let mut streams = self.streams.lock().unwrap();
// Look for a pooled stream with available capacity.
for (&id, entry) in streams.iter_mut() {
assert!(
entry.queue_depth <= self.max_queue_depth.get(),
"stream queue overflow"
);
assert_eq!(
entry.idle_since.is_some(),
entry.queue_depth == 0,
"incorrect stream idle state"
);
if entry.queue_depth < self.max_queue_depth.get() {
entry.queue_depth += 1;
entry.idle_since = None;
return StreamGuard {
pool: Arc::downgrade(self),
id,
sender: entry.sender.clone(),
permit,
};
}
}
// No available stream, spin up a new one. We install the stream entry in the pool first and
// return the guard, while spinning up the stream task async. This allows other callers to
// join onto this stream and also create additional streams concurrently if this fills up.
let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
let entry = StreamEntry {
sender: req_tx.clone(),
queue_depth: 1, // reserve quota for this caller
idle_since: None,
};
streams.insert(id, entry);
if let Some(max_streams) = self.max_streams {
assert!(streams.len() <= max_streams.get(), "stream overflow");
};
let client_pool = self.client_pool.clone();
let pool = Arc::downgrade(self);
tokio::spawn(async move {
if let Err(err) = Self::run_stream(client_pool, req_rx).await {
error!("stream failed: {err}");
}
// Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
if let Some(pool) = pool.upgrade() {
let entry = pool.streams.lock().unwrap().remove(&id);
assert!(entry.is_some(), "unknown stream ID: {id}");
}
});
StreamGuard {
pool: Arc::downgrade(self),
id,
sender: req_tx,
permit,
}
}
/// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
/// bidirectional GetPage stream, then forwards requests and responses between callers and the
/// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
/// atomic with pool stream acquisition.
///
/// The task exits when the request channel is closed, or on a stream error. The caller is
/// responsible for removing the stream from the pool on exit.
async fn run_stream(
client_pool: Arc<ClientPool>,
mut caller_rx: RequestReceiver,
) -> anyhow::Result<()> {
// Acquire a client from the pool and create a stream.
let mut client = client_pool.get().await?;
let (req_tx, req_rx) = mpsc::channel(1);
let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
let mut resp_stream = client.get_pages(req_stream).await?;
// Track caller response channels by request ID. If the task returns early, these response
// channels will be dropped and the waiting callers will receive an error.
let mut callers = HashMap::new();
// Process requests and responses.
loop {
tokio::select! {
// Receive requests from callers and send them to the stream.
req = caller_rx.recv() => {
// Shut down if request channel is closed.
let Some((req, resp_tx)) = req else {
return Ok(());
};
// Store the response channel by request ID.
if callers.contains_key(&req.request_id) {
// Error on request ID duplicates. Ignore callers that went away.
_ = resp_tx.send(Err(tonic::Status::invalid_argument(
format!("duplicate request ID: {}", req.request_id),
)));
continue;
}
callers.insert(req.request_id, resp_tx);
// Send the request on the stream. Bail out if the send fails.
req_tx.send(req).await.map_err(|_| {
tonic::Status::unavailable("stream closed")
})?;
}
// Receive responses from the stream and send them to callers.
resp = resp_stream.next() => {
// Shut down if the stream is closed, and bail out on stream errors.
let Some(resp) = resp.transpose()? else {
return Ok(())
};
// Send the response to the caller. Ignore errors if the caller went away.
let Some(resp_tx) = callers.remove(&resp.request_id) else {
warn!("received response for unknown request ID: {}", resp.request_id);
continue;
};
_ = resp_tx.send(Ok(resp));
}
}
}
}
}
impl Reapable for StreamPool {
/// Reaps streams that have been idle since before the cutoff.
fn reap_idle(&self, cutoff: Instant) {
self.streams.lock().unwrap().retain(|_, entry| {
let Some(idle_since) = entry.idle_since else {
assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
return true;
};
assert_eq!(entry.queue_depth, 0, "idle stream has requests");
idle_since >= cutoff
});
}
}
/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
/// depth. Queue depth is already reserved and will be returned on drop.
pub struct StreamGuard {
pool: Weak<StreamPool>,
id: StreamID,
sender: RequestSender,
permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
}
impl StreamGuard {
/// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
/// valid for a single request (to enforce queue depth). This also drops the guard on return and
/// returns the queue depth quota to the pool.
///
/// The `GetPageRequest::request_id` must be unique across in-flight requests.
///
/// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
/// to avoid tearing down the stream for per-request errors. Callers must check this.
pub async fn send(
self,
req: page_api::GetPageRequest,
) -> tonic::Result<page_api::GetPageResponse> {
let (resp_tx, resp_rx) = oneshot::channel();
self.sender
.send((req, resp_tx))
.await
.map_err(|_| tonic::Status::unavailable("stream closed"))?;
resp_rx
.await
.map_err(|_| tonic::Status::unavailable("stream closed"))?
}
}
impl Drop for StreamGuard {
fn drop(&mut self) {
let Some(pool) = self.pool.upgrade() else {
return; // pool was dropped
};
// Release the queue depth reservation on drop. This can prematurely decrement it if dropped
// before the response is received, but that's okay.
let mut streams = pool.streams.lock().unwrap();
let entry = streams.get_mut(&self.id).expect("unknown stream");
assert!(entry.idle_since.is_none(), "active stream marked idle");
assert!(entry.queue_depth > 0, "stream queue underflow");
entry.queue_depth -= 1;
if entry.queue_depth == 0 {
entry.idle_since = Some(Instant::now()); // mark stream as idle
}
_ = self.permit; // returned on drop, referenced for visibility
}
}
/// Periodically reaps idle resources from a pool.
struct Reaper {
/// The task check interval.
interval: Duration,
/// The threshold for reaping idle resources.
threshold: Duration,
/// Cancels the reaper task. Cancelled when the reaper is dropped.
cancel: CancellationToken,
}
impl Reaper {
/// Creates a new reaper.
pub fn new(threshold: Duration, interval: Duration) -> Self {
Self {
cancel: CancellationToken::new(),
threshold,
interval,
}
}
/// Spawns a task to periodically reap idle resources from the given task pool. The task is
/// cancelled when the reaper is dropped.
pub fn spawn(&self, pool: &Arc<impl Reapable>) {
// NB: hold a weak pool reference, otherwise the task will prevent dropping the pool.
let pool = Arc::downgrade(pool);
let cancel = self.cancel.clone();
let (interval, threshold) = (self.interval, self.threshold);
tokio::spawn(async move {
loop {
tokio::select! {
_ = tokio::time::sleep(interval) => {
let Some(pool) = pool.upgrade() else {
return; // pool was dropped
};
pool.reap_idle(Instant::now() - threshold);
}
_ = cancel.cancelled() => return,
}
}
});
}
}
impl Drop for Reaper {
fn drop(&mut self) {
self.cancel.cancel(); // cancel reaper task
}
}
/// A reapable resource pool.
trait Reapable: Send + Sync + 'static {
/// Reaps resources that have been idle since before the given cutoff.
fn reap_idle(&self, cutoff: Instant);
}

View File

@@ -0,0 +1,152 @@
use std::time::Duration;
use tokio::time::Instant;
use tracing::{error, info, warn};
use utils::backoff::exponential_backoff_duration;
/// A retry handler for Pageserver gRPC requests.
///
/// This is used instead of backoff::retry for better control and observability.
#[derive(Clone, Copy)]
pub struct Retry;
impl Retry {
/// The per-request timeout.
// TODO: tune these, and/or make them configurable. Should we retry forever?
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
/// The total timeout across all attempts
const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
/// The initial backoff duration.
const BASE_BACKOFF: Duration = Duration::from_millis(10);
/// The maximum backoff duration.
const MAX_BACKOFF: Duration = Duration::from_secs(10);
/// If true, log successful requests. For debugging.
const LOG_SUCCESS: bool = false;
/// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
/// using the current tracing span for context.
///
/// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
/// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
where
F: FnMut() -> O,
O: Future<Output = tonic::Result<T>>,
{
let started = Instant::now();
let deadline = started + Self::TOTAL_TIMEOUT;
let mut last_error = None;
let mut retries = 0;
loop {
// Set up a future to wait for the backoff (if any) and run the request with a timeout.
let backoff_and_try = async {
// NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
// https://github.com/tokio-rs/tokio/issues/6866
if let Some(backoff) = Self::backoff_duration(retries) {
tokio::time::sleep(backoff).await;
}
let request_started = Instant::now();
tokio::time::timeout(Self::REQUEST_TIMEOUT, f())
.await
.map_err(|_| {
tonic::Status::deadline_exceeded(format!(
"request timed out after {:.3}s",
request_started.elapsed().as_secs_f64()
))
})?
};
// Wait for the backoff and request, or bail out if the total timeout is exceeded.
let result = tokio::select! {
result = backoff_and_try => result,
_ = tokio::time::sleep_until(deadline) => {
let last_error = last_error.unwrap_or_else(|| {
tonic::Status::deadline_exceeded(format!(
"request timed out after {:.3}s",
started.elapsed().as_secs_f64()
))
});
error!(
"giving up after {:.3}s and {retries} retries, last error {:?}: {}",
started.elapsed().as_secs_f64(), last_error.code(), last_error.message(),
);
return Err(last_error);
}
};
match result {
// Success, return the result.
Ok(result) => {
if retries > 0 || Self::LOG_SUCCESS {
info!(
"request succeeded after {retries} retries in {:.3}s",
started.elapsed().as_secs_f64(),
);
}
return Ok(result);
}
// Error, retry or bail out.
Err(status) => {
let (code, message) = (status.code(), status.message());
let attempt = retries + 1;
if !Self::should_retry(code) {
// NB: include the attempt here too. This isn't necessarily the first
// attempt, because the error may change between attempts.
error!(
"request failed with {code:?}: {message}, not retrying (attempt {attempt})"
);
return Err(status);
}
warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})");
retries += 1;
last_error = Some(status);
}
}
}
}
/// Returns the backoff duration for the given retry attempt, or None for no backoff.
fn backoff_duration(retry: usize) -> Option<Duration> {
let backoff = exponential_backoff_duration(
retry as u32,
Self::BASE_BACKOFF.as_secs_f64(),
Self::MAX_BACKOFF.as_secs_f64(),
);
(!backoff.is_zero()).then_some(backoff)
}
/// Returns true if the given status code should be retries.
fn should_retry(code: tonic::Code) -> bool {
match code {
tonic::Code::Ok => panic!("unexpected Ok status code"),
// These codes are transient, so retry them.
tonic::Code::Aborted => true,
tonic::Code::Cancelled => true,
tonic::Code::DeadlineExceeded => true, // maybe transient slowness
tonic::Code::Internal => true, // maybe transient failure?
tonic::Code::ResourceExhausted => true,
tonic::Code::Unavailable => true,
// The following codes will like continue to fail, so don't retry.
tonic::Code::AlreadyExists => false,
tonic::Code::DataLoss => false,
tonic::Code::FailedPrecondition => false,
tonic::Code::InvalidArgument => false,
tonic::Code::NotFound => false,
tonic::Code::OutOfRange => false,
tonic::Code::PermissionDenied => false,
tonic::Code::Unauthenticated => false,
tonic::Code::Unimplemented => false,
tonic::Code::Unknown => false,
}
}
}

View File

@@ -0,0 +1,168 @@
use std::collections::HashMap;
use bytes::Bytes;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
use pageserver_page_api as page_api;
use utils::shard::{ShardCount, ShardIndex};
/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
/// TODO: add tests for this.
pub struct GetPageSplitter {
/// The original request ID. Used for all shard requests.
request_id: page_api::RequestID,
/// Split requests by shard index.
requests: HashMap<ShardIndex, page_api::GetPageRequest>,
/// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble
/// the response pages in the same order as the original request.
block_shards: Vec<ShardIndex>,
/// Page responses by shard index. Will be assembled into a single response.
responses: HashMap<ShardIndex, Vec<Bytes>>,
}
impl GetPageSplitter {
/// Checks if the given request only touches a single shard, and returns the shard ID. This is
/// the common case, so we check first in order to avoid unnecessary allocations and overhead.
/// The caller must ensure that the request has at least one block number, or this will panic.
pub fn is_single_shard(
req: &page_api::GetPageRequest,
count: ShardCount,
stripe_size: ShardStripeSize,
) -> Option<ShardIndex> {
// Fast path: unsharded tenant.
if count.is_unsharded() {
return Some(ShardIndex::unsharded());
}
// Find the base shard index for the first page, and compare with the rest.
let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages"));
let shard_number = key_to_shard_number(count, stripe_size, &key);
req.block_numbers
.iter()
.skip(1) // computed above
.all(|&blkno| {
let key = rel_block_to_key(req.rel, blkno);
key_to_shard_number(count, stripe_size, &key) == shard_number
})
.then_some(ShardIndex::new(shard_number, count))
}
/// Splits the given request.
pub fn split(
req: page_api::GetPageRequest,
count: ShardCount,
stripe_size: ShardStripeSize,
) -> Self {
// The caller should make sure we don't split requests unnecessarily.
debug_assert!(
Self::is_single_shard(&req, count, stripe_size).is_none(),
"unnecessary request split"
);
// Split the requests by shard index.
let mut requests = HashMap::with_capacity(2); // common case
let mut block_shards = Vec::with_capacity(req.block_numbers.len());
for blkno in req.block_numbers {
let key = rel_block_to_key(req.rel, blkno);
let shard_number = key_to_shard_number(count, stripe_size, &key);
let shard_id = ShardIndex::new(shard_number, count);
let shard_req = requests
.entry(shard_id)
.or_insert_with(|| page_api::GetPageRequest {
request_id: req.request_id,
request_class: req.request_class,
rel: req.rel,
read_lsn: req.read_lsn,
block_numbers: Vec::new(),
});
shard_req.block_numbers.push(blkno);
block_shards.push(shard_id);
}
Self {
request_id: req.request_id,
responses: HashMap::with_capacity(requests.len()),
requests,
block_shards,
}
}
/// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations.
pub fn drain_requests(
&mut self,
) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
self.requests.drain()
}
/// Adds a response from the given shard. The response must match the request ID and have an OK
/// status code. A response must not already exist for the given shard ID.
pub fn add_response(&mut self, shard_id: ShardIndex, response: page_api::GetPageResponse) {
// NB: this is called below a `Retry::with()`, so unrecoverable errors should not use a
// retryable status code (e.g. `Internal`).
// The caller should already have converted status codes into tonic::Status.
assert_eq!(
response.status_code,
page_api::GetPageStatusCode::Ok,
"non-OK response"
);
// The stream pool ensures the response matches the request ID.
assert_eq!(response.request_id, self.request_id, "response ID mismatch");
// Add the response data to the map.
let old = self.responses.insert(shard_id, response.page_images);
// We only dispatch one request per shard.
assert!(old.is_none(), "duplicate response for shard {shard_id}");
}
/// Assembles the shard responses into a single response. Responses must be present for all
/// relevant shards, and the total number of pages must match the original request.
#[allow(clippy::result_large_err)]
pub fn assemble_response(self) -> tonic::Result<page_api::GetPageResponse> {
// NB: this is called below a `Retry::with()`, so unrecoverable errors should not use a
// retryable status code (e.g. `Internal`).
let mut response = page_api::GetPageResponse {
request_id: self.request_id,
status_code: page_api::GetPageStatusCode::Ok,
reason: None,
page_images: Vec::with_capacity(self.block_shards.len()),
};
// Set up per-shard page iterators we can pull from.
let mut shard_responses = HashMap::with_capacity(self.responses.len());
for (shard_id, responses) in self.responses {
shard_responses.insert(shard_id, responses.into_iter());
}
// Reassemble the responses in the same order as the original request.
for shard_id in &self.block_shards {
let page = shard_responses
.get_mut(shard_id)
.ok_or_else(|| {
tonic::Status::data_loss(format!("missing response for shard {shard_id}"))
})?
.next()
.ok_or_else(|| {
tonic::Status::data_loss(format!("missing page from shard {shard_id}"))
})?;
response.page_images.push(page);
}
// Make sure there are no additional pages.
for (shard_id, mut pages) in shard_responses {
if pages.next().is_some() {
return Err(tonic::Status::out_of_range(format!(
"extra pages returned from shard {shard_id}"
)));
}
}
Ok(response)
}
}

View File

@@ -1,23 +1,153 @@
use anyhow::Result;
use anyhow::anyhow;
use futures::{Stream, StreamExt as _, TryStreamExt as _};
use tokio::io::AsyncRead;
use tokio_util::io::StreamReader;
use tonic::codec::CompressionEncoding;
use tonic::metadata::AsciiMetadataValue;
use tonic::metadata::errors::InvalidMetadataValue;
use tonic::transport::Channel;
use tonic::{Request, Streaming};
use tonic::service::Interceptor;
use tonic::service::interceptor::InterceptedService;
use tonic::transport::{Channel, Endpoint};
use utils::id::TenantId;
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
use utils::shard::ShardIndex;
use crate::model;
use crate::model::*;
use crate::proto;
///
/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These
/// headers are required at the pageserver.
///
/// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain
/// types from `model` rather than generated Protobuf types.
pub struct Client {
inner: proto::PageServiceClient<InterceptedService<Channel, AuthInterceptor>>,
}
impl Client {
/// Connects to the given gRPC endpoint.
pub async fn connect<E>(
endpoint: E,
tenant_id: TenantId,
timeline_id: TimelineId,
shard_id: ShardIndex,
auth_token: Option<String>,
compression: Option<CompressionEncoding>,
) -> anyhow::Result<Self>
where
E: TryInto<Endpoint> + Send + Sync + 'static,
<E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
{
let endpoint: Endpoint = endpoint
.try_into()
.map_err(|err| anyhow!("invalid endpoint: {err}"))?;
let channel = endpoint.connect().await?;
Self::new(
channel,
tenant_id,
timeline_id,
shard_id,
auth_token,
compression,
)
}
/// Creates a new client using the given gRPC channel.
pub fn new(
channel: Channel,
tenant_id: TenantId,
timeline_id: TimelineId,
shard_id: ShardIndex,
auth_token: Option<String>,
compression: Option<CompressionEncoding>,
) -> anyhow::Result<Self> {
let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?;
let mut inner = proto::PageServiceClient::with_interceptor(channel, auth);
if let Some(compression) = compression {
// TODO: benchmark this (including network latency).
inner = inner
.accept_compressed(compression)
.send_compressed(compression);
}
Ok(Self { inner })
}
/// Returns whether a relation exists.
pub async fn check_rel_exists(
&mut self,
req: CheckRelExistsRequest,
) -> tonic::Result<CheckRelExistsResponse> {
let req = proto::CheckRelExistsRequest::from(req);
let resp = self.inner.check_rel_exists(req).await?.into_inner();
Ok(resp.into())
}
/// Fetches a base backup.
pub async fn get_base_backup(
&mut self,
req: GetBaseBackupRequest,
) -> tonic::Result<impl AsyncRead + use<>> {
let req = proto::GetBaseBackupRequest::from(req);
let chunks = self.inner.get_base_backup(req).await?.into_inner();
Ok(StreamReader::new(
chunks
.map_ok(|resp| resp.chunk)
.map_err(std::io::Error::other),
))
}
/// Returns the total size of a database, as # of bytes.
pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result<GetDbSizeResponse> {
let req = proto::GetDbSizeRequest::from(req);
let resp = self.inner.get_db_size(req).await?.into_inner();
Ok(resp.into())
}
/// Fetches pages.
///
/// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are
/// typically returned as status_code instead of errors, to avoid tearing down the entire stream
/// via a tonic::Status error.
pub async fn get_pages(
&mut self,
reqs: impl Stream<Item = GetPageRequest> + Send + 'static,
) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
let reqs = reqs.map(proto::GetPageRequest::from);
let resps = self.inner.get_pages(reqs).await?.into_inner();
Ok(resps.map_ok(GetPageResponse::from))
}
/// Returns the size of a relation, as # of blocks.
pub async fn get_rel_size(
&mut self,
req: GetRelSizeRequest,
) -> tonic::Result<GetRelSizeResponse> {
let req = proto::GetRelSizeRequest::from(req);
let resp = self.inner.get_rel_size(req).await?.into_inner();
Ok(resp.into())
}
/// Fetches an SLRU segment.
pub async fn get_slru_segment(
&mut self,
req: GetSlruSegmentRequest,
) -> tonic::Result<GetSlruSegmentResponse> {
let req = proto::GetSlruSegmentRequest::from(req);
let resp = self.inner.get_slru_segment(req).await?.into_inner();
Ok(resp.try_into()?)
}
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
/// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
///
/// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
/// acquired because the LSN has already been garbage collected.
pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result<LeaseLsnResponse> {
let req = proto::LeaseLsnRequest::from(req);
let resp = self.inner.lease_lsn(req).await?.into_inner();
Ok(resp.try_into()?)
}
}
/// Adds authentication metadata to gRPC requests.
#[derive(Clone)]
struct AuthInterceptor {
tenant_id: AsciiMetadataValue,
@@ -30,174 +160,29 @@ impl AuthInterceptor {
fn new(
tenant_id: TenantId,
timeline_id: TimelineId,
auth_token: Option<String>,
shard_id: ShardIndex,
) -> Result<Self, InvalidMetadataValue> {
let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?;
let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?;
let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?;
let auth_header: Option<AsciiMetadataValue> = match auth_token {
Some(token) => Some(format!("Bearer {token}").try_into()?),
None => None,
};
auth_token: Option<String>,
) -> anyhow::Result<Self> {
Ok(Self {
tenant_id: tenant_ascii,
shard_id: shard_ascii,
timeline_id: timeline_ascii,
auth_header,
tenant_id: tenant_id.to_string().try_into()?,
timeline_id: timeline_id.to_string().try_into()?,
shard_id: shard_id.to_string().try_into()?,
auth_header: auth_token
.map(|token| format!("Bearer {token}").try_into())
.transpose()?,
})
}
}
impl tonic::service::Interceptor for AuthInterceptor {
fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
req.metadata_mut()
.insert("neon-tenant-id", self.tenant_id.clone());
req.metadata_mut()
.insert("neon-shard-id", self.shard_id.clone());
req.metadata_mut()
.insert("neon-timeline-id", self.timeline_id.clone());
if let Some(auth_header) = &self.auth_header {
req.metadata_mut()
.insert("authorization", auth_header.clone());
impl Interceptor for AuthInterceptor {
fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result<tonic::Request<()>> {
let metadata = req.metadata_mut();
metadata.insert("neon-tenant-id", self.tenant_id.clone());
metadata.insert("neon-timeline-id", self.timeline_id.clone());
metadata.insert("neon-shard-id", self.shard_id.clone());
if let Some(ref auth_header) = self.auth_header {
metadata.insert("authorization", auth_header.clone());
}
Ok(req)
}
}
#[derive(Clone)]
pub struct Client {
client: proto::PageServiceClient<
tonic::service::interceptor::InterceptedService<Channel, AuthInterceptor>,
>,
}
impl Client {
pub async fn new<T: TryInto<tonic::transport::Endpoint> + Send + Sync + 'static>(
into_endpoint: T,
tenant_id: TenantId,
timeline_id: TimelineId,
shard_id: ShardIndex,
auth_header: Option<String>,
compression: Option<tonic::codec::CompressionEncoding>,
) -> anyhow::Result<Self> {
let endpoint: tonic::transport::Endpoint = into_endpoint
.try_into()
.map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?;
let channel = endpoint.connect().await?;
let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id)
.map_err(|e| anyhow::anyhow!(e.to_string()))?;
let mut client = proto::PageServiceClient::with_interceptor(channel, auth);
if let Some(compression) = compression {
// TODO: benchmark this (including network latency).
client = client
.accept_compressed(compression)
.send_compressed(compression);
}
Ok(Self { client })
}
/// Returns whether a relation exists.
pub async fn check_rel_exists(
&mut self,
req: model::CheckRelExistsRequest,
) -> Result<model::CheckRelExistsResponse, tonic::Status> {
let proto_req = proto::CheckRelExistsRequest::from(req);
let response = self.client.check_rel_exists(proto_req).await?;
let proto_resp = response.into_inner();
Ok(proto_resp.into())
}
/// Fetches a base backup.
pub async fn get_base_backup(
&mut self,
req: model::GetBaseBackupRequest,
) -> Result<impl AsyncRead + use<>, tonic::Status> {
let req = proto::GetBaseBackupRequest::from(req);
let chunks = self.client.get_base_backup(req).await?.into_inner();
let reader = StreamReader::new(
chunks
.map_ok(|resp| resp.chunk)
.map_err(std::io::Error::other),
);
Ok(reader)
}
/// Returns the total size of a database, as # of bytes.
pub async fn get_db_size(
&mut self,
req: model::GetDbSizeRequest,
) -> Result<u64, tonic::Status> {
let proto_req = proto::GetDbSizeRequest::from(req);
let response = self.client.get_db_size(proto_req).await?;
Ok(response.into_inner().into())
}
/// Fetches pages.
///
/// This is implemented as a bidirectional streaming RPC for performance.
/// Per-request errors are often returned as status_code instead of errors,
/// to avoid tearing down the entire stream via tonic::Status.
pub async fn get_pages<ReqSt>(
&mut self,
inbound: ReqSt,
) -> Result<
impl Stream<Item = Result<model::GetPageResponse, tonic::Status>> + Send + 'static,
tonic::Status,
>
where
ReqSt: Stream<Item = model::GetPageRequest> + Send + 'static,
{
let outbound_proto = inbound.map(|domain_req| domain_req.into());
let req_new = Request::new(outbound_proto);
let response_stream: Streaming<proto::GetPageResponse> =
self.client.get_pages(req_new).await?.into_inner();
let domain_stream = response_stream.map_ok(model::GetPageResponse::from);
Ok(domain_stream)
}
/// Returns the size of a relation, as # of blocks.
pub async fn get_rel_size(
&mut self,
req: model::GetRelSizeRequest,
) -> Result<model::GetRelSizeResponse, tonic::Status> {
let proto_req = proto::GetRelSizeRequest::from(req);
let response = self.client.get_rel_size(proto_req).await?;
let proto_resp = response.into_inner();
Ok(proto_resp.into())
}
/// Fetches an SLRU segment.
pub async fn get_slru_segment(
&mut self,
req: model::GetSlruSegmentRequest,
) -> Result<model::GetSlruSegmentResponse, tonic::Status> {
let proto_req = proto::GetSlruSegmentRequest::from(req);
let response = self.client.get_slru_segment(proto_req).await?;
Ok(response.into_inner().try_into()?)
}
/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
/// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
///
/// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
/// acquired because the LSN has already been garbage collected.
pub async fn lease_lsn(
&mut self,
req: model::LeaseLsnRequest,
) -> Result<model::LeaseLsnResponse, tonic::Status> {
let req = proto::LeaseLsnRequest::from(req);
Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?)
}
}

View File

@@ -33,6 +33,8 @@ pub enum ProtocolError {
Invalid(&'static str, String),
#[error("required field '{0}' is missing")]
Missing(&'static str),
#[error("invalid combination of not_modified_lsn '{0}' and request_lsn '{1}'")]
InvalidLsns(Lsn, Lsn),
}
impl ProtocolError {
@@ -85,9 +87,9 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
}
if pb.not_modified_since_lsn > pb.request_lsn {
return Err(ProtocolError::invalid(
"not_modified_since_lsn",
pb.not_modified_since_lsn,
return Err(ProtocolError::InvalidLsns(
Lsn(pb.not_modified_since_lsn),
Lsn(pb.request_lsn),
));
}
Ok(Self {
@@ -384,7 +386,7 @@ impl From<GetPageRequest> for proto::GetPageRequest {
pub type RequestID = u64;
/// A GetPage request class.
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy, Debug, strum_macros::Display)]
pub enum GetPageClass {
/// Unknown class. For backwards compatibility: used when an older client version sends a class
/// that a newer server version has removed.
@@ -397,6 +399,19 @@ pub enum GetPageClass {
Background,
}
impl GetPageClass {
/// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
/// latency-sensitive).
pub fn is_bulk(&self) -> bool {
match self {
Self::Unknown => false,
Self::Normal => false,
Self::Prefetch => true,
Self::Background => true,
}
}
}
impl From<proto::GetPageClass> for GetPageClass {
fn from(pb: proto::GetPageClass) -> Self {
match pb {
@@ -602,6 +617,21 @@ impl TryFrom<tonic::Code> for GetPageStatusCode {
}
}
impl From<GetPageStatusCode> for tonic::Code {
fn from(status_code: GetPageStatusCode) -> Self {
use tonic::Code;
match status_code {
GetPageStatusCode::Unknown => Code::Unknown,
GetPageStatusCode::Ok => Code::Ok,
GetPageStatusCode::NotFound => Code::NotFound,
GetPageStatusCode::InvalidRequest => Code::InvalidArgument,
GetPageStatusCode::InternalError => Code::Internal,
GetPageStatusCode::SlowDown => Code::ResourceExhausted,
}
}
}
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
// shards will error.
#[derive(Clone, Copy, Debug)]

View File

@@ -24,10 +24,14 @@ tracing.workspace = true
tokio.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
axum.workspace = true
http.workspace = true
metrics.workspace = true
tonic.workspace = true
url.workspace = true
pageserver_client.workspace = true
pageserver_client_grpc.workspace = true
pageserver_api.workspace = true
pageserver_page_api.workspace = true
utils = { path = "../../libs/utils/" }

View File

@@ -326,7 +326,7 @@ impl GrpcClient {
ttid: TenantTimelineId,
compression: bool,
) -> anyhow::Result<Self> {
let inner = page_api::Client::new(
let inner = page_api::Client::connect(
connstring.to_string(),
ttid.tenant_id,
ttid.timeline_id,

View File

@@ -32,6 +32,10 @@ use crate::util::{request_stats, tokio_thread_local_stats};
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "false")]
grpc: bool,
#[clap(long, default_value = "false")]
grpc_stream: bool,
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -72,6 +76,9 @@ pub(crate) struct Args {
#[clap(long)]
set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
#[clap(long)]
only_relnode: Option<u32>,
/// Queue depth generated in each client.
#[clap(long, default_value = "1")]
queue_depth: NonZeroUsize,
@@ -86,10 +93,31 @@ pub(crate) struct Args {
#[clap(long, default_value = "1")]
batch_size: NonZeroUsize,
#[clap(long)]
only_relnode: Option<u32>,
targets: Option<Vec<TenantTimelineId>>,
#[clap(long, default_value = "100")]
pool_max_consumers: NonZeroUsize,
#[clap(long, default_value = "5")]
pool_error_threshold: NonZeroUsize,
#[clap(long, default_value = "5000")]
pool_connect_timeout: NonZeroUsize,
#[clap(long, default_value = "1000")]
pool_connect_backoff: NonZeroUsize,
#[clap(long, default_value = "60000")]
pool_max_idle_duration: NonZeroUsize,
#[clap(long, default_value = "0")]
max_delay_ms: usize,
#[clap(long, default_value = "0")]
percent_drops: usize,
#[clap(long, default_value = "0")]
percent_hangs: usize,
}
/// State shared by all clients
@@ -146,7 +174,6 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
main_impl(args, thread_local_stats)
})
}
async fn main_impl(
args: Args,
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
@@ -311,6 +338,7 @@ async fn main_impl(
let rps_period = args
.per_client_rate
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
let ss = shared_state.clone();
let cancel = cancel.clone();
@@ -625,7 +653,7 @@ impl GrpcClient {
ttid: TenantTimelineId,
compression: bool,
) -> anyhow::Result<Self> {
let mut client = page_api::Client::new(
let mut client = page_api::Client::connect(
connstring.to_string(),
ttid.tenant_id,
ttid.timeline_id,

View File

@@ -3170,6 +3170,7 @@ where
pub struct GrpcPageServiceHandler {
tenant_manager: Arc<TenantManager>,
ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
}
@@ -3222,6 +3223,7 @@ impl GrpcPageServiceHandler {
let page_service_handler = GrpcPageServiceHandler {
tenant_manager,
ctx,
cancel: cancel.clone(),
gate_guard: gate.enter().expect("gate was just created"),
get_vectored_concurrent_io,
};
@@ -3353,6 +3355,8 @@ impl GrpcPageServiceHandler {
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
/// split them up in the client or server.
///
/// TODO: verify that the given keys belong to this shard.
#[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))]
async fn get_page(
ctx: &RequestContext,
@@ -3670,6 +3674,7 @@ impl proto::PageService for GrpcPageServiceHandler {
// Spawn a task to handle the GetPageRequest stream.
let span = Span::current();
let ctx = self.ctx.attached_child();
let cancel = self.cancel.clone();
let mut reqs = req.into_inner();
let resps = async_stream::try_stream! {
@@ -3677,7 +3682,20 @@ impl proto::PageService for GrpcPageServiceHandler {
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
.await?
.downgrade();
while let Some(req) = reqs.message().await? {
loop {
let req = tokio::select! {
req = reqs.message() => req,
_ = cancel.cancelled() => {
tracing::info!("closing getpages stream due to shutdown");
break;
},
};
let req = if let Some(req) = req? {
req
} else {
break;
};
let req_id = req.request_id;
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
.instrument(span.clone()) // propagate request span

View File

@@ -5,6 +5,7 @@ MODULE_big = neon
OBJS = \
$(WIN32RES) \
communicator.o \
communicator_new.o \
extension_server.o \
file_cache.o \
hll.o \
@@ -29,6 +30,11 @@ PG_CPPFLAGS = -I$(libpq_srcdir)
SHLIB_LINK_INTERNAL = $(libpq)
SHLIB_LINK = -lcurl
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S), Darwin)
SHLIB_LINK += -framework Security -framework CoreFoundation -framework SystemConfiguration
endif
EXTENSION = neon
DATA = \
neon--1.0.sql \
@@ -57,7 +63,7 @@ WALPROP_OBJS = \
# libcommunicator.a is built by cargo from the Rust sources under communicator/
# subdirectory. `cargo build` also generates communicator_bindings.h.
neon.o: communicator/communicator_bindings.h
communicator_new.o: communicator/communicator_bindings.h
$(NEON_CARGO_ARTIFACT_TARGET_DIR)/libcommunicator.a communicator/communicator_bindings.h &:
(cd $(srcdir)/communicator && cargo build $(CARGO_BUILD_FLAGS) $(CARGO_PROFILE))

372
pgxn/neon/communicator/Cargo.lock generated Normal file
View File

@@ -0,0 +1,372 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "addr2line"
version = "0.24.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
dependencies = [
"gimli",
]
[[package]]
name = "adler2"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "backtrace"
version = "0.3.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
dependencies = [
"addr2line",
"cfg-if",
"libc",
"miniz_oxide",
"object",
"rustc-demangle",
"windows-targets",
]
[[package]]
name = "base64"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "bytes"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "communicator"
version = "0.1.0"
dependencies = [
"tonic",
]
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "futures-core"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "gimli"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "http"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
dependencies = [
"bytes",
"fnv",
"itoa",
]
[[package]]
name = "http-body"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
dependencies = [
"bytes",
"http",
]
[[package]]
name = "http-body-util"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
dependencies = [
"bytes",
"futures-core",
"http",
"http-body",
"pin-project-lite",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "libc"
version = "0.2.171"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "miniz_oxide"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
dependencies = [
"adler2",
]
[[package]]
name = "object"
version = "0.36.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
dependencies = [
"memchr",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "percent-encoding"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "pin-project"
version = "1.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "pin-project-lite"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
[[package]]
name = "proc-macro2"
version = "1.0.94"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rustc-demangle"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "syn"
version = "2.0.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tokio"
version = "1.44.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
dependencies = [
"backtrace",
"pin-project-lite",
]
[[package]]
name = "tokio-stream"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
dependencies = [
"futures-core",
"pin-project-lite",
"tokio",
]
[[package]]
name = "tonic"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
dependencies = [
"base64",
"bytes",
"http",
"http-body",
"http-body-util",
"percent-encoding",
"pin-project",
"tokio-stream",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tower-layer"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
[[package]]
name = "tower-service"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
[[package]]
name = "tracing"
version = "0.1.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
dependencies = [
"pin-project-lite",
"tracing-attributes",
"tracing-core",
]
[[package]]
name = "tracing-attributes"
version = "0.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tracing-core"
version = "0.1.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
dependencies = [
"once_cell",
]
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View File

@@ -1,19 +1,42 @@
[package]
name = "communicator"
version = "0.1.0"
license.workspace = true
edition.workspace = true
[lib]
crate-type = ["staticlib"]
[features]
# 'testing' feature is currently unused in the communicator, but we accept it for convenience of
# calling build scripts, so that you can pass the same feature to all packages.
testing = []
[lib]
crate-type = ["staticlib"]
[dependencies]
axum.workspace = true
bytes.workspace = true
clashmap.workspace = true
http.workspace = true
libc.workspace = true
nix.workspace = true
atomic_enum = "0.3.0"
prometheus.workspace = true
prost.workspace = true
tonic = { version = "0.12.0", default-features = false, features=["codegen", "prost", "transport"] }
tokio = { version = "1.43.1", features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
tokio-pipe = { version = "0.2.12" }
thiserror.workspace = true
tracing.workspace = true
tracing-subscriber.workspace = true
metrics.workspace = true
uring-common = { workspace = true, features = ["bytes"] }
pageserver_client_grpc.workspace = true
pageserver_api.workspace = true
pageserver_page_api.workspace = true
neon-shmem.workspace = true
utils.workspace = true
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
[build-dependencies]

View File

@@ -1,8 +1,123 @@
This package will evolve into a "compute-pageserver communicator"
process and machinery. For now, it's just a dummy that doesn't do
anything interesting, but it allows us to test the compilation and
linking of Rust code into the Postgres extensions.
# Communicator
This package provides the so-called "compute-pageserver communicator",
or just "communicator" in short. It runs in a PostgreSQL server, as
part of the neon extension, and handles the communication with the
pageservers. On the PostgreSQL side, the glue code in pgxn/neon/ uses
the communicator to implement the PostgreSQL Storage Manager (SMGR)
interface.
## Design criteria
- Low latency
- Saturate a 10 Gbit / s network interface without becoming a bottleneck
## Source code view
pgxn/neon/communicator_new.c
Contains the glue that interact with PostgreSQL code and the Rust
communicator code.
pgxn/neon/communicator/src/backend_interface.rs
The entry point for calls from each backend.
pgxn/neon/communicator/src/init.rs
Initialization at server startup
pgxn/neon/communicator/src/worker_process/
Worker process main loop and glue code
At compilation time, pgxn/neon/communicator/ produces a static
library, libcommunicator.a. It is linked to the neon.so extension
library.
The real networking code, which is independent of PostgreSQL, is in
the pageserver/client_grpc crate.
## Process view
The communicator runs in a dedicated background worker process, the
"communicator process". The communicator uses a multi-threaded Tokio
runtime to execute the IO requests. So the communicator process has
multiple threads running. That's unusual for Postgres processes and
care must be taken to make that work.
### Backend <-> worker communication
Each backend has a number of I/O request slots in shared memory. The
slots are statically allocated for each backend, and must not be
accessed by other backends. The worker process reads requests from the
shared memory slots, and writes responses back to the slots.
To submit an IO request, first pick one of your backend's free slots,
and write the details of the IO request in the slot. Finally, update
the 'state' field of the slot to Submitted. That informs the worker
process that it can start processing the request. Once the state has
been set to Submitted, the backend *must not* access the slot anymore,
until the worker process sets its state to 'Completed'. In other
words, each slot is owned by either the backend or the worker process
at all times, and the 'state' field indicates who has ownership at the
moment.
To inform the worker process that a request slot has a pending IO
request, there's a pipe shared by the worker process and all backend
processes. After you have changed the slot's state to Submitted, write
the index of the request slot to the pipe. This wakes up the worker
process.
(Note that the pipe is just used for wakeups, but the worker process
is free to pick up Submitted IO requests even without receiving the
wakeup. As of this writing, it doesn't do that, but it might be useful
in the future to reduce latency even further, for example.)
When the worker process has completed processing the request, it
writes the result back in the request slot. A GetPage request can also
contain a pointer to buffer in the shared buffer cache. In that case,
the worker process writes the resulting page contents directly to the
buffer, and just a result code in the request slot. It then updates
the 'state' field to Completed, which passes the owner ship back to
the originating backend. Finally, it signals the process Latch of the
originating backend, waking it up.
### Differences between PostgreSQL v16, v17 and v18
PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
mechanism uses a very similar mechanism as described in the previous
section, for the communication between AIO worker processes and
backends. With our communicator, the AIO worker processes are not
used, but we use the same PgAioHandle request slots as in upstream.
For Neon-specific IO requests like GetDbSize, a neon request slot is
used. But for the actual IO requests, the request slot merely contains
a pointer to the PgAioHandle slot. The worker process updates the
status of that, calls the IO callbacks upon completionetc, just like
the upstream AIO worker processes do.
## Sequence diagram
neon
PostgreSQL extension backend_interface.rs worker_process.rs processor tonic
| . . . .
| smgr_read() . . . .
+-------------> + . . .
. | . . .
. | rcommunicator_ . . .
. | get_page_at_lsn . . .
. +------------------> + . .
| . .
| write request to . . .
| slot . .
| . .
| . .
| submit_request() . .
+-----------------> + .
| | .
| | db_size_request . .
+---------------->.
. TODO
### Compute <-> pageserver protocol
The protocol between Compute and the pageserver is based on gRPC. See `protos/`.

View File

@@ -0,0 +1,207 @@
//! This module implements a request/response "slot" for submitting requests from backends
//! to the communicator process.
//!
//! NB: The "backend" side of this code runs in Postgres backend processes,
//! which means that it is not safe to use the 'tracing' crate for logging, nor
//! to launch threads or use tokio tasks.
use std::cell::UnsafeCell;
use std::sync::atomic::fence;
use std::sync::atomic::{AtomicI32, Ordering};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use atomic_enum::atomic_enum;
/// One request/response slot. Each backend has its own set of slots that it uses.
///
/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
/// Like PgAioHandle, try to keep this small.
///
/// There is an array of these in shared memory. Therefore, this must be Sized.
///
/// ## Lifecycle of a request
///
/// The slot is always owned by either the backend process or the communicator
/// process, depending on the 'state'. Only the owning process is allowed to
/// read or modify the slot, except for reading the 'state' itself to check who
/// owns it.
///
/// A slot begins in the Idle state, where it is owned by the backend process.
/// To submit a request, the backend process fills the slot with the request
/// data, and changes it to the Submitted state. After changing the state, the
/// slot is owned by the communicator process, and the backend is not allowed
/// to access it until the communicator process marks it as Completed.
///
/// When the communicator process sees that the slot is in Submitted state, it
/// starts to process the request. After processing the request, it stores the
/// result in the slot, and changes the state to Completed. It is now owned by
/// the backend process again, which may now read the result, and reuse the
/// slot for a new request.
///
/// For correctness of the above protocol, we really only need two states:
/// "owned by backend" and "owned by communicator process. But to help with
/// debugging, there are a few more states. When the backend starts to fill in
/// the request details in the slot, it first sets the state from Idle to
/// Filling, and when it's done with that, from Filling to Submitted. In the
/// Filling state, the slot is still owned by the backend. Similarly, when the
/// communicator process starts to process a request, it sets it to Processing
/// state first, but the slot is still owned by the communicator process.
///
/// This struct doesn't handle waking up the communicator process when a request
/// has been submitted or when a response is ready. We only store the 'owner_procno'
/// which can be used for waking up the backend on completion, but the wakeups are
/// performed elsewhere.
pub struct NeonIOHandle {
/// similar to PgAioHandleState
state: AtomicNeonIOHandleState,
/// The owning process's ProcNumber. The worker process uses this to set the process's
/// latch on completion.
///
/// (This could be calculated from num_neon_request_slots_per_backend and the index of
/// this slot in the overall 'neon_requst_slots array')
owner_procno: AtomicI32,
/// SAFETY: This is modified by fill_request(), after it has established ownership
/// of the slot by setting state from Idle to Filling
request: UnsafeCell<NeonIORequest>,
/// valid when state is Completed
///
/// SAFETY: This is modified by RequestProcessingGuard::complete(). There can be
/// only one RequestProcessingGuard outstanding for a slot at a time, because
/// it is returned by start_processing_request() which checks the state, so
/// RequestProcessingGuard has exclusive access to the slot.
result: UnsafeCell<NeonIOResult>,
}
// The protocol described in the "Lifecycle of a request" section above ensures
// the safe access to the fields
unsafe impl Send for NeonIOHandle {}
unsafe impl Sync for NeonIOHandle {}
impl Default for NeonIOHandle {
fn default() -> NeonIOHandle {
NeonIOHandle {
owner_procno: AtomicI32::new(-1),
request: UnsafeCell::new(NeonIORequest::Empty),
result: UnsafeCell::new(NeonIOResult::Empty),
state: AtomicNeonIOHandleState::new(NeonIOHandleState::Idle),
}
}
}
#[atomic_enum]
#[derive(Eq, PartialEq)]
pub enum NeonIOHandleState {
Idle,
/// backend is filling in the request
Filling,
/// Backend has submitted the request to the communicator, but the
/// communicator process has not yet started processing it.
Submitted,
/// Communicator is processing the request
Processing,
/// Communicator has completed the request, and the 'result' field is now
/// valid, but the backend has not read the result yet.
Completed,
}
pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle);
unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
impl<'a> RequestProcessingGuard<'a> {
pub fn get_request(&self) -> &NeonIORequest {
unsafe { &*self.0.request.get() }
}
pub fn get_owner_procno(&self) -> i32 {
self.0.owner_procno.load(Ordering::Relaxed)
}
pub fn completed(self, result: NeonIOResult) {
unsafe {
*self.0.result.get() = result;
};
// Ok, we have completed the IO. Mark the request as completed. After that,
// we no longer have ownership of the slot, and must not modify it.
let old_state = self
.0
.state
.swap(NeonIOHandleState::Completed, Ordering::Release);
assert!(old_state == NeonIOHandleState::Processing);
}
}
impl NeonIOHandle {
pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) {
// Verify that the slot is in Idle state previously, and start filling it.
//
// XXX: This step isn't strictly necessary. Assuming the caller didn't screw up
// and try to use a slot that's already in use, we could fill the slot and
// switch it directly from Idle to Submitted state.
if let Err(s) = self.state.compare_exchange(
NeonIOHandleState::Idle,
NeonIOHandleState::Filling,
Ordering::Relaxed,
Ordering::Relaxed,
) {
panic!("unexpected state in request slot: {s:?}");
}
// This fence synchronizes-with store/swap in `communicator_process_main_loop`.
fence(Ordering::Acquire);
self.owner_procno.store(proc_number, Ordering::Relaxed);
unsafe { *self.request.get() = *request }
self.state
.store(NeonIOHandleState::Submitted, Ordering::Release);
}
pub fn get_state(&self) -> NeonIOHandleState {
self.state.load(Ordering::Relaxed)
}
pub fn try_get_result(&self) -> Option<NeonIOResult> {
// FIXME: ordering?
let state = self.state.load(Ordering::Relaxed);
if state == NeonIOHandleState::Completed {
// This fence synchronizes-with store/swap in `communicator_process_main_loop`.
fence(Ordering::Acquire);
let result = unsafe { *self.result.get() };
self.state.store(NeonIOHandleState::Idle, Ordering::Relaxed);
Some(result)
} else {
None
}
}
/// Read the IO request from the slot indicated in the wakeup
pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
// XXX: using compare_exchange for this is not strictly necessary, as long as
// the communicator process has _some_ means of tracking which requests it's
// already processing. That could be a flag somewhere in communicator's private
// memory, for example.
if let Err(s) = self.state.compare_exchange(
NeonIOHandleState::Submitted,
NeonIOHandleState::Processing,
Ordering::Relaxed,
Ordering::Relaxed,
) {
// FIXME surprising state. This is unexpected at the moment, but if we
// started to process requests more aggressively, without waiting for the
// read from the pipe, then this could happen
panic!("unexpected state in request slot: {s:?}");
}
fence(Ordering::Acquire);
Some(RequestProcessingGuard(self))
}
}

View File

@@ -0,0 +1,234 @@
//! This code runs in each backend process. That means that launching Rust threads, panicking
//! etc. is forbidden!
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIOHandle;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
use crate::neon_request::{CCachedGetPageVResult, COid};
use crate::neon_request::{NeonIORequest, NeonIOResult};
pub struct CommunicatorBackendStruct<'t> {
my_proc_number: i32,
neon_request_slots: &'t [NeonIOHandle],
submission_pipe_write_fd: OwnedFd,
pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
integrated_cache: &'t IntegratedCacheReadAccess<'t>,
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_backend_init(
cis: Box<CommunicatorInitStruct>,
my_proc_number: i32,
) -> &'static mut CommunicatorBackendStruct<'static> {
if my_proc_number < 0 {
panic!(
"cannot attach to communicator shared memory with procnumber {}",
my_proc_number,
);
}
let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
let bs: &'static mut CommunicatorBackendStruct =
Box::leak(Box::new(CommunicatorBackendStruct {
my_proc_number,
neon_request_slots: cis.neon_request_slots,
submission_pipe_write_fd: cis.submission_pipe_write_fd,
pending_cache_read_op: None,
integrated_cache,
}));
bs
}
/// Start a request. You can poll for its completion and get the result by
/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
/// us up by setting our process latch, so to wait for the completion, wait on
/// the latch and call bcomm_poll_dbsize_request_completion() every time the
/// latch is set.
///
/// Safety: The C caller must ensure that the references are valid.
/// The requested slot must be free, or this panics.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_io_request(
bs: &'_ mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut NeonIOResult,
) -> i32 {
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
if let NeonIORequest::RelSize(req) = request {
if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
*immediate_result_ptr = NeonIOResult::RelSize(nblocks);
return -1;
}
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
// Tell the communicator about it
bs.submit_request(slot_idx);
slot_idx
}
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_get_page_v_request(
bs: &mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut CCachedGetPageVResult,
) -> i32 {
let NeonIORequest::GetPageV(get_pagev_request) = request else {
panic!("invalid request passed to bcomm_start_get_page_v_request()");
};
assert!(matches!(request, NeonIORequest::GetPageV(_)));
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
let mut all_cached = true;
let mut read_op = bs.integrated_cache.start_read_op();
for i in 0..get_pagev_request.nblocks {
if let Some(cache_block) = read_op.get_page(
&get_pagev_request.reltag(),
get_pagev_request.block_number + i as u32,
) {
immediate_result_ptr.cache_block_numbers[i as usize] = cache_block;
} else {
// not found in cache
all_cached = false;
break;
}
}
if all_cached {
bs.pending_cache_read_op = Some(read_op);
return -1;
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
// Tell the communicator about it
bs.submit_request(slot_idx);
slot_idx
}
/// Check if a request has completed. Returns:
///
/// -1 if the request is still being processed
/// 0 on success
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_poll_request_completion(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
result_p: &mut NeonIOResult,
) -> i32 {
match bs.neon_request_slots[request_slot_idx as usize].try_get_result() {
None => -1, // still processing
Some(result) => {
*result_p = result;
0
}
}
}
/// Check if a request has completed. Returns:
///
/// 'false' if the slot is Idle. The backend process has ownership.
/// 'true' if the slot is busy, and should be polled for result.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_get_request_slot_status(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
) -> bool {
use crate::backend_comms::NeonIOHandleState;
match bs.neon_request_slots[request_slot_idx as usize].get_state() {
NeonIOHandleState::Idle => false,
NeonIOHandleState::Filling => {
// 'false' would be the right result here. However, this
// is a very transient state. The C code should never
// leave a slot in this state, so if it sees that,
// something's gone wrong and it's not clear what to do
// with it.
panic!("unexpected Filling state in request slot {}", request_slot_idx);
},
NeonIOHandleState::Submitted => true,
NeonIOHandleState::Processing => true,
NeonIOHandleState::Completed => true,
}
}
// LFC functions
/// Finish a local file cache read
///
//
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
if let Some(op) = bs.pending_cache_read_op.take() {
op.finish()
} else {
panic!("bcomm_finish_cache_read() called with no cached read pending");
}
}
/// Check if the local file cache contians the given block
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_cache_contains(
bs: &mut CommunicatorBackendStruct,
spc_oid: COid,
db_oid: COid,
rel_number: u32,
fork_number: u8,
block_number: u32,
) -> bool {
bs.integrated_cache.cache_contains_page(
&pageserver_page_api::RelTag {
spcnode: spc_oid,
dbnode: db_oid,
relnode: rel_number,
forknum: fork_number,
},
block_number,
)
}
impl<'t> CommunicatorBackendStruct<'t> {
/// Send a wakeup to the communicator process
fn submit_request(self: &CommunicatorBackendStruct<'t>, request_slot_idx: i32) {
// wake up communicator by writing the idx to the submission pipe
//
// This can block, if the pipe is full. That should be very rare,
// because the communicator tries hard to drain the pipe to prevent
// that. Also, there's a natural upper bound on how many wakeups can be
// queued up: there is only a limited number of request slots for each
// backend.
//
// If it does block very briefly, that's not too serious.
let idxbuf = request_slot_idx.to_ne_bytes();
let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
// FIXME: check result, return any errors
}
/// Note: there's no guarantee on when the communicator might pick it up. You should ring
/// the doorbell. But it might pick it up immediately.
///
/// The slot must be free, or this panics.
pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
let my_proc_number = self.my_proc_number;
self.neon_request_slots[request_slot_idx as usize].fill_request(request, my_proc_number);
}
}

View File

@@ -0,0 +1,160 @@
//! Implement the "low-level" parts of the file cache.
//!
//! This module just deals with reading and writing the file, and keeping track
//! which blocks in the cache file are in use and which are free. The "high
//! level" parts of tracking which block in the cache file corresponds to which
//! relation block is handled in 'integrated_cache' instead.
//!
//! This module is only used to access the file from the communicator
//! process. The backend processes *also* read the file (and sometimes also
//! write it? ), but the backends use direct C library calls for that.
use std::fs::File;
use std::os::unix::fs::FileExt;
use std::path::Path;
use std::sync::Arc;
use std::sync::Mutex;
use crate::BLCKSZ;
use tokio::task::spawn_blocking;
pub type CacheBlock = u64;
pub const INVALID_CACHE_BLOCK: CacheBlock = u64::MAX;
pub struct FileCache {
file: Arc<File>,
free_list: Mutex<FreeList>,
// metrics
max_blocks_gauge: metrics::IntGauge,
num_free_blocks_gauge: metrics::IntGauge,
}
// TODO: We keep track of all free blocks in this vec. That doesn't really scale.
// Idea: when free_blocks fills up with more than 1024 entries, write them all to
// one block on disk.
struct FreeList {
next_free_block: CacheBlock,
max_blocks: u64,
free_blocks: Vec<CacheBlock>,
}
impl FileCache {
pub fn new(file_cache_path: &Path, mut initial_size: u64) -> Result<FileCache, std::io::Error> {
if initial_size < 100 {
tracing::warn!(
"min size for file cache is 100 blocks, {} requested",
initial_size
);
initial_size = 100;
}
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.truncate(true)
.create(true)
.open(file_cache_path)?;
let max_blocks_gauge = metrics::IntGauge::new(
"file_cache_max_blocks",
"Local File Cache size in 8KiB blocks",
)
.unwrap();
let num_free_blocks_gauge = metrics::IntGauge::new(
"file_cache_num_free_blocks",
"Number of free 8KiB blocks in Local File Cache",
)
.unwrap();
tracing::info!("initialized file cache with {} blocks", initial_size);
Ok(FileCache {
file: Arc::new(file),
free_list: Mutex::new(FreeList {
next_free_block: 0,
max_blocks: initial_size,
free_blocks: Vec::new(),
}),
max_blocks_gauge,
num_free_blocks_gauge,
})
}
// File cache management
pub async fn read_block(
&self,
cache_block: CacheBlock,
mut dst: impl uring_common::buf::IoBufMut + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(dst.bytes_total() == BLCKSZ);
let file = self.file.clone();
let dst_ref = unsafe { std::slice::from_raw_parts_mut(dst.stable_mut_ptr(), BLCKSZ) };
spawn_blocking(move || file.read_exact_at(dst_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
pub async fn write_block(
&self,
cache_block: CacheBlock,
src: impl uring_common::buf::IoBuf + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(src.bytes_init() == BLCKSZ);
let file = self.file.clone();
let src_ref = unsafe { std::slice::from_raw_parts(src.stable_ptr(), BLCKSZ) };
spawn_blocking(move || file.write_all_at(src_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
pub fn alloc_block(&self) -> Option<CacheBlock> {
let mut free_list = self.free_list.lock().unwrap();
if let Some(x) = free_list.free_blocks.pop() {
return Some(x);
}
if free_list.next_free_block < free_list.max_blocks {
let result = free_list.next_free_block;
free_list.next_free_block += 1;
return Some(result);
}
None
}
pub fn dealloc_block(&self, cache_block: CacheBlock) {
let mut free_list = self.free_list.lock().unwrap();
free_list.free_blocks.push(cache_block);
}
}
impl metrics::core::Collector for FileCache {
fn desc(&self) -> Vec<&metrics::core::Desc> {
let mut descs = Vec::new();
descs.append(&mut self.max_blocks_gauge.desc());
descs.append(&mut self.num_free_blocks_gauge.desc());
descs
}
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
// Update the gauges with fresh values first
{
let free_list = self.free_list.lock().unwrap();
self.max_blocks_gauge.set(free_list.max_blocks as i64);
let total_free_blocks: i64 = free_list.free_blocks.len() as i64
+ (free_list.max_blocks as i64 - free_list.next_free_block as i64);
self.num_free_blocks_gauge.set(total_free_blocks);
}
let mut values = Vec::new();
values.append(&mut self.max_blocks_gauge.collect());
values.append(&mut self.num_free_blocks_gauge.collect());
values
}
}

View File

@@ -0,0 +1,109 @@
//! Global allocator, for tracking memory usage of the Rust parts
//!
//! Postgres is designed to handle allocation failure (ie. malloc() returning NULL) gracefully. It
//! rolls backs the transaction and gives the user an "ERROR: out of memory" error. Rust code
//! however panics if an allocation fails. We don't want that to ever happen, because an unhandled
//! panic leads to Postgres crash and restart. Our strategy is to pre-allocate a large enough chunk
//! of memory for use by the Rust code, so that the allocations never fail.
//!
//! To pick the size for the pre-allocated chunk, we have a metric to track the high watermark
//! memory usage of all the Rust allocations in total.
//!
//! TODO:
//!
//! - Currently we just export the metrics. Actual allocations are still just passed through to
//! the system allocator.
//! - Take padding etc. overhead into account
use std::alloc::{GlobalAlloc, Layout, System};
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use metrics::IntGauge;
struct MyAllocator {
allocations: AtomicU64,
deallocations: AtomicU64,
allocated: AtomicUsize,
high: AtomicUsize,
}
unsafe impl GlobalAlloc for MyAllocator {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
self.allocations.fetch_add(1, Ordering::Relaxed);
let mut allocated = self.allocated.fetch_add(layout.size(), Ordering::Relaxed);
allocated += layout.size();
self.high.fetch_max(allocated, Ordering::Relaxed);
unsafe { System.alloc(layout) }
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
self.deallocations.fetch_add(1, Ordering::Relaxed);
self.allocated.fetch_sub(layout.size(), Ordering::Relaxed);
unsafe { System.dealloc(ptr, layout) }
}
}
#[global_allocator]
static GLOBAL: MyAllocator = MyAllocator {
allocations: AtomicU64::new(0),
deallocations: AtomicU64::new(0),
allocated: AtomicUsize::new(0),
high: AtomicUsize::new(0),
};
pub struct MyAllocatorCollector {
allocations: IntGauge,
deallocations: IntGauge,
allocated: IntGauge,
high: IntGauge,
}
impl MyAllocatorCollector {
pub fn new() -> MyAllocatorCollector {
MyAllocatorCollector {
allocations: IntGauge::new("allocations_total", "Number of allocations in Rust code")
.unwrap(),
deallocations: IntGauge::new(
"deallocations_total",
"Number of deallocations in Rust code",
)
.unwrap(),
allocated: IntGauge::new("allocated_total", "Bytes currently allocated").unwrap(),
high: IntGauge::new("allocated_high", "High watermark of allocated bytes").unwrap(),
}
}
}
impl metrics::core::Collector for MyAllocatorCollector {
fn desc(&self) -> Vec<&metrics::core::Desc> {
let mut descs = Vec::new();
descs.append(&mut self.allocations.desc());
descs.append(&mut self.deallocations.desc());
descs.append(&mut self.allocated.desc());
descs.append(&mut self.high.desc());
descs
}
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
let mut values = Vec::new();
// update the gauges
self.allocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.deallocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.allocated
.set(GLOBAL.allocated.load(Ordering::Relaxed) as i64);
self.high.set(GLOBAL.high.load(Ordering::Relaxed) as i64);
values.append(&mut self.allocations.collect());
values.append(&mut self.deallocations.collect());
values.append(&mut self.allocated.collect());
values.append(&mut self.high.collect());
values
}
}

View File

@@ -0,0 +1,171 @@
//! Initialization functions. These are executed in the postmaster process,
//! at different stages of server startup.
//!
//!
//! Communicator initialization steps:
//!
//! 1. At postmaster startup, before shared memory is allocated,
//! rcommunicator_shmem_size() is called to get the amount of
//! shared memory that this module needs.
//!
//! 2. Later, after the shared memory has been allocated,
//! rcommunicator_shmem_init() is called to initialize the shmem
//! area.
//!
//! Per process initialization:
//!
//! When a backend process starts up, it calls rcommunicator_backend_init().
//! In the communicator worker process, other functions are called, see
//! `worker_process` module.
use std::ffi::c_int;
use std::mem;
use std::mem::MaybeUninit;
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIOHandle;
use crate::integrated_cache::IntegratedCacheInitStruct;
/// This struct is created in the postmaster process, and inherited to
/// the communicator process and all backend processes through fork()
#[repr(C)]
pub struct CommunicatorInitStruct {
pub submission_pipe_read_fd: OwnedFd,
pub submission_pipe_write_fd: OwnedFd,
// Shared memory data structures
pub num_neon_request_slots: u32,
pub neon_request_slots: &'static [NeonIOHandle],
pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
}
impl std::fmt::Debug for CommunicatorInitStruct {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("CommunicatorInitStruct")
.field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
.field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
.field(
"num_neon_request_slots",
&self.num_neon_request_slots,
)
.field("neon_request_slots length", &self.neon_request_slots.len())
.finish()
}
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_size(num_neon_request_slots: u32) -> u64 {
let mut size = 0;
size += mem::size_of::<NeonIOHandle>() * num_neon_request_slots as usize;
// For integrated_cache's Allocator. TODO: make this adjustable
size += IntegratedCacheInitStruct::shmem_size();
size as u64
}
/// Initialize the shared memory segment. Returns a backend-private
/// struct, which will be inherited by backend processes through fork
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_init(
submission_pipe_read_fd: c_int,
submission_pipe_write_fd: c_int,
num_neon_request_slots: u32,
shmem_area_ptr: *mut MaybeUninit<u8>,
shmem_area_len: u64,
initial_file_cache_size: u64,
max_file_cache_size: u64,
) -> &'static mut CommunicatorInitStruct {
let shmem_area: &'static mut [MaybeUninit<u8>] =
unsafe { std::slice::from_raw_parts_mut(shmem_area_ptr, shmem_area_len as usize) };
let (neon_request_slots, remaining_area) =
alloc_array_from_slice::<NeonIOHandle>(shmem_area, num_neon_request_slots as usize);
for slot in neon_request_slots.iter_mut() {
slot.write(NeonIOHandle::default());
}
// 'neon_request_slots' is initialized now. (MaybeUninit::slice_assume_init_mut() is nightly-only
// as of this writing.)
let neon_request_slots = unsafe {
std::mem::transmute::<&mut [MaybeUninit<NeonIOHandle>], &mut [NeonIOHandle]>(
neon_request_slots,
)
};
// Give the rest of the area to the integrated cache
let integrated_cache_init_struct = IntegratedCacheInitStruct::shmem_init(
remaining_area,
initial_file_cache_size,
max_file_cache_size,
);
let (submission_pipe_read_fd, submission_pipe_write_fd) = unsafe {
use std::os::fd::FromRawFd;
(
OwnedFd::from_raw_fd(submission_pipe_read_fd),
OwnedFd::from_raw_fd(submission_pipe_write_fd),
)
};
let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
submission_pipe_read_fd,
submission_pipe_write_fd,
num_neon_request_slots,
neon_request_slots,
integrated_cache_init_struct,
}));
cis
}
// fixme: currently unused
#[allow(dead_code)]
pub fn alloc_from_slice<T>(
area: &mut [MaybeUninit<u8>],
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size());
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { result_ptr.as_mut().unwrap() };
(result, remain)
}
pub fn alloc_array_from_slice<T>(
area: &mut [MaybeUninit<u8>],
len: usize,
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() * len > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size() * len);
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
(result, remain)
}

View File

@@ -0,0 +1,794 @@
//! Integrated communicator cache
//!
//! It tracks:
//! - Relation sizes and existence
//! - Last-written LSN
//! - Block cache (also known as LFC)
//!
//! TODO: limit the size
//! TODO: concurrency
//!
//! Note: This deals with "relations" which is really just one "relation fork" in Postgres
//! terms. RelFileLocator + ForkNumber is the key.
//
// TODO: Thoughts on eviction:
//
// There are two things we need to track, and evict if we run out of space:
// - blocks in the file cache's file. If the file grows too large, need to evict something.
// Also if the cache is resized
//
// - entries in the cache map. If we run out of memory in the shmem area, need to evict
// something
//
use std::mem::MaybeUninit;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
use utils::lsn::{AtomicLsn, Lsn};
use crate::file_cache::INVALID_CACHE_BLOCK;
use crate::file_cache::{CacheBlock, FileCache};
use pageserver_page_api::RelTag;
use metrics::{IntCounter, IntGauge};
use neon_shmem::hash::{HashMapInit, entry::Entry};
use neon_shmem::shmem::ShmemHandle;
// in # of entries
const RELSIZE_CACHE_SIZE: u32 = 64 * 1024;
/// This struct is initialized at postmaster startup, and passed to all the processes via fork().
pub struct IntegratedCacheInitStruct<'t> {
relsize_cache_handle: HashMapInit<'t, RelKey, RelEntry>,
block_map_handle: HashMapInit<'t, BlockKey, BlockEntry>,
}
/// Represents write-access to the integrated cache. This is used by the communicator process.
pub struct IntegratedCacheWriteAccess<'t> {
relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
global_lw_lsn: AtomicU64,
pub(crate) file_cache: Option<FileCache>,
// Fields for eviction
clock_hand: std::sync::Mutex<usize>,
// Metrics
page_evictions_counter: IntCounter,
clock_iterations_counter: IntCounter,
// metrics from the hash map
block_map_num_buckets: IntGauge,
block_map_num_buckets_in_use: IntGauge,
relsize_cache_num_buckets: IntGauge,
relsize_cache_num_buckets_in_use: IntGauge,
}
/// Represents read-only access to the integrated cache. Backend processes have this.
pub struct IntegratedCacheReadAccess<'t> {
relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
}
impl<'t> IntegratedCacheInitStruct<'t> {
/// Return the desired size in bytes of the fixed-size shared memory area to reserve for the
/// integrated cache.
pub fn shmem_size() -> usize {
// The relsize cache is fixed-size. The block map is allocated in a separate resizable
// area.
HashMapInit::<RelKey, RelEntry>::estimate_size(RELSIZE_CACHE_SIZE)
}
/// Initialize the shared memory segment. This runs once in postmaster. Returns a struct which
/// will be inherited by all processes through fork.
pub fn shmem_init(
shmem_area: &'t mut [MaybeUninit<u8>],
initial_file_cache_size: u64,
max_file_cache_size: u64,
) -> IntegratedCacheInitStruct<'t> {
// Initialize the relsize cache in the fixed-size area
let relsize_cache_handle =
neon_shmem::hash::HashMapInit::with_fixed(RELSIZE_CACHE_SIZE, shmem_area);
let max_bytes =
HashMapInit::<BlockKey, BlockEntry>::estimate_size(max_file_cache_size as u32);
// Initialize the block map in a separate resizable shared memory area
let shmem_handle = ShmemHandle::new("block mapping", 0, max_bytes).unwrap();
let block_map_handle =
neon_shmem::hash::HashMapInit::with_shmem(initial_file_cache_size as u32, shmem_handle);
IntegratedCacheInitStruct {
relsize_cache_handle,
block_map_handle,
}
}
/// Initialize access to the integrated cache for the communicator worker process
pub fn worker_process_init(
self,
lsn: Lsn,
file_cache: Option<FileCache>,
) -> IntegratedCacheWriteAccess<'t> {
let IntegratedCacheInitStruct {
relsize_cache_handle,
block_map_handle,
} = self;
IntegratedCacheWriteAccess {
relsize_cache: relsize_cache_handle.attach_writer(),
block_map: block_map_handle.attach_writer(),
global_lw_lsn: AtomicU64::new(lsn.0),
file_cache,
clock_hand: std::sync::Mutex::new(0),
page_evictions_counter: metrics::IntCounter::new(
"integrated_cache_evictions",
"Page evictions from the Local File Cache",
)
.unwrap(),
clock_iterations_counter: metrics::IntCounter::new(
"clock_iterations",
"Number of times the clock hand has moved",
)
.unwrap(),
block_map_num_buckets: metrics::IntGauge::new(
"block_map_num_buckets",
"Allocated size of the block cache hash map",
)
.unwrap(),
block_map_num_buckets_in_use: metrics::IntGauge::new(
"block_map_num_buckets_in_use",
"Number of buckets in use in the block cache hash map",
)
.unwrap(),
relsize_cache_num_buckets: metrics::IntGauge::new(
"relsize_cache_num_buckets",
"Allocated size of the relsize cache hash map",
)
.unwrap(),
relsize_cache_num_buckets_in_use: metrics::IntGauge::new(
"relsize_cache_num_buckets_in_use",
"Number of buckets in use in the relsize cache hash map",
)
.unwrap(),
}
}
/// Initialize access to the integrated cache for a backend process
pub fn backend_init(self) -> IntegratedCacheReadAccess<'t> {
let IntegratedCacheInitStruct {
relsize_cache_handle,
block_map_handle,
} = self;
IntegratedCacheReadAccess {
relsize_cache: relsize_cache_handle.attach_reader(),
block_map: block_map_handle.attach_reader(),
}
}
}
/// Value stored in the cache mapping hash table.
struct BlockEntry {
lw_lsn: AtomicLsn,
cache_block: AtomicU64,
pinned: AtomicU64,
// 'referenced' bit for the clock algorithm
referenced: AtomicBool,
}
/// Value stored in the relsize cache hash table.
struct RelEntry {
/// cached size of the relation
/// u32::MAX means 'not known' (that's InvalidBlockNumber in Postgres)
nblocks: AtomicU32,
}
impl std::fmt::Debug for RelEntry {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("Rel")
.field("nblocks", &self.nblocks.load(Ordering::Relaxed))
.finish()
}
}
impl std::fmt::Debug for BlockEntry {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("Block")
.field("lw_lsn", &self.lw_lsn.load())
.field("cache_block", &self.cache_block.load(Ordering::Relaxed))
.field("pinned", &self.pinned.load(Ordering::Relaxed))
.field("referenced", &self.referenced.load(Ordering::Relaxed))
.finish()
}
}
#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
struct RelKey(RelTag);
impl From<&RelTag> for RelKey {
fn from(val: &RelTag) -> RelKey {
RelKey(*val)
}
}
#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
struct BlockKey {
rel: RelTag,
block_number: u32,
}
impl From<(&RelTag, u32)> for BlockKey {
fn from(val: (&RelTag, u32)) -> BlockKey {
BlockKey {
rel: *val.0,
block_number: val.1,
}
}
}
/// Return type used in the cache's get_*() functions. 'Found' means that the page, or other
/// information that was enqueried, exists in the cache. '
pub enum CacheResult<V> {
/// The enqueried page or other information existed in the cache.
Found(V),
/// The cache doesn't contain the page (or other enqueried information, like relation size). The
/// Lsn is the 'not_modified_since' LSN that should be used in the request to the pageserver to
/// read the page.
NotFound(Lsn),
}
impl<'t> IntegratedCacheWriteAccess<'t> {
pub fn get_rel_size(&'t self, rel: &RelTag) -> CacheResult<u32> {
if let Some(nblocks) = get_rel_size(&self.relsize_cache, rel) {
CacheResult::Found(nblocks)
} else {
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
CacheResult::NotFound(lsn)
}
}
pub async fn get_page(
&'t self,
rel: &RelTag,
block_number: u32,
dst: impl uring_common::buf::IoBufMut + Send + Sync,
) -> Result<CacheResult<()>, std::io::Error> {
let x = if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number)))
{
block_entry.referenced.store(true, Ordering::Relaxed);
let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
// pin it and release lock
block_entry.pinned.fetch_add(1, Ordering::Relaxed);
(cache_block, DeferredUnpin(block_entry.pinned.as_ptr()))
} else {
return Ok(CacheResult::NotFound(block_entry.lw_lsn.load()));
}
} else {
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
return Ok(CacheResult::NotFound(lsn));
};
let (cache_block, _deferred_pin) = x;
self.file_cache
.as_ref()
.unwrap()
.read_block(cache_block, dst)
.await?;
// unpin the entry (by implicitly dropping deferred_pin)
Ok(CacheResult::Found(()))
}
pub async fn page_is_cached(
&'t self,
rel: &RelTag,
block_number: u32,
) -> Result<CacheResult<()>, std::io::Error> {
if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number))) {
// This is used for prefetch requests. Treat the probe as an 'access', to keep it
// in cache.
block_entry.referenced.store(true, Ordering::Relaxed);
let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
Ok(CacheResult::Found(()))
} else {
Ok(CacheResult::NotFound(block_entry.lw_lsn.load()))
}
} else {
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
Ok(CacheResult::NotFound(lsn))
}
}
/// Does the relation exists? CacheResult::NotFound means that the cache doesn't contain that
/// information, i.e. we don't know if the relation exists or not.
pub fn get_rel_exists(&'t self, rel: &RelTag) -> CacheResult<bool> {
// we don't currently cache negative entries, so if the relation is in the cache, it exists
if let Some(_rel_entry) = self.relsize_cache.get(&RelKey::from(rel)) {
CacheResult::Found(true)
} else {
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
CacheResult::NotFound(lsn)
}
}
pub fn get_db_size(&'t self, _db_oid: u32) -> CacheResult<u64> {
// TODO: it would be nice to cache database sizes too. Getting the database size
// is not a very common operation, but when you do it, it's often interactive, with
// e.g. psql \l+ command, so the user will feel the latency.
// fixme: is this right lsn?
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
CacheResult::NotFound(lsn)
}
pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32) {
match self.relsize_cache.entry(RelKey::from(rel)) {
Entry::Vacant(e) => {
tracing::info!("inserting rel entry for {rel:?}, {nblocks} blocks");
// FIXME: what to do if we run out of memory? Evict other relation entries?
_ = e
.insert(RelEntry {
nblocks: AtomicU32::new(nblocks),
})
.expect("out of memory");
}
Entry::Occupied(e) => {
tracing::info!("updating rel entry for {rel:?}, {nblocks} blocks");
e.get().nblocks.store(nblocks, Ordering::Relaxed);
}
};
}
/// Remember the given page contents in the cache.
pub async fn remember_page(
&'t self,
rel: &RelTag,
block_number: u32,
src: impl uring_common::buf::IoBuf + Send + Sync,
lw_lsn: Lsn,
is_write: bool,
) {
let key = BlockKey::from((rel, block_number));
// FIXME: make this work when file cache is disabled. Or make it mandatory
let file_cache = self.file_cache.as_ref().unwrap();
if is_write {
// there should be no concurrent IOs. If a backend tries to read the page
// at the same time, they may get a torn write. That's the same as with
// regular POSIX filesystem read() and write()
// First check if we have a block in cache already
let mut old_cache_block = None;
let mut found_existing = false;
// NOTE(quantumish): honoring original semantics here (used to be update_with_fn)
// but I don't see any reason why this has to take a write lock.
if let Entry::Occupied(e) = self.block_map.entry(key.clone()) {
let block_entry = e.get();
found_existing = true;
// Prevent this entry from being evicted
let pin_count = block_entry.pinned.fetch_add(1, Ordering::Relaxed);
if pin_count > 0 {
// this is unexpected, because the caller has obtained the io-in-progress lock,
// so no one else should try to modify the page at the same time.
// XXX: and I think a read should not be happening either, because the postgres
// buffer is held locked. TODO: check these conditions and tidy this up a little. Seems fragile to just panic.
panic!("block entry was unexpectedly pinned");
}
let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
old_cache_block = if cache_block != INVALID_CACHE_BLOCK {
Some(cache_block)
} else {
None
};
}
// Allocate a new block if required
let cache_block = old_cache_block.unwrap_or_else(|| {
loop {
if let Some(x) = file_cache.alloc_block() {
break x;
}
if let Some(x) = self.try_evict_one_cache_block() {
break x;
}
}
});
// Write the page to the cache file
file_cache
.write_block(cache_block, src)
.await
.expect("error writing to cache");
// FIXME: handle errors gracefully.
// FIXME: unpin the block entry on error
// Update the block entry
let entry = self.block_map.entry(key);
assert_eq!(found_existing, matches!(entry, Entry::Occupied(_)));
match entry {
Entry::Occupied(e) => {
let block_entry = e.get();
// Update the cache block
let old_blk = block_entry.cache_block.compare_exchange(
INVALID_CACHE_BLOCK,
cache_block,
Ordering::Relaxed,
Ordering::Relaxed,
);
assert!(old_blk == Ok(INVALID_CACHE_BLOCK) || old_blk == Err(cache_block));
block_entry.lw_lsn.store(lw_lsn);
block_entry.referenced.store(true, Ordering::Relaxed);
let pin_count = block_entry.pinned.fetch_sub(1, Ordering::Relaxed);
assert!(pin_count > 0);
}
Entry::Vacant(e) => {
// FIXME: what to do if we run out of memory? Evict other relation entries? Remove
// block entries first?
_ = e
.insert(BlockEntry {
lw_lsn: AtomicLsn::new(lw_lsn.0),
cache_block: AtomicU64::new(cache_block),
pinned: AtomicU64::new(0),
referenced: AtomicBool::new(true),
})
.expect("out of memory");
}
}
} else {
// !is_write
//
// We can assume that it doesn't already exist, because the
// caller is assumed to have already checked it, and holds
// the io-in-progress lock. (The BlockEntry might exist, but no cache block)
// Allocate a new block first
let cache_block = {
loop {
if let Some(x) = file_cache.alloc_block() {
break x;
}
if let Some(x) = self.try_evict_one_cache_block() {
break x;
}
}
};
// Write the page to the cache file
file_cache
.write_block(cache_block, src)
.await
.expect("error writing to cache");
// FIXME: handle errors gracefully.
match self.block_map.entry(key) {
Entry::Occupied(e) => {
let block_entry = e.get();
// FIXME: could there be concurrent readers?
assert!(block_entry.pinned.load(Ordering::Relaxed) == 0);
let old_cache_block =
block_entry.cache_block.swap(cache_block, Ordering::Relaxed);
if old_cache_block != INVALID_CACHE_BLOCK {
panic!(
"remember_page called in !is_write mode, but page is already cached at blk {old_cache_block}"
);
}
}
Entry::Vacant(e) => {
// FIXME: what to do if we run out of memory? Evict other relation entries? Remove
// block entries first?
_ = e
.insert(BlockEntry {
lw_lsn: AtomicLsn::new(lw_lsn.0),
cache_block: AtomicU64::new(cache_block),
pinned: AtomicU64::new(0),
referenced: AtomicBool::new(true),
})
.expect("out of memory");
}
}
}
}
/// Forget information about given relation in the cache. (For DROP TABLE and such)
pub fn forget_rel(&'t self, rel: &RelTag) {
tracing::info!("forgetting rel entry for {rel:?}");
self.relsize_cache.remove(&RelKey::from(rel));
// also forget all cached blocks for the relation
// FIXME
/*
let mut iter = MapIterator::new(&key_range_for_rel_blocks(rel));
let r = self.cache_tree.start_read();
while let Some((k, _v)) = iter.next(&r) {
let w = self.cache_tree.start_write();
let mut evicted_cache_block = None;
let res = w.update_with_fn(&k, |e| {
if let Some(e) = e {
let block_entry = if let MapEntry::Block(e) = e {
e
} else {
panic!("unexpected map entry type for block key");
};
let cache_block = block_entry
.cache_block
.swap(INVALID_CACHE_BLOCK, Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
evicted_cache_block = Some(cache_block);
}
UpdateAction::Remove
} else {
UpdateAction::Nothing
}
});
// FIXME: It's pretty surprising to run out of memory while removing. But
// maybe it can happen because of trying to shrink a node?
res.expect("out of memory");
if let Some(evicted_cache_block) = evicted_cache_block {
self.file_cache
.as_ref()
.unwrap()
.dealloc_block(evicted_cache_block);
}
}
*/
}
// Maintenance routines
/// Evict one block from the file cache. This is used when the file cache fills up
/// Returns the evicted block. It's not put to the free list, so it's available for the
/// caller to use immediately.
pub fn try_evict_one_cache_block(&self) -> Option<CacheBlock> {
let mut clock_hand = self.clock_hand.lock().unwrap();
for _ in 0..100 {
self.clock_iterations_counter.inc();
(*clock_hand) += 1;
let mut evict_this = false;
let num_buckets = self.block_map.get_num_buckets();
match self
.block_map
.get_at_bucket((*clock_hand) % num_buckets)
.as_deref()
{
None => {
// This bucket was unused
}
Some((_, blk_entry)) => {
if !blk_entry.referenced.swap(false, Ordering::Relaxed) {
// Evict this. Maybe.
evict_this = true;
}
}
};
if evict_this {
// grab the write lock
let mut evicted_cache_block = None;
if let Some(e) = self.block_map.entry_at_bucket(*clock_hand % num_buckets) {
let old = e.get();
// note: all the accesses to 'pinned' currently happen
// within update_with_fn(), or while holding ValueReadGuard, which protects from concurrent
// updates. Otherwise, another thread could set the 'pinned'
// flag just after we have checked it here.
if old.pinned.load(Ordering::Relaxed) == 0 {
let _ = self
.global_lw_lsn
.fetch_max(old.lw_lsn.load().0, Ordering::Relaxed);
let cache_block =
old.cache_block.swap(INVALID_CACHE_BLOCK, Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
evicted_cache_block = Some(cache_block);
}
e.remove();
}
}
if evicted_cache_block.is_some() {
self.page_evictions_counter.inc();
return evicted_cache_block;
}
}
}
// Give up if we didn't find anything
None
}
/// Resize the local file cache.
pub fn resize_file_cache(&self, num_blocks: u32) {
let old_num_blocks = self.block_map.get_num_buckets() as u32;
if old_num_blocks < num_blocks {
if let Err(err) = self.block_map.grow(num_blocks) {
tracing::warn!(
"could not grow file cache to {} blocks (old size {}): {}",
num_blocks,
old_num_blocks,
err
);
}
} else {
// TODO: Shrinking not implemented yet
}
}
pub fn dump_map(&self, _dst: &mut dyn std::io::Write) {
//FIXME self.cache_map.start_read().dump(dst);
}
}
impl metrics::core::Collector for IntegratedCacheWriteAccess<'_> {
fn desc(&self) -> Vec<&metrics::core::Desc> {
let mut descs = Vec::new();
descs.append(&mut self.page_evictions_counter.desc());
descs.append(&mut self.clock_iterations_counter.desc());
descs.append(&mut self.block_map_num_buckets.desc());
descs.append(&mut self.block_map_num_buckets_in_use.desc());
descs.append(&mut self.relsize_cache_num_buckets.desc());
descs.append(&mut self.relsize_cache_num_buckets_in_use.desc());
descs
}
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
// Update gauges
self.block_map_num_buckets
.set(self.block_map.get_num_buckets() as i64);
self.block_map_num_buckets_in_use
.set(self.block_map.get_num_buckets_in_use() as i64);
self.relsize_cache_num_buckets
.set(self.relsize_cache.get_num_buckets() as i64);
self.relsize_cache_num_buckets_in_use
.set(self.relsize_cache.get_num_buckets_in_use() as i64);
let mut values = Vec::new();
values.append(&mut self.page_evictions_counter.collect());
values.append(&mut self.clock_iterations_counter.collect());
values.append(&mut self.block_map_num_buckets.collect());
values.append(&mut self.block_map_num_buckets_in_use.collect());
values.append(&mut self.relsize_cache_num_buckets.collect());
values.append(&mut self.relsize_cache_num_buckets_in_use.collect());
values
}
}
/// Read relation size from the cache.
///
/// This is in a separate function so that it can be shared by
/// IntegratedCacheReadAccess::get_rel_size() and IntegratedCacheWriteAccess::get_rel_size()
fn get_rel_size(
r: &neon_shmem::hash::HashMapAccess<RelKey, RelEntry>,
rel: &RelTag,
) -> Option<u32> {
if let Some(rel_entry) = r.get(&RelKey::from(rel)) {
let nblocks = rel_entry.nblocks.load(Ordering::Relaxed);
if nblocks != u32::MAX {
Some(nblocks)
} else {
None
}
} else {
None
}
}
/// Accessor for other backends
///
/// This allows backends to read pages from the cache directly, on their own, without making a
/// request to the communicator process.
impl<'t> IntegratedCacheReadAccess<'t> {
pub fn get_rel_size(&'t self, rel: &RelTag) -> Option<u32> {
get_rel_size(&self.relsize_cache, rel)
}
pub fn start_read_op(&'t self) -> BackendCacheReadOp<'t> {
BackendCacheReadOp {
read_guards: Vec::new(),
map_access: self,
}
}
/// Check if the given page is present in the cache
pub fn cache_contains_page(&'t self, rel: &RelTag, block_number: u32) -> bool {
self.block_map
.get(&BlockKey::from((rel, block_number)))
.is_some()
}
}
pub struct BackendCacheReadOp<'t> {
read_guards: Vec<DeferredUnpin>,
map_access: &'t IntegratedCacheReadAccess<'t>,
}
impl<'e> BackendCacheReadOp<'e> {
/// Initiate a read of the page from the cache.
///
/// This returns the "cache block number", i.e. the block number within the cache file, where
/// the page's contents is stored. To get the page contents, the caller needs to read that block
/// from the cache file. This returns a guard object that you must hold while it performs the
/// read. It's possible that while you are performing the read, the cache block is invalidated.
/// After you have completed the read, call BackendCacheReadResult::finish() to check if the
/// read was in fact valid or not. If it was concurrently invalidated, you need to retry.
pub fn get_page(&mut self, rel: &RelTag, block_number: u32) -> Option<u64> {
if let Some(block_entry) = self
.map_access
.block_map
.get(&BlockKey::from((rel, block_number)))
{
block_entry.referenced.store(true, Ordering::Relaxed);
let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
block_entry.pinned.fetch_add(1, Ordering::Relaxed);
self.read_guards
.push(DeferredUnpin(block_entry.pinned.as_ptr()));
Some(cache_block)
} else {
None
}
} else {
None
}
}
pub fn finish(self) -> bool {
// TODO: currently, we hold a pin on the in-memory map, so concurrent invalidations are not
// possible. But if we switch to optimistic locking, this would return 'false' if the
// optimistic locking failed and you need to retry.
true
}
}
/// A hack to decrement an AtomicU64 on drop. This is used to decrement the pin count
/// of a BlockEntry. The safety depends on the fact that the BlockEntry is not evicted
/// or moved while it's pinned.
struct DeferredUnpin(*mut u64);
unsafe impl Sync for DeferredUnpin {}
unsafe impl Send for DeferredUnpin {}
impl Drop for DeferredUnpin {
fn drop(&mut self) {
// unpin it
unsafe {
let pin_ref = AtomicU64::from_ptr(self.0);
pin_ref.fetch_sub(1, Ordering::Relaxed);
}
}
}

View File

@@ -1,6 +1,27 @@
/// dummy function, just to test linking Rust functions into the C
/// extension
#[unsafe(no_mangle)]
pub extern "C" fn communicator_dummy(arg: u32) -> u32 {
arg + 1
}
//!
//! Three main parts:
//! - async tokio communicator core, which receives requests and processes them.
//! - Main loop and requests queues, which routes requests from backends to the core
//! - the per-backend glue code, which submits requests
//!
mod backend_comms;
// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
// complains about a bunch of structs and enum variants being unused, because it thinkgs
// the functions that use them are never called. There are some C-callable functions in
// other modules too, but marking this as pub is currently enough to silence the warnings
//
// TODO: perhaps collect *all* the extern "C" functions to one module?
pub mod backend_interface;
mod file_cache;
mod init;
mod integrated_cache;
mod neon_request;
mod worker_process;
mod global_allocator;
// FIXME get this from postgres headers somehow
pub const BLCKSZ: usize = 8192;

View File

@@ -0,0 +1,377 @@
pub type CLsn = u64;
pub type COid = u32;
// This conveniently matches PG_IOV_MAX
pub const MAX_GETPAGEV_PAGES: usize = 32;
use pageserver_page_api as page_api;
#[allow(clippy::large_enum_variant)]
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub enum NeonIORequest {
Empty,
// Read requests. These are C-friendly variants of the corresponding structs in
// pageserver_page_api.
RelExists(CRelExistsRequest),
RelSize(CRelSizeRequest),
GetPageV(CGetPageVRequest),
PrefetchV(CPrefetchVRequest),
DbSize(CDbSizeRequest),
// Write requests. These are needed to keep the relation size cache and LFC up-to-date.
// They are not sent to the pageserver.
WritePage(CWritePageRequest),
RelExtend(CRelExtendRequest),
RelZeroExtend(CRelZeroExtendRequest),
RelCreate(CRelCreateRequest),
RelTruncate(CRelTruncateRequest),
RelUnlink(CRelUnlinkRequest),
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub enum NeonIOResult {
Empty,
RelExists(bool),
RelSize(u32),
/// the result pages are written to the shared memory addresses given in the request
GetPageV,
/// A prefetch request returns as soon as the request has been received by the communicator.
/// It is processed in the background.
PrefetchVLaunched,
DbSize(u64),
// FIXME design compact error codes. Can't easily pass a string or other dynamic data.
// currently, this is 'errno'
Error(i32),
Aborted,
/// used for all write requests
WriteOK,
}
impl NeonIORequest {
pub fn request_id(&self) -> u64 {
use NeonIORequest::*;
match self {
Empty => 0,
RelExists(req) => req.request_id,
RelSize(req) => req.request_id,
GetPageV(req) => req.request_id,
PrefetchV(req) => req.request_id,
DbSize(req) => req.request_id,
WritePage(req) => req.request_id,
RelExtend(req) => req.request_id,
RelZeroExtend(req) => req.request_id,
RelCreate(req) => req.request_id,
RelTruncate(req) => req.request_id,
RelUnlink(req) => req.request_id,
}
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CCachedGetPageVResult {
pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
}
/// ShmemBuf represents a buffer in shared memory.
///
/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
/// violate Rust's safety semantics, but it will mess up and crash Postgres.
///
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct ShmemBuf {
// These fields define where the result is written. Must point into a buffer in shared memory!
pub ptr: *mut u8,
}
unsafe impl Send for ShmemBuf {}
unsafe impl Sync for ShmemBuf {}
unsafe impl uring_common::buf::IoBuf for ShmemBuf {
fn stable_ptr(&self) -> *const u8 {
self.ptr
}
fn bytes_init(&self) -> usize {
crate::BLCKSZ
}
fn bytes_total(&self) -> usize {
crate::BLCKSZ
}
}
unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
fn stable_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
unsafe fn set_init(&mut self, pos: usize) {
if pos > crate::BLCKSZ {
panic!(
"set_init called past end of buffer, pos {}, buffer size {}",
pos,
crate::BLCKSZ
);
}
}
}
impl ShmemBuf {
pub fn as_mut_ptr(&self) -> *mut u8 {
self.ptr
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelExistsRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelSizeRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CGetPageVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CPrefetchVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CDbSizeRequest {
pub request_id: u64,
pub db_oid: COid,
pub request_lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CWritePageRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// These fields define page contents. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelZeroExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelCreateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelTruncateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelUnlinkRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u32,
}
impl CRelExistsRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelSizeRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CGetPageVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CPrefetchVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CWritePageRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelZeroExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelCreateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelTruncateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelUnlinkRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}

View File

@@ -0,0 +1,28 @@
//! C callbacks to PostgreSQL facilities that the neon extension needs
//! to provide. These are implemented in `neon/pgxn/communicator_new.c`.
//! The function signatures better match!
//!
//! These are called from the communicator threads! Careful what you do, most
//! Postgres functions are not safe to call in that context.
use utils::lsn::Lsn;
unsafe extern "C" {
pub fn notify_proc_unsafe(procno: std::ffi::c_int);
pub fn callback_set_my_latch_unsafe();
pub fn callback_get_request_lsn_unsafe() -> u64;
}
// safe wrappers
pub(super) fn notify_proc(procno: std::ffi::c_int) {
unsafe { notify_proc_unsafe(procno) };
}
pub(super) fn callback_set_my_latch() {
unsafe { callback_set_my_latch_unsafe() };
}
pub(super) fn get_request_lsn() -> Lsn {
Lsn(unsafe { callback_get_request_lsn_unsafe() })
}

View File

@@ -0,0 +1,95 @@
//! Lock table to ensure that only one IO request is in flight for a given
//! block (or relation or database metadata) at a time
use std::cmp::Eq;
use std::hash::Hash;
use std::sync::Arc;
use tokio::sync::{Mutex, OwnedMutexGuard};
use clashmap::ClashMap;
use clashmap::Entry;
use pageserver_page_api::RelTag;
#[derive(Clone, Eq, Hash, PartialEq)]
pub enum RequestInProgressKey {
Db(u32),
Rel(RelTag),
Block(RelTag, u32),
}
type RequestId = u64;
pub type RequestInProgressTable = MutexHashMap<RequestInProgressKey, RequestId>;
// more primitive locking thingie:
pub struct MutexHashMap<K, V>
where
K: Clone + Eq + Hash,
{
lock_table: ClashMap<K, (V, Arc<Mutex<()>>)>,
}
pub struct MutexHashMapGuard<'a, K, V>
where
K: Clone + Eq + Hash,
{
pub key: K,
map: &'a MutexHashMap<K, V>,
mutex: Arc<Mutex<()>>,
_guard: OwnedMutexGuard<()>,
}
impl<'a, K, V> Drop for MutexHashMapGuard<'a, K, V>
where
K: Clone + Eq + Hash,
{
fn drop(&mut self) {
let (_old_key, old_val) = self.map.lock_table.remove(&self.key).unwrap();
assert!(Arc::ptr_eq(&old_val.1, &self.mutex));
// the guard will be dropped as we return
}
}
impl<K, V> MutexHashMap<K, V>
where
K: Clone + Eq + Hash,
V: std::fmt::Display + Copy,
{
pub fn new() -> MutexHashMap<K, V> {
MutexHashMap {
lock_table: ClashMap::new(),
}
}
pub async fn lock<'a>(&'a self, key: K, val: V) -> MutexHashMapGuard<'a, K, V> {
let my_mutex = Arc::new(Mutex::new(()));
let my_guard = Arc::clone(&my_mutex).lock_owned().await;
loop {
let (request_id, lock) = match self.lock_table.entry(key.clone()) {
Entry::Occupied(e) => {
let e = e.get();
(e.0, Arc::clone(&e.1))
}
Entry::Vacant(e) => {
e.insert((val, Arc::clone(&my_mutex)));
break;
}
};
tracing::info!("waiting for conflicting IO {request_id} to complete");
let _ = lock.lock().await;
tracing::info!("conflicting IO {request_id} completed");
}
MutexHashMapGuard {
key,
map: self,
mutex: my_mutex,
_guard: my_guard,
}
}
}

View File

@@ -0,0 +1,231 @@
//! Glue code to hook up Rust logging with the `tracing` crate to the PostgreSQL log
//!
//! In the Rust threads, the log messages are written to a mpsc Channel, and the Postgres
//! process latch is raised. That wakes up the loop in the main thread. It reads the
//! message from the channel and ereport()s it. This ensures that only one thread, the main
//! thread, calls the PostgreSQL logging routines at any time.
use std::sync::mpsc::sync_channel;
use std::sync::mpsc::{Receiver, SyncSender};
use std::sync::mpsc::{TryRecvError, TrySendError};
use tracing::info;
use tracing::{Event, Level, Metadata, Subscriber};
use tracing_subscriber::filter::LevelFilter;
use tracing_subscriber::fmt::FmtContext;
use tracing_subscriber::fmt::FormatEvent;
use tracing_subscriber::fmt::FormatFields;
use tracing_subscriber::fmt::FormattedFields;
use tracing_subscriber::fmt::MakeWriter;
use tracing_subscriber::fmt::format::Writer;
use tracing_subscriber::registry::LookupSpan;
use crate::worker_process::callbacks::callback_set_my_latch;
pub struct LoggingState {
receiver: Receiver<FormattedEventWithMeta>,
}
/// Called once, at worker process startup. The returned LoggingState is passed back
/// in the subsequent calls to `pump_logging`. It is opaque to the C code.
#[unsafe(no_mangle)]
pub extern "C" fn configure_logging() -> Box<LoggingState> {
let (sender, receiver) = sync_channel(1000);
let maker = Maker { channel: sender };
use tracing_subscriber::prelude::*;
let r = tracing_subscriber::registry();
let r = r.with(
tracing_subscriber::fmt::layer()
.with_ansi(false)
.event_format(SimpleFormatter::new())
.with_writer(maker)
// TODO: derive this from log_min_messages?
.with_filter(LevelFilter::from_level(Level::INFO)),
);
r.init();
info!("communicator process logging started");
let state = LoggingState { receiver };
Box::new(state)
}
/// Read one message from the logging queue. This is essentially a wrapper to Receiver,
/// with a C-friendly signature.
///
/// The message is copied into *errbuf, which is a caller-supplied buffer of size `errbuf_len`.
/// If the message doesn't fit in the buffer, it is truncated. It is always NULL-terminated.
///
/// The error level is returned *elevel_p. It's one of the PostgreSQL error levels, see elog.h
#[unsafe(no_mangle)]
pub extern "C" fn pump_logging(
state: &mut LoggingState,
errbuf: *mut u8,
errbuf_len: u32,
elevel_p: &mut i32,
) -> i32 {
let msg = match state.receiver.try_recv() {
Err(TryRecvError::Empty) => return 0,
Err(TryRecvError::Disconnected) => return -1,
Ok(msg) => msg,
};
let src: &[u8] = &msg.message;
let dst = errbuf;
let len = std::cmp::min(src.len(), errbuf_len as usize - 1);
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dst, len);
*(errbuf.add(len)) = b'\0'; // NULL terminator
}
// XXX: these levels are copied from PostgreSQL's elog.h. Introduce another enum
// to hide these?
*elevel_p = match msg.level {
Level::TRACE => 10, // DEBUG5
Level::DEBUG => 14, // DEBUG1
Level::INFO => 17, // INFO
Level::WARN => 19, // WARNING
Level::ERROR => 21, // ERROR
};
1
}
//---- The following functions can be called from any thread ----
#[derive(Clone)]
struct FormattedEventWithMeta {
message: Vec<u8>,
level: tracing::Level,
}
impl Default for FormattedEventWithMeta {
fn default() -> Self {
FormattedEventWithMeta {
message: Vec::new(),
level: tracing::Level::DEBUG,
}
}
}
struct EventBuilder<'a> {
event: FormattedEventWithMeta,
maker: &'a Maker,
}
impl std::io::Write for EventBuilder<'_> {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.event.message.write(buf)
}
fn flush(&mut self) -> std::io::Result<()> {
self.maker.send_event(self.event.clone());
Ok(())
}
}
impl Drop for EventBuilder<'_> {
fn drop(&mut self) {
let maker = self.maker;
let event = std::mem::take(&mut self.event);
maker.send_event(event);
}
}
struct Maker {
channel: SyncSender<FormattedEventWithMeta>,
}
impl<'a> MakeWriter<'a> for Maker {
type Writer = EventBuilder<'a>;
fn make_writer(&'a self) -> Self::Writer {
panic!("not expected to be called when make_writer_for is implemented");
}
fn make_writer_for(&'a self, meta: &Metadata<'_>) -> Self::Writer {
EventBuilder {
event: FormattedEventWithMeta {
message: Vec::new(),
level: *meta.level(),
},
maker: self,
}
}
}
impl Maker {
fn send_event(&self, e: FormattedEventWithMeta) {
match self.channel.try_send(e) {
Ok(()) => {
// notify the main thread
callback_set_my_latch();
}
Err(TrySendError::Disconnected(_)) => {}
Err(TrySendError::Full(_)) => {
// TODO: record that some messages were lost
}
}
}
}
/// Simple formatter implementation for tracing_subscriber, which prints the log
/// spans and message part like the default formatter, but no timestamp or error
/// level. The error level is captured separately by `FormattedEventWithMeta',
/// and when the error is printed by the main thread, with PostgreSQL ereport(),
/// it gets a timestamp at that point. (The timestamp printed will therefore lag
/// behind the timestamp on the event here, if the main thread doesn't process
/// the log message promptly)
struct SimpleFormatter;
impl<S, N> FormatEvent<S, N> for SimpleFormatter
where
S: Subscriber + for<'a> LookupSpan<'a>,
N: for<'a> FormatFields<'a> + 'static,
{
fn format_event(
&self,
ctx: &FmtContext<'_, S, N>,
mut writer: Writer<'_>,
event: &Event<'_>,
) -> std::fmt::Result {
// Format all the spans in the event's span context.
if let Some(scope) = ctx.event_scope() {
for span in scope.from_root() {
write!(writer, "{}", span.name())?;
// `FormattedFields` is a formatted representation of the span's
// fields, which is stored in its extensions by the `fmt` layer's
// `new_span` method. The fields will have been formatted
// by the same field formatter that's provided to the event
// formatter in the `FmtContext`.
let ext = span.extensions();
let fields = &ext
.get::<FormattedFields<N>>()
.expect("will never be `None`");
// Skip formatting the fields if the span had no fields.
if !fields.is_empty() {
write!(writer, "{{{fields}}}")?;
}
write!(writer, ": ")?;
}
}
// Write fields on the event
ctx.field_format().format_fields(writer.by_ref(), event)?;
writeln!(writer)
}
}
impl SimpleFormatter {
fn new() -> Self {
SimpleFormatter {}
}
}

View File

@@ -0,0 +1,731 @@
use std::collections::HashMap;
use std::os::fd::AsRawFd;
use std::os::fd::OwnedFd;
use std::path::PathBuf;
use std::str::FromStr as _;
use crate::backend_comms::NeonIOHandle;
use crate::file_cache::FileCache;
use crate::global_allocator::MyAllocatorCollector;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
use crate::neon_request::{CGetPageVRequest, CPrefetchVRequest};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use crate::worker_process::in_progress_ios::{RequestInProgressKey, RequestInProgressTable};
use pageserver_client_grpc::{PageserverClient, ShardSpec};
use pageserver_page_api as page_api;
use metrics::{IntCounter, IntCounterVec};
use tokio::io::AsyncReadExt;
use tokio_pipe::PipeRead;
use uring_common::buf::IoBuf;
use utils::id::{TenantId, TimelineId};
use super::callbacks::{get_request_lsn, notify_proc};
use tracing::{error, info, info_span, trace};
use utils::lsn::Lsn;
pub struct CommunicatorWorkerProcessStruct<'a> {
neon_request_slots: &'a [NeonIOHandle],
client: PageserverClient,
pub(crate) cache: IntegratedCacheWriteAccess<'a>,
submission_pipe_read_fd: OwnedFd,
in_progress_table: RequestInProgressTable,
// Metrics
request_counters: IntCounterVec,
request_rel_exists_counter: IntCounter,
request_rel_size_counter: IntCounter,
request_get_pagev_counter: IntCounter,
request_prefetchv_counter: IntCounter,
request_db_size_counter: IntCounter,
request_write_page_counter: IntCounter,
request_rel_extend_counter: IntCounter,
request_rel_zero_extend_counter: IntCounter,
request_rel_create_counter: IntCounter,
request_rel_truncate_counter: IntCounter,
request_rel_unlink_counter: IntCounter,
getpage_cache_misses_counter: IntCounter,
getpage_cache_hits_counter: IntCounter,
request_nblocks_counters: IntCounterVec,
request_get_pagev_nblocks_counter: IntCounter,
request_prefetchv_nblocks_counter: IntCounter,
request_rel_zero_extend_nblocks_counter: IntCounter,
allocator_metrics: MyAllocatorCollector,
}
pub(super) async fn init(
cis: Box<CommunicatorInitStruct>,
tenant_id: String,
timeline_id: String,
auth_token: Option<String>,
shard_map: HashMap<utils::shard::ShardIndex, String>,
initial_file_cache_size: u64,
file_cache_path: Option<PathBuf>,
) -> CommunicatorWorkerProcessStruct<'static> {
info!("Test log message");
let last_lsn = get_request_lsn();
let file_cache = if let Some(path) = file_cache_path {
Some(FileCache::new(&path, initial_file_cache_size).expect("could not create cache file"))
} else {
// FIXME: temporarily for testing, use LFC even if disabled
Some(
FileCache::new(&PathBuf::from("new_filecache"), 1000)
.expect("could not create cache file"),
)
};
// Initialize subsystems
let cache = cis
.integrated_cache_init_struct
.worker_process_init(last_lsn, file_cache);
// TODO: plumb through the stripe size.
let tenant_id = TenantId::from_str(&tenant_id).expect("invalid tenant ID");
let timeline_id = TimelineId::from_str(&timeline_id).expect("invalid timeline ID");
let shard_spec = ShardSpec::new(shard_map, None).expect("invalid shard spec");
let client = PageserverClient::new(tenant_id, timeline_id, shard_spec, auth_token)
.expect("could not create client");
let request_counters = IntCounterVec::new(
metrics::core::Opts::new(
"backend_requests_total",
"Number of requests from backends.",
),
&["request_kind"],
)
.unwrap();
let request_rel_exists_counter = request_counters.with_label_values(&["rel_exists"]);
let request_rel_size_counter = request_counters.with_label_values(&["rel_size"]);
let request_get_pagev_counter = request_counters.with_label_values(&["get_pagev"]);
let request_prefetchv_counter = request_counters.with_label_values(&["prefetchv"]);
let request_db_size_counter = request_counters.with_label_values(&["db_size"]);
let request_write_page_counter = request_counters.with_label_values(&["write_page"]);
let request_rel_extend_counter = request_counters.with_label_values(&["rel_extend"]);
let request_rel_zero_extend_counter = request_counters.with_label_values(&["rel_zero_extend"]);
let request_rel_create_counter = request_counters.with_label_values(&["rel_create"]);
let request_rel_truncate_counter = request_counters.with_label_values(&["rel_truncate"]);
let request_rel_unlink_counter = request_counters.with_label_values(&["rel_unlink"]);
let getpage_cache_misses_counter = IntCounter::new(
"getpage_cache_misses",
"Number of file cache misses in get_pagev requests.",
)
.unwrap();
let getpage_cache_hits_counter = IntCounter::new(
"getpage_cache_hits",
"Number of file cache hits in get_pagev requests.",
)
.unwrap();
// For the requests that affect multiple blocks, have separate counters for the # of blocks affected
let request_nblocks_counters = IntCounterVec::new(
metrics::core::Opts::new(
"request_nblocks_total",
"Number of blocks in backend requests.",
),
&["request_kind"],
)
.unwrap();
let request_get_pagev_nblocks_counter =
request_nblocks_counters.with_label_values(&["get_pagev"]);
let request_prefetchv_nblocks_counter =
request_nblocks_counters.with_label_values(&["prefetchv"]);
let request_rel_zero_extend_nblocks_counter =
request_nblocks_counters.with_label_values(&["rel_zero_extend"]);
CommunicatorWorkerProcessStruct {
neon_request_slots: cis.neon_request_slots,
client,
cache,
submission_pipe_read_fd: cis.submission_pipe_read_fd,
in_progress_table: RequestInProgressTable::new(),
// metrics
request_counters,
request_rel_exists_counter,
request_rel_size_counter,
request_get_pagev_counter,
request_prefetchv_counter,
request_db_size_counter,
request_write_page_counter,
request_rel_extend_counter,
request_rel_zero_extend_counter,
request_rel_create_counter,
request_rel_truncate_counter,
request_rel_unlink_counter,
getpage_cache_misses_counter,
getpage_cache_hits_counter,
request_nblocks_counters,
request_get_pagev_nblocks_counter,
request_prefetchv_nblocks_counter,
request_rel_zero_extend_nblocks_counter,
allocator_metrics: MyAllocatorCollector::new(),
}
}
impl<'t> CommunicatorWorkerProcessStruct<'t> {
/// Main loop of the worker process. Receive requests from the backends and process them.
pub(super) async fn run(&'static self) {
let mut idxbuf: [u8; 4] = [0; 4];
let mut submission_pipe_read =
PipeRead::try_from(self.submission_pipe_read_fd.as_raw_fd()).expect("invalid pipe fd");
loop {
// Wait for a backend to ring the doorbell
match submission_pipe_read.read(&mut idxbuf).await {
Ok(4) => {}
Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
Err(e) => panic!("error reading from communicator pipe: {e}"),
}
let slot_idx = u32::from_ne_bytes(idxbuf) as usize;
// Read the IO request from the slot indicated in the wakeup
let Some(slot) = self.neon_request_slots[slot_idx].start_processing_request() else {
// This currently should not happen. But if we had multiple threads picking up
// requests, and without waiting for the notifications, it could.
panic!("no request in slot");
};
// Ok, we have ownership of this request now. We must process it now, there's no going
// back.
//
// Spawn a separate task for every request. That's a little excessive for requests that
// can be quickly satisfied from the cache, but we expect that to be rare, because the
// requesting backend would have already checked the cache.
tokio::spawn(async move {
use tracing::Instrument;
let request_id = slot.get_request().request_id();
let owner_procno = slot.get_owner_procno();
let span = info_span!(
"processing",
request_id = request_id,
slot_idx = slot_idx,
procno = owner_procno,
);
async {
// FIXME: as a temporary hack, abort the request if we don't get a response
// promptly.
//
// Lots of regression tests are getting stuck and failing at the moment,
// this makes them fail a little faster, which it faster to iterate.
// This needs to be removed once more regression tests are passing.
// See also similar hack in the backend code, in wait_request_completion()
let result = tokio::time::timeout(
tokio::time::Duration::from_secs(30),
self.handle_request(slot.get_request()),
)
.await
.unwrap_or_else(|_elapsed| {
info!("request {request_id} timed out");
NeonIOResult::Error(libc::ETIMEDOUT)
});
trace!("request {request_id} at slot {slot_idx} completed");
// Ok, we have completed the IO. Mark the request as completed. After that,
// we no longer have ownership of the slot, and must not modify it.
slot.completed(result);
// Notify the backend about the completion. (Note that the backend might see
// the completed status even before this; this is just a wakeup)
notify_proc(owner_procno);
}
.instrument(span)
.await
});
}
}
/// Compute the 'request_lsn' to use for a pageserver request
fn request_lsns(&self, not_modified_since_lsn: Lsn) -> page_api::ReadLsn {
let mut request_lsn = get_request_lsn();
// Is it possible that the last-written LSN is ahead of last flush LSN? Generally not, we
// shouldn't evict a page from the buffer cache before all its modifications have been
// safely flushed. That's the "WAL before data" rule. However, such case does exist at index
// building: _bt_blwritepage logs the full page without flushing WAL before smgrextend
// (files are fsynced before build ends).
//
// XXX: If we make a request LSN greater than the current WAL flush LSN, the pageserver would
// block waiting for the WAL arrive, until we flush it and it propagates through the
// safekeepers to the pageserver. If there's nothing that forces the WAL to be flushed,
// the pageserver would get stuck waiting forever. To avoid that, all the write-
// functions in communicator_new.c call XLogSetAsyncXactLSN(). That nudges the WAL writer to
// perform the flush relatively soon.
//
// It would perhaps be nicer to do the WAL flush here, but it's tricky to call back into
// Postgres code to do that from here. That's why we rely on communicator_new.c to do the
// calls "pre-emptively".
//
// FIXME: Because of the above, it can still happen that the flush LSN is ahead of
// not_modified_since, if the WAL writer hasn't done the flush yet. It would be nice to know
// if there are other cases like that that we have mised, but unfortunately we cannot turn
// this into an assertion because of that legit case.
//
// See also the old logic in neon_get_request_lsns() C function
if not_modified_since_lsn > request_lsn {
tracing::info!(
"not_modified_since_lsn {} is ahead of last flushed LSN {}",
not_modified_since_lsn,
request_lsn
);
request_lsn = not_modified_since_lsn;
}
page_api::ReadLsn {
request_lsn,
not_modified_since_lsn: Some(not_modified_since_lsn),
}
}
/// Handle one IO request
async fn handle_request(&'static self, req: &'_ NeonIORequest) -> NeonIOResult {
match req {
NeonIORequest::Empty => {
error!("unexpected Empty IO request");
NeonIOResult::Error(0)
}
NeonIORequest::RelExists(req) => {
self.request_rel_exists_counter.inc();
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Rel(rel), req.request_id)
.await;
// Check the cache first
let not_modified_since = match self.cache.get_rel_exists(&rel) {
CacheResult::Found(exists) => return NeonIOResult::RelExists(exists),
CacheResult::NotFound(lsn) => lsn,
};
match self
.client
.check_rel_exists(page_api::CheckRelExistsRequest {
read_lsn: self.request_lsns(not_modified_since),
rel,
})
.await
{
Ok(exists) => NeonIOResult::RelExists(exists),
Err(err) => {
info!("tonic error: {err:?}");
NeonIOResult::Error(0)
}
}
}
NeonIORequest::RelSize(req) => {
self.request_rel_size_counter.inc();
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Rel(rel), req.request_id)
.await;
// Check the cache first
let not_modified_since = match self.cache.get_rel_size(&rel) {
CacheResult::Found(nblocks) => {
tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
return NeonIOResult::RelSize(nblocks);
}
CacheResult::NotFound(lsn) => lsn,
};
let read_lsn = self.request_lsns(not_modified_since);
match self
.client
.get_rel_size(page_api::GetRelSizeRequest { read_lsn, rel })
.await
{
Ok(nblocks) => {
// update the cache
tracing::info!("updated relsize for {:?} in cache: {}", rel, nblocks);
self.cache.remember_rel_size(&rel, nblocks);
NeonIOResult::RelSize(nblocks)
}
Err(err) => {
info!("tonic error: {err:?}");
NeonIOResult::Error(0)
}
}
}
NeonIORequest::GetPageV(req) => {
self.request_get_pagev_counter.inc();
self.request_get_pagev_nblocks_counter
.inc_by(req.nblocks as u64);
match self.handle_get_pagev_request(req).await {
Ok(()) => NeonIOResult::GetPageV,
Err(errno) => NeonIOResult::Error(errno),
}
}
NeonIORequest::PrefetchV(req) => {
self.request_prefetchv_counter.inc();
self.request_prefetchv_nblocks_counter
.inc_by(req.nblocks as u64);
let req = *req;
tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
NeonIOResult::PrefetchVLaunched
}
NeonIORequest::DbSize(req) => {
self.request_db_size_counter.inc();
let _in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Db(req.db_oid), req.request_id)
.await;
// Check the cache first
let not_modified_since = match self.cache.get_db_size(req.db_oid) {
CacheResult::Found(db_size) => {
// get_page already copied the block content to the destination
return NeonIOResult::DbSize(db_size);
}
CacheResult::NotFound(lsn) => lsn,
};
match self
.client
.get_db_size(page_api::GetDbSizeRequest {
read_lsn: self.request_lsns(not_modified_since),
db_oid: req.db_oid,
})
.await
{
Ok(db_size) => NeonIOResult::DbSize(db_size),
Err(err) => {
info!("tonic error: {err:?}");
NeonIOResult::Error(0)
}
}
}
// Write requests
NeonIORequest::WritePage(req) => {
self.request_write_page_counter.inc();
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(
RequestInProgressKey::Block(rel, req.block_number),
req.request_id,
)
.await;
// We must at least update the last-written LSN on the page, but also store the page
// image in the LFC while we still have it
self.cache
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
.await;
NeonIOResult::WriteOK
}
NeonIORequest::RelExtend(req) => {
self.request_rel_extend_counter.inc();
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(
RequestInProgressKey::Block(rel, req.block_number),
req.request_id,
)
.await;
// We must at least update the last-written LSN on the page and the relation size,
// but also store the page image in the LFC while we still have it
self.cache
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
.await;
self.cache
.remember_rel_size(&req.reltag(), req.block_number + 1);
NeonIOResult::WriteOK
}
NeonIORequest::RelZeroExtend(req) => {
self.request_rel_zero_extend_counter.inc();
self.request_rel_zero_extend_nblocks_counter
.inc_by(req.nblocks as u64);
// TODO: need to grab an io-in-progress lock for this? I guess not
// TODO: I think we should put the empty pages to the cache, or at least
// update the last-written LSN.
self.cache
.remember_rel_size(&req.reltag(), req.block_number + req.nblocks);
NeonIOResult::WriteOK
}
NeonIORequest::RelCreate(req) => {
self.request_rel_create_counter.inc();
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache.remember_rel_size(&req.reltag(), 0);
NeonIOResult::WriteOK
}
NeonIORequest::RelTruncate(req) => {
self.request_rel_truncate_counter.inc();
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache.remember_rel_size(&req.reltag(), req.nblocks);
NeonIOResult::WriteOK
}
NeonIORequest::RelUnlink(req) => {
self.request_rel_unlink_counter.inc();
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache.forget_rel(&req.reltag());
NeonIOResult::WriteOK
}
}
}
/// Subroutine to handle a GetPageV request, since it's a little more complicated than
/// others.
async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
let rel = req.reltag();
// Check the cache first
//
// Note: Because the backends perform a direct lookup in the cache before sending
// the request to the communicator process, we expect the pages to almost never
// be already in cache. It could happen if:
// 1. two backends try to read the same page at the same time, but that should never
// happen because there's higher level locking in the Postgres buffer manager, or
// 2. a prefetch request finished at the same time as a backend requested the
// page. That's much more likely.
let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
// note: this is deadlock-safe even though we hold multiple locks at the same time,
// because they're always acquired in the same order.
let in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
.await;
let dest = req.dest[i as usize];
let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
Ok(CacheResult::Found(_)) => {
// get_page already copied the block content to the destination
trace!("found blk {} in rel {:?} in LFC", blkno, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(-1), // FIXME errno?
};
cache_misses.push((blkno, not_modified_since, dest, in_progress_guard));
}
self.getpage_cache_misses_counter
.inc_by(cache_misses.len() as u64);
self.getpage_cache_hits_counter
.inc_by(req.nblocks as u64 - cache_misses.len() as u64);
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses
.iter()
.map(|(_blkno, lsn, _dest, _guard)| *lsn)
.max()
.unwrap();
// Construct a pageserver request for the cache misses
let block_numbers: Vec<u32> = cache_misses
.iter()
.map(|(blkno, _lsn, _dest, _guard)| *blkno)
.collect();
let read_lsn = self.request_lsns(not_modified_since);
info!(
"sending getpage request for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
match self
.client
.get_page(page_api::GetPageRequest {
request_id: req.request_id,
request_class: page_api::GetPageClass::Normal,
read_lsn,
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
// Write the received page images directly to the shared memory location
// that the backend requested.
if resp.page_images.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.page_images.len(),
block_numbers.len(),
);
return Err(-1);
}
for (page_image, (blkno, _lsn, dest, _guard)) in
resp.page_images.into_iter().zip(cache_misses)
{
let src: &[u8] = page_image.as_ref();
let len = std::cmp::min(src.len(), dest.bytes_total());
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
};
// Also store it in the LFC while we have it
self.cache
.remember_page(
&rel,
blkno,
page_image,
read_lsn.not_modified_since_lsn.unwrap(),
false,
)
.await;
}
}
Err(err) => {
info!("tonic error: {err:?}");
return Err(-1);
}
}
Ok(())
}
/// Subroutine to handle a PrefetchV request, since it's a little more complicated than
/// others.
///
/// This is very similar to a GetPageV request, but the results are only stored in the cache.
async fn handle_prefetchv_request(&'static self, req: &CPrefetchVRequest) -> Result<(), i32> {
let rel = req.reltag();
// Check the cache first
let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
// note: this is deadlock-safe even though we hold multiple locks at the same time,
// because they're always acquired in the same order.
let in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
.await;
let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
Ok(CacheResult::Found(_)) => {
trace!("found blk {} in rel {:?} in LFC", blkno, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(-1), // FIXME errno?
};
cache_misses.push((blkno, not_modified_since, in_progress_guard));
}
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses
.iter()
.map(|(_blkno, lsn, _guard)| *lsn)
.max()
.unwrap();
let block_numbers: Vec<u32> = cache_misses
.iter()
.map(|(blkno, _lsn, _guard)| *blkno)
.collect();
// TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
// in-flight requests
match self
.client
.get_page(page_api::GetPageRequest {
request_id: req.request_id,
request_class: page_api::GetPageClass::Prefetch,
read_lsn: self.request_lsns(not_modified_since),
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
trace!(
"prefetch completed, remembering blocks {:?} in rel {:?} in LFC",
block_numbers, rel
);
if resp.page_images.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.page_images.len(),
block_numbers.len(),
);
return Err(-1);
}
for (page_image, (blkno, _lsn, _guard)) in
resp.page_images.into_iter().zip(cache_misses)
{
self.cache
.remember_page(&rel, blkno, page_image, not_modified_since, false)
.await;
}
}
Err(err) => {
info!("tonic error: {err:?}");
return Err(-1);
}
}
Ok(())
}
}
impl<'t> metrics::core::Collector for CommunicatorWorkerProcessStruct<'t> {
fn desc(&self) -> Vec<&metrics::core::Desc> {
let mut descs = Vec::new();
descs.append(&mut self.request_counters.desc());
descs.append(&mut self.getpage_cache_misses_counter.desc());
descs.append(&mut self.getpage_cache_hits_counter.desc());
descs.append(&mut self.request_nblocks_counters.desc());
if let Some(file_cache) = &self.cache.file_cache {
descs.append(&mut file_cache.desc());
}
descs.append(&mut self.cache.desc());
descs.append(&mut self.allocator_metrics.desc());
descs
}
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
let mut values = Vec::new();
values.append(&mut self.request_counters.collect());
values.append(&mut self.getpage_cache_misses_counter.collect());
values.append(&mut self.getpage_cache_hits_counter.collect());
values.append(&mut self.request_nblocks_counters.collect());
if let Some(file_cache) = &self.cache.file_cache {
values.append(&mut file_cache.collect());
}
values.append(&mut self.cache.collect());
values.append(&mut self.allocator_metrics.collect());
values
}
}

View File

@@ -0,0 +1,82 @@
//! Export information about Postgres, the communicator process, file cache etc. as
//! prometheus metrics.
use axum::Router;
use axum::body::Body;
use axum::extract::State;
use axum::response::Response;
use http::StatusCode;
use http::header::CONTENT_TYPE;
use metrics::proto::MetricFamily;
use metrics::{Encoder, TextEncoder};
use std::path::PathBuf;
use tokio::net::UnixListener;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
impl<'a> CommunicatorWorkerProcessStruct<'a> {
pub(crate) async fn launch_exporter_task(&'static self) {
use axum::routing::get;
let app = Router::new()
.route("/metrics", get(get_metrics))
.route("/dump_cache_map", get(dump_cache_map))
.with_state(self);
// Listen on unix domain socket, in the data directory. That should be unique.
let path = PathBuf::from(".metrics.socket");
let listener = UnixListener::bind(path.clone()).unwrap();
tokio::spawn(async {
tracing::info!("metrics listener spawned");
axum::serve(listener, app).await.unwrap()
});
}
}
async fn dump_cache_map(
State(state): State<&CommunicatorWorkerProcessStruct<'static>>,
) -> Response {
let mut buf: Vec<u8> = Vec::new();
state.cache.dump_map(&mut buf);
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, "application/text")
.body(Body::from(buf))
.unwrap()
}
/// Expose Prometheus metrics.
async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct<'static>>) -> Response {
use metrics::core::Collector;
let metrics = state.collect();
// When we call TextEncoder::encode() below, it will immediately return an
// error if a metric family has no metrics, so we need to preemptively
// filter out metric families with no metrics.
let metrics = metrics
.into_iter()
.filter(|m| !m.get_metric().is_empty())
.collect::<Vec<MetricFamily>>();
let encoder = TextEncoder::new();
let mut buffer = vec![];
if let Err(e) = encoder.encode(&metrics, &mut buffer) {
Response::builder()
.status(StatusCode::INTERNAL_SERVER_ERROR)
.header(CONTENT_TYPE, "application/text")
.body(Body::from(e.to_string()))
.unwrap()
} else {
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, encoder.format_type())
.body(Body::from(buffer))
.unwrap()
}
}

View File

@@ -0,0 +1,14 @@
//! This code runs in the communicator worker process. This provides
//! the glue code to:
//!
//! - launch the 'processor',
//! - receive IO requests from backends and pass them to the processor,
//! - write results back to backends.
mod callbacks;
mod logging;
mod main_loop;
mod metrics_exporter;
mod worker_interface;
mod in_progress_ios;

View File

@@ -0,0 +1,121 @@
//! Functions called from the C code in the worker process
use std::collections::HashMap;
use std::ffi::{CStr, c_char};
use std::path::PathBuf;
use tracing::error;
use crate::init::CommunicatorInitStruct;
use crate::worker_process::main_loop;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
/// Launch the communicator's tokio tasks, which do most of the work.
///
/// The caller has initialized the process as a regular PostgreSQL
/// background worker process. The shared memory segment used to
/// communicate with the backends has been allocated and initialized
/// earlier, at postmaster startup, in rcommunicator_shmem_init().
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_process_launch(
cis: Box<CommunicatorInitStruct>,
tenant_id: *const c_char,
timeline_id: *const c_char,
auth_token: *const c_char,
shard_map: *mut *mut c_char,
nshards: u32,
file_cache_path: *const c_char,
initial_file_cache_size: u64,
) -> &'static CommunicatorWorkerProcessStruct<'static> {
// Convert the arguments into more convenient Rust types
let tenant_id = unsafe { CStr::from_ptr(tenant_id) }.to_str().unwrap();
let timeline_id = unsafe { CStr::from_ptr(timeline_id) }.to_str().unwrap();
let auth_token = if auth_token.is_null() {
None
} else {
Some(
unsafe { CStr::from_ptr(auth_token) }
.to_str()
.unwrap()
.to_string(),
)
};
let file_cache_path = {
if file_cache_path.is_null() {
None
} else {
let c_str = unsafe { CStr::from_ptr(file_cache_path) };
Some(PathBuf::from(c_str.to_str().unwrap()))
}
};
let shard_map = parse_shard_map(nshards, shard_map);
// start main loop
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.thread_name("communicator thread")
.build()
.unwrap();
let worker_struct = runtime.block_on(main_loop::init(
cis,
tenant_id.to_string(),
timeline_id.to_string(),
auth_token,
shard_map,
initial_file_cache_size,
file_cache_path,
));
let worker_struct = Box::leak(Box::new(worker_struct));
let main_loop_handle = runtime.spawn(worker_struct.run());
runtime.spawn(async {
let err = main_loop_handle.await.unwrap_err();
error!("error: {err:?}");
});
runtime.block_on(worker_struct.launch_exporter_task());
// keep the runtime running after we exit this function
Box::leak(Box::new(runtime));
worker_struct
}
/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
fn parse_shard_map(
nshards: u32,
shard_map: *mut *mut c_char,
) -> HashMap<utils::shard::ShardIndex, String> {
use utils::shard::*;
assert!(nshards <= u8::MAX as u32);
let mut result: HashMap<ShardIndex, String> = HashMap::new();
let mut p = shard_map;
for i in 0..nshards {
let c_str = unsafe { CStr::from_ptr(*p) };
p = unsafe { p.add(1) };
let s = c_str.to_str().unwrap();
let k = if nshards > 1 {
ShardIndex::new(ShardNumber(i as u8), ShardCount(nshards as u8))
} else {
ShardIndex::unsharded()
};
result.insert(k, s.into());
}
result
}
/// Inform the rust code about a configuration change
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_config_reload(
proc_handle: &'static CommunicatorWorkerProcessStruct<'static>,
file_cache_size: u64,
) {
proc_handle.cache.resize_file_cache(file_cache_size as u32);
}

1190
pgxn/neon/communicator_new.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,56 @@
/*-------------------------------------------------------------------------
*
* communicator_new.h
* new implementation
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#ifndef COMMUNICATOR_NEW_H
#define COMMUNICATOR_NEW_H
#include "neon_pgversioncompat.h"
#include "storage/buf_internals.h"
#include "pagestore_client.h"
/* initialization at postmaster startup */
extern void pg_init_communicator_new(void);
extern void communicator_new_shmem_request(void);
extern void communicator_new_shmem_startup(void);
/* initialization at backend startup */
extern void communicator_new_init(void);
/* Read requests */
extern bool communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum);
extern BlockNumber communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forknum);
extern int64 communicator_new_dbsize(Oid dbNode);
extern void communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber base_blockno,
void **buffers, BlockNumber nblocks);
extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno,
BlockNumber nblocks);
extern bool communicator_new_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno);
extern int communicator_new_read_slru_segment(SlruKind kind, int64 segno,
void *buffer);
/* Write requests, to keep the caches up-to-date */
extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn);
extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn);
extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno, BlockNumber nblocks,
XLogRecPtr lsn);
extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum);
extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum);
#endif /* COMMUNICATOR_NEW_H */

View File

@@ -183,13 +183,13 @@ typedef struct FileCacheControl
static HTAB *lfc_hash;
static int lfc_desc = -1;
static LWLockId lfc_lock;
static int lfc_max_size;
static int lfc_size_limit;
int lfc_max_size;
int lfc_size_limit;
static int lfc_prewarm_limit;
static int lfc_prewarm_batch;
static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
static char *lfc_path;
char *lfc_path;
static uint64 lfc_generation;
static FileCacheControl *lfc_ctl;
static bool lfc_do_prewarm;
@@ -230,6 +230,8 @@ lfc_switch_off(void)
{
int fd;
Assert(!neon_enable_new_communicator);
if (LFC_ENABLED())
{
HASH_SEQ_STATUS status;
@@ -295,6 +297,8 @@ lfc_maybe_disabled(void)
static bool
lfc_ensure_opened(void)
{
Assert(!neon_enable_new_communicator);
if (lfc_generation != lfc_ctl->generation)
{
lfc_close_file();
@@ -320,6 +324,8 @@ lfc_shmem_startup(void)
bool found;
static HASHCTL info;
Assert(!neon_enable_new_communicator);
if (prev_shmem_startup_hook)
{
prev_shmem_startup_hook();
@@ -618,6 +624,9 @@ lfc_init(void)
if (lfc_max_size == 0)
return;
if (neon_enable_new_communicator)
return;
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = lfc_shmem_startup;
#if PG_VERSION_NUM>=150000
@@ -693,6 +702,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
dsm_segment *seg;
BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
Assert(!neon_enable_new_communicator);
if (!lfc_ensure_opened())
return;
@@ -847,6 +857,8 @@ lfc_prewarm_main(Datum main_arg)
PrewarmWorkerState* ws;
uint32 worker_id = DatumGetInt32(main_arg);
Assert(!neon_enable_new_communicator);
AmPrewarmWorker = true;
pqsignal(SIGTERM, die);
@@ -947,6 +959,8 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
FileCacheEntry *entry;
uint32 hash;
Assert(!neon_enable_new_communicator);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return;
@@ -992,6 +1006,8 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
bool found = false;
uint32 hash;
Assert(!neon_enable_new_communicator);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return false;
@@ -1027,6 +1043,8 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
uint32 hash;
int i = 0;
Assert(!neon_enable_new_communicator);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return 0;
@@ -1134,6 +1152,8 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
int blocks_read = 0;
int buf_offset = 0;
Assert(!neon_enable_new_communicator);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return -1;
@@ -1500,6 +1520,8 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
Assert(!neon_enable_new_communicator);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return false;
@@ -1645,6 +1667,8 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
uint32 entry_offset;
int buf_offset = 0;
Assert(!neon_enable_new_communicator);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return;
@@ -2140,6 +2164,9 @@ PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
Datum
approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
{
if (neon_enable_new_communicator)
elog(ERROR, "TODO: not implemented");
if (lfc_size_limit != 0)
{
int32 dc;
@@ -2157,6 +2184,9 @@ PG_FUNCTION_INFO_V1(approximate_working_set_size);
Datum
approximate_working_set_size(PG_FUNCTION_ARGS)
{
if (neon_enable_new_communicator)
elog(ERROR, "TODO: not implemented");
if (lfc_size_limit != 0)
{
int32 dc;
@@ -2177,7 +2207,13 @@ Datum
get_local_cache_state(PG_FUNCTION_ARGS)
{
size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
FileCacheState* fcs = lfc_get_state(max_entries);
FileCacheState* fcs;
if (neon_enable_new_communicator)
elog(ERROR, "TODO: not implemented");
fcs = lfc_get_state(max_entries);
if (fcs != NULL)
PG_RETURN_BYTEA_P((bytea*)fcs);
else
@@ -2191,8 +2227,12 @@ prewarm_local_cache(PG_FUNCTION_ARGS)
{
bytea* state = PG_GETARG_BYTEA_PP(0);
uint32 n_workers = PG_GETARG_INT32(1);
FileCacheState* fcs = (FileCacheState*)state;
FileCacheState* fcs;
if (neon_enable_new_communicator)
elog(ERROR, "TODO: not implemented");
fcs = (FileCacheState*)state;
lfc_prewarm(fcs, n_workers);
PG_RETURN_NULL();
@@ -2212,6 +2252,9 @@ get_prewarm_info(PG_FUNCTION_ARGS)
uint32 total_pages;
size_t n_workers;
if (neon_enable_new_communicator)
elog(ERROR, "TODO: not implemented");
if (lfc_size_limit == 0)
PG_RETURN_NULL();

View File

@@ -26,6 +26,9 @@ typedef struct FileCacheState
/* GUCs */
extern bool lfc_store_prefetch_result;
extern int lfc_max_size;
extern int lfc_size_limit;
extern char *lfc_path;
/* functions for local file cache */
extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);

View File

@@ -69,7 +69,8 @@ char *neon_project_id;
char *neon_branch_id;
char *neon_endpoint_id;
int32 max_cluster_size;
char *page_server_connstring;
char *pageserver_connstring;
char *pageserver_grpc_urls;
char *neon_auth_token;
int readahead_buffer_size = 128;
@@ -177,6 +178,8 @@ static bool pageserver_flush(shardno_t shard_no);
static void pageserver_disconnect(shardno_t shard_no);
static void pageserver_disconnect_shard(shardno_t shard_no);
static void AssignShardMap(const char *newval);
static bool
PagestoreShmemIsValid(void)
{
@@ -239,6 +242,7 @@ ParseShardMap(const char *connstr, ShardMap *result)
return true;
}
/* GUC hooks for neon.pageserver_connstring */
static bool
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
{
@@ -249,6 +253,45 @@ CheckPageserverConnstring(char **newval, void **extra, GucSource source)
static void
AssignPageserverConnstring(const char *newval, void *extra)
{
/*
* 'neon.pageserver_connstring' is ignored if the new communicator is used.
* In that case, the shard map is loaded from 'neon.pageserver_grpc_urls'
* instead.
*/
if (neon_enable_new_communicator)
return;
AssignShardMap(newval);
}
/* GUC hooks for neon.pageserver_connstring */
static bool
CheckPageserverGrpcUrls(char **newval, void **extra, GucSource source)
{
char *p = *newval;
return ParseShardMap(p, NULL);
}
static void
AssignPageserverGrpcUrls(const char *newval, void *extra)
{
/*
* 'neon.pageserver_grpc-urls' is ignored if the new communicator is not
* used. In that case, the shard map is loaded from 'neon.pageserver_connstring'
instead.
*/
if (!neon_enable_new_communicator)
return;
AssignShardMap(newval);
}
static void
AssignShardMap(const char *newval)
{
ShardMap shard_map;
@@ -262,7 +305,7 @@ AssignPageserverConnstring(const char *newval, void *extra)
{
/*
* shouldn't happen, because we already checked the value in
* CheckPageserverConnstring
* CheckPageserverConnstring/CheckPageserverGrpcUrls
*/
elog(ERROR, "could not parse shard map");
}
@@ -281,6 +324,54 @@ AssignPageserverConnstring(const char *newval, void *extra)
}
}
/* Return a copy of the whole shard map from shared memory */
void
get_shard_map(char ***connstrs_p, shardno_t *num_shards_p)
{
uint64 begin_update_counter;
uint64 end_update_counter;
ShardMap *shard_map = &pagestore_shared->shard_map;
shardno_t num_shards;
char *buf;
char **connstrs;
buf = palloc(MAX_SHARDS*MAX_PAGESERVER_CONNSTRING_SIZE);
connstrs = palloc(sizeof(char *) * MAX_SHARDS);
/*
* Postmaster can update the shared memory values concurrently, in which
* case we would copy a garbled mix of the old and new values. We will
* detect it because the counter's won't match, and retry. But it's
* important that we don't do anything within the retry-loop that would
* depend on the string having valid contents.
*/
do
{
char *p;
begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
num_shards = shard_map->num_shards;
p = buf;
for (int i = 0; i < Min(num_shards, MAX_SHARDS); i++)
{
strlcpy(p, shard_map->connstring[i], MAX_PAGESERVER_CONNSTRING_SIZE);
connstrs[i] = p;
p += MAX_PAGESERVER_CONNSTRING_SIZE;
}
pg_memory_barrier();
}
while (begin_update_counter != end_update_counter
|| begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
|| end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
*connstrs_p = connstrs;
*num_shards_p = num_shards;
}
/*
* Get the current number of shards, and/or the connection string for a
* particular shard from the shard map in shared memory.
@@ -1304,7 +1395,8 @@ PagestoreShmemInit(void)
pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
AssignPageserverConnstring(page_server_connstring, NULL);
AssignPageserverConnstring(pageserver_connstring, NULL);
AssignPageserverGrpcUrls(pageserver_grpc_urls, NULL);
}
NeonPerfCountersShmemInit();
@@ -1357,12 +1449,21 @@ pg_init_libpagestore(void)
DefineCustomStringVariable("neon.pageserver_connstring",
"connection string to the page server",
NULL,
&page_server_connstring,
&pageserver_connstring,
"",
PGC_SIGHUP,
0, /* no flags required */
CheckPageserverConnstring, AssignPageserverConnstring, NULL);
DefineCustomStringVariable("neon.pageserver_grpc_urls",
"list of gRPC URLs for the page servers",
NULL,
&pageserver_grpc_urls,
"",
PGC_SIGHUP,
0, /* no flags required */
CheckPageserverGrpcUrls, AssignPageserverGrpcUrls, NULL);
DefineCustomStringVariable("neon.timeline_id",
"Neon timeline_id the server is running on",
NULL,
@@ -1520,7 +1621,7 @@ pg_init_libpagestore(void)
if (neon_auth_token)
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
if (page_server_connstring && page_server_connstring[0])
if (pageserver_connstring[0] || pageserver_grpc_urls[0])
{
neon_log(PageStoreTrace, "set neon_smgr hook");
smgr_hook = smgr_neon;

View File

@@ -21,6 +21,7 @@
#include "replication/logicallauncher.h"
#include "replication/slot.h"
#include "replication/walsender.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "funcapi.h"
#include "access/htup_details.h"
@@ -30,6 +31,7 @@
#include "utils/guc_tables.h"
#include "communicator.h"
#include "communicator_new.h"
#include "extension_server.h"
#include "file_cache.h"
#include "neon.h"
@@ -50,6 +52,7 @@ PG_MODULE_MAGIC;
void _PG_init(void);
bool neon_enable_new_communicator;
static int running_xacts_overflow_policy;
static bool monitor_query_exec_time = false;
@@ -59,11 +62,14 @@ static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags);
static void neon_ExecutorEnd(QueryDesc *queryDesc);
#if PG_MAJORVERSION_NUM >= 16
static shmem_startup_hook_type prev_shmem_startup_hook;
static void neon_shmem_startup_hook(void);
#if PG_VERSION_NUM>=150000
static shmem_request_hook_type prev_shmem_request_hook;
#endif
static void neon_shmem_request(void);
static void neon_shmem_startup_hook(void);
#if PG_MAJORVERSION_NUM >= 17
uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE;
uint32 WAIT_EVENT_NEON_LFC_READ;
@@ -450,13 +456,26 @@ _PG_init(void)
*/
#if PG_VERSION_NUM >= 160000
load_file("$libdir/neon_rmgr", false);
#endif
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = neon_shmem_startup_hook;
#if PG_VERSION_NUM>=150000
prev_shmem_request_hook = shmem_request_hook;
shmem_request_hook = neon_shmem_request;
#else
neon_shmem_request();
#endif
/* dummy call to a Rust function in the communicator library, to check that it works */
(void) communicator_dummy(123);
DefineCustomBoolVariable(
"neon.enable_new_communicator",
"Enables new communicator implementation",
NULL,
&neon_enable_new_communicator,
true,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
pg_init_libpagestore();
lfc_init();
@@ -464,6 +483,9 @@ _PG_init(void)
init_lwlsncache();
pg_init_communicator();
if (neon_enable_new_communicator)
pg_init_communicator_new();
Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
InitUnstableExtensionsSupport();
@@ -607,7 +629,17 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
PG_RETURN_UINT64(BackpressureThrottlingTime());
}
#if PG_MAJORVERSION_NUM >= 16
static void
neon_shmem_request(void)
{
#if PG_VERSION_NUM>=150000
if (prev_shmem_request_hook)
prev_shmem_request_hook();
#endif
communicator_new_shmem_request();
}
static void
neon_shmem_startup_hook(void)
{
@@ -627,8 +659,9 @@ neon_shmem_startup_hook(void)
WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
#endif
communicator_new_shmem_startup();
}
#endif
/*
* ExecutorStart hook: start up tracking if needed

View File

@@ -13,6 +13,7 @@
#include "utils/wait_event.h"
/* GUCs */
extern bool neon_enable_new_communicator;
extern char *neon_auth_token;
extern char *neon_timeline;
extern char *neon_tenant;

View File

@@ -9,6 +9,10 @@
#include "fmgr.h"
#include "storage/buf_internals.h"
#if PG_MAJORVERSION_NUM < 16
typedef PGAlignedBlock PGIOAlignedBlock;
#endif
#if PG_MAJORVERSION_NUM < 17
#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
#else
@@ -160,6 +164,10 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
#endif
#if PG_MAJORVERSION_NUM < 17
#define MyProcNumber (MyProc - &ProcGlobal->allProcs[0])
#endif
#if PG_MAJORVERSION_NUM < 15
extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
extern TimeLineID GetWALInsertionTimeLine(void);

View File

@@ -236,7 +236,8 @@ extern void prefetch_on_ps_disconnect(void);
extern page_server_api *page_server;
extern char *page_server_connstring;
extern char *pageserver_connstring;
extern char *pageserver_grpc_urls;
extern int flush_every_n_requests;
extern int readahead_buffer_size;
extern char *neon_timeline;
@@ -244,6 +245,7 @@ extern char *neon_tenant;
extern int32 max_cluster_size;
extern int neon_protocol_version;
extern void get_shard_map(char ***connstrs_p, shardno_t *num_shards_p);
extern shardno_t get_shard_number(BufferTag* tag);
extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);

View File

@@ -62,6 +62,7 @@
#include "bitmap.h"
#include "communicator.h"
#include "communicator_new.h"
#include "file_cache.h"
#include "neon.h"
#include "neon_lwlsncache.h"
@@ -87,7 +88,7 @@ static char *hexdump_page(char *page);
NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \
)
const int SmgrTrace = DEBUG5;
const int SmgrTrace = DEBUG1;
/* unlogged relation build states */
typedef enum
@@ -744,11 +745,6 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
{
return true;
}
/*
* \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server
* will error out if you check that, because the whole dbdir for
@@ -772,10 +768,20 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
return false;
}
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
if (neon_enable_new_communicator)
return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum);
else
{
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
{
return true;
}
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
}
}
/*
@@ -816,33 +822,46 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forkNum);
/*
* Newly created relation is empty, remember that in the relsize cache.
*
* Note that in REDO, this is called to make sure the relation fork
* exists, but it does not truncate the relation. So, we can only update
* the relsize if it didn't exist before.
*
* Also, in redo, we must make sure to update the cached size of the
* relation, as that is the primary source of truth for REDO's file length
* considerations, and as file extension isn't (perfectly) logged, we need
* to take care of that before we hit file size checks.
*
* FIXME: This is currently not just an optimization, but required for
* correctness. Postgres can call smgrnblocks() on the newly-created
* relation. Currently, we don't call SetLastWrittenLSN() when a new
* relation created, so if we didn't remember the size in the relsize
* cache, we might call smgrnblocks() on the newly-created relation before
* the creation WAL record hass been received by the page server.
*/
if (isRedo)
if (neon_enable_new_communicator)
{
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
&reln->smgr_cached_nblocks[forkNum]);
if (isRedo)
{
if (!communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum))
communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum);
}
else
communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum);
}
else
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
{
/*
* Newly created relation is empty, remember that in the relsize cache.
*
* Note that in REDO, this is called to make sure the relation fork
* exists, but it does not truncate the relation. So, we can only update
* the relsize if it didn't exist before.
*
* Also, in redo, we must make sure to update the cached size of the
* relation, as that is the primary source of truth for REDO's file length
* considerations, and as file extension isn't (perfectly) logged, we need
* to take care of that before we hit file size checks.
*
* FIXME: This is currently not just an optimization, but required for
* correctness. Postgres can call smgrnblocks() on the newly-created
* relation. Currently, we don't call SetLastWrittenLSN() when a new
* relation created, so if we didn't remember the size in the relsize
* cache, we might call smgrnblocks() on the newly-created relation before
* the creation WAL record hass been received by the page server.
*/
if (isRedo)
{
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
&reln->smgr_cached_nblocks[forkNum]);
}
else
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
}
if (debug_compare_local)
{
@@ -878,9 +897,15 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
* unlink, it won't do any harm if the file doesn't exist.
*/
mdunlink(rinfo, forkNum, isRedo);
if (!NRelFileInfoBackendIsTemp(rinfo))
{
forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
if (neon_enable_new_communicator)
{
communicator_new_rel_unlink(InfoFromNInfoB(rinfo), forkNum);
}
else
forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
}
}
@@ -960,7 +985,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
neon_wallog_page(reln, forkNum, blkno, buffer, false);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
lsn = PageGetLSN((Page) buffer);
neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
@@ -968,35 +992,51 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
forkNum, blkno,
(uint32) (lsn >> 32), (uint32) lsn);
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
if (debug_compare_local)
if (neon_enable_new_communicator)
{
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
// FIXME: this can pass lsn == invalid. Is that ok?
communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn);
/*
* smgr_extend is often called with an all-zeroes page, so
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
* later, after it has been initialized with the real page contents, and
* it is eventually evicted from the buffer cache. But we need a valid LSN
* to the relation metadata update now.
*/
if (lsn == InvalidXLogRecPtr)
{
lsn = GetXLogInsertRecPtr();
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
}
else
{
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
/*
* smgr_extend is often called with an all-zeroes page, so
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
* later, after it has been initialized with the real page contents, and
* it is eventually evicted from the buffer cache. But we need a valid LSN
* to the relation metadata update now.
*/
if (lsn == InvalidXLogRecPtr)
{
lsn = GetXLogInsertRecPtr();
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
}
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
}
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
}
#if PG_MAJORVERSION_NUM >= 16
static void
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
int nblocks, bool skipFsync)
{
const PGIOAlignedBlock buffer = {0};
BlockNumber blocknum = start_block;
int remblocks = nblocks;
XLogRecPtr lsn = 0;
@@ -1079,11 +1119,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
for (int i = 0; i < count; i++)
if (!neon_enable_new_communicator)
{
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
blocknum + i);
for (int i = 0; i < count; i++)
{
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
blocknum + i);
}
}
blocknum += count;
@@ -1092,8 +1135,15 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
Assert(lsn != 0);
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
if (neon_enable_new_communicator)
{
communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn);
}
else
{
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
}
}
#endif
@@ -1153,11 +1203,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_enable_new_communicator)
{
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks);
return false;
}
tag.spcOid = reln->smgr_rlocator.locator.spcOid;
tag.dbOid = reln->smgr_rlocator.locator.dbOid;
tag.relNumber = reln->smgr_rlocator.locator.relNumber;
tag.forkNum = forknum;
while (nblocks > 0)
{
int iterblocks = Min(nblocks, PG_IOV_MAX);
@@ -1179,7 +1235,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
blocknum += iterblocks;
}
communicator_prefetch_pump_state();
if (!neon_enable_new_communicator)
communicator_prefetch_pump_state();
return false;
}
@@ -1192,8 +1249,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
static bool
neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
BufferTag tag;
switch (reln->smgr_relpersistence)
{
case 0: /* probably shouldn't happen, but ignore it */
@@ -1208,17 +1263,25 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
return false;
if (neon_enable_new_communicator)
{
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1);
}
else
{
BufferTag tag;
tag.forkNum = forknum;
tag.blockNum = blocknum;
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
return false;
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
tag.forkNum = forknum;
tag.blockNum = blocknum;
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
}
return false;
}
@@ -1262,7 +1325,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
*/
neon_log(SmgrTrace, "writeback noop");
communicator_prefetch_pump_state();
if (!neon_enable_new_communicator)
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1279,7 +1343,14 @@ void
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
neon_request_lsns request_lsns, void *buffer)
{
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
if (neon_enable_new_communicator)
{
// FIXME: request_lsns is ignored. That affects the neon_test_utils callers.
// Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ?
communicator_new_read_at_lsnv(rinfo, forkNum, blkno, &buffer, 1);
}
else
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
}
static void
@@ -1405,47 +1476,55 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
/* Try to read PS results if they are available */
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
present = 0;
bufferp = buffer;
if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
if (neon_enable_new_communicator)
{
/* Prefetch hit */
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
{
return;
}
communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno,
(void *) &buffer, 1);
}
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
else
{
MyNeonCounters->file_cache_hits_total++;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
/* Try to read PS results if they are available */
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
present = 0;
bufferp = buffer;
if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
/* Prefetch hit */
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
{
return;
}
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
{
return;
MyNeonCounters->file_cache_hits_total++;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
{
return;
}
}
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
}
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
if (debug_compare_local)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
@@ -1508,59 +1587,67 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
nblocks, PG_IOV_MAX);
/* Try to read PS results if they are available */
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
request_lsns, nblocks);
if (!neon_enable_new_communicator)
communicator_prefetch_pump_state();
memset(read_pages, 0, sizeof(read_pages));
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
blocknum, request_lsns, nblocks,
buffers, read_pages);
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
if (neon_enable_new_communicator)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum,
buffers, nblocks);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
else
{
return;
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
request_lsns, nblocks);
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
blocknum, request_lsns, nblocks,
buffers, read_pages);
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
{
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
{
memset(read_pages, 0, sizeof(read_pages));
}
/* Try to read from local file cache */
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
nblocks, read_pages);
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
{
/* Read all blocks from LFC, so we're done */
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
{
memset(read_pages, 0, sizeof(read_pages));
}
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read_pages);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
{
memset(read_pages, 0, sizeof(read_pages));
}
/* Try to read from local file cache */
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
nblocks, read_pages);
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
{
/* Read all blocks from LFC, so we're done */
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
{
memset(read_pages, 0, sizeof(read_pages));
}
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read_pages);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1661,9 +1748,16 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
forknum, blocknum,
(uint32) (lsn >> 32), (uint32) lsn);
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
if (neon_enable_new_communicator)
{
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn);
}
else
{
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
}
if (debug_compare_local)
{
@@ -1724,9 +1818,21 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
if (neon_enable_new_communicator)
{
for (int i = 0; i < nblocks; i++)
{
XLogRecPtr lsn = PageGetLSN((Page) buffers[i]);
communicator_prefetch_pump_state();
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blkno + i, buffers[i], lsn);
}
}
else
{
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
communicator_prefetch_pump_state();
}
if (debug_compare_local)
{
@@ -1767,19 +1873,26 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
if (neon_enable_new_communicator)
{
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum);
}
else
{
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
{
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
}
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
}
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
@@ -1800,10 +1913,17 @@ neon_dbsize(Oid dbNode)
neon_request_lsns request_lsns;
NRelFileInfo dummy_node = {0};
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
if (neon_enable_new_communicator)
{
db_size = communicator_new_dbsize(dbNode);
}
else
{
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
db_size = communicator_dbsize(dbNode, &request_lsns);
db_size = communicator_dbsize(dbNode, &request_lsns);
}
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
@@ -1817,8 +1937,6 @@ neon_dbsize(Oid dbNode)
static void
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
{
XLogRecPtr lsn;
switch (reln->smgr_relpersistence)
{
case 0:
@@ -1842,34 +1960,43 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
if (neon_enable_new_communicator)
{
communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks);
}
else
{
XLogRecPtr lsn;
/*
* Truncating a relation drops all its buffers from the buffer cache
* without calling smgrwrite() on them. But we must account for that in
* our tracking of last-written-LSN all the same: any future smgrnblocks()
* request must return the new size after the truncation. We don't know
* what the LSN of the truncation record was, so be conservative and use
* the most recently inserted WAL record's LSN.
*/
lsn = GetXLogInsertRecPtr();
lsn = nm_adjust_lsn(lsn);
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
/*
* Flush it, too. We don't actually care about it here, but let's uphold
* the invariant that last-written LSN <= flush LSN.
*/
XLogFlush(lsn);
/*
* Truncating a relation drops all its buffers from the buffer cache
* without calling smgrwrite() on them. But we must account for that in
* our tracking of last-written-LSN all the same: any future smgrnblocks()
* request must return the new size after the truncation. We don't know
* what the LSN of the truncation record was, so be conservative and use
* the most recently inserted WAL record's LSN.
*/
lsn = GetXLogInsertRecPtr();
lsn = nm_adjust_lsn(lsn);
/*
* Truncate may affect several chunks of relations. So we should either
* update last written LSN for all of them, or update LSN for "dummy"
* metadata block. Second approach seems more efficient. If the relation
* is extended again later, the extension will update the last-written LSN
* for the extended pages, so there's no harm in leaving behind obsolete
* entries for the truncated chunks.
*/
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
/*
* Flush it, too. We don't actually care about it here, but let's uphold
* the invariant that last-written LSN <= flush LSN.
*/
XLogFlush(lsn);
/*
* Truncate may affect several chunks of relations. So we should either
* update last written LSN for all of them, or update LSN for "dummy"
* metadata block. Second approach seems more efficient. If the relation
* is extended again later, the extension will update the last-written LSN
* for the extended pages, so there's no harm in leaving behind obsolete
* entries for the truncated chunks.
*/
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
}
if (debug_compare_local)
{
@@ -1912,7 +2039,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
communicator_prefetch_pump_state();
if (!neon_enable_new_communicator)
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -2112,8 +2240,12 @@ neon_end_unlogged_build(SMgrRelation reln)
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
forknum);
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
// FIXME: also do this with the new communicator
if (!neon_enable_new_communicator)
{
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
}
mdclose(reln, forknum);
if (!debug_compare_local)
@@ -2181,7 +2313,10 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
request_lsns.not_modified_since = not_modified_since;
request_lsns.effective_request_lsn = request_lsn;
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
if (neon_enable_new_communicator)
n_blocks = communicator_new_read_slru_segment(kind, segno, buffer);
else
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
return n_blocks;
}
@@ -2218,7 +2353,8 @@ AtEOXact_neon(XactEvent event, void *arg)
}
break;
}
communicator_reconfigure_timeout_if_needed();
if (!neon_enable_new_communicator)
communicator_reconfigure_timeout_if_needed();
}
static const struct f_smgr neon_smgr =
@@ -2276,7 +2412,10 @@ smgr_init_neon(void)
smgr_init_standard();
neon_init();
communicator_init();
if (neon_enable_new_communicator)
communicator_new_init();
else
communicator_init();
}
@@ -2288,6 +2427,20 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
/* This is only used in WAL replay */
Assert(RecoveryInProgress());
if (neon_enable_new_communicator)
{
relsize = communicator_new_rel_nblocks(rinfo, forknum);
if (blkno >= relsize)
communicator_new_rel_zeroextend(rinfo, forknum, relsize, (blkno - relsize) + 1, end_recptr);
/*
* FIXME: does this need to update the last-written LSN too, like the
* old implementation?
*/
return;
}
/* Extend the relation if we know its size */
if (get_cached_relsize(rinfo, forknum, &relsize))
{
@@ -2453,7 +2606,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
* We should perform this check after assigning LwLSN to prevent
* prefetching of some older version of the page by some other backend.
*/
no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
if (neon_enable_new_communicator)
no_redo_needed = communicator_new_cache_contains(rinfo, forknum, blkno);
else
no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
}
LWLockRelease(partitionLock);

View File

@@ -10,6 +10,7 @@
*/
#include "postgres.h"
#include "neon.h"
#include "neon_pgversioncompat.h"
#include "pagestore_client.h"
@@ -99,6 +100,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
{
bool found = false;
Assert(!neon_enable_new_communicator);
if (relsize_hash_size > 0)
{
RelTag tag;
@@ -130,6 +133,8 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
void
set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
{
Assert(!neon_enable_new_communicator);
if (relsize_hash_size > 0)
{
RelTag tag;
@@ -178,6 +183,8 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
void
update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
{
Assert(!neon_enable_new_communicator);
if (relsize_hash_size > 0)
{
RelTag tag;
@@ -212,6 +219,8 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
void
forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
{
Assert(!neon_enable_new_communicator);
if (relsize_hash_size > 0)
{
RelTag tag;

View File

@@ -5,8 +5,9 @@ use std::sync::Arc;
use std::time::Duration;
use anyhow::Context;
use compute_api::spec::PageserverProtocol;
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
use control_plane::endpoint::{
ComputeControlPlane, EndpointStatus, PageserverConnectionInfo, PageserverShardConnectionInfo,
};
use control_plane::local_env::LocalEnv;
use futures::StreamExt;
use hyper::StatusCode;
@@ -506,27 +507,40 @@ impl ApiMethod for ComputeHookTenant {
if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}");
let pageservers = shards
.iter()
.map(|shard| {
let ps_conf = env
.get_pageserver_conf(shard.node_id)
.expect("Unknown pageserver");
if endpoint.grpc {
let addr = ps_conf.listen_grpc_addr.as_ref().expect("no gRPC address");
let (host, port) = parse_host_port(addr).expect("invalid gRPC address");
let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
(PageserverProtocol::Grpc, host, port)
} else {
let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
.expect("Unable to parse listen_pg_addr");
(PageserverProtocol::Libpq, host, port.unwrap_or(5432))
}
})
.collect::<Vec<_>>();
let mut shard_conninfos = HashMap::new();
for shard in shards.iter() {
let ps_conf = env
.get_pageserver_conf(shard.node_id)
.expect("Unknown pageserver");
let libpq_url = Some({
let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
.expect("Unable to parse listen_pg_addr");
let port = port.unwrap_or(5432);
format!("postgres://no_user@{host}:{port}")
});
let grpc_url = if let Some(grpc_addr) = &ps_conf.listen_grpc_addr {
let (host, port) =
parse_host_port(grpc_addr).expect("invalid gRPC address");
let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
let pageserver = PageserverShardConnectionInfo {
libpq_url,
grpc_url,
};
shard_conninfos.insert(shard.shard_number.0 as u32, pageserver);
}
let pageserver_conninfo = PageserverConnectionInfo {
shards: shard_conninfos,
prefer_grpc: endpoint.grpc,
};
endpoint
.reconfigure_pageservers(pageservers, *stripe_size)
.reconfigure_pageservers(pageserver_conninfo, *stripe_size)
.await
.map_err(NotifyError::NeonLocal)?;
}

View File

@@ -4309,7 +4309,18 @@ class Endpoint(PgProtocol, LogUtils):
# set small 'max_replication_write_lag' to enable backpressure
# and make tests more stable.
config_lines = ["max_replication_write_lag=15MB"] + config_lines
config_lines += ["max_replication_write_lag=15MB"]
# If gRPC is enabled, use the new communicator too.
#
# NB: the communicator is enabled by default, so force it to false otherwise.
#
# XXX: By checking for None, we enable the new communicator for all tests
# by default
if grpc or grpc is None:
config_lines += [f"neon.enable_new_communicator=on"]
else:
config_lines += [f"neon.enable_new_communicator=off"]
# Delete file cache if it exists (and we're recreating the endpoint)
if USE_LFC:
@@ -5367,6 +5378,7 @@ SKIP_FILES = frozenset(
"postmaster.pid",
"pg_control",
"pg_dynshmem",
".metrics.socket",
)
)

View File

@@ -17,7 +17,7 @@ def check_tenant(
config_lines = [
f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
]
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines, grpc=True)
# we rely upon autocommit after each statement
res_1 = endpoint.safe_psql_many(
queries=[