mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-31 09:10:38 +00:00
Compare commits
338 Commits
proxy-fix-
...
release-51
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7b860b837c | ||
|
|
41fc96e20f | ||
|
|
fb2b1ce57b | ||
|
|
464717451b | ||
|
|
5cec5cb3cf | ||
|
|
0694ee9531 | ||
|
|
9752ad8489 | ||
|
|
ad6f538aef | ||
|
|
1aa159acca | ||
|
|
60f30000ef | ||
|
|
bc1efa827f | ||
|
|
67522ce83d | ||
|
|
7d32af5ad5 | ||
|
|
59b6cce418 | ||
|
|
bf187aa13f | ||
|
|
22c26d610b | ||
|
|
516f793ab4 | ||
|
|
6443dbef90 | ||
|
|
23416cc358 | ||
|
|
46098ea0ea | ||
|
|
49bc734e02 | ||
|
|
76c44dc140 | ||
|
|
58ef78cf41 | ||
|
|
678ed39de2 | ||
|
|
3d8830ac35 | ||
|
|
38767ace68 | ||
|
|
9fe0193e51 | ||
|
|
8075f0965a | ||
|
|
44f42627dd | ||
|
|
3bd6551b36 | ||
|
|
69338e53e3 | ||
|
|
5309711691 | ||
|
|
8a53d576e6 | ||
|
|
b0aff04157 | ||
|
|
0554bee022 | ||
|
|
83855a907c | ||
|
|
1b41db8bdd | ||
|
|
bac06ea1ac | ||
|
|
7ae8364b0b | ||
|
|
1f7d54f987 | ||
|
|
580e136b2e | ||
|
|
09699d4bd8 | ||
|
|
89cf714890 | ||
|
|
621ea2ec44 | ||
|
|
74d09b78c7 | ||
|
|
0cf0731d8b | ||
|
|
98723844ee | ||
|
|
73a8c97ac8 | ||
|
|
17a3c9036e | ||
|
|
8c5b310090 | ||
|
|
8224580f3e | ||
|
|
2b0f3549f7 | ||
|
|
b4972d07d4 | ||
|
|
26ae7b0b3e | ||
|
|
c6ed86d3d0 | ||
|
|
f0a9017008 | ||
|
|
f8483cc4a3 | ||
|
|
cc5d6c66b3 | ||
|
|
d894d2b450 | ||
|
|
b09d686335 | ||
|
|
74d24582cf | ||
|
|
4834d22d2d | ||
|
|
86e8c43ddf | ||
|
|
bb7949ba00 | ||
|
|
1df0f69664 | ||
|
|
970066a914 | ||
|
|
1ebd3897c0 | ||
|
|
6460beffcd | ||
|
|
6f7f8958db | ||
|
|
936a00e077 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
5
.github/workflows/build_and_test.yml
vendored
5
.github/workflows/build_and_test.yml
vendored
@@ -461,6 +461,7 @@ jobs:
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: 60
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
test_selection: regress
|
||||
@@ -474,7 +475,7 @@ jobs:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_GET_VECTORED_IMPL: vectored
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
@@ -554,7 +555,7 @@ jobs:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
|
||||
8
Cargo.lock
generated
8
Cargo.lock
generated
@@ -282,8 +282,10 @@ dependencies = [
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
@@ -1344,6 +1346,7 @@ dependencies = [
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
@@ -3527,6 +3530,7 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
@@ -5884,7 +5888,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -6421,7 +6425,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
|
||||
2
Makefile
2
Makefile
@@ -51,7 +51,7 @@ CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
# Set PQ_LIB_DIR to make sure `attachment_service` get linked with bundled libpq (through diesel)
|
||||
# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
|
||||
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
||||
|
||||
#
|
||||
|
||||
@@ -238,6 +238,14 @@ If you encounter errors during setting up the initial tenant, it's best to stop
|
||||
|
||||
## Running tests
|
||||
|
||||
### Rust unit tests
|
||||
|
||||
We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
|
||||
Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
|
||||
You can install `cargo-nextest` with `cargo install cargo-nextest`.
|
||||
|
||||
### Integration tests
|
||||
|
||||
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
|
||||
|
||||
```sh
|
||||
|
||||
@@ -2,6 +2,8 @@ disallowed-methods = [
|
||||
"tokio::task::block_in_place",
|
||||
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
||||
# "tokio::runtime::Handle::block_on",
|
||||
# use tokio_epoll_uring_ext instead
|
||||
"tokio_epoll_uring::thread_local_system",
|
||||
]
|
||||
|
||||
disallowed-macros = [
|
||||
|
||||
@@ -17,6 +17,7 @@ use chrono::{DateTime, Utc};
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use nix::unistd::Pid;
|
||||
use postgres::error::SqlState;
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
@@ -722,8 +723,12 @@ impl ComputeNode {
|
||||
// Stop it when it's ready
|
||||
info!("waiting for postgres");
|
||||
wait_for_postgres(&mut pg, Path::new(pgdata))?;
|
||||
pg.kill()?;
|
||||
info!("sent kill signal");
|
||||
// SIGQUIT orders postgres to exit immediately. We don't want to SIGKILL
|
||||
// it to avoid orphaned processes prowling around while datadir is
|
||||
// wiped.
|
||||
let pm_pid = Pid::from_raw(pg.id() as i32);
|
||||
kill(pm_pid, Signal::SIGQUIT)?;
|
||||
info!("sent SIGQUIT signal");
|
||||
pg.wait()?;
|
||||
info!("done prewarming");
|
||||
|
||||
|
||||
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
// from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("running role create query: '{}'", &query);
|
||||
@@ -743,19 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||
// which may happen in two cases:
|
||||
// - extension was just installed
|
||||
// - extension was already installed and is up to date
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade");
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade (not really)");
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -805,6 +807,18 @@ $$;"#,
|
||||
"",
|
||||
"",
|
||||
// Add new migrations below.
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
role_name TEXT;
|
||||
BEGIN
|
||||
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
|
||||
LOOP
|
||||
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
|
||||
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
|
||||
END LOOP;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
|
||||
@@ -12,6 +12,7 @@ clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
@@ -19,8 +19,10 @@ aws-config.workspace = true
|
||||
aws-sdk-secretsmanager.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
227
control_plane/attachment_service/src/heartbeater.rs
Normal file
227
control_plane/attachment_service/src/heartbeater.rs
Normal file
@@ -0,0 +1,227 @@
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use pageserver_api::{
|
||||
controller_api::{NodeAvailability, UtilizationScore},
|
||||
models::PageserverUtilization,
|
||||
};
|
||||
|
||||
use thiserror::Error;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::node::Node;
|
||||
|
||||
struct HeartbeaterTask {
|
||||
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
|
||||
cancel: CancellationToken,
|
||||
|
||||
state: HashMap<NodeId, PageserverState>,
|
||||
|
||||
max_unavailable_interval: Duration,
|
||||
jwt_token: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum PageserverState {
|
||||
Available {
|
||||
last_seen_at: Instant,
|
||||
utilization: PageserverUtilization,
|
||||
},
|
||||
Offline,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub(crate) enum HeartbeaterError {
|
||||
#[error("Cancelled")]
|
||||
Cancel,
|
||||
}
|
||||
|
||||
struct HeartbeatRequest {
|
||||
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
|
||||
}
|
||||
|
||||
pub(crate) struct Heartbeater {
|
||||
sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
|
||||
}
|
||||
|
||||
impl Heartbeater {
|
||||
pub(crate) fn new(
|
||||
jwt_token: Option<String>,
|
||||
max_unavailable_interval: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
|
||||
let mut heartbeater =
|
||||
HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
|
||||
tokio::task::spawn(async move { heartbeater.run().await });
|
||||
|
||||
Self { sender }
|
||||
}
|
||||
|
||||
pub(crate) async fn heartbeat(
|
||||
&self,
|
||||
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
) -> Result<AvailablityDeltas, HeartbeaterError> {
|
||||
let (sender, receiver) = tokio::sync::oneshot::channel();
|
||||
self.sender
|
||||
.send(HeartbeatRequest {
|
||||
pageservers,
|
||||
reply: sender,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
receiver.await.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl HeartbeaterTask {
|
||||
fn new(
|
||||
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
|
||||
jwt_token: Option<String>,
|
||||
max_unavailable_interval: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
receiver,
|
||||
cancel,
|
||||
state: HashMap::new(),
|
||||
max_unavailable_interval,
|
||||
jwt_token,
|
||||
}
|
||||
}
|
||||
|
||||
async fn run(&mut self) {
|
||||
loop {
|
||||
tokio::select! {
|
||||
request = self.receiver.recv() => {
|
||||
match request {
|
||||
Some(req) => {
|
||||
let res = self.heartbeat(req.pageservers).await;
|
||||
req.reply.send(res).unwrap();
|
||||
},
|
||||
None => { return; }
|
||||
}
|
||||
},
|
||||
_ = self.cancel.cancelled() => return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn heartbeat(
|
||||
&mut self,
|
||||
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
) -> Result<AvailablityDeltas, HeartbeaterError> {
|
||||
let mut new_state = HashMap::new();
|
||||
|
||||
let mut heartbeat_futs = FuturesUnordered::new();
|
||||
for (node_id, node) in &*pageservers {
|
||||
heartbeat_futs.push({
|
||||
let jwt_token = self.jwt_token.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
|
||||
// Clone the node and mark it as available such that the request
|
||||
// goes through to the pageserver even when the node is marked offline.
|
||||
// This doesn't impact the availability observed by [`crate::service::Service`].
|
||||
let mut node = node.clone();
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
|
||||
async move {
|
||||
let response = node
|
||||
.with_client_retries(
|
||||
|client| async move { client.get_utilization().await },
|
||||
&jwt_token,
|
||||
2,
|
||||
3,
|
||||
Duration::from_secs(1),
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
let response = match response {
|
||||
Some(r) => r,
|
||||
None => {
|
||||
// This indicates cancellation of the request.
|
||||
// We ignore the node in this case.
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let status = if let Ok(utilization) = response {
|
||||
PageserverState::Available {
|
||||
last_seen_at: Instant::now(),
|
||||
utilization,
|
||||
}
|
||||
} else {
|
||||
PageserverState::Offline
|
||||
};
|
||||
|
||||
Some((*node_id, status))
|
||||
}
|
||||
});
|
||||
|
||||
loop {
|
||||
let maybe_status = tokio::select! {
|
||||
next = heartbeat_futs.next() => {
|
||||
match next {
|
||||
Some(result) => result,
|
||||
None => { break; }
|
||||
}
|
||||
},
|
||||
_ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
|
||||
};
|
||||
|
||||
if let Some((node_id, status)) = maybe_status {
|
||||
new_state.insert(node_id, status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut deltas = Vec::new();
|
||||
let now = Instant::now();
|
||||
for (node_id, ps_state) in new_state {
|
||||
use std::collections::hash_map::Entry::*;
|
||||
let entry = self.state.entry(node_id);
|
||||
|
||||
let mut needs_update = false;
|
||||
match entry {
|
||||
Occupied(ref occ) => match (occ.get(), &ps_state) {
|
||||
(PageserverState::Offline, PageserverState::Offline) => {}
|
||||
(PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
|
||||
if now - *last_seen_at >= self.max_unavailable_interval {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
needs_update = true;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
needs_update = true;
|
||||
}
|
||||
},
|
||||
Vacant(_) => {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
match entry {
|
||||
Occupied(mut occ) if needs_update => {
|
||||
(*occ.get_mut()) = ps_state;
|
||||
}
|
||||
Vacant(vac) => {
|
||||
vac.insert(ps_state);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(AvailablityDeltas(deltas))
|
||||
}
|
||||
}
|
||||
@@ -10,9 +10,11 @@ use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::auth::{Scope, SwappableJwtAuth};
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
||||
use utils::http::request::{must_get_query_param, parse_request_param};
|
||||
use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::{
|
||||
@@ -26,11 +28,11 @@ use utils::{
|
||||
};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
};
|
||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
use control_plane::attachment_service::{AttachHookRequest, InspectRequest};
|
||||
use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
|
||||
|
||||
/// State available to HTTP request handlers
|
||||
#[derive(Clone)]
|
||||
@@ -174,14 +176,14 @@ async fn handle_tenant_location_config(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_location_config(tenant_id, config_req)
|
||||
.tenant_location_config(tenant_shard_id, config_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
@@ -246,8 +248,10 @@ async fn handle_tenant_secondary_download(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
service.tenant_secondary_download(tenant_id).await?;
|
||||
json_response(StatusCode::OK, ())
|
||||
let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
|
||||
|
||||
let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
|
||||
json_response(status, progress)
|
||||
}
|
||||
|
||||
async fn handle_tenant_delete(
|
||||
@@ -387,7 +391,14 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.node_configure(config_req).await?,
|
||||
state
|
||||
.service
|
||||
.node_configure(
|
||||
config_req.node_id,
|
||||
config_req.availability.map(NodeAvailability::from),
|
||||
config_req.scheduling,
|
||||
)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -554,6 +565,9 @@ pub fn make_router(
|
||||
.post("/debug/v1/consistency_check", |r| {
|
||||
request_span(r, handle_consistency_check)
|
||||
})
|
||||
.put("/debug/v1/failpoints", |r| {
|
||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
@@ -587,7 +601,7 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_get)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||
|
||||
54
control_plane/attachment_service/src/id_lock_map.rs
Normal file
54
control_plane/attachment_service/src/id_lock_map.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
|
||||
/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
|
||||
/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
|
||||
/// is needed at a tenant-wide granularity.
|
||||
pub(crate) struct IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
/// A synchronous lock for getting/setting the async locks that our callers will wait on.
|
||||
entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
|
||||
}
|
||||
|
||||
impl<T> IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
pub(crate) fn shared(
|
||||
&self,
|
||||
key: T,
|
||||
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default();
|
||||
entry.clone().read_owned()
|
||||
}
|
||||
|
||||
pub(crate) fn exclusive(
|
||||
&self,
|
||||
key: T,
|
||||
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default();
|
||||
entry.clone().write_owned()
|
||||
}
|
||||
|
||||
/// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
|
||||
/// periodic housekeeping to avoid the map growing indefinitely
|
||||
pub(crate) fn housekeeping(&self) {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
locked.retain(|_k, lock| lock.try_write().is_err())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Default for IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
entities: std::sync::Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,9 @@ use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod auth;
|
||||
mod compute_hook;
|
||||
mod heartbeater;
|
||||
pub mod http;
|
||||
mod id_lock_map;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
pub mod persistence;
|
||||
|
||||
@@ -1,14 +1,8 @@
|
||||
/// The attachment service mimics the aspects of the control plane API
|
||||
/// that are required for a pageserver to operate.
|
||||
///
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::{anyhow, Context};
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service};
|
||||
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
|
||||
use aws_config::{BehaviorVersion, Region};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
@@ -60,6 +54,10 @@ struct Cli {
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||
#[arg(long)]
|
||||
database_url: Option<String>,
|
||||
|
||||
/// Grace period before marking unresponsive pageserver offline
|
||||
#[arg(long)]
|
||||
max_unavailable_interval: Option<humantime::Duration>,
|
||||
}
|
||||
|
||||
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
||||
@@ -212,6 +210,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let default_panic = std::panic::take_hook();
|
||||
std::panic::set_hook(Box::new(move |info| {
|
||||
default_panic(info);
|
||||
std::process::exit(1);
|
||||
}));
|
||||
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
// We use spawn_blocking for database operations, so require approximately
|
||||
// as many blocking threads as we will open database connections.
|
||||
@@ -249,6 +253,10 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
jwt_token: secrets.jwt_token,
|
||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||
compute_hook_url: args.compute_hook_url,
|
||||
max_unavailable_interval: args
|
||||
.max_unavailable_interval
|
||||
.map(humantime::Duration::into)
|
||||
.unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
|
||||
};
|
||||
|
||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||
|
||||
@@ -12,7 +12,7 @@ use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, id::NodeId};
|
||||
|
||||
use crate::persistence::NodePersistence;
|
||||
use crate::{persistence::NodePersistence, scheduler::MaySchedule};
|
||||
|
||||
/// Represents the in-memory description of a Node.
|
||||
///
|
||||
@@ -83,29 +83,38 @@ impl Node {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_availability(
|
||||
&mut self,
|
||||
availability: NodeAvailability,
|
||||
) -> AvailabilityTransition {
|
||||
use NodeAvailability::*;
|
||||
let transition = match (self.availability, availability) {
|
||||
(Offline, Active) => {
|
||||
pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
|
||||
match self.get_availability_transition(availability) {
|
||||
AvailabilityTransition::ToActive => {
|
||||
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
||||
// users of previously-cloned copies of the node will still see the old cancellation
|
||||
// state. For example, Reconcilers in flight will have to complete and be spawned
|
||||
// again to realize that the node has become available.
|
||||
self.cancel = CancellationToken::new();
|
||||
AvailabilityTransition::ToActive
|
||||
}
|
||||
(Active, Offline) => {
|
||||
AvailabilityTransition::ToOffline => {
|
||||
// Fire the node's cancellation token to cancel any in-flight API requests to it
|
||||
self.cancel.cancel();
|
||||
AvailabilityTransition::ToOffline
|
||||
}
|
||||
_ => AvailabilityTransition::Unchanged,
|
||||
};
|
||||
AvailabilityTransition::Unchanged => {}
|
||||
}
|
||||
self.availability = availability;
|
||||
transition
|
||||
}
|
||||
|
||||
/// Without modifying the availability of the node, convert the intended availability
|
||||
/// into a description of the transition.
|
||||
pub(crate) fn get_availability_transition(
|
||||
&self,
|
||||
availability: NodeAvailability,
|
||||
) -> AvailabilityTransition {
|
||||
use AvailabilityTransition::*;
|
||||
use NodeAvailability::*;
|
||||
|
||||
match (self.availability, availability) {
|
||||
(Offline, Active(_)) => ToActive,
|
||||
(Active(_), Offline) => ToOffline,
|
||||
_ => Unchanged,
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether we may send API requests to this node.
|
||||
@@ -114,21 +123,21 @@ impl Node {
|
||||
// a reference to the original Node's cancellation status. Checking both of these results
|
||||
// in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
|
||||
// when we cloned it, or if the original Node instance's cancellation token was fired.
|
||||
matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
|
||||
matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
|
||||
}
|
||||
|
||||
/// Is this node elegible to have work scheduled onto it?
|
||||
pub(crate) fn may_schedule(&self) -> bool {
|
||||
match self.availability {
|
||||
NodeAvailability::Active => {}
|
||||
NodeAvailability::Offline => return false,
|
||||
}
|
||||
pub(crate) fn may_schedule(&self) -> MaySchedule {
|
||||
let score = match self.availability {
|
||||
NodeAvailability::Active(score) => score,
|
||||
NodeAvailability::Offline => return MaySchedule::No,
|
||||
};
|
||||
|
||||
match self.scheduling {
|
||||
NodeSchedulingPolicy::Active => true,
|
||||
NodeSchedulingPolicy::Draining => false,
|
||||
NodeSchedulingPolicy::Filling => true,
|
||||
NodeSchedulingPolicy::Pause => false,
|
||||
NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
|
||||
NodeSchedulingPolicy::Draining => MaySchedule::No,
|
||||
NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
|
||||
NodeSchedulingPolicy::Pause => MaySchedule::No,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -146,8 +155,7 @@ impl Node {
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
scheduling: NodeSchedulingPolicy::Filling,
|
||||
// TODO: we shouldn't really call this Active until we've heartbeated it.
|
||||
availability: NodeAvailability::Active,
|
||||
availability: NodeAvailability::Offline,
|
||||
cancel: CancellationToken::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,9 @@ use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::ShardConfigError;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::generation::Generation;
|
||||
@@ -20,7 +23,7 @@ use crate::node::Node;
|
||||
|
||||
/// ## What do we store?
|
||||
///
|
||||
/// The attachment service does not store most of its state durably.
|
||||
/// The storage controller service does not store most of its state durably.
|
||||
///
|
||||
/// The essential things to store durably are:
|
||||
/// - generation numbers, as these must always advance monotonically to ensure data safety.
|
||||
@@ -34,7 +37,7 @@ use crate::node::Node;
|
||||
///
|
||||
/// ## Performance/efficiency
|
||||
///
|
||||
/// The attachment service does not go via the database for most things: there are
|
||||
/// The storage controller service does not go via the database for most things: there are
|
||||
/// a couple of places where we must, and where efficiency matters:
|
||||
/// - Incrementing generation numbers: the Reconciler has to wait for this to complete
|
||||
/// before it can attach a tenant, so this acts as a bound on how fast things like
|
||||
@@ -72,6 +75,14 @@ pub(crate) enum DatabaseError {
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) enum AbortShardSplitStatus {
|
||||
/// We aborted the split in the database by reverting to the parent shards
|
||||
Aborted,
|
||||
/// The split had already been persisted.
|
||||
Complete,
|
||||
}
|
||||
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
impl Persistence {
|
||||
@@ -570,6 +581,51 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Used when the remote part of a shard split failed: we will revert the database state to have only
|
||||
/// the parent shards, with SplitState::Idle.
|
||||
pub(crate) async fn abort_shard_split(
|
||||
&self,
|
||||
split_tenant_id: TenantId,
|
||||
new_shard_count: ShardCount,
|
||||
) -> DatabaseResult<AbortShardSplitStatus> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
|
||||
// Sanity check: if parent shards were present, their cardinality should
|
||||
// be less than the number of child shards.
|
||||
if updated >= new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected parent shard count {updated} while aborting split to \
|
||||
count {new_shard_count:?} on tenant {split_tenant_id}"
|
||||
)));
|
||||
}
|
||||
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
})?;
|
||||
|
||||
Ok(aborted)
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
@@ -604,6 +660,28 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) config: String,
|
||||
}
|
||||
|
||||
impl TenantShardPersistence {
|
||||
pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
|
||||
if self.shard_count == 0 {
|
||||
Ok(ShardIdentity::unsharded())
|
||||
} else {
|
||||
Ok(ShardIdentity::new(
|
||||
ShardNumber(self.shard_number as u8),
|
||||
ShardCount::new(self.shard_count as u8),
|
||||
ShardStripeSize(self.shard_stripe_size as u32),
|
||||
)?)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
|
||||
Ok(TenantShardId {
|
||||
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(self.shard_number as u8),
|
||||
shard_count: ShardCount::new(self.shard_count as u8),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
@@ -7,7 +8,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
@@ -18,6 +19,8 @@ use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
use crate::node::Node;
|
||||
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
|
||||
|
||||
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
|
||||
|
||||
/// Object with the lifetime of the background reconcile task that is created
|
||||
/// for tenants which have a difference between their intent and observed states.
|
||||
pub(super) struct Reconciler {
|
||||
@@ -255,22 +258,81 @@ impl Reconciler {
|
||||
tenant_shard_id: TenantShardId,
|
||||
node: &Node,
|
||||
) -> Result<(), ReconcileError> {
|
||||
match node
|
||||
.with_client_retries(
|
||||
|client| async move { client.tenant_secondary_download(tenant_shard_id).await },
|
||||
&self.service_config.jwt_token,
|
||||
1,
|
||||
1,
|
||||
Duration::from_secs(60),
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
None => Err(ReconcileError::Cancel),
|
||||
Some(Ok(_)) => Ok(()),
|
||||
Some(Err(e)) => {
|
||||
tracing::info!(" (skipping destination download: {})", e);
|
||||
Ok(())
|
||||
// This is not the timeout for a request, but the total amount of time we're willing to wait
|
||||
// for a secondary location to get up to date before
|
||||
const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
|
||||
|
||||
// This the long-polling interval for the secondary download requests we send to destination pageserver
|
||||
// during a migration.
|
||||
const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
loop {
|
||||
let (status, progress) = match node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
client
|
||||
.tenant_secondary_download(
|
||||
tenant_shard_id,
|
||||
Some(REQUEST_DOWNLOAD_TIMEOUT),
|
||||
)
|
||||
.await
|
||||
},
|
||||
&self.service_config.jwt_token,
|
||||
1,
|
||||
3,
|
||||
REQUEST_DOWNLOAD_TIMEOUT * 2,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
None => Err(ReconcileError::Cancel),
|
||||
Some(Ok(v)) => Ok(v),
|
||||
Some(Err(e)) => {
|
||||
// Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
|
||||
// attaching, but we should not let an issue with a secondary location stop us proceeding
|
||||
// with a live migration.
|
||||
tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
|
||||
return Ok(());
|
||||
}
|
||||
}?;
|
||||
|
||||
if status == StatusCode::OK {
|
||||
tracing::info!(
|
||||
"Downloads to {} complete: {}/{} layers, {}/{} bytes",
|
||||
node,
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
return Ok(());
|
||||
} else if status == StatusCode::ACCEPTED {
|
||||
let total_runtime = started_at.elapsed();
|
||||
if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
|
||||
tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes",
|
||||
total_runtime.as_millis(),
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
// Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
|
||||
// it just makes the I/O performance for users less good.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Log and proceed around the loop to retry. We don't sleep between requests, because our HTTP call
|
||||
// to the pageserver is a long-poll.
|
||||
tracing::info!(
|
||||
"Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
|
||||
node,
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -485,17 +547,29 @@ impl Reconciler {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Ok(observed)) => observed,
|
||||
Some(Ok(observed)) => Some(observed),
|
||||
Some(Err(mgmt_api::Error::ApiError(status, _msg)))
|
||||
if status == StatusCode::NOT_FOUND =>
|
||||
{
|
||||
None
|
||||
}
|
||||
Some(Err(e)) => return Err(e.into()),
|
||||
None => return Err(ReconcileError::Cancel),
|
||||
};
|
||||
tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
|
||||
self.observed.locations.insert(
|
||||
attached_node.get_id(),
|
||||
ObservedStateLocation {
|
||||
conf: observed_conf,
|
||||
},
|
||||
);
|
||||
match observed_conf {
|
||||
Some(conf) => {
|
||||
// Pageserver returned a state: update it in observed. This may still be an indeterminate (None) state,
|
||||
// if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
|
||||
self.observed
|
||||
.locations
|
||||
.insert(attached_node.get_id(), ObservedStateLocation { conf });
|
||||
}
|
||||
None => {
|
||||
// Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
|
||||
self.observed.locations.remove(&attached_node.get_id());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -525,7 +599,12 @@ impl Reconciler {
|
||||
)));
|
||||
};
|
||||
|
||||
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
let mut wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
);
|
||||
match self.observed.locations.get(&node.get_id()) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
@@ -662,10 +741,26 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
/// We tweak the externally-set TenantConfig while configuring
|
||||
/// locations, using our awareness of whether secondary locations
|
||||
/// are in use to automatically enable/disable heatmap uploads.
|
||||
fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
|
||||
let mut config = config.clone();
|
||||
if has_secondaries {
|
||||
if config.heatmap_period.is_none() {
|
||||
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
|
||||
}
|
||||
} else {
|
||||
config.heatmap_period = None;
|
||||
}
|
||||
config
|
||||
}
|
||||
|
||||
pub(crate) fn attached_location_conf(
|
||||
generation: Generation,
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
has_secondaries: bool,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
@@ -674,7 +769,7 @@ pub(crate) fn attached_location_conf(
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
tenant_conf: ha_aware_config(config, has_secondaries),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -689,6 +784,6 @@ pub(crate) fn secondary_location_conf(
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
tenant_conf: ha_aware_config(config, true),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
use pageserver_api::controller_api::UtilizationScore;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
@@ -19,15 +20,34 @@ impl From<ScheduleError> for ApiError {
|
||||
}
|
||||
|
||||
#[derive(Serialize, Eq, PartialEq)]
|
||||
pub enum MaySchedule {
|
||||
Yes(UtilizationScore),
|
||||
No,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
||||
shard_count: usize,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
/// from a node's availability state and scheduling policy).
|
||||
may_schedule: bool,
|
||||
may_schedule: MaySchedule,
|
||||
}
|
||||
|
||||
impl PartialEq for SchedulerNode {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
let may_schedule_matches = matches!(
|
||||
(&self.may_schedule, &other.may_schedule),
|
||||
(MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
|
||||
);
|
||||
|
||||
may_schedule_matches && self.shard_count == other.shard_count
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for SchedulerNode {}
|
||||
|
||||
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
||||
/// on which to run.
|
||||
///
|
||||
@@ -186,13 +206,15 @@ impl Scheduler {
|
||||
return None;
|
||||
}
|
||||
|
||||
// TODO: When the utilization score returned by the pageserver becomes meaningful,
|
||||
// schedule based on that instead of the shard count.
|
||||
let node = nodes
|
||||
.iter()
|
||||
.map(|node_id| {
|
||||
let may_schedule = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule)
|
||||
.map(|n| n.may_schedule != MaySchedule::No)
|
||||
.unwrap_or(false);
|
||||
(*node_id, may_schedule)
|
||||
})
|
||||
@@ -211,7 +233,7 @@ impl Scheduler {
|
||||
.nodes
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) || !v.may_schedule {
|
||||
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
|
||||
None
|
||||
} else {
|
||||
Some((*k, v.shard_count))
|
||||
@@ -230,7 +252,7 @@ impl Scheduler {
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule,
|
||||
node.may_schedule != MaySchedule::No,
|
||||
node.shard_count
|
||||
);
|
||||
}
|
||||
@@ -255,6 +277,7 @@ impl Scheduler {
|
||||
pub(crate) mod test_utils {
|
||||
|
||||
use crate::node::Node;
|
||||
use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
|
||||
use std::collections::HashMap;
|
||||
use utils::id::NodeId;
|
||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||
@@ -264,13 +287,14 @@ pub(crate) mod test_utils {
|
||||
(1..n + 1)
|
||||
.map(|i| {
|
||||
(NodeId(i), {
|
||||
let node = Node::new(
|
||||
let mut node = Node::new(
|
||||
NodeId(i),
|
||||
format!("httphost-{i}"),
|
||||
80 + i as u16,
|
||||
format!("pghost-{i}"),
|
||||
5432 + i as u16,
|
||||
);
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
assert!(node.is_available());
|
||||
node
|
||||
})
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -577,7 +577,12 @@ impl TenantState {
|
||||
.generation
|
||||
.expect("Attempted to enter attached state without a generation");
|
||||
|
||||
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
let wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
@@ -617,7 +622,7 @@ impl TenantState {
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
|
||||
pub(crate) fn maybe_reconcile(
|
||||
&mut self,
|
||||
result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
|
||||
pageservers: &Arc<HashMap<NodeId, Node>>,
|
||||
compute_hook: &Arc<ComputeHook>,
|
||||
service_config: &service::Config,
|
||||
@@ -729,6 +734,7 @@ impl TenantState {
|
||||
tenant_id=%reconciler.tenant_shard_id.tenant_id,
|
||||
shard_id=%reconciler.tenant_shard_id.shard_slug());
|
||||
metrics::RECONCILER.spawned.inc();
|
||||
let result_tx = result_tx.clone();
|
||||
let join_handle = tokio::task::spawn(
|
||||
async move {
|
||||
// Wait for any previous reconcile task to complete before we start
|
||||
|
||||
@@ -8,11 +8,11 @@
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
|
||||
use compute_api::spec::ComputeMode;
|
||||
use control_plane::attachment_service::AttachmentService;
|
||||
use control_plane::endpoint::ComputeControlPlane;
|
||||
use control_plane::local_env::{InitForceMode, LocalEnv};
|
||||
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::StorageController;
|
||||
use control_plane::{broker, local_env};
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
@@ -138,7 +138,7 @@ fn main() -> Result<()> {
|
||||
"start" => rt.block_on(handle_start_all(sub_args, &env)),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"attachment_service" => rt.block_on(handle_attachment_service(sub_args, &env)),
|
||||
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
||||
"safekeeper" => rt.block_on(handle_safekeeper(sub_args, &env)),
|
||||
"endpoint" => rt.block_on(handle_endpoint(sub_args, &env)),
|
||||
"mappings" => handle_mappings(sub_args, &mut env),
|
||||
@@ -445,14 +445,14 @@ async fn handle_tenant(
|
||||
// If tenant ID was not specified, generate one
|
||||
let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);
|
||||
|
||||
// We must register the tenant with the attachment service, so
|
||||
// We must register the tenant with the storage controller, so
|
||||
// that when the pageserver restarts, it will be re-attached.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_create(TenantCreateRequest {
|
||||
// Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
|
||||
// attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
||||
// type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
|
||||
// storage controller expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
|
||||
// type is used both in storage controller (for creating tenants) and in pageserver (for creating shards)
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters {
|
||||
@@ -476,9 +476,9 @@ async fn handle_tenant(
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
// FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
|
||||
// different shards picking different start lsns. Maybe we have to teach attachment service
|
||||
// different shards picking different start lsns. Maybe we have to teach storage controller
|
||||
// to let shard 0 branch first and then propagate the chosen LSN to other shards.
|
||||
attachment_service
|
||||
storage_controller
|
||||
.tenant_timeline_create(
|
||||
tenant_id,
|
||||
TimelineCreateRequest {
|
||||
@@ -528,8 +528,8 @@ async fn handle_tenant(
|
||||
let new_pageserver = get_pageserver(env, matches)?;
|
||||
let new_pageserver_id = new_pageserver.conf.id;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_migrate(tenant_shard_id, new_pageserver_id)
|
||||
.await?;
|
||||
|
||||
@@ -543,8 +543,8 @@ async fn handle_tenant(
|
||||
|
||||
let mut tenant_synthetic_size = None;
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
|
||||
let pageserver =
|
||||
PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
|
||||
|
||||
@@ -585,10 +585,14 @@ async fn handle_tenant(
|
||||
Some(("shard-split", matches)) => {
|
||||
let tenant_id = get_tenant_id(matches, env)?;
|
||||
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
|
||||
let shard_stripe_size: Option<ShardStripeSize> = matches
|
||||
.get_one::<Option<ShardStripeSize>>("shard-stripe-size")
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let result = attachment_service
|
||||
.tenant_split(tenant_id, shard_count)
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let result = storage_controller
|
||||
.tenant_split(tenant_id, shard_count, shard_stripe_size)
|
||||
.await?;
|
||||
println!(
|
||||
"Split tenant {} into shards {}",
|
||||
@@ -613,7 +617,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
|
||||
match timeline_match.subcommand() {
|
||||
Some(("list", list_match)) => {
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
|
||||
let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
|
||||
@@ -633,7 +637,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
let new_timeline_id_opt = parse_timeline_id(create_match)?;
|
||||
let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: None,
|
||||
@@ -641,7 +645,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
ancestor_start_lsn: None,
|
||||
pg_version: Some(pg_version),
|
||||
};
|
||||
let timeline_info = attachment_service
|
||||
let timeline_info = storage_controller
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
|
||||
@@ -730,7 +734,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
.transpose()
|
||||
.context("Failed to parse ancestor start Lsn from the request")?;
|
||||
let new_timeline_id = TimelineId::generate();
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let create_req = TimelineCreateRequest {
|
||||
new_timeline_id,
|
||||
ancestor_timeline_id: Some(ancestor_timeline_id),
|
||||
@@ -738,7 +742,7 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
|
||||
ancestor_start_lsn: start_lsn,
|
||||
pg_version: None,
|
||||
};
|
||||
let timeline_info = attachment_service
|
||||
let timeline_info = storage_controller
|
||||
.tenant_timeline_create(tenant_id, create_req)
|
||||
.await?;
|
||||
|
||||
@@ -767,7 +771,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
|
||||
match sub_name {
|
||||
"list" => {
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
|
||||
// TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the storage controller
|
||||
// where shard 0 is attached, and query there.
|
||||
let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
|
||||
let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
|
||||
@@ -952,21 +956,21 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
(
|
||||
vec![(parsed.0, parsed.1.unwrap_or(5432))],
|
||||
// If caller is telling us what pageserver to use, this is not a tenant which is
|
||||
// full managed by attachment service, therefore not sharded.
|
||||
// full managed by storage controller, therefore not sharded.
|
||||
ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
)
|
||||
} else {
|
||||
// Look up the currently attached location of the tenant, and its striping metadata,
|
||||
// to pass these on to postgres.
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
|
||||
let pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
@@ -1015,8 +1019,8 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
pageserver.pg_connection_config.port(),
|
||||
)]
|
||||
} else {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.tenant_locate(endpoint.tenant_id)
|
||||
.await?
|
||||
.shards
|
||||
@@ -1024,7 +1028,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported malformed host"),
|
||||
.expect("Storage controller reported malformed host"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
@@ -1100,9 +1104,8 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
let register = subcommand_args.get_one::<bool>("register").unwrap_or(&true);
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(&pageserver_config_overrides(subcommand_args), *register)
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1131,7 +1134,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(subcommand_args), false)
|
||||
.start(&pageserver_config_overrides(subcommand_args))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
@@ -1144,8 +1147,8 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
let scheduling = subcommand_args.get_one("scheduling");
|
||||
let availability = subcommand_args.get_one("availability");
|
||||
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
attachment_service
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
storage_controller
|
||||
.node_configure(NodeConfigureRequest {
|
||||
node_id: pageserver.conf.id,
|
||||
scheduling: scheduling.cloned(),
|
||||
@@ -1170,11 +1173,11 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_attachment_service(
|
||||
async fn handle_storage_controller(
|
||||
sub_match: &ArgMatches,
|
||||
env: &local_env::LocalEnv,
|
||||
) -> Result<()> {
|
||||
let svc = AttachmentService::from_env(env);
|
||||
let svc = StorageController::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", _start_match)) => {
|
||||
if let Err(e) = svc.start().await {
|
||||
@@ -1194,8 +1197,8 @@ async fn handle_attachment_service(
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
Some((sub_name, _)) => bail!("Unexpected attachment_service subcommand '{}'", sub_name),
|
||||
None => bail!("no attachment_service subcommand provided"),
|
||||
Some((sub_name, _)) => bail!("Unexpected storage_controller subcommand '{}'", sub_name),
|
||||
None => bail!("no storage_controller subcommand provided"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1280,11 +1283,11 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
|
||||
broker::start_broker_process(env).await?;
|
||||
|
||||
// Only start the attachment service if the pageserver is configured to need it
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.start().await {
|
||||
eprintln!("attachment_service start failed: {:#}", e);
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.start().await {
|
||||
eprintln!("storage_controller start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
}
|
||||
@@ -1293,7 +1296,7 @@ async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver
|
||||
.start(&pageserver_config_overrides(sub_match), true)
|
||||
.start(&pageserver_config_overrides(sub_match))
|
||||
.await
|
||||
{
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
@@ -1356,9 +1359,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
}
|
||||
|
||||
if env.control_plane_api.is_some() {
|
||||
let attachment_service = AttachmentService::from_env(env);
|
||||
if let Err(e) = attachment_service.stop(immediate).await {
|
||||
eprintln!("attachment service stop failed: {e:#}");
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.stop(immediate).await {
|
||||
eprintln!("storage controller stop failed: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1586,6 +1589,7 @@ fn cli() -> Command {
|
||||
.about("Increase the number of shards in the tenant")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||
.arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
@@ -1596,11 +1600,7 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(pageserver_config_args.clone()).arg(Arg::new("register")
|
||||
.long("register")
|
||||
.default_value("true").required(false)
|
||||
.value_parser(value_parser!(bool))
|
||||
.value_name("register"))
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
@@ -1618,9 +1618,9 @@ fn cli() -> Command {
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("attachment_service")
|
||||
Command::new("storage_controller")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage attachment_service")
|
||||
.about("Manage storage_controller")
|
||||
.subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
|
||||
@@ -57,9 +57,9 @@ use serde::{Deserialize, Serialize};
|
||||
use url::Host;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage_controller::StorageController;
|
||||
|
||||
use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeFeature, ComputeMode, ComputeSpec};
|
||||
@@ -750,17 +750,17 @@ impl Endpoint {
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
// If we weren't given explicit pageservers, query the attachment service
|
||||
// If we weren't given explicit pageservers, query the storage controller
|
||||
if pageservers.is_empty() {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
|
||||
let storage_controller = StorageController::from_env(&self.env);
|
||||
let locate_result = storage_controller.tenant_locate(self.tenant_id).await?;
|
||||
pageservers = locate_result
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|shard| {
|
||||
(
|
||||
Host::parse(&shard.listen_pg_addr)
|
||||
.expect("Attachment service reported bad hostname"),
|
||||
.expect("Storage controller reported bad hostname"),
|
||||
shard.listen_pg_port,
|
||||
)
|
||||
})
|
||||
@@ -774,7 +774,10 @@ impl Endpoint {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
.unwrap();
|
||||
let response = client
|
||||
.post(format!(
|
||||
"http://{}:{}/configure",
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
//! local installations.
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
pub mod attachment_service;
|
||||
mod background_process;
|
||||
pub mod broker;
|
||||
pub mod endpoint;
|
||||
@@ -14,3 +13,4 @@ pub mod local_env;
|
||||
pub mod pageserver;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
pub mod storage_controller;
|
||||
|
||||
@@ -72,13 +72,13 @@ pub struct LocalEnv {
|
||||
#[serde(default)]
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
|
||||
// Control plane upcall API for pageserver: if None, we will not run attachment_service. If set, this will
|
||||
// Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will
|
||||
// be propagated into each pageserver's configuration.
|
||||
#[serde(default)]
|
||||
pub control_plane_api: Option<Url>,
|
||||
|
||||
// Control plane upcall API for attachment service. If set, this will be propagated into the
|
||||
// attachment service's configuration.
|
||||
// Control plane upcall API for storage controller. If set, this will be propagated into the
|
||||
// storage controller's configuration.
|
||||
#[serde(default)]
|
||||
pub control_plane_compute_hook_api: Option<Url>,
|
||||
|
||||
@@ -114,7 +114,7 @@ impl NeonBroker {
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
#[serde(default, deny_unknown_fields)]
|
||||
pub struct PageServerConf {
|
||||
// node id
|
||||
pub id: NodeId,
|
||||
@@ -126,6 +126,9 @@ pub struct PageServerConf {
|
||||
// auth type used for the PG and HTTP ports
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
|
||||
pub(crate) virtual_file_io_engine: String,
|
||||
pub(crate) get_vectored_impl: String,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -136,6 +139,9 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
// FIXME: use the ones exposed by pageserver crate
|
||||
virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
|
||||
get_vectored_impl: "sequential".to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -227,10 +233,10 @@ impl LocalEnv {
|
||||
self.neon_distrib_dir.join("pageserver")
|
||||
}
|
||||
|
||||
pub fn attachment_service_bin(&self) -> PathBuf {
|
||||
// Irrespective of configuration, attachment service binary is always
|
||||
pub fn storage_controller_bin(&self) -> PathBuf {
|
||||
// Irrespective of configuration, storage controller binary is always
|
||||
// run from the same location as neon_local. This means that for compatibility
|
||||
// tests that run old pageserver/safekeeper, they still run latest attachment service.
|
||||
// tests that run old pageserver/safekeeper, they still run latest storage controller.
|
||||
let neon_local_bin_dir = env::current_exe().unwrap().parent().unwrap().to_owned();
|
||||
neon_local_bin_dir.join("storage_controller")
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@ use std::time::Duration;
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::controller_api::NodeRegisterRequest;
|
||||
use pageserver_api::models::{
|
||||
self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
|
||||
};
|
||||
@@ -31,7 +30,6 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::attachment_service::AttachmentService;
|
||||
use crate::local_env::PageServerConf;
|
||||
use crate::{background_process, local_env::LocalEnv};
|
||||
|
||||
@@ -80,18 +78,31 @@ impl PageServerNode {
|
||||
///
|
||||
/// These all end up on the command line of the `pageserver` binary.
|
||||
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
||||
let id = format!("id={}", self.conf.id);
|
||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||
let pg_distrib_dir_param = format!(
|
||||
"pg_distrib_dir='{}'",
|
||||
self.env.pg_distrib_dir_raw().display()
|
||||
);
|
||||
|
||||
let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
|
||||
let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
|
||||
let PageServerConf {
|
||||
id,
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
} = &self.conf;
|
||||
|
||||
let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
|
||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
|
||||
let id = format!("id={}", id);
|
||||
|
||||
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
|
||||
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
|
||||
|
||||
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
|
||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
|
||||
let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
|
||||
let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
|
||||
|
||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||
|
||||
@@ -103,6 +114,8 @@ impl PageServerNode {
|
||||
listen_http_addr_param,
|
||||
listen_pg_addr_param,
|
||||
broker_endpoint_param,
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
];
|
||||
|
||||
if let Some(control_plane_api) = &self.env.control_plane_api {
|
||||
@@ -111,9 +124,9 @@ impl PageServerNode {
|
||||
control_plane_api.as_str()
|
||||
));
|
||||
|
||||
// Attachment service uses the same auth as pageserver: if JWT is enabled
|
||||
// Storage controller uses the same auth as pageserver: if JWT is enabled
|
||||
// for us, we will also need it to talk to them.
|
||||
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
||||
if matches!(http_auth_type, AuthType::NeonJWT) {
|
||||
let jwt_token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
||||
@@ -131,8 +144,7 @@ impl PageServerNode {
|
||||
));
|
||||
}
|
||||
|
||||
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
|
||||
{
|
||||
if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
|
||||
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
||||
// are one level below that, so refer to keys with ../
|
||||
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
||||
@@ -163,8 +175,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, config_overrides: &[&str], register: bool) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false, register).await
|
||||
pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
self.start_node(config_overrides, false).await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
|
||||
@@ -202,6 +214,28 @@ impl PageServerNode {
|
||||
String::from_utf8_lossy(&init_output.stderr),
|
||||
);
|
||||
|
||||
// Write metadata file, used by pageserver on startup to register itself with
|
||||
// the storage controller
|
||||
let metadata_path = datadir.join("metadata.json");
|
||||
|
||||
let (_http_host, http_port) =
|
||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||
let http_port = http_port.unwrap_or(9898);
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
std::fs::write(
|
||||
metadata_path,
|
||||
serde_json::to_vec(&serde_json::json!({
|
||||
"host": "localhost",
|
||||
"port": self.pg_connection_config.port(),
|
||||
"http_host": "localhost",
|
||||
"http_port": http_port,
|
||||
}))
|
||||
.unwrap(),
|
||||
)
|
||||
.expect("Failed to write metadata file");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -209,27 +243,7 @@ impl PageServerNode {
|
||||
&self,
|
||||
config_overrides: &[&str],
|
||||
update_config: bool,
|
||||
register: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
// Register the node with the storage controller before starting pageserver: pageserver must be registered to
|
||||
// successfully call /re-attach and finish starting up.
|
||||
if register {
|
||||
let attachment_service = AttachmentService::from_env(&self.env);
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&self.conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&self.conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
attachment_service
|
||||
.node_register(NodeRegisterRequest {
|
||||
node_id: self.conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
@@ -554,13 +568,6 @@ impl PageServerNode {
|
||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_secondary_download(*tenant_id)
|
||||
.await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
|
||||
@@ -10,7 +10,7 @@ use pageserver_api::{
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -24,7 +24,7 @@ use utils::{
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
|
||||
pub struct AttachmentService {
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
@@ -36,7 +36,10 @@ pub struct AttachmentService {
|
||||
|
||||
const COMMAND: &str = "storage_controller";
|
||||
|
||||
const ATTACHMENT_SERVICE_POSTGRES_VERSION: u32 = 16;
|
||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
@@ -59,7 +62,7 @@ pub struct InspectResponse {
|
||||
pub attachment: Option<(u32, NodeId)>,
|
||||
}
|
||||
|
||||
impl AttachmentService {
|
||||
impl StorageController {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
@@ -136,27 +139,27 @@ impl AttachmentService {
|
||||
}
|
||||
|
||||
fn pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// PIDFile for the postgres instance used to store attachment service state
|
||||
/// PIDFile for the postgres instance used to store storage controller state
|
||||
fn postgres_pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join("attachment_service_postgres.pid"),
|
||||
.join("storage_controller_postgres.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||
///
|
||||
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
let prefer_versions = [ATTACHMENT_SERVICE_POSTGRES_VERSION, 15, 14];
|
||||
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
|
||||
|
||||
for v in prefer_versions {
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
||||
@@ -189,7 +192,7 @@ impl AttachmentService {
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
const DB_NAME: &str = "attachment_service";
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
@@ -219,10 +222,10 @@ impl AttachmentService {
|
||||
}
|
||||
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the attachment service for persistence.
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachment_service_db");
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
@@ -245,7 +248,7 @@ impl AttachmentService {
|
||||
.await?;
|
||||
};
|
||||
|
||||
println!("Starting attachment service database...");
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
@@ -256,7 +259,7 @@ impl AttachmentService {
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"attachment_service_db",
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
@@ -269,6 +272,8 @@ impl AttachmentService {
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
@@ -276,6 +281,8 @@ impl AttachmentService {
|
||||
self.path.as_ref(),
|
||||
"--database-url",
|
||||
&database_url,
|
||||
"--max-unavailable-interval",
|
||||
&max_unavailable.to_string(),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
@@ -300,7 +307,7 @@ impl AttachmentService {
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.attachment_service_bin(),
|
||||
&self.env.storage_controller_bin(),
|
||||
args,
|
||||
[(
|
||||
"NEON_REPO_DIR".to_string(),
|
||||
@@ -322,10 +329,10 @@ impl AttachmentService {
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("attachment_service_db");
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
println!("Stopping attachment service database...");
|
||||
println!("Stopping storage controller database...");
|
||||
let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
|
||||
let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_stop_args)
|
||||
@@ -344,10 +351,10 @@ impl AttachmentService {
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Attachment service data base is already stopped");
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop attachment service database: {stop_status}")
|
||||
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -368,7 +375,7 @@ impl AttachmentService {
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple HTTP request wrapper for calling into attachment service
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: hyper::Method,
|
||||
@@ -496,11 +503,15 @@ impl AttachmentService {
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
new_shard_count: u8,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<TenantShardSplitResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||
Some(TenantShardSplitRequest { new_shard_count }),
|
||||
Some(TenantShardSplitRequest {
|
||||
new_shard_count,
|
||||
new_stripe_size,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -70,9 +70,9 @@ Should only be used e.g. for status check/tenant creation/list.
|
||||
Should only be used e.g. for status check.
|
||||
Currently also used for connection from any pageserver to any safekeeper.
|
||||
|
||||
"generations_api": Provides access to the upcall APIs served by the attachment service or the control plane.
|
||||
"generations_api": Provides access to the upcall APIs served by the storage controller or the control plane.
|
||||
|
||||
"admin": Provides access to the control plane and admin APIs of the attachment service.
|
||||
"admin": Provides access to the control plane and admin APIs of the storage controller.
|
||||
|
||||
### CLI
|
||||
CLI generates a key pair during call to `neon_local init` with the following commands:
|
||||
|
||||
408
docs/rfcs/031-sharding-static.md
Normal file
408
docs/rfcs/031-sharding-static.md
Normal file
@@ -0,0 +1,408 @@
|
||||
# Sharding Phase 1: Static Key-space Sharding
|
||||
|
||||
## Summary
|
||||
|
||||
To enable databases with sizes approaching the capacity of a pageserver's disk,
|
||||
it is necessary to break up the storage for the database, or _shard_ it.
|
||||
|
||||
Sharding in general is a complex area. This RFC aims to define an initial
|
||||
capability that will permit creating large-capacity databases using a static configuration
|
||||
defined at time of Tenant creation.
|
||||
|
||||
## Motivation
|
||||
|
||||
Currently, all data for a Tenant, including all its timelines, is stored on a single
|
||||
pageserver. The local storage required may be several times larger than the actual
|
||||
database size, due to LSM write inflation.
|
||||
|
||||
If a database is larger than what one pageserver can hold, then it becomes impossible
|
||||
for the pageserver to hold it in local storage, as it must do to provide service to
|
||||
clients.
|
||||
|
||||
### Prior art
|
||||
|
||||
In Neon:
|
||||
|
||||
- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
|
||||
- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
|
||||
- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
|
||||
|
||||
Prior art in other distributed systems is too broad to capture here: pretty much
|
||||
any scale out storage system does something like this.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Enable creating a large (for example, 16TiB) database without requiring dedicated
|
||||
pageserver nodes.
|
||||
- Share read/write bandwidth costs for large databases across pageservers, as well
|
||||
as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
|
||||
that disrupt service to other tenants.
|
||||
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
|
||||
does not write out a single contiguous ranges of page numbers.
|
||||
|
||||
_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
|
||||
that a user might create on a current-gen enterprise SSD should also work well on
|
||||
Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
|
||||
pageserver backend is not the limiting factor in the database size_.
|
||||
|
||||
## Non Goals
|
||||
|
||||
- Independently distributing timelines within the same tenant. If a tenant has many
|
||||
timelines, then sharding may be a less efficient mechanism for distributing load than
|
||||
sharing out timelines between pageservers.
|
||||
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
|
||||
based on the idea that separate mechanisms will make sense for each dimension.
|
||||
|
||||
## Impacted Components
|
||||
|
||||
pageserver, control plane, postgres/smgr
|
||||
|
||||
## Terminology
|
||||
|
||||
**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
|
||||
the page number is the key in that store. `Key` is a literal data type in existing code.
|
||||
|
||||
**LSN dimension**: this just means the range of LSNs (history), when talking about the range
|
||||
of keys and LSNs as a two dimensional space.
|
||||
|
||||
## Implementation
|
||||
|
||||
### Key sharding vs. LSN sharding
|
||||
|
||||
When we think of sharding across the two dimensional key/lsn space, this is an
|
||||
opportunity to think about how the two dimensions differ:
|
||||
|
||||
- Sharding the key space distributes the _write_ workload of ingesting data
|
||||
and compacting. This work must be carefully managed so that exactly one
|
||||
node owns a given key.
|
||||
- Sharding the LSN space distributes the _historical read_ workload. This work
|
||||
can be done by anyone without any special coordination, as long as they can
|
||||
see the remote index and layers.
|
||||
|
||||
The key sharding is the harder part, and also the more urgent one, to support larger
|
||||
capacity databases. Because distributing historical LSN read work is a relatively
|
||||
simpler problem that most users don't have, we defer it to future work. It is anticipated
|
||||
that some quite simple P2P offload model will enable distributing work for historical
|
||||
reads: a node which is low on space can call out to peer to ask it to download and
|
||||
serve reads from a historical layer.
|
||||
|
||||
### Key mapping scheme
|
||||
|
||||
Having decided to focus on key sharding, we must next decide how we will map
|
||||
keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
|
||||
between data locality and avoiding entire large relations mapping to the same shard.
|
||||
|
||||
We will define two spaces:
|
||||
|
||||
- Key space: unsigned integer
|
||||
- Shard space: integer from 0 to N-1, where we have N shards.
|
||||
|
||||
### Key -> Shard mapping
|
||||
|
||||
Keys are currently defined in the pageserver's getpage@lsn interface as follows:
|
||||
|
||||
```
|
||||
pub struct Key {
|
||||
pub field1: u8,
|
||||
pub field2: u32,
|
||||
pub field3: u32,
|
||||
pub field4: u32,
|
||||
pub field5: u8,
|
||||
pub field6: u32,
|
||||
}
|
||||
|
||||
|
||||
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
|
||||
shards. For distribution purposes, we only care about user data keys_
|
||||
|
||||
The properties we want from our Key->Shard mapping are:
|
||||
|
||||
- Locality in `blknum`, such that adjacent `blknum` will usually map to
|
||||
the same stripe and consequently land on the same shard, even though the overall
|
||||
collection of blocks in a relation will be spread over many stripes and therefore
|
||||
many shards.
|
||||
- Avoid the same blknum on different relations landing on the same stripe, so that
|
||||
with many small relations we do not end up aliasing data to the same stripe/shard.
|
||||
- Avoid vulnerability to aliasing in the values of relation identity fields, such that
|
||||
if there are patterns in the value of `relnode`, these do not manifest as patterns
|
||||
in data placement.
|
||||
|
||||
To accomplish this, the blknum is used to select a stripe, and stripes are
|
||||
assigned to shards in a pseudorandom order via a hash. The motivation for
|
||||
pseudo-random distribution (rather than sequential mapping of stripe to shard)
|
||||
is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
|
||||
all relations' stripes to touch pageservers in the same order.
|
||||
|
||||
To map a `Key` to a shard:
|
||||
|
||||
- Hash the `Key` field 4 (relNode).
|
||||
- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
|
||||
hash of this with the hash from the previous step.
|
||||
- The total hash modulo the shard count gives the shard holding this key.
|
||||
|
||||
Why don't we use the other fields in the Key?
|
||||
|
||||
- We ignore `forknum` for key mapping, because it distinguishes different classes of data
|
||||
in the same relation, and we would like to keep the data in a relation together.
|
||||
- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
|
||||
database's blocks differ only by spcNode and dbNode from the original. To enable running
|
||||
this type of creation without cross-pageserver communication, we must ensure that these
|
||||
blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
|
||||
|
||||
### Data placement examples
|
||||
|
||||
For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
|
||||
and a stripe size of 32k pages:
|
||||
|
||||
- A single large relation: `blknum` division will break the data up into 4096
|
||||
stripes, which will be scattered across the shards.
|
||||
- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
|
||||
and that stripe will be placed according to the hash of the key fields 4. The
|
||||
data placement will be statistically uniform across shards.
|
||||
|
||||
Data placement will be more uneven on smaller databases:
|
||||
|
||||
- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
|
||||
that both relations land on the same shard and no data lands on the other shard.
|
||||
- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
|
||||
the data of the other four shards.
|
||||
|
||||
These uneven cases for small amounts of data do not matter, as long as the stripe size
|
||||
is an order of magnitude smaller than the amount of data we are comfortable holding
|
||||
in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
|
||||
a tenant has some shards with 256MB size and some shards with 512MB size, even though
|
||||
the standard deviation of shard size within the tenant is very high. Our key mapping
|
||||
scheme provides a statistical guarantee that as the tenant's overall data size increases,
|
||||
uniformity of placement will improve.
|
||||
|
||||
### Important Types
|
||||
|
||||
#### `ShardIdentity`
|
||||
|
||||
Provides the information needed to know whether a particular key belongs
|
||||
to a particular shard:
|
||||
|
||||
- Layout version
|
||||
- Stripe size
|
||||
- Shard count
|
||||
- Shard index
|
||||
|
||||
This structure's size is constant. Note that if we had used a differnet key
|
||||
mapping scheme such as consistent hashing with explicit hash ranges assigned
|
||||
to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
|
||||
key mapping scheme used here enables a small fixed size ShardIdentity.
|
||||
|
||||
### Pageserver changes
|
||||
|
||||
#### Structural
|
||||
|
||||
Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
|
||||
`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
|
||||
of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
|
||||
covers the whole keyspace.
|
||||
|
||||
When the pageserver writes layers and index_part.json to remote storage, it must
|
||||
include the shard index & count in the name, to avoid collisions (the count is
|
||||
necessary for future-proofing: the count will vary in time). These keys
|
||||
will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
|
||||
exactly the same for TenantShards as it does for Tenants today: each shard will have
|
||||
its own generation number.
|
||||
|
||||
#### Storage Format: Keys
|
||||
|
||||
For tenants with >1 shard, layer files implicitly become sparse: within the key
|
||||
range described in the layer name, the layer file for a shard will only hold the
|
||||
content relevant to stripes assigned to the shard.
|
||||
|
||||
For this reason, the LayerFileName within a tenant is no longer unique: different shards
|
||||
may use the same LayerFileName to refer to different data. We may solve this simply
|
||||
by including the shard number in the keys used for layers.
|
||||
|
||||
The shard number will be included as a prefix (as part of tenant ID), like this:
|
||||
|
||||
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
|
||||
|
||||
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
|
||||
|
||||
Reasons for this particular format:
|
||||
|
||||
- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
|
||||
we construct a layer file name), and enables efficient listing of index_parts within
|
||||
a particular shard-timeline prefix.
|
||||
- Including the shard _count_ as well as shard number means that in future when we implement
|
||||
shard splitting, it will be possible for a parent shard and one of its children to write
|
||||
the same layer file without a name collision. For example, a parent shard 0_1 might split
|
||||
into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
|
||||
that is distinct from what shard 0_1 would have written at the same place.
|
||||
|
||||
In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
|
||||
and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
|
||||
for example a single-shard tenant's prefix will be `0001`.
|
||||
|
||||
For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
|
||||
and use this as a cue to construct paths with no prefix at all.
|
||||
|
||||
#### Storage Format: Indices
|
||||
|
||||
In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
|
||||
when we implement shard splitting in future, it will be useful to enable shards to reference layers
|
||||
written by other shards (specifically the parent shard during a split), so that shards don't
|
||||
have to exhaustively copy all data into their own shard-prefixed keys.
|
||||
|
||||
To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
|
||||
tuple on each layer, such that it can construct paths for layers written by other shards. This
|
||||
naturally raises the question of who "owns" such layers written by ancestral shards: this problem
|
||||
will be addressed in phase 2.
|
||||
|
||||
For backward compatibility, any index entry without shard information will be assumed to be
|
||||
in the legacy shardidentity.
|
||||
|
||||
#### WAL Ingest
|
||||
|
||||
In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
|
||||
it down to the pages relevant to their shard:
|
||||
|
||||
- For ordinary user data writes, only retain a write if it matches the ShardIdentity
|
||||
- For metadata describing relations etc, all shards retain these writes.
|
||||
|
||||
The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
|
||||
one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
|
||||
and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
|
||||
expensive: if the safekeeper can be made shard-aware then it could be taught to use
|
||||
the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
|
||||
|
||||
#### Compaction/GC
|
||||
|
||||
No changes needed.
|
||||
|
||||
The pageserver doesn't have to do anything special during compaction
|
||||
or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
|
||||
This will result in sparse layer files, containing keys only in the stripes that this
|
||||
shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
|
||||
the key range, these should be updated to ignore gaps that are due to sharding, to
|
||||
avoid spuriously splitting up layers ito stripe-sized pieces.
|
||||
|
||||
### Compute Endpoints
|
||||
|
||||
Compute endpoints will need to:
|
||||
|
||||
- Accept a vector of connection strings as part of their configuration from the control plane
|
||||
- Route pageserver requests according to mapping the hash of key to the correct
|
||||
entry in the vector of connection strings.
|
||||
|
||||
Doing this in compute rather than routing requests via a single pageserver is
|
||||
necessary to enable sharding tenants without adding latency from extra hops.
|
||||
|
||||
### Control Plane
|
||||
|
||||
Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
|
||||
be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
|
||||
tenants.
|
||||
|
||||
Tenant lifecycle operations like deletion will require fanning-out to all the shards
|
||||
in the tenant. The same goes for timeline creation and deletion: a timeline should
|
||||
not be considered created until it has been created in all shards.
|
||||
|
||||
#### Selectively enabling sharding for large tenants
|
||||
|
||||
Initially, we will explicitly enable sharding for large tenants only.
|
||||
|
||||
In future, this hint mechanism will become optional when we implement automatic
|
||||
re-sharding of tenants.
|
||||
|
||||
## Future Phases
|
||||
|
||||
This section exists to indicate what will likely come next after this phase.
|
||||
|
||||
Phases 2a and 2b are amenable to execution in parallel.
|
||||
|
||||
### Phase 2a: WAL fan-out
|
||||
|
||||
**Problem**: when all shards consume the whole WAL, the network bandwidth used
|
||||
for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
|
||||
of the shard count.
|
||||
|
||||
Network bandwidth is not our most pressing bottleneck, but it is likely to become
|
||||
a problem if we set a modest shard count (~8) on a significant number of tenants,
|
||||
especially as those larger tenants which we shard are also likely to have higher
|
||||
write bandwidth than average.
|
||||
|
||||
### Phase 2b: Shard Splitting
|
||||
|
||||
**Problem**: the number of shards in a tenant is defined at creation time and cannot
|
||||
be changed. This causes excessive sharding for most small tenants, and an upper
|
||||
bound on scale for very large tenants.
|
||||
|
||||
To address this, a _splitting_ feature will later be added. One shard can split its
|
||||
data into a number of children by doing a special compaction operation to generate
|
||||
image layers broken up child-shard-wise, and then writing out an `index_part.json` for
|
||||
each child. This will then require external coordination (by the control plane) to
|
||||
safely attach these new child shards and then move them around to distribute work.
|
||||
The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
|
||||
once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
|
||||
the risk/complexity of implementing such a rarely-encountered scenario.
|
||||
|
||||
### Phase N (future): distributed historical reads
|
||||
|
||||
**Problem**: while sharding based on key is good for handling changes in overall
|
||||
database size, it is less suitable for spiky/unpredictable changes in the read
|
||||
workload to historical layers. Sudden increases in historical reads could result
|
||||
in sudden increases in local disk capacity required for a TenantShard.
|
||||
|
||||
Example: the extreme case of this would be to run a tenant for a year, then create branches
|
||||
with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
|
||||
the on-disk capacity footprint of a TenantShard, since it would be serving reads
|
||||
from all those disparate historical layers.
|
||||
|
||||
If we can respond fast enough, then key-sharding a tenant more finely can help with
|
||||
this, but splitting may be a relatively expensive operation and the increased historical
|
||||
read load may be transient.
|
||||
|
||||
A separate mechanism for handling heavy historical reads could be something like
|
||||
a gossip mechanism for pageservers to communicate
|
||||
about their workload, and then a getpageatlsn offload mechanism where one pageserver can
|
||||
ask another to go read the necessary layers from remote storage to serve the read. This
|
||||
requires relativly little coordination because it is read-only: any node can service any
|
||||
read. All reads to a particular shard would still flow through one node, but the
|
||||
disk capactity & I/O impact of servicing the read would be distributed.
|
||||
|
||||
## FAQ/Alternatives
|
||||
|
||||
### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
|
||||
|
||||
When a database is growing under a write workload, writes may predominantly hit the
|
||||
end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
|
||||
is intensively re-writing a particular relation, if that relation lived in a particular
|
||||
shard then it would not achieve our goal of distributing the write work across shards.
|
||||
|
||||
### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
|
||||
|
||||
1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
|
||||
database would still cause a load hotspot on the pageserver routing its read requests.
|
||||
2. The additional hop through the "proxy" pageserver would add latency and overall
|
||||
resource cost (CPU, network bandwidth)
|
||||
|
||||
### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
|
||||
|
||||
In this model, there would be no explicit sharding of work, but the pageserver to which
|
||||
a tenant is attached would not hold all layers on its disk: instead, it would call out
|
||||
to peers to have them store some layers, and call out to those peers to request reads
|
||||
in those layers.
|
||||
|
||||
This mechanism will work well for distributing work in the LSN dimension, but in the key
|
||||
space dimension it has the major limitation of requiring one node to handle all
|
||||
incoming writes, and compactions. Even if the write workload for a large database
|
||||
fits in one pageserver, it will still be a hotspot and such tenants may still
|
||||
de-facto require their own pageserver.
|
||||
479
docs/rfcs/032-shard-splitting.md
Normal file
479
docs/rfcs/032-shard-splitting.md
Normal file
@@ -0,0 +1,479 @@
|
||||
# Shard splitting
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC describes a new pageserver API for splitting an existing tenant shard into
|
||||
multiple shards, and describes how to use this API to safely increase the total
|
||||
shard count of a tenant.
|
||||
|
||||
## Motivation
|
||||
|
||||
In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
|
||||
tenants beyond the capacity of a single pageserver by breaking up the key space
|
||||
into stripes, and distributing these stripes across many pageservers. However,
|
||||
the shard count was defined once at tenant creation time and not varied thereafter.
|
||||
|
||||
In practice, the expected size of a database is rarely known at creation time, and
|
||||
it is inefficient to enable sharding for very small tenants: we need to be
|
||||
able to create a tenant with a small number of shards (such as 1), and later expand
|
||||
when it becomes clear that the tenant has grown in size to a point where sharding
|
||||
is beneficial.
|
||||
|
||||
### Prior art
|
||||
|
||||
Many distributed systems have the problem of choosing how many shards to create for
|
||||
tenants that do not specify an expected size up-front. There are a couple of general
|
||||
approaches:
|
||||
|
||||
- Write to a key space in order, and start a new shard when the highest key advances
|
||||
past some point. This doesn't work well for Neon, because we write to our key space
|
||||
in many different contiguous ranges (per relation), rather than in one contiguous
|
||||
range. To adapt to this kind of model, we would need a sharding scheme where each
|
||||
relation had its own range of shards, which would be inefficient for the common
|
||||
case of databases with many small relations.
|
||||
- Monitor the system, and automatically re-shard at some size threshold. For
|
||||
example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
|
||||
component monitors the size of each RADOS Pool, and adjusts the number of Placement
|
||||
Groups (Ceph's shard equivalent).
|
||||
|
||||
## Requirements
|
||||
|
||||
- A configurable capacity limit per-shard is enforced.
|
||||
- Changes in shard count do not interrupt service beyond requiring postgres
|
||||
to reconnect (i.e. milliseconds).
|
||||
- Human being does not have to choose shard count
|
||||
|
||||
## Non Goals
|
||||
|
||||
- Shard splitting is always a tenant-global operation: we will not enable splitting
|
||||
one shard while leaving others intact.
|
||||
- The inverse operation (shard merging) is not described in this RFC. This is a lower
|
||||
priority than splitting, because databases grow more often than they shrink, and
|
||||
a database with many shards will still work properly if the stored data shrinks, just
|
||||
with slightly more overhead (e.g. redundant WAL replication)
|
||||
- Shard splitting is only initiated based on capacity bounds, not load. Splitting
|
||||
a tenant based on load will make sense for some medium-capacity, high-load workloads,
|
||||
but is more complex to reason about and likely is not desirable until we have
|
||||
shard merging to reduce the shard count again if the database becomes less busy.
|
||||
|
||||
## Impacted Components
|
||||
|
||||
pageserver, storage controller
|
||||
|
||||
(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
|
||||
|
||||
## Terminology
|
||||
|
||||
**Parent** shards are the shards that exist before a split. **Child** shards are
|
||||
the new shards created during a split.
|
||||
|
||||
**Shard** is synonymous with _tenant shard_.
|
||||
|
||||
**Shard Index** is the 2-tuple of shard number and shard count, written in
|
||||
paths as {:02x}{:02x}, e.g. `0001`.
|
||||
|
||||
## Background
|
||||
|
||||
In the implementation section, a couple of existing aspects of sharding are important
|
||||
to remember:
|
||||
|
||||
- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
|
||||
a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
|
||||
storage paths, and remote index metadata.
|
||||
- Remote layer file paths contain the shard index of the shard that created them, and
|
||||
remote indices contain the same index to enable building the layer file path. A shard's
|
||||
index may reference layers that were created by another shard.
|
||||
- Local tenant shard directories include the shard index. All layers downloaded by
|
||||
a tenant shard are stored in this shard-prefixed path, even if those layers were
|
||||
initially created by another shard: tenant shards do not read and write one anothers'
|
||||
paths.
|
||||
- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
|
||||
This is for historical reasons and will be cleaned up in future, but the existing
|
||||
name is used here to help comprehension when reading code.
|
||||
|
||||
## Implementation
|
||||
|
||||
Note: this section focuses on the correctness of the core split process. This will
|
||||
be fairly inefficient in a naive implementation, and several important optimizations
|
||||
are described in a later section.
|
||||
|
||||
There are broadly two parts to the implementation:
|
||||
|
||||
1. The pageserver split API, which splits one shard on one pageserver
|
||||
2. The overall tenant split proccess which is coordinated by the storage controller,
|
||||
and calls into the pageserver split API as needed.
|
||||
|
||||
### Pageserver Split API
|
||||
|
||||
The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
|
||||
that takes the new total shard count in the body.
|
||||
|
||||
The pageserver split API operates on one tenant shard, on one pageserver. External
|
||||
coordination is required to use it safely, this is described in the later
|
||||
'Split procedure' section.
|
||||
|
||||
#### Preparation
|
||||
|
||||
First identify the shard indices for the new child shards. These are deterministic,
|
||||
calculated from the parent shard's index, and the number of children being created (this
|
||||
is an input to the API, and validated to be a power of two). In a trivial example, splitting
|
||||
0001 in two always results in 0002 and 0102.
|
||||
|
||||
Child shard indices are chosen such that the childrens' parts of the keyspace will
|
||||
be subsets of the parent's parts of the keyspace.
|
||||
|
||||
#### Step 1: write new remote indices
|
||||
|
||||
In remote storage, splitting is very simple: we may just write new index_part.json
|
||||
objects for each child shard, containing exactly the same layers as the parent shard.
|
||||
|
||||
The children will have more data than they need, but this avoids any exhausive
|
||||
re-writing or copying of layer files.
|
||||
|
||||
The index key path includes a generation number: the parent shard's current
|
||||
attached generation number will also be used for the child shards' indices. This
|
||||
makes the operation safely retryable: if everything crashes and restarts, we may
|
||||
call the split API again on the parent shard, and the result will be some new remote
|
||||
indices for the child shards, under a higher generation number.
|
||||
|
||||
#### Step 2: start new `Tenant` objects
|
||||
|
||||
A new `Tenant` object may be instantiated for each child shard, while the parent
|
||||
shard still exists. When calling the tenant_spawn function for this object,
|
||||
the remote index from step 1 will be read, and the child shard will start
|
||||
to ingest WAL to catch up from whatever was in the remote storage at step 1.
|
||||
|
||||
We now wait for child shards' WAL ingestion to catch up with the parent shard,
|
||||
so that we can safely tear down the parent shard without risking an availability
|
||||
gap to clients reading recent LSNs.
|
||||
|
||||
#### Step 3: tear down parent `Tenant` object
|
||||
|
||||
Once child shards are running and have caught up with WAL ingest, we no longer
|
||||
need the parent shard. Note that clients may still be using it -- when we
|
||||
shut it down, any page_service handlers will also shut down, causing clients
|
||||
to disconnect. When the client reconnects, it will re-lookup the tenant,
|
||||
and hit the child shard instead of the parent (shard lookup from page_service
|
||||
should bias toward higher ShardCount shards).
|
||||
|
||||
Note that at this stage the page service client has not yet been notified of
|
||||
any split. In the trivial single split example:
|
||||
|
||||
- Shard 0001 is gone: Tenant object torn down
|
||||
- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
|
||||
- Clients will continue to connect to that server thinking that shard 0001 is there,
|
||||
and all requests will work, because any key that was in shard 0001 is definitely
|
||||
available in either shard 0002 or shard 0102.
|
||||
- Eventually, the storage controller (not the pageserver) will decide to migrate
|
||||
some child shards away: at that point it will do a live migration, ensuring
|
||||
that the client has an updated configuration before it detaches anything
|
||||
from the original server.
|
||||
|
||||
#### Complete
|
||||
|
||||
When we send a 200 response to the split request, we are promising the caller:
|
||||
|
||||
- That the child shards are persistent in remote storage
|
||||
- That the parent shard has been shut down
|
||||
|
||||
This enables the caller to proceed with the overall shard split operation, which
|
||||
may involve other shards on other pageservers.
|
||||
|
||||
### Storage Controller Split procedure
|
||||
|
||||
Splitting a tenant requires calling the pageserver split API, and tracking
|
||||
enough state to ensure recovery + completion in the event of any component (pageserver
|
||||
or storage controller) crashing (or request timing out) during the split.
|
||||
|
||||
1. call the split API on all existing shards. Ensure that the resulting
|
||||
child shards are pinned to their pageservers until _all_ the split calls are done.
|
||||
This pinning may be implemented as a "split bit" on the tenant shards, that
|
||||
blocks any migrations, and also acts as a sign that if we restart, we must go
|
||||
through some recovery steps to resume the split.
|
||||
2. Once all the split calls are done, we may unpin the child shards (clear
|
||||
the split bit). The split is now complete: subsequent steps are just migrations,
|
||||
not strictly part of the split.
|
||||
3. Try to schedule new pageserver locations for the child shards, using
|
||||
a soft anti-affinity constraint to place shards from the same tenant onto different
|
||||
pageservers.
|
||||
|
||||
Updating computes about the new shard count is not necessary until we migrate
|
||||
any of the child shards away from the parent's location.
|
||||
|
||||
### Recovering from failures
|
||||
|
||||
#### Rolling back an incomplete split
|
||||
|
||||
An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
|
||||
and detaching child shards. This will lose any WAL ingested into the children after the parents
|
||||
were detached earlier, but the parents will catch up.
|
||||
|
||||
No special pageserver API is needed for this. From the storage controllers point of view, the
|
||||
procedure is:
|
||||
|
||||
1. For all parent shards in the tenant, ensure they are attached
|
||||
2. For all child shards, ensure they are not attached
|
||||
3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
|
||||
|
||||
Any remote storage content for child shards is left behind. This is similar to other cases where
|
||||
we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
|
||||
index that references it). Future online scrub/cleanup functionality can remove these objects, or
|
||||
they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
|
||||
which would include any child shards that were rolled back.
|
||||
|
||||
If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
|
||||
this, we will **block timeline creation during splitting**, so that we can safely roll back until
|
||||
the split is complete, without risking losing timelines.
|
||||
|
||||
Rolling back an incomplete split will happen automatically if a split fails due to some fatal
|
||||
reason, and will not be accessible via an API:
|
||||
|
||||
- A pageserver fails to complete its split API request after too many retries
|
||||
- A pageserver returns a fatal unexpected error such as 400 or 500
|
||||
- The storage controller database returns a non-retryable error
|
||||
- Some internal invariant is violated in the storage controller split code
|
||||
|
||||
#### Rolling back a complete split
|
||||
|
||||
A complete shard split may be rolled back similarly to an incomplete split, with the following
|
||||
modifications:
|
||||
|
||||
- The parent shards will no longer exist in the storage controller database, so these must
|
||||
be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
|
||||
may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
|
||||
shards in the storage controller database.
|
||||
- Any timelines that were created after the split complete will disappear when rolling back
|
||||
to the tenant shards. For this reason, rolling back after a complete split should only
|
||||
be done due to serious issues where loss of recently created timelines is acceptable, or
|
||||
in cases where we have confirmed that no timelines were created in the intervening period.
|
||||
- Parent shards' layers must not have been deleted: this property will come "for free" when
|
||||
we first roll out sharding, by simply not implementing deletion of parent layers after
|
||||
a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
|
||||
Optimizations section), it should apply a TTL to layers such that we have a
|
||||
defined walltime window in which rollback will be possible.
|
||||
|
||||
The storage controller will expose an API for rolling back a complete split, for use
|
||||
in the field if we encounter some critical bug with a post-split tenant.
|
||||
|
||||
#### Retrying API calls during Pageserver Restart
|
||||
|
||||
When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
|
||||
child shards from an ongoing split. This does not intrinsically break anything, and the
|
||||
pageserver may include all these shards in its `/re-attach` request to the storage controller.
|
||||
|
||||
In order to support such restarts, it is important that the storage controller stores
|
||||
persistent records of each child shard before it calls into a pageserver, as these child shards
|
||||
may require generation increments via a `/re-attach` request.
|
||||
|
||||
The pageserver restart will also result in a failed API call from the storage controller's point
|
||||
of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
|
||||
complete, and all shards must remain pinned to their current pageserver locations until the
|
||||
split is done.
|
||||
|
||||
The pageserver API calls during splitting will retry on transient errors, so that
|
||||
short availability gaps do not result in a failure of the overall operation. The
|
||||
split in progress will be automatically rolled back if the threshold for API
|
||||
retries is reached (e.g. if a pageserver stays offline for longer than a typical
|
||||
restart).
|
||||
|
||||
#### Rollback on Storage Controller Restart
|
||||
|
||||
On startup, the storage controller will inspect the split bit for tenant shards that
|
||||
it loads from the database. If any splits are in progress:
|
||||
|
||||
- Database content will be reverted to the parent shards
|
||||
- Child shards will be dropped from memory
|
||||
- The parent and child shards will be included in the general startup reconciliation that
|
||||
the storage controller does: any child shards will be detached from pageservers because
|
||||
they don't exist in the storage controller's expected set of shards, and parent shards
|
||||
will be attached if they aren't already.
|
||||
|
||||
#### Storage controller API request failures/retries
|
||||
|
||||
The split request handler will implement idempotency: if the [`Tenant`] requested to split
|
||||
doesn't exist, we will check for the would-be child shards, and if they already exist,
|
||||
we consider the request complete.
|
||||
|
||||
If a request is retried while the original request is still underway, then the split
|
||||
request handler will notice an InProgress marker in TenantManager, and return 503
|
||||
to encourage the client to backoff/retry. This is the same as the general pageserver
|
||||
API handling for calls that try to act on an InProgress shard.
|
||||
|
||||
#### Compute start/restart during a split
|
||||
|
||||
If a compute starts up during split, it will be configured with the old sharding
|
||||
configuration. This will work for reads irrespective of the progress of the split
|
||||
as long as no child hards have been migrated away from their original location, and
|
||||
this is guaranteed in the split procedure (see earlier section).
|
||||
|
||||
#### Pageserver fails permanently during a split
|
||||
|
||||
If a pageserver permanently fails (i.e. the storage controller availability state for it
|
||||
goes to Offline) while a split is in progress, the splitting operation will roll back, and
|
||||
during the roll back it will skip any API calls to the offline pageserver. If the offline
|
||||
pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
|
||||
|
||||
### Handling secondary locations
|
||||
|
||||
For correctness, it is not necessary to split secondary locations. We can simply detach
|
||||
the secondary locations for parent shards, and then attach new secondary locations
|
||||
for child shards.
|
||||
|
||||
Clearly this is not optimal, as it will result in re-downloads of layer files that
|
||||
were already present on disk. See "Splitting secondary locations"
|
||||
|
||||
### Conditions to trigger a split
|
||||
|
||||
The pageserver will expose a new API for reporting on shards that are candidates
|
||||
for split: this will return a top-N report of the largest tenant shards by
|
||||
physical size (remote size). This should exclude any tenants that are already
|
||||
at the maximum configured shard count.
|
||||
|
||||
The API would look something like:
|
||||
`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
|
||||
|
||||
The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
|
||||
|
||||
A split operation will be started when the tenant exceeds some threshold. This threshold
|
||||
should be _less than_ how large we actually want shards to be, perhaps much less. That's to
|
||||
minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
|
||||
wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
|
||||
tenant size distribution may be useful here: if we can make a statement like "usually, if
|
||||
a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
|
||||
make our policy to split a tenant at 20GiB.
|
||||
|
||||
The finest split we can do is by factors of two, but we can do higher-cardinality splits
|
||||
too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
|
||||
as it grows. An example of a very simple heuristic for early deployment of the splitting
|
||||
feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
|
||||
would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
|
||||
split a tenant, it will not need re-splitting soon after.
|
||||
|
||||
## Optimizations
|
||||
|
||||
### Flush parent shard to remote storage during split
|
||||
|
||||
Any data that is in WAL but not remote storage at time of split will need
|
||||
to be replayed by child shards when they start for the first time. To minimize
|
||||
this work, we may flush the parent shard to remote storage before writing the
|
||||
remote indices for child shards.
|
||||
|
||||
It is important that this flush is subject to some time bounds: we may be splitting
|
||||
in response to a surge of write ingest, so it may be time-critical to split. A
|
||||
few seconds to flush latest data should be sufficient to optimize common cases without
|
||||
running the risk of holding up a split for a harmful length of time when a parent
|
||||
shard is being written heavily. If the flush doesn't complete in time, we may proceed
|
||||
to shut down the parent shard and carry on with the split.
|
||||
|
||||
### Hard linking parent layers into child shard directories
|
||||
|
||||
Before we start the Tenant objects for child shards, we may pre-populate their
|
||||
local storage directories with hard links to the layer files already present
|
||||
in the parent shard's local directory. When the child shard starts and downloads
|
||||
its remote index, it will find all those layer files already present on local disk.
|
||||
|
||||
This avoids wasting download capacity and makes splitting faster, but more importantly
|
||||
it avoids taking up a factor of N more disk space when splitting 1 shard into N.
|
||||
|
||||
This mechanism will work well in typical flows where shards are migrated away
|
||||
promptly after a split, but for the general case including what happens when
|
||||
layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
|
||||
section below.
|
||||
|
||||
### Filtering during compaction
|
||||
|
||||
Compaction, especially image layer generation, should skip any keys that are
|
||||
present in a shard's layer files, but do not match the shard's ShardIdentity's
|
||||
is_key_local() check. This avoids carrying around data for longer than necessary
|
||||
in post-split compactions.
|
||||
|
||||
This was already implemented in https://github.com/neondatabase/neon/pull/6246
|
||||
|
||||
### Proactive compaction
|
||||
|
||||
In remote storage, there is little reason to rewrite any data on a shard split:
|
||||
all the children can reference parent layers via the very cheap write of the child
|
||||
index_part.json.
|
||||
|
||||
In local storage, things are more nuanced. During the initial split there is no
|
||||
capacity cost to duplicating parent layers, if we implement the hard linking
|
||||
optimization described above. However, as soon as any layers are evicted from
|
||||
local disk and re-downloaded, the downloaded layers will not be hard-links any more:
|
||||
they'll have real capacity footprint. That isn't a problem if we migrate child shards
|
||||
away from the parent node swiftly, but it risks a significant over-use of local disk
|
||||
space if we do not.
|
||||
|
||||
For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
|
||||
the shards elsewhere, then churned all the layers in all the shards via eviction,
|
||||
then we would blow up the storage capacity used on the node by 8x. If we're splitting
|
||||
a 100GB shard, that could take the pageserver to the point of exhausting disk space.
|
||||
|
||||
To avoid this scenario, we could implement a special compaction mode where we just
|
||||
read historic layers, drop unwanted keys, and write back the layer file. This
|
||||
is pretty expensive, but useful if we have split a large shard and are not going to
|
||||
migrate the child shards away.
|
||||
|
||||
The heuristic conditions for triggering such a compaction are:
|
||||
|
||||
- A) eviction plus time: if a child shard
|
||||
has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
|
||||
- B) resident size plus time: we may inspect the resident layers and calculate how
|
||||
many of them include the overhead of storing pre-split keys. After some time
|
||||
threshold (different to the one in case A) we still have such layers occupying
|
||||
local disk space, then we should proactively compact them.
|
||||
|
||||
### Cleaning up parent-shard layers
|
||||
|
||||
It is functionally harmless to leave parent shard layers in remote storage indefinitely.
|
||||
They would be cleaned up in the event of the tenant's deletion.
|
||||
|
||||
As an optimization to avoid leaking remote storage capacity (which costs money), we may
|
||||
lazily clean up parent shard layers once no child shards reference them.
|
||||
|
||||
This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
|
||||
|
||||
- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
|
||||
which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
|
||||
- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
|
||||
may drop out now.
|
||||
- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
|
||||
- for all ancestral shards, list objects in the prefix and delete any layer which was not
|
||||
referenced by a current shard.
|
||||
|
||||
If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
|
||||
|
||||
The cleanup may be done by the scrubber (external process), or we may choose to have
|
||||
the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
|
||||
reading the other shard's indices at runtime, and we do not require visibility of the
|
||||
latest index writes.
|
||||
|
||||
Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
|
||||
that we retain the option to roll back a split in case of bugs.
|
||||
|
||||
### Splitting secondary locations
|
||||
|
||||
We may implement a pageserver API similar to the main splitting API, which does a simpler
|
||||
operation for secondary locations: it would not write anything to S3, instead it would simply
|
||||
create the child shard directory on local disk, hard link in directories from the parent,
|
||||
and set up the in memory (TenantSlot) state for the children.
|
||||
|
||||
Similar to attached locations, a subset of secondary locations will probably need re-locating
|
||||
after the split is complete, to avoid leaving multiple child shards on the same pageservers,
|
||||
where they may use excessive space for the tenant.
|
||||
|
||||
## FAQ/Alternatives
|
||||
|
||||
### What should the thresholds be set to?
|
||||
|
||||
Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
|
||||
|
||||
Max shard count:
|
||||
|
||||
- The safekeeper overhead to sharding is currently O(N) network bandwidth because
|
||||
the un-filtered WAL is sent to all shards. To avoid this growing out of control,
|
||||
a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
|
||||
on the safekeeper.
|
||||
- there is also little benefit to increasing the shard count beyond the number
|
||||
of pageservers in a region.
|
||||
|
||||
### Is it worth just rewriting all the data during a split to simplify reasoning about space?
|
||||
@@ -29,7 +29,6 @@ pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub mod metric_vec_duration;
|
||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod more_process_metrics;
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
//! Helpers for observing duration on `HistogramVec` / `CounterVec` / `GaugeVec` / `MetricVec<T>`.
|
||||
|
||||
use std::{future::Future, time::Instant};
|
||||
|
||||
pub trait DurationResultObserver {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
|
||||
}
|
||||
|
||||
pub async fn observe_async_block_duration_by_result<
|
||||
T,
|
||||
E,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
O: DurationResultObserver,
|
||||
>(
|
||||
observer: &O,
|
||||
block: F,
|
||||
) -> Result<T, E> {
|
||||
let start = Instant::now();
|
||||
let result = block.await;
|
||||
let duration = start.elapsed();
|
||||
observer.observe_result(&result, duration);
|
||||
result
|
||||
}
|
||||
@@ -35,7 +35,7 @@ pub struct NodeRegisterRequest {
|
||||
pub struct NodeConfigureRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub availability: Option<NodeAvailability>,
|
||||
pub availability: Option<NodeAvailabilityWrapper>,
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
@@ -66,30 +66,82 @@ pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
/// Utilisation score indicating how good a candidate a pageserver
|
||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||
/// Lower values are better.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub struct UtilizationScore(pub u64);
|
||||
|
||||
impl UtilizationScore {
|
||||
pub fn worst() -> Self {
|
||||
UtilizationScore(u64::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy)]
|
||||
#[serde(into = "NodeAvailabilityWrapper")]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
Active(UtilizationScore),
|
||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||
// secondary locations on this node still exist. Newly added nodes are in this
|
||||
// state until we successfully contact them.
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl PartialEq for NodeAvailability {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
use NodeAvailability::*;
|
||||
matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for NodeAvailability {}
|
||||
|
||||
// This wrapper provides serde functionality and it should only be used to
|
||||
// communicate with external callers which don't know or care about the
|
||||
// utilisation score of the pageserver it is targeting.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub enum NodeAvailabilityWrapper {
|
||||
Active,
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
||||
fn from(val: NodeAvailabilityWrapper) -> Self {
|
||||
match val {
|
||||
// Assume the worst utilisation score to begin with. It will later be updated by
|
||||
// the heartbeats.
|
||||
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
||||
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
||||
fn from(val: NodeAvailability) -> Self {
|
||||
match val {
|
||||
NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
|
||||
NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
// This is used when parsing node configuration requests from neon-local.
|
||||
// Assume the worst possible utilisation score
|
||||
// and let it get updated via the heartbeats.
|
||||
"active" => Ok(Self::Active(UtilizationScore::worst())),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
|
||||
/// type needs to be defined with diesel traits in there.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
pub enum NodeSchedulingPolicy {
|
||||
Active,
|
||||
|
||||
@@ -4,6 +4,7 @@ pub mod utilization;
|
||||
pub use utilization::PageserverUtilization;
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
@@ -198,6 +199,13 @@ pub struct TimelineCreateRequest {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantShardSplitRequest {
|
||||
pub new_shard_count: u8,
|
||||
|
||||
// A tenant's stripe size is only meaningful the first time their shard count goes
|
||||
// above 1: therefore during a split from 1->N shards, we may modify the stripe size.
|
||||
//
|
||||
// If this is set while the stripe count is being increased from an already >1 value,
|
||||
// then the request will fail with 400.
|
||||
pub new_stripe_size: Option<ShardStripeSize>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -419,7 +427,7 @@ pub struct StatusResponse {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
pub tenant_id: TenantShardId,
|
||||
pub tenant_id: Option<TenantShardId>,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
@@ -570,7 +578,7 @@ pub struct TimelineInfo {
|
||||
pub walreceiver_status: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerMapInfo {
|
||||
pub in_memory_layers: Vec<InMemoryLayerInfo>,
|
||||
pub historic_layers: Vec<HistoricLayerInfo>,
|
||||
@@ -588,7 +596,7 @@ pub enum LayerAccessKind {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerAccessStatFullDetails {
|
||||
pub when_millis_since_epoch: u64,
|
||||
pub task_kind: &'static str,
|
||||
pub task_kind: Cow<'static, str>,
|
||||
pub access_kind: LayerAccessKind,
|
||||
}
|
||||
|
||||
@@ -647,23 +655,23 @@ impl LayerResidenceEvent {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerAccessStats {
|
||||
pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
|
||||
pub task_kind_access_flag: Vec<&'static str>,
|
||||
pub task_kind_access_flag: Vec<Cow<'static, str>>,
|
||||
pub first: Option<LayerAccessStatFullDetails>,
|
||||
pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
||||
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum InMemoryLayerInfo {
|
||||
Open { lsn_start: Lsn },
|
||||
Frozen { lsn_start: Lsn, lsn_end: Lsn },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum HistoricLayerInfo {
|
||||
Delta {
|
||||
@@ -685,6 +693,32 @@ pub enum HistoricLayerInfo {
|
||||
},
|
||||
}
|
||||
|
||||
impl HistoricLayerInfo {
|
||||
pub fn layer_file_name(&self) -> &str {
|
||||
match self {
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name, ..
|
||||
} => layer_file_name,
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name, ..
|
||||
} => layer_file_name,
|
||||
}
|
||||
}
|
||||
pub fn is_remote(&self) -> bool {
|
||||
match self {
|
||||
HistoricLayerInfo::Delta { remote, .. } => *remote,
|
||||
HistoricLayerInfo::Image { remote, .. } => *remote,
|
||||
}
|
||||
}
|
||||
pub fn set_remote(&mut self, value: bool) {
|
||||
let field = match self {
|
||||
HistoricLayerInfo::Delta { remote, .. } => remote,
|
||||
HistoricLayerInfo::Image { remote, .. } => remote,
|
||||
};
|
||||
*field = value;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct DownloadRemoteLayersTaskSpawnRequest {
|
||||
pub max_concurrent_downloads: NonZeroUsize,
|
||||
@@ -717,6 +751,52 @@ pub struct WalRedoManagerStatus {
|
||||
pub pid: Option<u32>,
|
||||
}
|
||||
|
||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||
/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
|
||||
/// what's happening.
|
||||
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct SecondaryProgress {
|
||||
/// The remote storage LastModified time of the heatmap object we last downloaded.
|
||||
#[serde(
|
||||
serialize_with = "opt_ser_rfc3339_millis",
|
||||
deserialize_with = "opt_deser_rfc3339_millis"
|
||||
)]
|
||||
pub heatmap_mtime: Option<SystemTime>,
|
||||
|
||||
/// The number of layers currently on-disk
|
||||
pub layers_downloaded: usize,
|
||||
/// The number of layers in the most recently seen heatmap
|
||||
pub layers_total: usize,
|
||||
|
||||
/// The number of layer bytes currently on-disk
|
||||
pub bytes_downloaded: u64,
|
||||
/// The number of layer bytes in the most recently seen heatmap
|
||||
pub bytes_total: u64,
|
||||
}
|
||||
|
||||
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
|
||||
ts: &Option<SystemTime>,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
match ts {
|
||||
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
|
||||
None => serializer.serialize_none(),
|
||||
}
|
||||
}
|
||||
|
||||
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
match s {
|
||||
None => Ok(None),
|
||||
Some(s) => humantime::parse_rfc3339(&s)
|
||||
.map_err(serde::de::Error::custom)
|
||||
.map(Some),
|
||||
}
|
||||
}
|
||||
|
||||
pub mod virtual_file {
|
||||
#[derive(
|
||||
Copy,
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::time::SystemTime;
|
||||
///
|
||||
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
||||
/// not handle full u64 values properly.
|
||||
#[derive(serde::Serialize, Debug)]
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
|
||||
pub struct PageserverUtilization {
|
||||
/// Used disk space
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
@@ -21,7 +21,10 @@ pub struct PageserverUtilization {
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
/// Use millis to give confidence that the value is regenerated often enough.
|
||||
#[serde(serialize_with = "ser_rfc3339_millis")]
|
||||
#[serde(
|
||||
serialize_with = "ser_rfc3339_millis",
|
||||
deserialize_with = "deser_rfc3339_millis"
|
||||
)]
|
||||
pub captured_at: SystemTime,
|
||||
}
|
||||
|
||||
@@ -32,6 +35,14 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
|
||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||
}
|
||||
|
||||
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
|
||||
}
|
||||
|
||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||
///
|
||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||
|
||||
@@ -6,11 +6,18 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::shard::TenantShardId;
|
||||
use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
|
||||
|
||||
/// Upcall message sent by the pageserver to the configured `control_plane_api` on
|
||||
/// startup.
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
/// Optional inline self-registration: this is useful with the storage controller,
|
||||
/// if the node already has a node_id set.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub register: Option<NodeRegisterRequest>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
|
||||
@@ -18,6 +18,7 @@ camino.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
|
||||
@@ -157,9 +157,8 @@ impl AzureBlobStorage {
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
etag = Some(part.blob.properties.etag);
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
@@ -174,6 +173,16 @@ impl AzureBlobStorage {
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
bufs.push(data);
|
||||
}
|
||||
|
||||
if bufs.is_empty() {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Azure GET response contained no buffers"
|
||||
)));
|
||||
}
|
||||
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already
|
||||
let etag = etag.unwrap();
|
||||
let last_modified = last_modified.unwrap();
|
||||
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
|
||||
@@ -42,6 +42,9 @@ pub use self::{
|
||||
};
|
||||
use s3_bucket::RequestKind;
|
||||
|
||||
/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
|
||||
pub use azure_core::Etag;
|
||||
|
||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
@@ -291,9 +294,9 @@ pub type DownloadStream =
|
||||
pub struct Download {
|
||||
pub download_stream: DownloadStream,
|
||||
/// The last time the file was modified (`last-modified` HTTP header)
|
||||
pub last_modified: Option<SystemTime>,
|
||||
pub last_modified: SystemTime,
|
||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
||||
pub etag: Option<String>,
|
||||
pub etag: Etag,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::{
|
||||
io::ErrorKind,
|
||||
num::NonZeroU32,
|
||||
pin::Pin,
|
||||
time::{Duration, SystemTime},
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
@@ -30,6 +30,7 @@ use crate::{
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
use crate::Etag;
|
||||
|
||||
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
@@ -406,35 +407,37 @@ impl RemoteStorage for LocalFs {
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let source = ReaderStream::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
let file_metadata = file_metadata(&target_path).await?;
|
||||
|
||||
let source = ReaderStream::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
let etag = mock_etag(&file_metadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: file_metadata
|
||||
.modified()
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
||||
etag,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
@@ -452,50 +455,51 @@ impl RemoteStorage for LocalFs {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
||||
}
|
||||
}
|
||||
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let mut source = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let len = source
|
||||
.metadata()
|
||||
.await
|
||||
.context("query file length")
|
||||
.map_err(DownloadError::Other)?
|
||||
.len();
|
||||
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
.context("Failed to seek to the range start in a local storage file")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||
let source = ReaderStream::new(source);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
let file_metadata = file_metadata(&target_path).await?;
|
||||
let mut source = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let len = source
|
||||
.metadata()
|
||||
.await
|
||||
.context("query file length")
|
||||
.map_err(DownloadError::Other)?
|
||||
.len();
|
||||
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
.context("Failed to seek to the range start in a local storage file")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||
let source = ReaderStream::new(source);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
let etag = mock_etag(&file_metadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: file_metadata
|
||||
.modified()
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
||||
etag,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
@@ -610,13 +614,22 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
||||
if file_path.exists() {
|
||||
ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
|
||||
tokio::fs::metadata(&file_path).await.map_err(|e| {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
DownloadError::NotFound
|
||||
} else {
|
||||
DownloadError::BadInput(e.into())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Use mtime as stand-in for ETag. We could calculate a meaningful one by md5'ing the contents of files we
|
||||
// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
|
||||
// quickly, with less overhead than using a mock S3 server.
|
||||
fn mock_etag(meta: &std::fs::Metadata) -> Etag {
|
||||
let mtime = meta.modified().expect("Filesystem mtime missing");
|
||||
format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
|
||||
};
|
||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use aws_smithy_types::byte_stream::ByteStream;
|
||||
use aws_smithy_types::{body::SdkBody, DateTime};
|
||||
use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use hyper::Body;
|
||||
@@ -287,8 +287,17 @@ impl S3Bucket {
|
||||
let remaining = self.timeout.saturating_sub(started_at.elapsed());
|
||||
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
let etag = object_output.e_tag;
|
||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||
let etag = object_output
|
||||
.e_tag
|
||||
.ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
|
||||
.into();
|
||||
let last_modified = object_output
|
||||
.last_modified
|
||||
.ok_or(DownloadError::Other(anyhow::anyhow!(
|
||||
"Missing LastModified header"
|
||||
)))?
|
||||
.try_into()
|
||||
.map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
|
||||
|
||||
let body = object_output.body;
|
||||
let body = ByteStreamAsStream::from(body);
|
||||
|
||||
@@ -17,6 +17,7 @@ use remote_storage::{
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
@@ -117,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||
{
|
||||
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
|
||||
let last_modified = dl.last_modified.unwrap();
|
||||
let last_modified = dl.last_modified;
|
||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||
let t0_hwt = t0 + half_wt;
|
||||
let t1_hwt = t1 - half_wt;
|
||||
@@ -484,32 +485,33 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
let file_len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
|
||||
|
||||
{
|
||||
let mut stream = ctx
|
||||
let stream = ctx
|
||||
.client
|
||||
.download(&path, &cancel)
|
||||
.await
|
||||
.expect("download succeeds")
|
||||
.download_stream;
|
||||
|
||||
let first = stream
|
||||
.next()
|
||||
.await
|
||||
.expect("should have the first blob")
|
||||
.expect("should have succeeded");
|
||||
let mut reader = std::pin::pin!(tokio_util::io::StreamReader::new(stream));
|
||||
|
||||
tracing::info!(len = first.len(), "downloaded first chunk");
|
||||
let first = reader.fill_buf().await.expect("should have the first blob");
|
||||
|
||||
let len = first.len();
|
||||
tracing::info!(len, "downloaded first chunk");
|
||||
|
||||
assert!(
|
||||
first.len() < len,
|
||||
first.len() < file_len,
|
||||
"uploaded file is too small, we downloaded all on first chunk"
|
||||
);
|
||||
|
||||
reader.consume(len);
|
||||
|
||||
cancel.cancel();
|
||||
|
||||
let next = stream.next().await.expect("stream should have more");
|
||||
let next = reader.fill_buf().await;
|
||||
|
||||
let e = next.expect_err("expected an error, but got a chunk?");
|
||||
|
||||
@@ -520,6 +522,10 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
|
||||
.is_some_and(|e| matches!(e, DownloadError::Cancelled)),
|
||||
"{inner:?}"
|
||||
);
|
||||
|
||||
let e = DownloadError::from(e);
|
||||
|
||||
assert!(matches!(e, DownloadError::Cancelled), "{e:?}");
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
@@ -47,9 +47,10 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct SerdeRepr<T> {
|
||||
buffer: Vec<T>,
|
||||
buffer_size: usize,
|
||||
drop_count: u64,
|
||||
}
|
||||
|
||||
@@ -61,6 +62,7 @@ where
|
||||
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
|
||||
SerdeRepr {
|
||||
buffer: buffer.iter().cloned().collect(),
|
||||
buffer_size: L,
|
||||
drop_count: *drop_count,
|
||||
}
|
||||
}
|
||||
@@ -78,19 +80,52 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
|
||||
where
|
||||
T: Clone + serde::Deserialize<'de>,
|
||||
{
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let SerdeRepr {
|
||||
buffer: des_buffer,
|
||||
drop_count,
|
||||
buffer_size,
|
||||
} = SerdeRepr::<T>::deserialize(deserializer)?;
|
||||
if buffer_size != L {
|
||||
use serde::de::Error;
|
||||
return Err(D::Error::custom(format!(
|
||||
"invalid buffer_size, expecting {L} got {buffer_size}"
|
||||
)));
|
||||
}
|
||||
let mut buffer = HistoryBuffer::new();
|
||||
buffer.extend(des_buffer);
|
||||
Ok(HistoryBufferWithDropCounter { buffer, drop_count })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::HistoryBufferWithDropCounter;
|
||||
|
||||
#[test]
|
||||
fn test_basics() {
|
||||
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
|
||||
let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
|
||||
b.write(1);
|
||||
b.write(2);
|
||||
b.write(3);
|
||||
assert!(b.iter().any(|e| *e == 2));
|
||||
assert!(b.iter().any(|e| *e == 3));
|
||||
assert!(!b.iter().any(|e| *e == 1));
|
||||
|
||||
// round-trip serde
|
||||
let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
|
||||
serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
|
||||
assert_eq!(
|
||||
round_tripped.iter().cloned().collect::<Vec<_>>(),
|
||||
b.iter().cloned().collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -123,6 +123,12 @@ impl PageserverFeedback {
|
||||
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
}
|
||||
}
|
||||
b"shard_number" => {
|
||||
let len = buf.get_i32();
|
||||
// TODO: this will be implemented in the next update,
|
||||
// for now, we just skip the value.
|
||||
buf.advance(len as usize);
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
|
||||
@@ -110,6 +110,49 @@ impl<T> OnceCell<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, or returns an unique initialization
|
||||
/// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
|
||||
pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
|
||||
// It looks like OnceCell::get_or_init could be implemented using this method instead of
|
||||
// duplication. However, that makes the future be !Send due to possibly holding on to the
|
||||
// MutexGuard over an await point.
|
||||
loop {
|
||||
let sem = {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
{
|
||||
let permit = {
|
||||
// increment the count for the duration of queued
|
||||
let _guard = CountWaitingInitializers::start(self);
|
||||
sem.acquire().await
|
||||
};
|
||||
|
||||
let Ok(permit) = permit else {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
|
||||
// there was a take_and_deinit in between
|
||||
continue;
|
||||
}
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
};
|
||||
|
||||
permit.forget();
|
||||
}
|
||||
|
||||
let permit = InitPermit(sem);
|
||||
return Err(permit);
|
||||
}
|
||||
}
|
||||
|
||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||
/// to complete initializing the inner value.
|
||||
///
|
||||
@@ -481,4 +524,39 @@ mod tests {
|
||||
|
||||
assert_eq!("t1", *cell.get().unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn detached_init_smoke() {
|
||||
let target = OnceCell::default();
|
||||
|
||||
let Err(permit) = target.get_or_init_detached().await else {
|
||||
unreachable!("it is not initialized")
|
||||
};
|
||||
|
||||
tokio::time::timeout(
|
||||
std::time::Duration::from_secs(3600 * 24 * 7 * 365),
|
||||
target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
|
||||
)
|
||||
.await
|
||||
.expect_err("should timeout since we are already holding the permit");
|
||||
|
||||
target.set(42, permit);
|
||||
|
||||
let (_answer, permit) = {
|
||||
let mut guard = target
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(*guard, 42);
|
||||
|
||||
guard.take_and_deinit()
|
||||
};
|
||||
|
||||
assert!(target.get().is_none());
|
||||
|
||||
target.set(11, permit);
|
||||
|
||||
assert_eq!(*target.get().unwrap(), 11);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
|
||||
(*api).process_safekeeper_feedback(&mut (*wp))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -142,7 +142,7 @@ pub trait ApiImpl {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
|
||||
fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
|
||||
@@ -89,6 +89,9 @@ enumset = { workspace = true, features = ["serde"]}
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
|
||||
@@ -169,7 +169,7 @@ impl Client {
|
||||
self.request(Method::GET, uri, ()).await
|
||||
}
|
||||
|
||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
@@ -181,7 +181,16 @@ impl Client {
|
||||
} else {
|
||||
req
|
||||
};
|
||||
let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
|
||||
req.json(&body).send().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
body: B,
|
||||
) -> Result<reqwest::Response> {
|
||||
let res = self.request_noerror(method, uri, body).await?;
|
||||
let response = res.error_from_body().await?;
|
||||
Ok(response)
|
||||
}
|
||||
@@ -240,13 +249,26 @@ impl Client {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
pub async fn tenant_secondary_download(
|
||||
&self,
|
||||
tenant_id: TenantShardId,
|
||||
wait: Option<std::time::Duration>,
|
||||
) -> Result<(StatusCode, SecondaryProgress)> {
|
||||
let mut path = reqwest::Url::parse(&format!(
|
||||
"{}/v1/tenant/{}/secondary/download",
|
||||
self.mgmt_api_endpoint, tenant_id
|
||||
);
|
||||
self.request(Method::POST, &uri, ()).await?;
|
||||
Ok(())
|
||||
))
|
||||
.expect("Cannot build URL");
|
||||
|
||||
if let Some(wait) = wait {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("wait_ms", &format!("{}", wait.as_millis()));
|
||||
}
|
||||
|
||||
let response = self.request(Method::POST, path, ()).await?;
|
||||
let status = response.status();
|
||||
let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
|
||||
Ok((status, progress))
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
@@ -257,7 +279,7 @@ impl Client {
|
||||
lazy: bool,
|
||||
) -> Result<()> {
|
||||
let req_body = TenantLocationConfigRequest {
|
||||
tenant_id: tenant_shard_id,
|
||||
tenant_id: Some(tenant_shard_id),
|
||||
config,
|
||||
};
|
||||
|
||||
@@ -416,4 +438,77 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
||||
self.get(uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn layer_map_info(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<LayerMapInfo> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/layer",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
|
||||
);
|
||||
self.get(&uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn layer_evict(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &str,
|
||||
) -> Result<bool> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/layer/{}",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
|
||||
);
|
||||
let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => Ok(true),
|
||||
StatusCode::NOT_MODIFIED => Ok(false),
|
||||
// TODO: dedupe this pattern / introduce separate error variant?
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn layer_ondemand_download(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &str,
|
||||
) -> Result<bool> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/layer/{}",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
|
||||
);
|
||||
let resp = self.request_noerror(Method::GET, &uri, ()).await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => Ok(true),
|
||||
StatusCode::NOT_MODIFIED => Ok(false),
|
||||
// TODO: dedupe this pattern / introduce separate error variant?
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
272
pageserver/pagebench/src/cmd/ondemand_download_churn.rs
Normal file
272
pageserver/pagebench/src/cmd/ondemand_download_churn.rs
Normal file
@@ -0,0 +1,272 @@
|
||||
use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
|
||||
|
||||
use pageserver_client::mgmt_api;
|
||||
use rand::seq::SliceRandom;
|
||||
use tracing::{debug, info};
|
||||
use utils::id::{TenantTimelineId, TimelineId};
|
||||
|
||||
use tokio::{
|
||||
sync::{mpsc, OwnedSemaphorePermit},
|
||||
task::JoinSet,
|
||||
};
|
||||
|
||||
use std::{
|
||||
num::NonZeroUsize,
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
/// Evict & on-demand download random layers.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long, default_value = "1")]
|
||||
tasks_per_target: NonZeroUsize,
|
||||
#[clap(long, default_value = "1")]
|
||||
concurrency_per_target: NonZeroUsize,
|
||||
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
/// Before starting the benchmark, live-reconfigure the pageserver to use the given
|
||||
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
||||
#[clap(long)]
|
||||
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let task = rt.spawn(main_impl(args));
|
||||
rt.block_on(task).unwrap().unwrap();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
evictions: AtomicU64,
|
||||
downloads: AtomicU64,
|
||||
timeline_restarts: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn eviction_done(&self) {
|
||||
self.evictions.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn download_done(&self) {
|
||||
self.downloads.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn timeline_restart_done(&self) {
|
||||
self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
if let Some(engine_str) = &args.set_io_engine {
|
||||
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||
}
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut tasks = JoinSet::new();
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
tasks.spawn({
|
||||
let live_stats = Arc::clone(&live_stats);
|
||||
async move {
|
||||
let mut last_at = Instant::now();
|
||||
loop {
|
||||
tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
|
||||
let now = Instant::now();
|
||||
let delta: Duration = now - last_at;
|
||||
last_at = now;
|
||||
|
||||
let LiveStats {
|
||||
evictions,
|
||||
downloads,
|
||||
timeline_restarts,
|
||||
} = &*live_stats;
|
||||
let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||
let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||
let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
|
||||
info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
for tl in timelines {
|
||||
for _ in 0..args.tasks_per_target.get() {
|
||||
tasks.spawn(timeline_actor(
|
||||
args,
|
||||
Arc::clone(&mgmt_api_client),
|
||||
tl,
|
||||
Arc::clone(&live_stats),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
res.unwrap();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn timeline_actor(
|
||||
args: &'static Args,
|
||||
mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
|
||||
timeline: TenantTimelineId,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
// TODO: support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||
|
||||
struct Timeline {
|
||||
joinset: JoinSet<()>,
|
||||
layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
|
||||
concurrency: Arc<tokio::sync::Semaphore>,
|
||||
}
|
||||
loop {
|
||||
debug!("restarting timeline");
|
||||
let layer_map_info = mgmt_api_client
|
||||
.layer_map_info(tenant_shard_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
let concurrency = Arc::new(tokio::sync::Semaphore::new(
|
||||
args.concurrency_per_target.get(),
|
||||
));
|
||||
|
||||
let mut joinset = JoinSet::new();
|
||||
let layers = layer_map_info
|
||||
.historic_layers
|
||||
.into_iter()
|
||||
.map(|historic_layer| {
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
joinset.spawn(layer_actor(
|
||||
tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
historic_layer,
|
||||
rx,
|
||||
Arc::clone(&mgmt_api_client),
|
||||
Arc::clone(&live_stats),
|
||||
));
|
||||
tx
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut timeline = Timeline {
|
||||
joinset,
|
||||
layers,
|
||||
concurrency,
|
||||
};
|
||||
|
||||
live_stats.timeline_restart_done();
|
||||
|
||||
loop {
|
||||
assert!(!timeline.joinset.is_empty());
|
||||
if let Some(res) = timeline.joinset.try_join_next() {
|
||||
debug!(?res, "a layer actor exited, should not happen");
|
||||
timeline.joinset.shutdown().await;
|
||||
break;
|
||||
}
|
||||
|
||||
let mut permit = Some(
|
||||
Arc::clone(&timeline.concurrency)
|
||||
.acquire_owned()
|
||||
.await
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
loop {
|
||||
let layer_tx = {
|
||||
let mut rng = rand::thread_rng();
|
||||
timeline.layers.choose_mut(&mut rng).expect("no layers")
|
||||
};
|
||||
match layer_tx.try_send(permit.take().unwrap()) {
|
||||
Ok(_) => break,
|
||||
Err(e) => match e {
|
||||
mpsc::error::TrySendError::Full(back) => {
|
||||
// TODO: retrying introduces bias away from slow downloaders
|
||||
permit.replace(back);
|
||||
}
|
||||
mpsc::error::TrySendError::Closed(_) => panic!(),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn layer_actor(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
mut layer: HistoricLayerInfo,
|
||||
mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
|
||||
mgmt_api_client: Arc<mgmt_api::Client>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
#[derive(Clone, Copy)]
|
||||
enum Action {
|
||||
Evict,
|
||||
OnDemandDownload,
|
||||
}
|
||||
|
||||
while let Some(_permit) = rx.recv().await {
|
||||
let action = if layer.is_remote() {
|
||||
Action::OnDemandDownload
|
||||
} else {
|
||||
Action::Evict
|
||||
};
|
||||
|
||||
let did_it = match action {
|
||||
Action::Evict => {
|
||||
let did_it = mgmt_api_client
|
||||
.layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||
.await
|
||||
.unwrap();
|
||||
live_stats.eviction_done();
|
||||
did_it
|
||||
}
|
||||
Action::OnDemandDownload => {
|
||||
let did_it = mgmt_api_client
|
||||
.layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||
.await
|
||||
.unwrap();
|
||||
live_stats.download_done();
|
||||
did_it
|
||||
}
|
||||
};
|
||||
if !did_it {
|
||||
debug!("local copy of layer map appears out of sync, re-downloading");
|
||||
return;
|
||||
}
|
||||
debug!("did it");
|
||||
layer.set_remote(match action {
|
||||
Action::Evict => true,
|
||||
Action::OnDemandDownload => false,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,7 @@ mod util {
|
||||
mod cmd {
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod ondemand_download_churn;
|
||||
pub(super) mod trigger_initial_size_calculation;
|
||||
}
|
||||
|
||||
@@ -25,6 +26,7 @@ enum Args {
|
||||
Basebackup(cmd::basebackup::Args),
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -43,6 +45,7 @@ fn main() {
|
||||
Args::TriggerInitialSizeCalculation(args) => {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![recursion_limit = "300"]
|
||||
|
||||
//! Main entry point for the Page Server executable.
|
||||
|
||||
use std::env::{var, VarError};
|
||||
@@ -118,6 +120,9 @@ fn main() -> anyhow::Result<()> {
|
||||
&[("node_id", &conf.id.to_string())],
|
||||
);
|
||||
|
||||
// after setting up logging, log the effective IO engine choice
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
utils::crashsafe::create_dir_all(conf.tenants_path())
|
||||
@@ -312,6 +317,7 @@ fn start_pageserver(
|
||||
let http_listener = tcp_listener::bind(http_addr)?;
|
||||
|
||||
let pg_addr = &conf.listen_pg_addr;
|
||||
|
||||
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||
|
||||
@@ -544,7 +550,7 @@ fn start_pageserver(
|
||||
let router_state = Arc::new(
|
||||
http::routes::State::new(
|
||||
conf,
|
||||
tenant_manager,
|
||||
tenant_manager.clone(),
|
||||
http_auth.clone(),
|
||||
remote_storage.clone(),
|
||||
broker_client.clone(),
|
||||
@@ -688,6 +694,7 @@ fn start_pageserver(
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
||||
&tenant_manager,
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
));
|
||||
|
||||
@@ -7,8 +7,9 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::{RemotePath, RemoteStorageConfig};
|
||||
use serde;
|
||||
use serde::de::IntoDeserializer;
|
||||
use std::env;
|
||||
use std::{collections::HashMap, env};
|
||||
use storage_broker::Uri;
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::ConnectionId;
|
||||
@@ -29,18 +30,17 @@ use utils::{
|
||||
logging::LogFormat,
|
||||
};
|
||||
|
||||
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::timeline::GetVectoredImpl;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::virtual_file;
|
||||
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
||||
use crate::{tenant::config::TenantConf, virtual_file};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
@@ -83,6 +83,10 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring";
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs";
|
||||
|
||||
pub const DEFAULT_GET_VECTORED_IMPL: &str = "sequential";
|
||||
@@ -286,21 +290,49 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
|
||||
|
||||
// use dedicated enum for builder to better indicate the intention
|
||||
// and avoid possible confusion with nested options
|
||||
#[derive(Clone, Default)]
|
||||
pub enum BuilderValue<T> {
|
||||
Set(T),
|
||||
#[default]
|
||||
NotSet,
|
||||
}
|
||||
|
||||
impl<T> BuilderValue<T> {
|
||||
pub fn ok_or<E>(self, err: E) -> Result<T, E> {
|
||||
impl<T: Clone> BuilderValue<T> {
|
||||
pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
|
||||
match self {
|
||||
Self::Set(v) => Ok(v),
|
||||
Self::NotSet => Err(err),
|
||||
Self::Set(v) => Ok(v.clone()),
|
||||
Self::NotSet => match default {
|
||||
BuilderValue::Set(v) => Ok(v.clone()),
|
||||
BuilderValue::NotSet => {
|
||||
anyhow::bail!("missing config value {field_name:?}")
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Certain metadata (e.g. externally-addressable name, AZ) is delivered
|
||||
// as a separate structure. This information is not neeed by the pageserver
|
||||
// itself, it is only used for registering the pageserver with the control
|
||||
// plane and/or storage controller.
|
||||
//
|
||||
#[derive(serde::Deserialize)]
|
||||
pub(crate) struct NodeMetadata {
|
||||
#[serde(rename = "host")]
|
||||
pub(crate) postgres_host: String,
|
||||
#[serde(rename = "port")]
|
||||
pub(crate) postgres_port: u16,
|
||||
pub(crate) http_host: String,
|
||||
pub(crate) http_port: u16,
|
||||
|
||||
// Deployment tools may write fields to the metadata file beyond what we
|
||||
// use in this type: this type intentionally only names fields that require.
|
||||
#[serde(flatten)]
|
||||
pub(crate) other: HashMap<String, serde_json::Value>,
|
||||
}
|
||||
|
||||
// needed to simplify config construction
|
||||
#[derive(Default)]
|
||||
struct PageServerConfigBuilder {
|
||||
listen_pg_addr: BuilderValue<String>,
|
||||
|
||||
@@ -368,8 +400,9 @@ struct PageServerConfigBuilder {
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
fn default() -> Self {
|
||||
impl PageServerConfigBuilder {
|
||||
#[inline(always)]
|
||||
fn default_values() -> Self {
|
||||
use self::BuilderValue::*;
|
||||
use defaults::*;
|
||||
Self {
|
||||
@@ -622,125 +655,96 @@ impl PageServerConfigBuilder {
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
.ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?;
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
.ok_or(anyhow!("missing listen_pg_addr"))?,
|
||||
listen_http_addr: self
|
||||
.listen_http_addr
|
||||
.ok_or(anyhow!("missing listen_http_addr"))?,
|
||||
availability_zone: self
|
||||
.availability_zone
|
||||
.ok_or(anyhow!("missing availability_zone"))?,
|
||||
wait_lsn_timeout: self
|
||||
.wait_lsn_timeout
|
||||
.ok_or(anyhow!("missing wait_lsn_timeout"))?,
|
||||
wal_redo_timeout: self
|
||||
.wal_redo_timeout
|
||||
.ok_or(anyhow!("missing wal_redo_timeout"))?,
|
||||
superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
|
||||
page_cache_size: self
|
||||
.page_cache_size
|
||||
.ok_or(anyhow!("missing page_cache_size"))?,
|
||||
max_file_descriptors: self
|
||||
.max_file_descriptors
|
||||
.ok_or(anyhow!("missing max_file_descriptors"))?,
|
||||
workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
|
||||
pg_distrib_dir: self
|
||||
.pg_distrib_dir
|
||||
.ok_or(anyhow!("missing pg_distrib_dir"))?,
|
||||
http_auth_type: self
|
||||
.http_auth_type
|
||||
.ok_or(anyhow!("missing http_auth_type"))?,
|
||||
pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
.ok_or(anyhow!("missing auth_validation_public_key_path"))?,
|
||||
remote_storage_config: self
|
||||
.remote_storage_config
|
||||
.ok_or(anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow!("missing id"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoint: self
|
||||
.broker_endpoint
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?,
|
||||
broker_keepalive_interval: self
|
||||
.broker_keepalive_interval
|
||||
.ok_or(anyhow!("No broker keepalive interval provided"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
metric_collection_interval: self
|
||||
.metric_collection_interval
|
||||
.ok_or(anyhow!("missing metric_collection_interval"))?,
|
||||
cached_metric_collection_interval: self
|
||||
.cached_metric_collection_interval
|
||||
.ok_or(anyhow!("missing cached_metric_collection_interval"))?,
|
||||
metric_collection_endpoint: self
|
||||
.metric_collection_endpoint
|
||||
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
|
||||
synthetic_size_calculation_interval: self
|
||||
.synthetic_size_calculation_interval
|
||||
.ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
|
||||
disk_usage_based_eviction: self
|
||||
.disk_usage_based_eviction
|
||||
.ok_or(anyhow!("missing disk_usage_based_eviction"))?,
|
||||
test_remote_failures: self
|
||||
.test_remote_failures
|
||||
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
||||
ondemand_download_behavior_treat_error_as_warn: self
|
||||
.ondemand_download_behavior_treat_error_as_warn
|
||||
.ok_or(anyhow!(
|
||||
"missing ondemand_download_behavior_treat_error_as_warn"
|
||||
))?,
|
||||
background_task_maximum_delay: self
|
||||
.background_task_maximum_delay
|
||||
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
|
||||
control_plane_api: self
|
||||
.control_plane_api
|
||||
.ok_or(anyhow!("missing control_plane_api"))?,
|
||||
control_plane_api_token: self
|
||||
.control_plane_api_token
|
||||
.ok_or(anyhow!("missing control_plane_api_token"))?,
|
||||
control_plane_emergency_mode: self
|
||||
.control_plane_emergency_mode
|
||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
||||
heatmap_upload_concurrency: self
|
||||
.heatmap_upload_concurrency
|
||||
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
||||
secondary_download_concurrency: self
|
||||
.secondary_download_concurrency
|
||||
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
|
||||
ingest_batch_size: self
|
||||
.ingest_batch_size
|
||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
||||
virtual_file_io_engine: self
|
||||
.virtual_file_io_engine
|
||||
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
|
||||
get_vectored_impl: self
|
||||
.get_vectored_impl
|
||||
.ok_or(anyhow!("missing get_vectored_impl"))?,
|
||||
max_vectored_read_bytes: self
|
||||
.max_vectored_read_bytes
|
||||
.ok_or(anyhow!("missing max_vectored_read_bytes"))?,
|
||||
validate_vectored_get: self
|
||||
.validate_vectored_get
|
||||
.ok_or(anyhow!("missing validate_vectored_get"))?,
|
||||
})
|
||||
let default = Self::default_values();
|
||||
|
||||
macro_rules! conf {
|
||||
(USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
|
||||
PageServerConf {
|
||||
$(
|
||||
$field: self.$field.ok_or(stringify!($field), default.$field)?,
|
||||
)*
|
||||
$(
|
||||
$custom_field: $custom_value,
|
||||
)*
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Ok(conf!(
|
||||
USING DEFAULT
|
||||
{
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
availability_zone,
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
superuser,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
workdir,
|
||||
pg_distrib_dir,
|
||||
http_auth_type,
|
||||
pg_auth_type,
|
||||
auth_validation_public_key_path,
|
||||
remote_storage_config,
|
||||
id,
|
||||
broker_endpoint,
|
||||
broker_keepalive_interval,
|
||||
log_format,
|
||||
metric_collection_interval,
|
||||
cached_metric_collection_interval,
|
||||
metric_collection_endpoint,
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api,
|
||||
control_plane_api_token,
|
||||
control_plane_emergency_mode,
|
||||
heatmap_upload_concurrency,
|
||||
secondary_download_concurrency,
|
||||
ingest_batch_size,
|
||||
get_vectored_impl,
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new({
|
||||
self
|
||||
.concurrent_tenant_warmup
|
||||
.ok_or("concurrent_tenant_warmpup",
|
||||
default.concurrent_tenant_warmup)?
|
||||
}),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||
self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or("concurrent_tenant_size_logical_size_queries",
|
||||
default.concurrent_tenant_size_logical_size_queries.clone())?
|
||||
),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
||||
// re-use `concurrent_tenant_size_logical_size_queries`
|
||||
self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or("eviction_task_immitated_concurrent_logical_size_queries",
|
||||
default.concurrent_tenant_size_logical_size_queries.clone())?,
|
||||
),
|
||||
virtual_file_io_engine: match self.virtual_file_io_engine {
|
||||
BuilderValue::Set(v) => v,
|
||||
BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
|
||||
io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
|
||||
io_engine::FeatureTestResult::Worse { engine, remark } => {
|
||||
// TODO: bubble this up to the caller so we can tracing::warn! it.
|
||||
eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
|
||||
engine
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -757,6 +761,10 @@ impl PageServerConf {
|
||||
self.workdir.join("deletion")
|
||||
}
|
||||
|
||||
pub fn metadata_path(&self) -> Utf8PathBuf {
|
||||
self.workdir.join("metadata.json")
|
||||
}
|
||||
|
||||
pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
|
||||
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
||||
// increment this.
|
||||
@@ -816,18 +824,7 @@ impl PageServerConf {
|
||||
.join(timeline_id.to_string())
|
||||
}
|
||||
|
||||
pub fn timeline_uninit_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Utf8PathBuf {
|
||||
path_with_suffix_extension(
|
||||
self.timeline_path(&tenant_shard_id, &timeline_id),
|
||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn timeline_delete_mark_file_path(
|
||||
pub(crate) fn timeline_delete_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -838,7 +835,10 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
pub(crate) fn tenant_deleted_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::collections::HashMap;
|
||||
|
||||
use futures::Future;
|
||||
use pageserver_api::{
|
||||
controller_api::NodeRegisterRequest,
|
||||
shard::TenantShardId,
|
||||
upcall_api::{
|
||||
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
|
||||
@@ -12,7 +13,10 @@ use tokio_util::sync::CancellationToken;
|
||||
use url::Url;
|
||||
use utils::{backoff, generation::Generation, id::NodeId};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::{
|
||||
config::{NodeMetadata, PageServerConf},
|
||||
virtual_file::on_fatal_io_error,
|
||||
};
|
||||
|
||||
/// The Pageserver's client for using the control plane API: this is a small subset
|
||||
/// of the overall control plane API, for dealing with generations (see docs/rfcs/025-generation-numbers.md)
|
||||
@@ -32,6 +36,7 @@ pub enum RetryForeverError {
|
||||
pub trait ControlPlaneGenerationsApi {
|
||||
fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
|
||||
fn validate(
|
||||
&self,
|
||||
@@ -110,13 +115,59 @@ impl ControlPlaneClient {
|
||||
|
||||
impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
/// Block until we get a successful response, or error out if we are shut down
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
let re_attach_path = self
|
||||
.base_url
|
||||
.join("re-attach")
|
||||
.expect("Failed to build re-attach path");
|
||||
|
||||
// Include registration content in the re-attach request if a metadata file is readable
|
||||
let metadata_path = conf.metadata_path();
|
||||
let register = match tokio::fs::read_to_string(&metadata_path).await {
|
||||
Ok(metadata_str) => match serde_json::from_str::<NodeMetadata>(&metadata_str) {
|
||||
Ok(m) => {
|
||||
// Since we run one time at startup, be generous in our logging and
|
||||
// dump all metadata.
|
||||
tracing::info!(
|
||||
"Loaded node metadata: postgres {}:{}, http {}:{}, other fields: {:?}",
|
||||
m.postgres_host,
|
||||
m.postgres_port,
|
||||
m.http_host,
|
||||
m.http_port,
|
||||
m.other
|
||||
);
|
||||
|
||||
Some(NodeRegisterRequest {
|
||||
node_id: conf.id,
|
||||
listen_pg_addr: m.postgres_host,
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Unreadable metadata in {metadata_path}: {e}");
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// This is legal: we may have been deployed with some external script
|
||||
// doing registration for us.
|
||||
tracing::info!("Metadata file not found at {metadata_path}");
|
||||
} else {
|
||||
on_fatal_io_error(&e, &format!("Loading metadata at {metadata_path}"))
|
||||
}
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let request = ReAttachRequest {
|
||||
node_id: self.node_id,
|
||||
register,
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-re-attach");
|
||||
|
||||
@@ -831,7 +831,10 @@ mod test {
|
||||
}
|
||||
|
||||
impl ControlPlaneGenerationsApi for MockControlPlane {
|
||||
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
async fn re_attach(
|
||||
&self,
|
||||
_conf: &PageServerConf,
|
||||
) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
|
||||
unimplemented!()
|
||||
}
|
||||
async fn validate(
|
||||
|
||||
@@ -567,9 +567,9 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
/v1/tenant/{tenant_id}/location_config:
|
||||
/v1/tenant/{tenant_shard_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
@@ -932,6 +932,75 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/heatmap_upload:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
If the location is in an attached mode, upload the current state to the remote heatmap
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/secondary/download:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: wait_ms
|
||||
description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
post:
|
||||
description: |
|
||||
If the location is in secondary mode, download latest heatmap and layers
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
"202":
|
||||
description: Download has started but not yet finished
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
@@ -1314,10 +1383,11 @@ components:
|
||||
TenantLocationConfigRequest:
|
||||
type: object
|
||||
required:
|
||||
- tenant_id
|
||||
- mode
|
||||
properties:
|
||||
tenant_id:
|
||||
type: string
|
||||
description: Not used, scheduled for removal.
|
||||
mode:
|
||||
type: string
|
||||
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
||||
@@ -1391,7 +1461,7 @@ components:
|
||||
trace_read_requests:
|
||||
type: boolean
|
||||
heatmap_period:
|
||||
type: integer
|
||||
type: string
|
||||
TenantConfigResponse:
|
||||
type: object
|
||||
properties:
|
||||
@@ -1569,6 +1639,37 @@ components:
|
||||
Lower is better score for how good this pageserver would be for the next tenant.
|
||||
The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
|
||||
|
||||
SecondaryProgress:
|
||||
type: object
|
||||
required:
|
||||
- heatmap_mtime
|
||||
- layers_downloaded
|
||||
- layers_total
|
||||
- bytes_downloaded
|
||||
- bytes_total
|
||||
properties:
|
||||
heatmap_mtime:
|
||||
type: string
|
||||
format: date-time
|
||||
description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
|
||||
layers_downloaded:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many layers from the latest layer heatmap are present on disk
|
||||
bytes_downloaded:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many bytes of layer content from the latest layer heatmap are present on disk
|
||||
layers_total:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many layers were in the latest layer heatmap
|
||||
bytes_total:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many bytes of layer content were in the latest layer heatmap
|
||||
|
||||
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -535,9 +535,9 @@ async fn timeline_create_handler(
|
||||
)
|
||||
}
|
||||
Err(
|
||||
tenant::CreateTimelineError::Conflict
|
||||
| tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, ()),
|
||||
e @ tenant::CreateTimelineError::Conflict
|
||||
| e @ tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
||||
StatusCode::NOT_ACCEPTABLE,
|
||||
HttpErrorBody::from_msg(format!("{err:#}")),
|
||||
@@ -1151,7 +1151,12 @@ async fn tenant_shard_split_handler(
|
||||
|
||||
let new_shards = state
|
||||
.tenant_manager
|
||||
.shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
|
||||
.shard_split(
|
||||
tenant_shard_id,
|
||||
ShardCount::new(req.new_shard_count),
|
||||
req.new_stripe_size,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -1982,13 +1987,42 @@ async fn secondary_download_handler(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
state
|
||||
.secondary_controller
|
||||
.download_tenant(tenant_shard_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
// We don't need this to issue the download request, but:
|
||||
// - it enables us to cleanly return 404 if we get a request for an absent shard
|
||||
// - we will use this to provide status feedback in the response
|
||||
let Some(secondary_tenant) = state
|
||||
.tenant_manager
|
||||
.get_secondary_tenant_shard(tenant_shard_id)
|
||||
else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
|
||||
));
|
||||
};
|
||||
|
||||
let timeout = wait.unwrap_or(Duration::MAX);
|
||||
|
||||
let status = match tokio::time::timeout(
|
||||
timeout,
|
||||
state.secondary_controller.download_tenant(tenant_shard_id),
|
||||
)
|
||||
.await
|
||||
{
|
||||
// Download job ran to completion.
|
||||
Ok(Ok(())) => StatusCode::OK,
|
||||
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
|
||||
// okay. We could get an error here in the unlikely edge case that the tenant
|
||||
// was detached between our check above and executing the download job.
|
||||
Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
|
||||
// A timeout is not an error: we have started the download, we're just not done
|
||||
// yet. The caller will get a response body indicating status.
|
||||
Err(_) => StatusCode::ACCEPTED,
|
||||
};
|
||||
|
||||
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||
|
||||
json_response(status, progress)
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -2048,6 +2082,10 @@ async fn get_utilization(
|
||||
r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
fail::fail_point!("get-utilization-http-handler", |_| {
|
||||
Err(ApiError::ResourceUnavailable("failpoint".into()))
|
||||
});
|
||||
|
||||
// this probably could be completely public, but lets make that change later.
|
||||
check_permission(&r, None)?;
|
||||
|
||||
@@ -2103,6 +2141,16 @@ where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
|
||||
{
|
||||
if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
|
||||
fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
|
||||
"failpoint".into()
|
||||
)));
|
||||
|
||||
fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
|
||||
anyhow::anyhow!("failpoint")
|
||||
)));
|
||||
}
|
||||
|
||||
// Spawn a new task to handle the request, to protect the handler from unexpected
|
||||
// async cancellations. Most pageserver functions are not async cancellation safe.
|
||||
// We arm a drop-guard, so that if Hyper drops the Future, we signal the task
|
||||
@@ -2247,7 +2295,7 @@ pub fn make_router(
|
||||
.get("/v1/location_config", |r| {
|
||||
api_handler(r, list_location_config_handler)
|
||||
})
|
||||
.get("/v1/location_config/:tenant_id", |r| {
|
||||
.get("/v1/location_config/:tenant_shard_id", |r| {
|
||||
api_handler(r, get_location_config_handler)
|
||||
})
|
||||
.put(
|
||||
|
||||
@@ -31,6 +31,7 @@ pub mod walredo;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
use tenant::mgr::TenantManager;
|
||||
use tracing::info;
|
||||
|
||||
/// Current storage format version
|
||||
@@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%exit_code))]
|
||||
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
|
||||
pub async fn shutdown_pageserver(
|
||||
tenant_manager: &TenantManager,
|
||||
deletion_queue: Option<DeletionQueue>,
|
||||
exit_code: i32,
|
||||
) {
|
||||
use std::time::Duration;
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
@@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC tasks.
|
||||
timed(
|
||||
tenant::mgr::shutdown_all_tenants(),
|
||||
tenant_manager.shutdown(),
|
||||
"shutdown all tenants",
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
@@ -114,27 +119,27 @@ pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub const TENANT_CONFIG_NAME: &str = "config";
|
||||
pub(crate) const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
|
||||
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
||||
/// tenant path while in secondary mode.
|
||||
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||
pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||
|
||||
/// A suffix used for various temporary files. Any temporary files found in the
|
||||
/// data directory at pageserver startup can be automatically removed.
|
||||
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
/// A marker file to mark that a timeline directory was not fully initialized.
|
||||
/// If a timeline directory with this marker is encountered at pageserver startup,
|
||||
/// the timeline directory and the marker file are both removed.
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
||||
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
@@ -161,11 +166,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
|
||||
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
|
||||
// from the name.
|
||||
|
||||
pub fn is_uninit_mark(path: &Utf8Path) -> bool {
|
||||
pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
pub fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||
pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use enum_map::EnumMap;
|
||||
use metrics::metric_vec_duration::DurationResultObserver;
|
||||
use metrics::{
|
||||
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
|
||||
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
|
||||
@@ -168,7 +167,7 @@ impl GetVectoredLatency {
|
||||
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_get_vectored_seconds",
|
||||
"Time spent in get_vectored",
|
||||
"Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
|
||||
&["task_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
@@ -1283,11 +1282,65 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
|
||||
})
|
||||
});
|
||||
|
||||
impl DurationResultObserver for BasebackupQueryTime {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
|
||||
pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
parent: &'a BasebackupQueryTime,
|
||||
ctx: &'c RequestContext,
|
||||
start: std::time::Instant,
|
||||
}
|
||||
|
||||
impl BasebackupQueryTime {
|
||||
pub(crate) fn start_recording<'c: 'a, 'a>(
|
||||
&'a self,
|
||||
ctx: &'c RequestContext,
|
||||
) -> BasebackupQueryTimeOngoingRecording<'_, '_> {
|
||||
let start = Instant::now();
|
||||
match ctx.micros_spent_throttled.open() {
|
||||
Ok(()) => (),
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
BasebackupQueryTimeOngoingRecording {
|
||||
parent: self,
|
||||
ctx,
|
||||
start,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
pub(crate) fn observe<T, E>(self, res: &Result<T, E>) {
|
||||
let elapsed = self.start.elapsed();
|
||||
let ex_throttled = self
|
||||
.ctx
|
||||
.micros_spent_throttled
|
||||
.close_and_checked_sub_from(elapsed);
|
||||
let ex_throttled = match ex_throttled {
|
||||
Ok(ex_throttled) => ex_throttled,
|
||||
Err(error) => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
let label_value = if res.is_ok() { "ok" } else { "error" };
|
||||
let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
|
||||
metric.observe(duration.as_secs_f64());
|
||||
let metric = self
|
||||
.parent
|
||||
.0
|
||||
.get_metric_with_label_values(&[label_value])
|
||||
.unwrap();
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1964,10 +2017,8 @@ impl TimelineMetrics {
|
||||
pub(crate) fn resident_physical_size_get(&self) -> u64 {
|
||||
self.resident_physical_size_gauge.get()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineMetrics {
|
||||
fn drop(&mut self) {
|
||||
pub(crate) fn shutdown(&self) {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
@@ -2414,7 +2465,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
}
|
||||
|
||||
pub mod tokio_epoll_uring {
|
||||
use metrics::UIntGauge;
|
||||
use metrics::{register_int_counter, UIntGauge};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub struct Collector {
|
||||
descs: Vec<metrics::core::Desc>,
|
||||
@@ -2422,15 +2474,13 @@ pub mod tokio_epoll_uring {
|
||||
systems_destroyed: UIntGauge,
|
||||
}
|
||||
|
||||
const NMETRICS: usize = 2;
|
||||
|
||||
impl metrics::core::Collector for Collector {
|
||||
fn desc(&self) -> Vec<&metrics::core::Desc> {
|
||||
self.descs.iter().collect()
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
||||
let mut mfs = Vec::with_capacity(NMETRICS);
|
||||
let mut mfs = Vec::with_capacity(Self::NMETRICS);
|
||||
let tokio_epoll_uring::metrics::Metrics {
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
@@ -2444,6 +2494,8 @@ pub mod tokio_epoll_uring {
|
||||
}
|
||||
|
||||
impl Collector {
|
||||
const NMETRICS: usize = 2;
|
||||
|
||||
#[allow(clippy::new_without_default)]
|
||||
pub fn new() -> Self {
|
||||
let mut descs = Vec::new();
|
||||
@@ -2477,6 +2529,22 @@ pub mod tokio_epoll_uring {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
|
||||
"Number of times where thread_local_system creation spanned multiple executor threads",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
|
||||
"Number of times thread_local_system creation failed and was retried after back-off.",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) mod tenant_throttling {
|
||||
@@ -2605,6 +2673,8 @@ pub fn preinitialize_metrics() {
|
||||
&WALRECEIVER_BROKER_UPDATES,
|
||||
&WALRECEIVER_CANDIDATES_ADDED,
|
||||
&WALRECEIVER_CANDIDATES_REMOVED,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
@@ -2623,6 +2693,12 @@ pub fn preinitialize_metrics() {
|
||||
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
|
||||
Lazy::force(&disk_usage_based_eviction::METRICS);
|
||||
|
||||
for state_name in pageserver_api::models::TenantState::VARIANTS {
|
||||
// initialize the metric for all gauges, otherwise the time series might seemingly show
|
||||
// values from last restart.
|
||||
TENANT_STATE_METRIC.with_label_values(&[state_name]).set(0);
|
||||
}
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
|
||||
@@ -1199,7 +1199,7 @@ impl PageServerHandler {
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
ctx: RequestContext,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -1214,7 +1214,7 @@ impl PageServerHandler {
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
info!("waiting for {}", lsn);
|
||||
timeline.wait_lsn(lsn, &ctx).await?;
|
||||
timeline.wait_lsn(lsn, ctx).await?;
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
.context("invalid basebackup lsn")?;
|
||||
@@ -1236,7 +1236,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else {
|
||||
@@ -1257,7 +1257,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
// shutdown the encoder to ensure the gzip footer is written
|
||||
@@ -1269,7 +1269,7 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
&ctx,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -1449,25 +1449,25 @@ where
|
||||
false
|
||||
};
|
||||
|
||||
::metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let res = async {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
}
|
||||
.await;
|
||||
metric_recording.observe(&res);
|
||||
res?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
@@ -1563,7 +1563,7 @@ where
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
ctx,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
@@ -50,8 +50,6 @@ use once_cell::sync::Lazy;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::shutdown_pageserver;
|
||||
|
||||
//
|
||||
// There are four runtimes:
|
||||
//
|
||||
@@ -272,9 +270,6 @@ pub enum TaskKind {
|
||||
// Task that uploads a file to remote storage
|
||||
RemoteUploadTask,
|
||||
|
||||
// Task that downloads a file from remote storage
|
||||
RemoteDownloadTask,
|
||||
|
||||
// task that handles the initial downloading of all tenants
|
||||
InitialLoad,
|
||||
|
||||
@@ -456,7 +451,7 @@ async fn task_finish(
|
||||
}
|
||||
|
||||
if shutdown_process {
|
||||
shutdown_pageserver(None, 1).await;
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -55,8 +55,8 @@ use self::mgr::GetTenantError;
|
||||
use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::upload::upload_index_part;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineCreateGuard;
|
||||
use self::timeline::uninit::TimelineExclusionError;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
@@ -565,9 +565,8 @@ impl Tenant {
|
||||
// avoiding holding it across awaits
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
match timelines_accessor.entry(timeline_id) {
|
||||
// We should never try and load the same timeline twice during startup
|
||||
Entry::Occupied(_) => {
|
||||
// The uninit mark file acts as a lock that prevents another task from
|
||||
// initializing the timeline at the same time.
|
||||
unreachable!(
|
||||
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
|
||||
);
|
||||
@@ -1064,8 +1063,7 @@ impl Tenant {
|
||||
let entry_path = entry.path();
|
||||
|
||||
let purge = if crate::is_temporary(entry_path)
|
||||
// TODO: uninit_mark isn't needed any more, since uninitialized timelines are already
|
||||
// covered by the check that the timeline must exist in remote storage.
|
||||
// TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
|
||||
|| is_uninit_mark(entry_path)
|
||||
|| crate::is_delete_mark(entry_path)
|
||||
{
|
||||
@@ -1298,11 +1296,6 @@ impl Tenant {
|
||||
/// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
|
||||
/// and the timeline will fail to load at a restart.
|
||||
///
|
||||
/// That's why we add an uninit mark file, and wrap it together witht the Timeline
|
||||
/// in-memory object into UninitializedTimeline.
|
||||
/// Once the caller is done setting up the timeline, they should call
|
||||
/// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark.
|
||||
///
|
||||
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
|
||||
/// minimum amount of keys required to get a writable timeline.
|
||||
/// (Without it, `put` might fail due to `repartition` failing.)
|
||||
@@ -1318,7 +1311,9 @@ impl Tenant {
|
||||
"Cannot create empty timelines on inactive tenant"
|
||||
);
|
||||
|
||||
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
|
||||
// Protect against concurrent attempts to use this TimelineId
|
||||
let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
|
||||
|
||||
let new_metadata = TimelineMetadata::new(
|
||||
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
||||
// make it valid, before calling finish_creation()
|
||||
@@ -1333,7 +1328,7 @@ impl Tenant {
|
||||
self.prepare_new_timeline(
|
||||
new_timeline_id,
|
||||
&new_metadata,
|
||||
timeline_uninit_mark,
|
||||
create_guard,
|
||||
initdb_lsn,
|
||||
None,
|
||||
)
|
||||
@@ -1421,9 +1416,8 @@ impl Tenant {
|
||||
.map_err(|_| CreateTimelineError::ShuttingDown)?;
|
||||
|
||||
// Get exclusive access to the timeline ID: this ensures that it does not already exist,
|
||||
// and that no other creation attempts will be allowed in while we are working. The
|
||||
// uninit_mark is a guard.
|
||||
let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
|
||||
// and that no other creation attempts will be allowed in while we are working.
|
||||
let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
|
||||
Ok(m) => m,
|
||||
Err(TimelineExclusionError::AlreadyCreating) => {
|
||||
// Creation is in progress, we cannot create it again, and we cannot
|
||||
@@ -1466,6 +1460,8 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
pausable_failpoint!("timeline-creation-after-uninit");
|
||||
|
||||
let loaded_timeline = match ancestor_timeline_id {
|
||||
Some(ancestor_timeline_id) => {
|
||||
let ancestor_timeline = self
|
||||
@@ -1513,7 +1509,7 @@ impl Tenant {
|
||||
&ancestor_timeline,
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
uninit_mark,
|
||||
create_guard,
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
@@ -1523,7 +1519,7 @@ impl Tenant {
|
||||
new_timeline_id,
|
||||
pg_version,
|
||||
load_existing_initdb,
|
||||
uninit_mark,
|
||||
create_guard,
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
@@ -1846,6 +1842,8 @@ impl Tenant {
|
||||
// Wait for any in-flight operations to complete
|
||||
self.gate.close().await;
|
||||
|
||||
remove_tenant_metrics(&self.tenant_shard_id);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2868,9 +2866,9 @@ impl Tenant {
|
||||
start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
|
||||
let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
|
||||
let tl = self
|
||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
|
||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
|
||||
.await?;
|
||||
tl.set_state(TimelineState::Active);
|
||||
Ok(tl)
|
||||
@@ -2884,10 +2882,10 @@ impl Tenant {
|
||||
src_timeline: &Arc<Timeline>,
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
|
||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -2896,7 +2894,7 @@ impl Tenant {
|
||||
src_timeline: &Arc<Timeline>,
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let src_id = src_timeline.timeline_id;
|
||||
@@ -2980,7 +2978,7 @@ impl Tenant {
|
||||
.prepare_new_timeline(
|
||||
dst_id,
|
||||
&metadata,
|
||||
timeline_uninit_mark,
|
||||
timeline_create_guard,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
)
|
||||
@@ -3012,12 +3010,12 @@ impl Tenant {
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
|
||||
let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
|
||||
self.bootstrap_timeline(
|
||||
timeline_id,
|
||||
pg_version,
|
||||
load_existing_initdb,
|
||||
uninit_mark,
|
||||
create_guard,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -3081,7 +3079,7 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
@@ -3093,13 +3091,14 @@ impl Tenant {
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
// an uninit mark was placed before, nothing else can access this timeline files
|
||||
// current initdb was not run yet, so remove whatever was left from the previous runs
|
||||
// Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
|
||||
// we won't race with other creations or existent timelines with the same path.
|
||||
if pgdata_path.exists() {
|
||||
fs::remove_dir_all(&pgdata_path).with_context(|| {
|
||||
format!("Failed to remove already existing initdb directory: {pgdata_path}")
|
||||
})?;
|
||||
}
|
||||
|
||||
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
|
||||
scopeguard::defer! {
|
||||
if let Err(e) = fs::remove_dir_all(&pgdata_path) {
|
||||
@@ -3176,7 +3175,7 @@ impl Tenant {
|
||||
.prepare_new_timeline(
|
||||
timeline_id,
|
||||
&new_metadata,
|
||||
timeline_uninit_mark,
|
||||
timeline_create_guard,
|
||||
pgdata_lsn,
|
||||
None,
|
||||
)
|
||||
@@ -3248,13 +3247,12 @@ impl Tenant {
|
||||
///
|
||||
/// An empty layer map is initialized, and new data and WAL can be imported starting
|
||||
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
|
||||
/// `finish_creation` to insert the Timeline into the timelines map and to remove the
|
||||
/// uninit mark file.
|
||||
/// `finish_creation` to insert the Timeline into the timelines map.
|
||||
async fn prepare_new_timeline<'a>(
|
||||
&'a self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
uninit_mark: TimelineUninitMark<'a>,
|
||||
create_guard: TimelineCreateGuard<'a>,
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
@@ -3277,9 +3275,12 @@ impl Tenant {
|
||||
|
||||
timeline_struct.init_empty_layer_map(start_lsn);
|
||||
|
||||
if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
|
||||
if let Err(e) = self
|
||||
.create_timeline_files(&create_guard.timeline_path)
|
||||
.await
|
||||
{
|
||||
error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
cleanup_timeline_directory(create_guard);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
@@ -3290,41 +3291,31 @@ impl Tenant {
|
||||
Ok(UninitializedTimeline::new(
|
||||
self,
|
||||
new_timeline_id,
|
||||
Some((timeline_struct, uninit_mark)),
|
||||
Some((timeline_struct, create_guard)),
|
||||
))
|
||||
}
|
||||
|
||||
async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
||||
|
||||
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
|
||||
fail::fail_point!("after-timeline-dir-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-dir-creation");
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Attempts to create an uninit mark file for the timeline initialization.
|
||||
/// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists.
|
||||
///
|
||||
/// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init.
|
||||
fn create_timeline_uninit_mark(
|
||||
/// Get a guard that provides exclusive access to the timeline directory, preventing
|
||||
/// concurrent attempts to create the same timeline.
|
||||
fn create_timeline_create_guard(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<TimelineUninitMark, TimelineExclusionError> {
|
||||
) -> Result<TimelineCreateGuard, TimelineExclusionError> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
let uninit_mark_path = self
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
|
||||
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
|
||||
let uninit_mark = TimelineUninitMark::new(
|
||||
self,
|
||||
timeline_id,
|
||||
uninit_mark_path.clone(),
|
||||
timeline_path.clone(),
|
||||
)?;
|
||||
let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
|
||||
|
||||
// At this stage, we have got exclusive access to in-memory state for this timeline ID
|
||||
// for creation.
|
||||
@@ -3340,23 +3331,7 @@ impl Tenant {
|
||||
)));
|
||||
}
|
||||
|
||||
// Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
|
||||
// that during process runtime, colliding creations will be caught in-memory without getting
|
||||
// as far as failing to write a file.
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(&uninit_mark_path)
|
||||
.context("Failed to create uninit mark file")
|
||||
.and_then(|_| {
|
||||
crashsafe::fsync_file_and_parent(&uninit_mark_path)
|
||||
.context("Failed to fsync uninit mark file")
|
||||
})
|
||||
.with_context(|| {
|
||||
format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
Ok(uninit_mark)
|
||||
Ok(create_guard)
|
||||
}
|
||||
|
||||
/// Gathers inputs from all of the timelines to produce a sizing model input.
|
||||
@@ -3557,11 +3532,6 @@ async fn run_initdb(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Drop for Tenant {
|
||||
fn drop(&mut self) {
|
||||
remove_tenant_metrics(&self.tenant_shard_id);
|
||||
}
|
||||
}
|
||||
/// Dump contents of a layer file to stdout.
|
||||
pub async fn dump_layerfile_from_path(
|
||||
path: &Utf8Path,
|
||||
@@ -4628,10 +4598,7 @@ mod tests {
|
||||
drop(guard);
|
||||
|
||||
// Pick a big LSN such that we query over all the changes.
|
||||
// Technically, u64::MAX - 1 is the largest LSN supported by the read path,
|
||||
// but there seems to be a bug on the non-vectored search path which surfaces
|
||||
// in that case.
|
||||
let reads_lsn = Lsn(u64::MAX - 1000);
|
||||
let reads_lsn = Lsn(u64::MAX - 1);
|
||||
|
||||
for read in reads {
|
||||
info!("Doing vectored read on {:?}", read);
|
||||
@@ -5105,15 +5072,15 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_uninit_mark_crash() -> anyhow::Result<()> {
|
||||
let name = "test_uninit_mark_crash";
|
||||
async fn test_create_guard_crash() -> anyhow::Result<()> {
|
||||
let name = "test_create_guard_crash";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
// Keeps uninit mark in place
|
||||
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
||||
let raw_tline = tline.raw_timeline().unwrap();
|
||||
raw_tline
|
||||
.shutdown()
|
||||
@@ -5141,10 +5108,24 @@ mod tests {
|
||||
.timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
|
||||
.exists());
|
||||
|
||||
assert!(!harness
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID)
|
||||
.exists());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_at_max_lsn() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_read_at_max_lsn")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x08), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let lsn = Lsn(0x10);
|
||||
bulk_insert_compact_gc(tline.clone(), &ctx, lsn, 50, 10000).await?;
|
||||
|
||||
let test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let read_lsn = Lsn(u64::MAX - 1);
|
||||
|
||||
assert!(tline.get(test_key, read_lsn, &ctx).await.is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -354,6 +354,7 @@ pub struct TenantConf {
|
||||
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
|
||||
/// may be disabled if a Tenant will not have secondary locations: only secondary
|
||||
/// locations will use the heatmap uploaded by attached locations.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heatmap_period: Duration,
|
||||
|
||||
/// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
|
||||
|
||||
@@ -296,6 +296,7 @@ impl DeleteTenantFlow {
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
@@ -303,7 +304,9 @@ impl DeleteTenantFlow {
|
||||
|
||||
let mut guard = Self::prepare(&tenant).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||
if let Err(e) =
|
||||
Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
|
||||
{
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
return Err(e);
|
||||
}
|
||||
@@ -322,6 +325,7 @@ impl DeleteTenantFlow {
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
@@ -335,15 +339,9 @@ impl DeleteTenantFlow {
|
||||
// Though sounds scary, different mark name?
|
||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
create_remote_delete_mark(
|
||||
conf,
|
||||
remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
@@ -546,8 +544,7 @@ impl DeleteTenantFlow {
|
||||
conf,
|
||||
remote_storage.as_ref(),
|
||||
&tenant.tenant_shard_id,
|
||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
||||
&CancellationToken::new(),
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -6,7 +6,9 @@ use futures::stream::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
||||
use pageserver_api::shard::{
|
||||
ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
|
||||
};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
@@ -100,7 +102,7 @@ pub(crate) enum TenantsMap {
|
||||
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
|
||||
/// New tenants can be added using [`tenant_map_acquire_slot`].
|
||||
Open(BTreeMap<TenantShardId, TenantSlot>),
|
||||
/// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
|
||||
/// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
|
||||
/// Existing tenants are still accessible, but no new tenants can be created.
|
||||
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
|
||||
}
|
||||
@@ -259,6 +261,12 @@ pub struct TenantManager {
|
||||
// See https://github.com/neondatabase/neon/issues/5796
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
resources: TenantSharedResources,
|
||||
|
||||
// Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
|
||||
// This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime),
|
||||
// tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
|
||||
// when the tenant detaches.
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
fn emergency_generations(
|
||||
@@ -295,7 +303,7 @@ async fn init_load_generations(
|
||||
} else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
|
||||
info!("Calling control plane API to re-attach tenants");
|
||||
// If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
|
||||
match client.re_attach().await {
|
||||
match client.re_attach(conf).await {
|
||||
Ok(tenants) => tenants,
|
||||
Err(RetryForeverError::ShuttingDown) => {
|
||||
anyhow::bail!("Shut down while waiting for control plane re-attach response")
|
||||
@@ -618,6 +626,7 @@ pub async fn init_tenant_mgr(
|
||||
conf,
|
||||
tenants: &TENANTS,
|
||||
resources,
|
||||
cancel: CancellationToken::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -678,21 +687,6 @@ pub(crate) fn tenant_spawn(
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
///
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
/// NB: We leave the tenants in the map, so that they remain accessible through
|
||||
/// the management API until we shut it down. If we removed the shut-down tenants
|
||||
/// from the tenants map, the management API would return 404 for these tenants,
|
||||
/// because TenantsMap::get() now returns `None`.
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn shutdown_all_tenants() {
|
||||
shutdown_all_tenants0(&TENANTS).await
|
||||
}
|
||||
|
||||
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
let mut join_set = JoinSet::new();
|
||||
|
||||
@@ -1426,6 +1420,7 @@ impl TenantManager {
|
||||
self.resources.remote_storage.clone(),
|
||||
&TENANTS,
|
||||
tenant,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -1439,11 +1434,41 @@ impl TenantManager {
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let r = self
|
||||
.do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
|
||||
.await;
|
||||
if r.is_err() {
|
||||
// Shard splitting might have left the original shard in a partially shut down state (it
|
||||
// stops the shard's remote timeline client). Reset it to ensure we leave things in
|
||||
// a working state.
|
||||
if self.get(tenant_shard_id).is_some() {
|
||||
tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
|
||||
if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
|
||||
// Log this error because our return value will still be the original error, not this one. This is
|
||||
// a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
|
||||
// (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or
|
||||
// setting it broken probably won't help either.
|
||||
tracing::error!("Failed to reset {tenant_shard_id}: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r
|
||||
}
|
||||
|
||||
pub(crate) async fn do_shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
// Validate the incoming request
|
||||
if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
|
||||
anyhow::bail!("Requested shard count is not an increase");
|
||||
}
|
||||
@@ -1452,10 +1477,18 @@ impl TenantManager {
|
||||
anyhow::bail!("Requested split is not a power of two");
|
||||
}
|
||||
|
||||
let parent_shard_identity = tenant.shard_identity;
|
||||
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||
let parent_generation = tenant.generation;
|
||||
if let Some(new_stripe_size) = new_stripe_size {
|
||||
if tenant.get_shard_stripe_size() != new_stripe_size
|
||||
&& tenant_shard_id.shard_count.count() > 1
|
||||
{
|
||||
// This tenant already has multiple shards, it is illegal to try and change its stripe size
|
||||
anyhow::bail!(
|
||||
"Shard stripe size may not be modified once tenant has multiple shards"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Plan: identify what the new child shards will be
|
||||
let child_shards = tenant_shard_id.split(new_shard_count);
|
||||
tracing::info!(
|
||||
"Shard {} splits into: {}",
|
||||
@@ -1466,6 +1499,14 @@ impl TenantManager {
|
||||
.join(",")
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
let parent_shard_identity = tenant.shard_identity;
|
||||
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||
let parent_generation = tenant.generation;
|
||||
|
||||
// Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
|
||||
if let Err(e) = tenant.split_prepare(&child_shards).await {
|
||||
// If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
|
||||
@@ -1475,6 +1516,10 @@ impl TenantManager {
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
self.resources.deletion_queue_client.flush_advisory();
|
||||
|
||||
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||
@@ -1496,11 +1541,16 @@ impl TenantManager {
|
||||
anyhow::bail!("Detached parent shard in the middle of split!")
|
||||
}
|
||||
};
|
||||
|
||||
fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
// Optimization: hardlink layers from the parent into the children, so that they don't have to
|
||||
// re-download & duplicate the data referenced in their initial IndexPart
|
||||
self.shard_split_hardlink(parent, child_shards.clone())
|
||||
.await?;
|
||||
fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
|
||||
// child shards to reach this point.
|
||||
@@ -1515,6 +1565,9 @@ impl TenantManager {
|
||||
// Phase 3: Spawn the child shards
|
||||
for child_shard in &child_shards {
|
||||
let mut child_shard_identity = parent_shard_identity;
|
||||
if let Some(new_stripe_size) = new_stripe_size {
|
||||
child_shard_identity.stripe_size = new_stripe_size;
|
||||
}
|
||||
child_shard_identity.count = child_shard.shard_count;
|
||||
child_shard_identity.number = child_shard.shard_number;
|
||||
|
||||
@@ -1537,6 +1590,10 @@ impl TenantManager {
|
||||
.await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
|
||||
for child_shard_id in &child_shards {
|
||||
let child_shard_id = *child_shard_id;
|
||||
@@ -1569,6 +1626,10 @@ impl TenantManager {
|
||||
timeline.timeline_id,
|
||||
target_lsn
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
||||
// Failure here might mean shutdown, in any case this part is an optimization
|
||||
// and we shouldn't hold up the split operation.
|
||||
@@ -1614,6 +1675,10 @@ impl TenantManager {
|
||||
},
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
parent_slot_guard.drop_old_value()?;
|
||||
|
||||
// Phase 6: Release the InProgress on the parent shard
|
||||
@@ -1745,6 +1810,23 @@ impl TenantManager {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
/// NB: We leave the tenants in the map, so that they remain accessible through
|
||||
/// the management API until we shut it down. If we removed the shut-down tenants
|
||||
/// from the tenants map, the management API would return 404 for these tenants,
|
||||
/// because TenantsMap::get() now returns `None`.
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
self.cancel.cancel();
|
||||
|
||||
shutdown_all_tenants0(self.tenants).await
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
|
||||
@@ -23,7 +23,7 @@ use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -73,55 +73,13 @@ pub async fn download_layer_file<'a>(
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let (mut destination_file, bytes_amount) = download_retry(
|
||||
|| async {
|
||||
let destination_file = tokio::fs::File::create(&temp_file_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = storage.download(&remote_path, cancel).await?;
|
||||
|
||||
let mut destination_file =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
|
||||
|
||||
match bytes_amount {
|
||||
Ok(bytes_amount) => {
|
||||
let destination_file = destination_file.into_inner();
|
||||
Ok((destination_file, bytes_amount))
|
||||
}
|
||||
Err(e) => {
|
||||
if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
|
||||
}
|
||||
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
},
|
||||
let bytes_amount = download_retry(
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.with_context(|| format!("flush source file at {temp_file_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let expected = layer_metadata.file_size();
|
||||
if expected != bytes_amount {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
@@ -129,14 +87,6 @@ pub async fn download_layer_file<'a>(
|
||||
)));
|
||||
}
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to fsync source file at {temp_file_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
Err(DownloadError::Other(anyhow!(
|
||||
"remote-storage-download-pre-rename failpoint triggered"
|
||||
@@ -169,6 +119,128 @@ pub async fn download_layer_file<'a>(
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
|
||||
/// Download the object `src_path` in the remote `storage` to local path `dst_path`.
|
||||
///
|
||||
/// If Ok() is returned, the download succeeded and the inode & data have been made durable.
|
||||
/// (Note that the directory entry for the inode is not made durable.)
|
||||
/// The file size in bytes is returned.
|
||||
///
|
||||
/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
|
||||
/// The unlinking has _not_ been made durable.
|
||||
async fn download_object<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
src_path: &RemotePath,
|
||||
dst_path: &Utf8PathBuf,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let res = match crate::virtual_file::io_engine::get() {
|
||||
crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
|
||||
crate::virtual_file::io_engine::IoEngine::StdFs => {
|
||||
async {
|
||||
let destination_file = tokio::fs::File::create(dst_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = storage.download(src_path, cancel).await?;
|
||||
|
||||
let mut buf_writer =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
|
||||
buf_writer.flush().await?;
|
||||
|
||||
let mut destination_file = buf_writer.into_inner();
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.with_context(|| format!("flush source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
.await
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
|
||||
use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
|
||||
async {
|
||||
let destination_file = VirtualFile::create(dst_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut download = storage.download(src_path, cancel).await?;
|
||||
|
||||
// TODO: use vectored write (writev) once supported by tokio-epoll-uring.
|
||||
// There's chunks_vectored() on the stream.
|
||||
let (bytes_amount, destination_file) = async {
|
||||
let size_tracking = size_tracking_writer::Writer::new(destination_file);
|
||||
let mut buffered = owned_buffers_io::write::BufferedWriter::<
|
||||
{ super::BUFFER_SIZE },
|
||||
_,
|
||||
>::new(size_tracking);
|
||||
while let Some(res) =
|
||||
futures::StreamExt::next(&mut download.download_stream).await
|
||||
{
|
||||
let chunk = match res {
|
||||
Ok(chunk) => chunk,
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
buffered
|
||||
.write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
|
||||
.await?;
|
||||
}
|
||||
let size_tracking = buffered.flush_and_into_inner().await?;
|
||||
Ok(size_tracking.into_inner())
|
||||
}
|
||||
.await?;
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
.await
|
||||
}
|
||||
};
|
||||
|
||||
// in case the download failed, clean up
|
||||
match res {
|
||||
Ok(bytes_amount) => Ok(bytes_amount),
|
||||
Err(e) => {
|
||||
if let Err(e) = tokio::fs::remove_file(dst_path).await {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
|
||||
}
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
|
||||
|
||||
@@ -95,7 +95,11 @@ pub(crate) struct SecondaryTenant {
|
||||
shard_identity: ShardIdentity,
|
||||
tenant_conf: std::sync::Mutex<TenantConfOpt>,
|
||||
|
||||
// Internal state used by the Downloader.
|
||||
detail: std::sync::Mutex<SecondaryDetail>,
|
||||
|
||||
// Public state indicating overall progress of downloads relative to the last heatmap seen
|
||||
pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
|
||||
}
|
||||
|
||||
impl SecondaryTenant {
|
||||
@@ -118,6 +122,8 @@ impl SecondaryTenant {
|
||||
tenant_conf: std::sync::Mutex::new(tenant_conf),
|
||||
|
||||
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
||||
|
||||
progress: std::sync::Mutex::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -247,9 +253,12 @@ impl SecondaryTenant {
|
||||
}
|
||||
|
||||
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
|
||||
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
|
||||
/// where we want to immediately upload/download for a particular tenant. In normal operation
|
||||
/// uploads & downloads are autonomous and not driven by this interface.
|
||||
/// and heatmap uploads. This is not a hot data path: it's used for:
|
||||
/// - Live migrations, where we want to ensure a migration destination has the freshest possible
|
||||
/// content before trying to cut over.
|
||||
/// - Tests, where we want to immediately upload/download for a particular tenant.
|
||||
///
|
||||
/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface.
|
||||
pub struct SecondaryController {
|
||||
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
|
||||
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
|
||||
|
||||
@@ -41,14 +41,16 @@ use crate::tenant::{
|
||||
use camino::Utf8PathBuf;
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::Future;
|
||||
use pageserver_api::models::SecondaryProgress;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
|
||||
id::TimelineId,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -128,6 +130,7 @@ pub(super) struct SecondaryDetail {
|
||||
pub(super) config: SecondaryLocationConfig,
|
||||
|
||||
last_download: Option<Instant>,
|
||||
last_etag: Option<Etag>,
|
||||
next_download: Option<Instant>,
|
||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
}
|
||||
@@ -138,11 +141,26 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
|
||||
datetime.format("%d/%m/%Y %T")
|
||||
}
|
||||
|
||||
/// Information returned from download function when it detects the heatmap has changed
|
||||
struct HeatMapModified {
|
||||
etag: Etag,
|
||||
last_modified: SystemTime,
|
||||
bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
enum HeatMapDownload {
|
||||
// The heatmap's etag has changed: return the new etag, mtime and the body bytes
|
||||
Modified(HeatMapModified),
|
||||
// The heatmap's etag is unchanged
|
||||
Unmodified,
|
||||
}
|
||||
|
||||
impl SecondaryDetail {
|
||||
pub(super) fn new(config: SecondaryLocationConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
last_download: None,
|
||||
last_etag: None,
|
||||
next_download: None,
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
@@ -477,11 +495,31 @@ impl<'a> TenantDownloader<'a> {
|
||||
};
|
||||
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
|
||||
// We will use the etag from last successful download to make the download conditional on changes
|
||||
let last_etag = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.last_etag
|
||||
.clone();
|
||||
|
||||
// Download the tenant's heatmap
|
||||
let heatmap_bytes = tokio::select!(
|
||||
bytes = self.download_heatmap() => {bytes?},
|
||||
let HeatMapModified {
|
||||
last_modified: heatmap_mtime,
|
||||
etag: heatmap_etag,
|
||||
bytes: heatmap_bytes,
|
||||
} = match tokio::select!(
|
||||
bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
|
||||
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
||||
);
|
||||
) {
|
||||
HeatMapDownload::Unmodified => {
|
||||
tracing::info!("Heatmap unchanged since last successful download");
|
||||
return Ok(());
|
||||
}
|
||||
HeatMapDownload::Modified(m) => m,
|
||||
};
|
||||
|
||||
let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
|
||||
|
||||
@@ -498,6 +536,14 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
|
||||
|
||||
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
|
||||
// principle that deletions should be done before writes wherever possible, and so that we can use this
|
||||
// phase to initialize our SecondaryProgress.
|
||||
{
|
||||
*self.secondary_state.progress.lock().unwrap() =
|
||||
self.prepare_timelines(&heatmap, heatmap_mtime).await?;
|
||||
}
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
@@ -515,30 +561,159 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Only update last_etag after a full successful download: this way will not skip
|
||||
// the next download, even if the heatmap's actual etag is unchanged.
|
||||
self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
|
||||
/// Do any fast local cleanup that comes before the much slower process of downloading
|
||||
/// layers from remote storage. In the process, initialize the SecondaryProgress object
|
||||
/// that will later be updated incrementally as we download layers.
|
||||
async fn prepare_timelines(
|
||||
&self,
|
||||
heatmap: &HeatMapTenant,
|
||||
heatmap_mtime: SystemTime,
|
||||
) -> Result<SecondaryProgress, UpdateError> {
|
||||
let heatmap_stats = heatmap.get_stats();
|
||||
// We will construct a progress object, and then populate its initial "downloaded" numbers
|
||||
// while iterating through local layer state in [`Self::prepare_timelines`]
|
||||
let mut progress = SecondaryProgress {
|
||||
layers_total: heatmap_stats.layers,
|
||||
bytes_total: heatmap_stats.bytes,
|
||||
heatmap_mtime: Some(heatmap_mtime),
|
||||
layers_downloaded: 0,
|
||||
bytes_downloaded: 0,
|
||||
};
|
||||
// Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
|
||||
let mut delete_layers = Vec::new();
|
||||
let mut delete_timelines = Vec::new();
|
||||
{
|
||||
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||
for (timeline_id, timeline_state) in &mut detail.timelines {
|
||||
let Some(heatmap_timeline_index) = heatmap
|
||||
.timelines
|
||||
.iter()
|
||||
.position(|t| t.timeline_id == *timeline_id)
|
||||
else {
|
||||
// This timeline is no longer referenced in the heatmap: delete it locally
|
||||
delete_timelines.push(*timeline_id);
|
||||
continue;
|
||||
};
|
||||
|
||||
let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
|
||||
|
||||
let layers_in_heatmap = heatmap_timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let mut layer_count = layers_on_disk.len();
|
||||
let mut layer_byte_count: u64 = timeline_state
|
||||
.on_disk_layers
|
||||
.values()
|
||||
.map(|l| l.metadata.file_size())
|
||||
.sum();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
layer_count -= 1;
|
||||
layer_byte_count -= timeline_state
|
||||
.on_disk_layers
|
||||
.get(layer)
|
||||
.unwrap()
|
||||
.metadata
|
||||
.file_size();
|
||||
|
||||
delete_layers.push((*timeline_id, (*layer).clone()));
|
||||
}
|
||||
|
||||
progress.bytes_downloaded += layer_byte_count;
|
||||
progress.layers_downloaded += layer_count;
|
||||
}
|
||||
}
|
||||
|
||||
// Execute accumulated deletions
|
||||
for (timeline_id, layer_name) in delete_layers {
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||
let local_path = timeline_path.join(layer_name.to_string());
|
||||
tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
|
||||
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary layer")?;
|
||||
|
||||
// Update in-memory housekeeping to reflect the absence of the deleted layer
|
||||
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||
let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
|
||||
continue;
|
||||
};
|
||||
timeline_state.on_disk_layers.remove(&layer_name);
|
||||
}
|
||||
|
||||
for timeline_id in delete_timelines {
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||
tracing::info!(timeline_id=%timeline_id,
|
||||
"Timeline no longer in heatmap, removing from secondary location"
|
||||
);
|
||||
tokio::fs::remove_dir_all(&timeline_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary timeline")?;
|
||||
}
|
||||
|
||||
Ok(progress)
|
||||
}
|
||||
|
||||
/// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object
|
||||
/// still matches `prev_etag`.
|
||||
async fn download_heatmap(
|
||||
&self,
|
||||
prev_etag: Option<&Etag>,
|
||||
) -> Result<HeatMapDownload, UpdateError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
// TODO: make download conditional on ETag having changed since last download
|
||||
// TODO: pull up etag check into the request, to do a conditional GET rather than
|
||||
// issuing a GET and then maybe ignoring the response body
|
||||
// (https://github.com/neondatabase/neon/issues/6199)
|
||||
tracing::debug!("Downloading heatmap for secondary tenant",);
|
||||
|
||||
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
||||
let cancel = &self.secondary_state.cancel;
|
||||
|
||||
let heatmap_bytes = backoff::retry(
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let download = self
|
||||
.remote_storage
|
||||
.download(&heatmap_path, cancel)
|
||||
.await
|
||||
.map_err(UpdateError::from)?;
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||
Ok(heatmap_bytes)
|
||||
|
||||
if Some(&download.etag) == prev_etag {
|
||||
Ok(HeatMapDownload::Unmodified)
|
||||
} else {
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
Ok(HeatMapDownload::Modified(HeatMapModified {
|
||||
etag: download.etag,
|
||||
last_modified: download.last_modified,
|
||||
bytes: heatmap_bytes,
|
||||
}))
|
||||
}
|
||||
},
|
||||
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
@@ -548,11 +723,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| UpdateError::Cancelled)
|
||||
.and_then(|x| x)?;
|
||||
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
|
||||
Ok(heatmap_bytes)
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
|
||||
@@ -593,27 +764,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
};
|
||||
|
||||
let layers_in_heatmap = timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
let local_path = timeline_path.join(layer.to_string());
|
||||
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary layer")?;
|
||||
}
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
@@ -662,6 +812,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// Failpoint for simulating slow remote storage
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"secondary-layer-download-sleep",
|
||||
&self.secondary_state.cancel
|
||||
);
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
@@ -701,6 +857,11 @@ impl<'a> TenantDownloader<'a> {
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)?;
|
||||
} else {
|
||||
tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
progress.bytes_downloaded += downloaded_bytes;
|
||||
progress.layers_downloaded += 1;
|
||||
}
|
||||
|
||||
SECONDARY_MODE.download_layer.inc();
|
||||
|
||||
@@ -62,3 +62,25 @@ impl HeatMapTimeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct HeatMapStats {
|
||||
pub(crate) bytes: u64,
|
||||
pub(crate) layers: usize,
|
||||
}
|
||||
|
||||
impl HeatMapTenant {
|
||||
pub(crate) fn get_stats(&self) -> HeatMapStats {
|
||||
let mut stats = HeatMapStats {
|
||||
bytes: 0,
|
||||
layers: 0,
|
||||
};
|
||||
for timeline in &self.timelines {
|
||||
for layer in &timeline.layers {
|
||||
stats.layers += 1;
|
||||
stats.bytes += layer.metadata.file_size;
|
||||
}
|
||||
}
|
||||
|
||||
stats
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,7 +183,13 @@ pub(super) async fn gather_inputs(
|
||||
// new gc run, which we have no control over. however differently from `Timeline::gc`
|
||||
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
|
||||
// actually removing files.
|
||||
let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
|
||||
//
|
||||
// We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
|
||||
// a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
|
||||
// than a space bound (horizon cutoff). This means that if someone drops a database and waits for their
|
||||
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
|
||||
// horizon_cutoff.
|
||||
let mut next_gc_cutoff = gc_info.pitr_cutoff;
|
||||
|
||||
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
|
||||
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
|
||||
|
||||
@@ -20,6 +20,7 @@ use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::models::{
|
||||
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
@@ -427,7 +428,7 @@ impl LayerAccessStatFullDetails {
|
||||
} = self;
|
||||
pageserver_api::models::LayerAccessStatFullDetails {
|
||||
when_millis_since_epoch: system_time_to_millis_since_epoch(when),
|
||||
task_kind: task_kind.into(), // into static str, powered by strum_macros
|
||||
task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
|
||||
access_kind: *access_kind,
|
||||
}
|
||||
}
|
||||
@@ -525,7 +526,7 @@ impl LayerAccessStats {
|
||||
.collect(),
|
||||
task_kind_access_flag: task_kind_flag
|
||||
.iter()
|
||||
.map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
|
||||
.map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
|
||||
.collect(),
|
||||
first: first_access.as_ref().map(|a| a.as_api_model()),
|
||||
accesses_history: last_accesses.map(|m| m.as_api_model()),
|
||||
|
||||
@@ -536,6 +536,18 @@ impl Drop for LayerInner {
|
||||
// carry this until we are finished for [`Layer::wait_drop`] support
|
||||
let _status = status;
|
||||
|
||||
let Some(timeline) = timeline.upgrade() else {
|
||||
// no need to nag that timeline is gone: under normal situation on
|
||||
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
return;
|
||||
};
|
||||
|
||||
let Ok(_guard) = timeline.gate.enter() else {
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
return;
|
||||
};
|
||||
|
||||
let removed = match std::fs::remove_file(path) {
|
||||
Ok(()) => true,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
@@ -554,32 +566,26 @@ impl Drop for LayerInner {
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(timeline) = timeline.upgrade() {
|
||||
if removed {
|
||||
timeline.metrics.resident_physical_size_sub(file_size);
|
||||
}
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
|
||||
if removed {
|
||||
timeline.metrics.resident_physical_size_sub(file_size);
|
||||
}
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
|
||||
|
||||
if let Err(e) = res {
|
||||
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
||||
// demonstrating this deadlock (without spawn_blocking): stop will drop
|
||||
// queued items, which will have ResidentLayer's, and those drops would try
|
||||
// to re-entrantly lock the RemoteTimelineClient inner state.
|
||||
if !timeline.is_active() {
|
||||
tracing::info!("scheduling deletion on drop failed: {e:#}");
|
||||
} else {
|
||||
tracing::warn!("scheduling deletion on drop failed: {e:#}");
|
||||
}
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
|
||||
if let Err(e) = res {
|
||||
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
|
||||
// demonstrating this deadlock (without spawn_blocking): stop will drop
|
||||
// queued items, which will have ResidentLayer's, and those drops would try
|
||||
// to re-entrantly lock the RemoteTimelineClient inner state.
|
||||
if !timeline.is_active() {
|
||||
tracing::info!("scheduling deletion on drop failed: {e:#}");
|
||||
} else {
|
||||
LAYER_IMPL_METRICS.inc_completed_deletes();
|
||||
tracing::warn!("scheduling deletion on drop failed: {e:#}");
|
||||
}
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
|
||||
} else {
|
||||
LAYER_IMPL_METRICS.inc_completed_deletes();
|
||||
}
|
||||
} else {
|
||||
// no need to nag that timeline is gone: under normal situation on
|
||||
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -704,10 +710,6 @@ impl LayerInner {
|
||||
// disable any scheduled but not yet running eviction deletions for this
|
||||
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// count cancellations, which currently remain largely unexpected
|
||||
let init_cancelled =
|
||||
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
|
||||
|
||||
// no need to make the evict_and_wait wait for the actual download to complete
|
||||
drop(self.status.send(Status::Downloaded));
|
||||
|
||||
@@ -716,7 +718,9 @@ impl LayerInner {
|
||||
.upgrade()
|
||||
.ok_or_else(|| DownloadError::TimelineShutdown)?;
|
||||
|
||||
// FIXME: grab a gate
|
||||
// count cancellations, which currently remain largely unexpected
|
||||
let init_cancelled =
|
||||
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
|
||||
|
||||
let can_ever_evict = timeline.remote_client.as_ref().is_some();
|
||||
|
||||
@@ -725,9 +729,17 @@ impl LayerInner {
|
||||
let needs_download = self
|
||||
.needs_download()
|
||||
.await
|
||||
.map_err(DownloadError::PreStatFailed)?;
|
||||
.map_err(DownloadError::PreStatFailed);
|
||||
|
||||
let permit = if let Some(reason) = needs_download {
|
||||
let needs_download = match needs_download {
|
||||
Ok(reason) => reason,
|
||||
Err(e) => {
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
let (permit, downloaded) = if let Some(reason) = needs_download {
|
||||
if let NeedsDownload::NotFile(ft) = reason {
|
||||
return Err(DownloadError::NotFile(ft));
|
||||
}
|
||||
@@ -738,36 +750,59 @@ impl LayerInner {
|
||||
self.wanted_evicted.store(false, Ordering::Release);
|
||||
|
||||
if !can_ever_evict {
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(DownloadError::NoRemoteStorage);
|
||||
}
|
||||
|
||||
if let Some(ctx) = ctx {
|
||||
self.check_expected_download(ctx)?;
|
||||
let res = self.check_expected_download(ctx);
|
||||
if let Err(e) = res {
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
if !allow_download {
|
||||
// this does look weird, but for LayerInner the "downloading" means also changing
|
||||
// internal once related state ...
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(DownloadError::DownloadRequired);
|
||||
}
|
||||
|
||||
tracing::info!(%reason, "downloading on-demand");
|
||||
|
||||
self.spawn_download_and_wait(timeline, permit).await?
|
||||
let permit = self.spawn_download_and_wait(timeline, permit).await;
|
||||
|
||||
let permit = match permit {
|
||||
Ok(permit) => permit,
|
||||
Err(e) => {
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
(permit, true)
|
||||
} else {
|
||||
// the file is present locally, probably by a previous but cancelled call to
|
||||
// get_or_maybe_download. alternatively we might be running without remote storage.
|
||||
LAYER_IMPL_METRICS.inc_init_needed_no_download();
|
||||
|
||||
permit
|
||||
(permit, false)
|
||||
};
|
||||
|
||||
let since_last_eviction =
|
||||
self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
|
||||
if let Some(since_last_eviction) = since_last_eviction {
|
||||
// FIXME: this will not always be recorded correctly until #6028 (the no
|
||||
// download needed branch above)
|
||||
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
|
||||
if downloaded {
|
||||
let since_last_eviction = self
|
||||
.last_evicted_at
|
||||
.lock()
|
||||
.unwrap()
|
||||
.take()
|
||||
.map(|ts| ts.elapsed());
|
||||
|
||||
if let Some(since_last_eviction) = since_last_eviction {
|
||||
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
|
||||
}
|
||||
}
|
||||
|
||||
let res = Arc::new(DownloadedLayer {
|
||||
@@ -789,8 +824,6 @@ impl LayerInner {
|
||||
);
|
||||
}
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
|
||||
Ok((ResidentOrWantedEvicted::Resident(res), permit))
|
||||
}
|
||||
.instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
|
||||
@@ -880,23 +913,18 @@ impl LayerInner {
|
||||
) -> Result<heavier_once_cell::InitPermit, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let task_name = format!("download layer {}", self);
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
// this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
|
||||
// block tenant::mgr::remove_tenant_from_memory.
|
||||
|
||||
let this: Arc<Self> = self.clone();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
||||
Some(self.desc.tenant_shard_id),
|
||||
Some(self.desc.timeline_id),
|
||||
&task_name,
|
||||
false,
|
||||
async move {
|
||||
let guard = timeline
|
||||
.gate
|
||||
.enter()
|
||||
.map_err(|_| DownloadError::DownloadCancelled)?;
|
||||
|
||||
tokio::task::spawn(async move {
|
||||
|
||||
let _guard = guard;
|
||||
|
||||
let client = timeline
|
||||
.remote_client
|
||||
@@ -906,7 +934,7 @@ impl LayerInner {
|
||||
let result = client.download_layer_file(
|
||||
&this.desc.filename(),
|
||||
&this.metadata(),
|
||||
&crate::task_mgr::shutdown_token()
|
||||
&timeline.cancel
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -929,7 +957,6 @@ impl LayerInner {
|
||||
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep(backoff) => {},
|
||||
_ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
|
||||
_ = timeline.cancel.cancelled() => {},
|
||||
};
|
||||
|
||||
@@ -959,11 +986,10 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
|
||||
match rx.await {
|
||||
Ok((Ok(()), permit)) => {
|
||||
if let Some(reason) = self
|
||||
@@ -1102,6 +1128,10 @@ impl LayerInner {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
};
|
||||
|
||||
let Ok(_gate) = timeline.gate.enter() else {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
};
|
||||
|
||||
// to avoid starting a new download while we evict, keep holding on to the
|
||||
// permit.
|
||||
let _permit = {
|
||||
|
||||
@@ -130,10 +130,10 @@ where
|
||||
self.inner.load().config.steady_rps()
|
||||
}
|
||||
|
||||
pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
|
||||
pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
|
||||
let inner = self.inner.load_full(); // clones the `Inner` Arc
|
||||
if !inner.task_kinds.contains(ctx.task_kind()) {
|
||||
return;
|
||||
return None;
|
||||
};
|
||||
let start = std::time::Instant::now();
|
||||
let mut did_throttle = false;
|
||||
@@ -170,6 +170,9 @@ where
|
||||
});
|
||||
}
|
||||
}
|
||||
Some(wait_time)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -634,6 +634,8 @@ impl Timeline {
|
||||
/// If a remote layer file is needed, it is downloaded as part of this
|
||||
/// call.
|
||||
///
|
||||
/// This method enforces [`Self::timeline_get_throttle`] internally.
|
||||
///
|
||||
/// NOTE: It is considered an error to 'get' a key that doesn't exist. The
|
||||
/// abstraction above this needs to store suitable metadata to track what
|
||||
/// data exists with what keys, in separate metadata entries. If a
|
||||
@@ -644,18 +646,27 @@ impl Timeline {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
#[inline(always)]
|
||||
pub(crate) async fn get(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.timeline_get_throttle.throttle(ctx, 1).await;
|
||||
self.get_impl(key, lsn, ctx).await
|
||||
}
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
async fn get_impl(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
if !lsn.is_valid() {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
|
||||
}
|
||||
|
||||
self.timeline_get_throttle.throttle(ctx, 1).await;
|
||||
|
||||
// This check is debug-only because of the cost of hashing, and because it's a double-check: we
|
||||
// already checked the key against the shard_identity when looking up the Timeline from
|
||||
// page_service.
|
||||
@@ -752,10 +763,6 @@ impl Timeline {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
}
|
||||
|
||||
self.timeline_get_throttle
|
||||
.throttle(ctx, key_count as usize)
|
||||
.await;
|
||||
|
||||
for range in &keyspace.ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
@@ -772,11 +779,18 @@ impl Timeline {
|
||||
self.conf.get_vectored_impl
|
||||
);
|
||||
|
||||
let _timer = crate::metrics::GET_VECTORED_LATENCY
|
||||
let start = crate::metrics::GET_VECTORED_LATENCY
|
||||
.for_task_kind(ctx.task_kind())
|
||||
.map(|t| t.start_timer());
|
||||
.map(|metric| (metric, Instant::now()));
|
||||
|
||||
match self.conf.get_vectored_impl {
|
||||
// start counting after throttle so that throttle time
|
||||
// is always less than observation time
|
||||
let throttled = self
|
||||
.timeline_get_throttle
|
||||
.throttle(ctx, key_count as usize)
|
||||
.await;
|
||||
|
||||
let res = match self.conf.get_vectored_impl {
|
||||
GetVectoredImpl::Sequential => {
|
||||
self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
|
||||
}
|
||||
@@ -790,9 +804,33 @@ impl Timeline {
|
||||
|
||||
vectored_res
|
||||
}
|
||||
};
|
||||
|
||||
if let Some((metric, start)) = start {
|
||||
let elapsed = start.elapsed();
|
||||
let ex_throttled = if let Some(throttled) = throttled {
|
||||
elapsed.checked_sub(throttled)
|
||||
} else {
|
||||
Some(elapsed)
|
||||
};
|
||||
|
||||
if let Some(ex_throttled) = ex_throttled {
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
} else {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!("error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
pub(super) async fn get_vectored_sequential_impl(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
@@ -803,7 +841,7 @@ impl Timeline {
|
||||
for range in keyspace.ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
let block = self.get(key, lsn, ctx).await;
|
||||
let block = self.get_impl(key, lsn, ctx).await;
|
||||
|
||||
use PageReconstructError::*;
|
||||
match block {
|
||||
@@ -853,6 +891,7 @@ impl Timeline {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
pub(super) async fn validate_get_vectored_impl(
|
||||
&self,
|
||||
vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
|
||||
@@ -1257,6 +1296,8 @@ impl Timeline {
|
||||
|
||||
// Finally wait until any gate-holders are complete
|
||||
self.gate.close().await;
|
||||
|
||||
self.metrics.shutdown();
|
||||
}
|
||||
|
||||
pub(crate) fn set_state(&self, new_state: TimelineState) {
|
||||
@@ -2476,7 +2517,7 @@ impl Timeline {
|
||||
// 'prev_lsn' tracks the last LSN that we were at in our search. It's used
|
||||
// to check that each iteration make some progress, to break infinite
|
||||
// looping if something goes wrong.
|
||||
let mut prev_lsn = Lsn(u64::MAX);
|
||||
let mut prev_lsn = None;
|
||||
|
||||
let mut result = ValueReconstructResult::Continue;
|
||||
let mut cont_lsn = Lsn(request_lsn.0 + 1);
|
||||
@@ -2496,18 +2537,20 @@ impl Timeline {
|
||||
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
|
||||
return Ok(traversal_path);
|
||||
}
|
||||
if prev_lsn <= cont_lsn {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
// getting stuck in the loop.
|
||||
return Err(layer_traversal_error(format!(
|
||||
"could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
|
||||
key,
|
||||
Lsn(cont_lsn.0 - 1),
|
||||
request_lsn,
|
||||
timeline.ancestor_lsn
|
||||
), traversal_path));
|
||||
if let Some(prev) = prev_lsn {
|
||||
if prev <= cont_lsn {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
// getting stuck in the loop.
|
||||
return Err(layer_traversal_error(format!(
|
||||
"could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
|
||||
key,
|
||||
Lsn(cont_lsn.0 - 1),
|
||||
request_lsn,
|
||||
timeline.ancestor_lsn
|
||||
), traversal_path));
|
||||
}
|
||||
}
|
||||
prev_lsn = cont_lsn;
|
||||
prev_lsn = Some(cont_lsn);
|
||||
}
|
||||
ValueReconstructResult::Missing => {
|
||||
return Err(layer_traversal_error(
|
||||
@@ -2537,7 +2580,7 @@ impl Timeline {
|
||||
|
||||
timeline_owned = timeline.get_ready_ancestor_timeline(ctx).await?;
|
||||
timeline = &*timeline_owned;
|
||||
prev_lsn = Lsn(u64::MAX);
|
||||
prev_lsn = None;
|
||||
continue 'outer;
|
||||
}
|
||||
|
||||
@@ -2963,7 +3006,6 @@ impl Timeline {
|
||||
}
|
||||
|
||||
trace!("waking up");
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
let flush_counter = *layer_flush_start_rx.borrow();
|
||||
let result = loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -2974,6 +3016,8 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
|
||||
let layer_to_flush = {
|
||||
let guard = self.layers.read().await;
|
||||
guard.layer_map().frozen_layers.front().cloned()
|
||||
@@ -2995,13 +3039,12 @@ impl Timeline {
|
||||
break err;
|
||||
}
|
||||
}
|
||||
timer.stop_and_record();
|
||||
};
|
||||
// Notify any listeners that we're done
|
||||
let _ = self
|
||||
.layer_flush_done_tx
|
||||
.send_replace((flush_counter, result));
|
||||
|
||||
timer.stop_and_record();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3069,6 +3112,7 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), FlushLayerError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
@@ -3740,8 +3784,11 @@ impl Timeline {
|
||||
// The timestamp is in the future. That sounds impossible,
|
||||
// but what it really means is that there hasn't been
|
||||
// any commits since the cutoff timestamp.
|
||||
//
|
||||
// In this case we should use the LSN of the most recent commit,
|
||||
// which is implicitly the last LSN in the log.
|
||||
debug!("future({})", lsn);
|
||||
cutoff_horizon
|
||||
self.get_last_record_lsn()
|
||||
}
|
||||
LsnForTimestamp::Past(lsn) => {
|
||||
debug!("past({})", lsn);
|
||||
|
||||
@@ -2,8 +2,8 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use tracing::{error, info, info_span, warn};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
|
||||
use tracing::{error, info, info_span};
|
||||
use utils::{fs_ext, id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{context::RequestContext, import_datadir, tenant::Tenant};
|
||||
|
||||
@@ -11,22 +11,22 @@ use super::Timeline;
|
||||
|
||||
/// A timeline with some of its files on disk, being initialized.
|
||||
/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
|
||||
/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
|
||||
/// to be removed on next restart.
|
||||
/// its local files are removed. If we crash while this class exists, then the timeline's local
|
||||
/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage.
|
||||
///
|
||||
/// The caller is responsible for proper timeline data filling before the final init.
|
||||
#[must_use]
|
||||
pub struct UninitializedTimeline<'t> {
|
||||
pub(crate) owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
|
||||
}
|
||||
|
||||
impl<'t> UninitializedTimeline<'t> {
|
||||
pub(crate) fn new(
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
|
||||
) -> Self {
|
||||
Self {
|
||||
owning_tenant,
|
||||
@@ -35,8 +35,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Finish timeline creation: insert it into the Tenant's timelines map and remove the
|
||||
/// uninit mark file.
|
||||
/// Finish timeline creation: insert it into the Tenant's timelines map
|
||||
///
|
||||
/// This function launches the flush loop if not already done.
|
||||
///
|
||||
@@ -72,16 +71,9 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
Entry::Vacant(v) => {
|
||||
// after taking here should be no fallible operations, because the drop guard will not
|
||||
// cleanup after and would block for example the tenant deletion
|
||||
let (new_timeline, uninit_mark) =
|
||||
let (new_timeline, _create_guard) =
|
||||
self.raw_timeline.take().expect("already checked");
|
||||
|
||||
// this is the mutual exclusion between different retries to create the timeline;
|
||||
// this should be an assertion.
|
||||
uninit_mark.remove_uninit_mark().with_context(|| {
|
||||
format!(
|
||||
"Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
|
||||
)
|
||||
})?;
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
|
||||
new_timeline.maybe_spawn_flush_loop();
|
||||
@@ -120,8 +112,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
.await
|
||||
.context("Failed to flush after basebackup import")?;
|
||||
|
||||
// All the data has been imported. Insert the Timeline into the tenant's timelines
|
||||
// map and remove the uninit mark file.
|
||||
// All the data has been imported. Insert the Timeline into the tenant's timelines map
|
||||
let tl = self.finish_creation()?;
|
||||
tl.activate(broker_client, None, ctx);
|
||||
Ok(tl)
|
||||
@@ -143,37 +134,35 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
|
||||
impl Drop for UninitializedTimeline<'_> {
|
||||
fn drop(&mut self) {
|
||||
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
||||
if let Some((_, create_guard)) = self.raw_timeline.take() {
|
||||
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
|
||||
error!("Timeline got dropped without initializing, cleaning its files");
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
cleanup_timeline_directory(create_guard);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
|
||||
let timeline_path = &uninit_mark.timeline_path;
|
||||
pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
|
||||
let timeline_path = &create_guard.timeline_path;
|
||||
match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
|
||||
Ok(()) => {
|
||||
info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
|
||||
info!("Timeline dir {timeline_path:?} removed successfully")
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
|
||||
}
|
||||
}
|
||||
drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
|
||||
// Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other
|
||||
// timeline creation attempts under this TimelineId to proceed
|
||||
drop(create_guard);
|
||||
}
|
||||
|
||||
/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
|
||||
/// or gets removed eventually.
|
||||
///
|
||||
/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
|
||||
/// A guard for timeline creations in process: as long as this object exists, the timeline ID
|
||||
/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
|
||||
#[must_use]
|
||||
pub(crate) struct TimelineUninitMark<'t> {
|
||||
pub(crate) struct TimelineCreateGuard<'t> {
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
uninit_mark_deleted: bool,
|
||||
uninit_mark_path: Utf8PathBuf,
|
||||
pub(crate) timeline_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
@@ -190,11 +179,10 @@ pub(crate) enum TimelineExclusionError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl<'t> TimelineUninitMark<'t> {
|
||||
impl<'t> TimelineCreateGuard<'t> {
|
||||
pub(crate) fn new(
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
uninit_mark_path: Utf8PathBuf,
|
||||
timeline_path: Utf8PathBuf,
|
||||
) -> Result<Self, TimelineExclusionError> {
|
||||
// Lock order: this is the only place we take both locks. During drop() we only
|
||||
@@ -214,56 +202,14 @@ impl<'t> TimelineUninitMark<'t> {
|
||||
Ok(Self {
|
||||
owning_tenant,
|
||||
timeline_id,
|
||||
uninit_mark_deleted: false,
|
||||
uninit_mark_path,
|
||||
timeline_path,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
|
||||
if !self.uninit_mark_deleted {
|
||||
self.delete_mark_file_if_present()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
|
||||
let uninit_mark_file = &self.uninit_mark_path;
|
||||
let uninit_mark_parent = uninit_mark_file
|
||||
.parent()
|
||||
.with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
|
||||
fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
|
||||
format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
|
||||
})?;
|
||||
crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
|
||||
self.uninit_mark_deleted = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineUninitMark<'_> {
|
||||
impl Drop for TimelineCreateGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
if !self.uninit_mark_deleted {
|
||||
if self.timeline_path.exists() {
|
||||
error!(
|
||||
"Uninit mark {} is not removed, timeline {} stays uninitialized",
|
||||
self.uninit_mark_path, self.timeline_path
|
||||
)
|
||||
} else {
|
||||
// unblock later timeline creation attempts
|
||||
warn!(
|
||||
"Removing intermediate uninit mark file {}",
|
||||
self.uninit_mark_path
|
||||
);
|
||||
if let Err(e) = self.delete_mark_file_if_present() {
|
||||
error!("Failed to remove the uninit mark file: {e}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.owning_tenant
|
||||
.timelines_creating
|
||||
.lock()
|
||||
|
||||
@@ -28,12 +28,31 @@ use tokio::time::Instant;
|
||||
|
||||
pub use pageserver_api::models::virtual_file as api;
|
||||
pub(crate) mod io_engine;
|
||||
pub use io_engine::feature_test as io_engine_feature_test;
|
||||
pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
|
||||
mod metadata;
|
||||
mod open_options;
|
||||
pub(crate) use io_engine::IoEngineKind;
|
||||
pub(crate) use metadata::Metadata;
|
||||
pub(crate) use open_options::*;
|
||||
|
||||
#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
|
||||
pub(crate) mod owned_buffers_io {
|
||||
//! Abstractions for IO with owned buffers.
|
||||
//!
|
||||
//! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary
|
||||
//! reason we need this abstraction.
|
||||
//!
|
||||
//! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`,
|
||||
//! but for the time being we're proving out the primitives in the neon.git repo
|
||||
//! for faster iteration.
|
||||
|
||||
pub(crate) mod write;
|
||||
pub(crate) mod util {
|
||||
pub(crate) mod size_tracking_writer;
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
/// the underlying file is closed if the system is low on file descriptors,
|
||||
|
||||
@@ -6,6 +6,11 @@
|
||||
//! Initialize using [`init`].
|
||||
//!
|
||||
//! Then use [`get`] and [`super::OpenOptions`].
|
||||
//!
|
||||
//!
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub(super) mod tokio_epoll_uring_ext;
|
||||
|
||||
use tokio_epoll_uring::{IoBuf, Slice};
|
||||
use tracing::Instrument;
|
||||
@@ -145,7 +150,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.read(file_guard, offset, buf).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
}
|
||||
@@ -160,7 +165,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.fsync(file_guard).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
}
|
||||
@@ -178,7 +183,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.fdatasync(file_guard).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
}
|
||||
@@ -197,7 +202,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.statx(file_guard).await;
|
||||
(
|
||||
resources,
|
||||
@@ -220,7 +225,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.write(file_guard, offset, buf).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
}
|
||||
@@ -253,3 +258,82 @@ impl IoEngine {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum FeatureTestResult {
|
||||
PlatformPreferred(IoEngineKind),
|
||||
Worse {
|
||||
engine: IoEngineKind,
|
||||
remark: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl FeatureTestResult {
|
||||
#[cfg(target_os = "linux")]
|
||||
const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::TokioEpollUring;
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::StdFs;
|
||||
}
|
||||
|
||||
impl From<FeatureTestResult> for IoEngineKind {
|
||||
fn from(val: FeatureTestResult) -> Self {
|
||||
match val {
|
||||
FeatureTestResult::PlatformPreferred(e) => e,
|
||||
FeatureTestResult::Worse { engine, .. } => engine,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Somewhat costly under the hood, do only once.
|
||||
/// Panics if we can't set up the feature test.
|
||||
pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
|
||||
std::thread::spawn(|| {
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
{
|
||||
Ok(FeatureTestResult::PlatformPreferred(
|
||||
FeatureTestResult::PLATFORM_PREFERRED,
|
||||
))
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
Ok(match rt.block_on(tokio_epoll_uring::System::launch()) {
|
||||
Ok(_) => FeatureTestResult::PlatformPreferred({
|
||||
assert!(matches!(
|
||||
IoEngineKind::TokioEpollUring,
|
||||
FeatureTestResult::PLATFORM_PREFERRED
|
||||
));
|
||||
FeatureTestResult::PLATFORM_PREFERRED
|
||||
}),
|
||||
Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => {
|
||||
let remark = match e.raw_os_error() {
|
||||
Some(nix::libc::EPERM) => {
|
||||
// fall back
|
||||
"creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled "
|
||||
.to_string()
|
||||
}
|
||||
Some(nix::libc::EFAULT) => {
|
||||
// fail feature test
|
||||
anyhow::bail!(
|
||||
"creating tokio-epoll-uring fails with EFAULT, might have corrupted memory"
|
||||
);
|
||||
}
|
||||
Some(_) | None => {
|
||||
// fall back
|
||||
format!("creating tokio-epoll-uring fails with error: {e:#}")
|
||||
}
|
||||
};
|
||||
FeatureTestResult::Worse {
|
||||
engine: IoEngineKind::StdFs,
|
||||
remark,
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
.join()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
194
pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
Normal file
194
pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
Normal file
@@ -0,0 +1,194 @@
|
||||
//! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific
|
||||
//! handling in case the instance can't launched.
|
||||
//!
|
||||
//! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation
|
||||
//! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
|
||||
//! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
|
||||
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
|
||||
use tokio_epoll_uring::{System, SystemHandle};
|
||||
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
|
||||
use crate::metrics::tokio_epoll_uring as metrics;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct ThreadLocalState(Arc<ThreadLocalStateInner>);
|
||||
|
||||
struct ThreadLocalStateInner {
|
||||
cell: tokio::sync::OnceCell<SystemHandle>,
|
||||
launch_attempts: AtomicU32,
|
||||
/// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
|
||||
thread_local_state_id: u64,
|
||||
}
|
||||
|
||||
impl ThreadLocalState {
|
||||
pub fn new() -> Self {
|
||||
Self(Arc::new(ThreadLocalStateInner {
|
||||
cell: tokio::sync::OnceCell::default(),
|
||||
launch_attempts: AtomicU32::new(0),
|
||||
thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed),
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn make_id_string(&self) -> String {
|
||||
format!("{}", self.0.thread_local_state_id)
|
||||
}
|
||||
}
|
||||
|
||||
static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
thread_local! {
|
||||
static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new();
|
||||
}
|
||||
|
||||
/// Panics if we cannot [`System::launch`].
|
||||
pub async fn thread_local_system() -> Handle {
|
||||
let fake_cancel = CancellationToken::new();
|
||||
loop {
|
||||
let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone());
|
||||
let inner = &thread_local_state.0;
|
||||
let get_or_init_res = inner
|
||||
.cell
|
||||
.get_or_try_init(|| async {
|
||||
let attempt_no = inner
|
||||
.launch_attempts
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no);
|
||||
async {
|
||||
// Rate-limit retries per thread-local.
|
||||
// NB: doesn't yield to executor at attempt_no=0.
|
||||
utils::backoff::exponential_backoff(
|
||||
attempt_no,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
&fake_cancel,
|
||||
)
|
||||
.await;
|
||||
let res = System::launch()
|
||||
// this might move us to another executor thread => loop outside the get_or_try_init, not inside it
|
||||
.await;
|
||||
match res {
|
||||
Ok(system) => {
|
||||
info!("successfully launched system");
|
||||
metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
|
||||
Ok(system)
|
||||
}
|
||||
Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
|
||||
warn!("not enough locked memory to tokio-epoll-uring, will retry");
|
||||
info_span!("stats").in_scope(|| {
|
||||
emit_launch_failure_process_stats();
|
||||
});
|
||||
metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
|
||||
Err(())
|
||||
}
|
||||
// abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
|
||||
// This is equivalent to a fatal IO error.
|
||||
Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => {
|
||||
error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process");
|
||||
info_span!("stats").in_scope(|| {
|
||||
emit_launch_failure_process_stats();
|
||||
});
|
||||
on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring");
|
||||
},
|
||||
}
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
})
|
||||
.await;
|
||||
if get_or_init_res.is_ok() {
|
||||
return Handle(thread_local_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn emit_launch_failure_process_stats() {
|
||||
// tokio-epoll-uring stats
|
||||
// vmlck + rlimit
|
||||
// number of threads
|
||||
// rss / system memory usage generally
|
||||
|
||||
let tokio_epoll_uring::metrics::Metrics {
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
} = tokio_epoll_uring::metrics::global();
|
||||
info!(systems_created, systems_destroyed, "tokio-epoll-uring");
|
||||
|
||||
match procfs::process::Process::myself() {
|
||||
Ok(myself) => {
|
||||
match myself.limits() {
|
||||
Ok(limits) => {
|
||||
info!(?limits.max_locked_memory, "/proc/self/limits");
|
||||
}
|
||||
Err(error) => {
|
||||
info!(%error, "no limit stats due to error");
|
||||
}
|
||||
}
|
||||
|
||||
match myself.status() {
|
||||
Ok(status) => {
|
||||
let procfs::process::Status {
|
||||
vmsize,
|
||||
vmlck,
|
||||
vmpin,
|
||||
vmrss,
|
||||
rssanon,
|
||||
rssfile,
|
||||
rssshmem,
|
||||
vmdata,
|
||||
vmstk,
|
||||
vmexe,
|
||||
vmlib,
|
||||
vmpte,
|
||||
threads,
|
||||
..
|
||||
} = status;
|
||||
info!(
|
||||
vmsize,
|
||||
vmlck,
|
||||
vmpin,
|
||||
vmrss,
|
||||
rssanon,
|
||||
rssfile,
|
||||
rssshmem,
|
||||
vmdata,
|
||||
vmstk,
|
||||
vmexe,
|
||||
vmlib,
|
||||
vmpte,
|
||||
threads,
|
||||
"/proc/self/status"
|
||||
);
|
||||
}
|
||||
Err(error) => {
|
||||
info!(%error, "no status status due to error");
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(error) => {
|
||||
info!(%error, "no process stats due to error");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Handle(ThreadLocalState);
|
||||
|
||||
impl std::ops::Deref for Handle {
|
||||
type Target = SystemHandle;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0
|
||||
.0
|
||||
.cell
|
||||
.get()
|
||||
.expect("must be already initialized when using this")
|
||||
}
|
||||
}
|
||||
@@ -98,7 +98,7 @@ impl OpenOptions {
|
||||
OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await;
|
||||
system.open(path, x).await.map_err(|e| match e {
|
||||
tokio_epoll_uring::Error::Op(e) => e,
|
||||
tokio_epoll_uring::Error::System(system) => {
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf};
|
||||
|
||||
pub struct Writer {
|
||||
dst: VirtualFile,
|
||||
bytes_amount: u64,
|
||||
}
|
||||
|
||||
impl Writer {
|
||||
pub fn new(dst: VirtualFile) -> Self {
|
||||
Self {
|
||||
dst,
|
||||
bytes_amount: 0,
|
||||
}
|
||||
}
|
||||
/// Returns the wrapped `VirtualFile` object as well as the number
|
||||
/// of bytes that were written to it through this object.
|
||||
pub fn into_inner(self) -> (u64, VirtualFile) {
|
||||
(self.bytes_amount, self.dst)
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for Writer {
|
||||
#[inline(always)]
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let (buf, res) = self.dst.write_all(buf).await;
|
||||
let nwritten = res?;
|
||||
self.bytes_amount += u64::try_from(nwritten).unwrap();
|
||||
Ok((nwritten, buf))
|
||||
}
|
||||
}
|
||||
206
pageserver/src/virtual_file/owned_buffers_io/write.rs
Normal file
206
pageserver/src/virtual_file/owned_buffers_io/write.rs
Normal file
@@ -0,0 +1,206 @@
|
||||
use bytes::BytesMut;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
|
||||
/// A trait for doing owned-buffer write IO.
|
||||
/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
|
||||
pub trait OwnedAsyncWriter {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)>;
|
||||
}
|
||||
|
||||
/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
|
||||
/// into `BUFFER_SIZE`-sized writes.
|
||||
///
|
||||
/// # Passthrough Of Large Writers
|
||||
///
|
||||
/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
|
||||
/// buffer to be flushed, even if it is not full yet. Then, the large
|
||||
/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
|
||||
///
|
||||
/// This pass-through is generally beneficial for throughput, but if
|
||||
/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
|
||||
/// unlimited large writes may cause latency or fairness issues.
|
||||
///
|
||||
/// In such cases, a different implementation that always buffers in memory
|
||||
/// may be preferable.
|
||||
pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
|
||||
writer: W,
|
||||
// invariant: always remains Some(buf)
|
||||
// with buf.capacity() == BUFFER_SIZE except
|
||||
// - while IO is ongoing => goes back to Some() once the IO completed successfully
|
||||
// - after an IO error => stays `None` forever
|
||||
// In these exceptional cases, it's `None`.
|
||||
buf: Option<BytesMut>,
|
||||
}
|
||||
|
||||
impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
|
||||
where
|
||||
W: OwnedAsyncWriter,
|
||||
{
|
||||
pub fn new(writer: W) -> Self {
|
||||
Self {
|
||||
writer,
|
||||
buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
|
||||
self.flush().await?;
|
||||
let Self { buf, writer } = self;
|
||||
assert!(buf.is_some());
|
||||
Ok(writer)
|
||||
}
|
||||
|
||||
pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
|
||||
where
|
||||
B: IoBuf + Send,
|
||||
{
|
||||
// avoid memcpy for the middle of the chunk
|
||||
if chunk.len() >= BUFFER_SIZE {
|
||||
self.flush().await?;
|
||||
// do a big write, bypassing `buf`
|
||||
assert_eq!(
|
||||
self.buf
|
||||
.as_ref()
|
||||
.expect("must not use after an error")
|
||||
.len(),
|
||||
0
|
||||
);
|
||||
let chunk_len = chunk.len();
|
||||
let (nwritten, chunk) = self.writer.write_all(chunk).await?;
|
||||
assert_eq!(nwritten, chunk_len);
|
||||
drop(chunk);
|
||||
return Ok(());
|
||||
}
|
||||
// in-memory copy the < BUFFER_SIZED tail of the chunk
|
||||
assert!(chunk.len() < BUFFER_SIZE);
|
||||
let mut chunk = &chunk[..];
|
||||
while !chunk.is_empty() {
|
||||
let buf = self.buf.as_mut().expect("must not use after an error");
|
||||
let need = BUFFER_SIZE - buf.len();
|
||||
let have = chunk.len();
|
||||
let n = std::cmp::min(need, have);
|
||||
buf.extend_from_slice(&chunk[..n]);
|
||||
chunk = &chunk[n..];
|
||||
if buf.len() >= BUFFER_SIZE {
|
||||
assert_eq!(buf.len(), BUFFER_SIZE);
|
||||
self.flush().await?;
|
||||
}
|
||||
}
|
||||
assert!(chunk.is_empty(), "by now we should have drained the chunk");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> std::io::Result<()> {
|
||||
let buf = self.buf.take().expect("must not use after an error");
|
||||
if buf.is_empty() {
|
||||
self.buf = Some(buf);
|
||||
return std::io::Result::Ok(());
|
||||
}
|
||||
let buf_len = buf.len();
|
||||
let (nwritten, mut buf) = self.writer.write_all(buf).await?;
|
||||
assert_eq!(nwritten, buf_len);
|
||||
buf.clear();
|
||||
self.buf = Some(buf);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for Vec<u8> {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
return Ok((0, Slice::into_inner(buf.slice_full())));
|
||||
}
|
||||
let buf = buf.slice(0..nbytes);
|
||||
self.extend_from_slice(&buf[..]);
|
||||
Ok((buf.len(), Slice::into_inner(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RecorderWriter {
|
||||
writes: Vec<Vec<u8>>,
|
||||
}
|
||||
impl OwnedAsyncWriter for RecorderWriter {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
self.writes.push(vec![]);
|
||||
return Ok((0, Slice::into_inner(buf.slice_full())));
|
||||
}
|
||||
let buf = buf.slice(0..nbytes);
|
||||
self.writes.push(Vec::from(&buf[..]));
|
||||
Ok((buf.len(), Slice::into_inner(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! write {
|
||||
($writer:ident, $data:literal) => {{
|
||||
$writer
|
||||
.write_buffered(::bytes::Bytes::from_static($data).slice_full())
|
||||
.await?;
|
||||
}};
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_buffered_writes_only() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::<2, _>::new(recorder);
|
||||
write!(writer, b"a");
|
||||
write!(writer, b"b");
|
||||
write!(writer, b"c");
|
||||
write!(writer, b"d");
|
||||
write!(writer, b"e");
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_passthrough_writes_only() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::<2, _>::new(recorder);
|
||||
write!(writer, b"abc");
|
||||
write!(writer, b"de");
|
||||
write!(writer, b"");
|
||||
write!(writer, b"fghijk");
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::<2, _>::new(recorder);
|
||||
write!(writer, b"a");
|
||||
write!(writer, b"bc");
|
||||
write!(writer, b"d");
|
||||
write!(writer, b"e");
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested)
|
||||
struct sysinfo si;
|
||||
Size total;
|
||||
if (sysinfo(&si) < 0)
|
||||
elog(ERROR, "Failed to get amount of RAM: %n");
|
||||
elog(ERROR, "Failed to get amount of RAM: %m");
|
||||
|
||||
total = si.totalram*si.mem_unit;
|
||||
if ((Size)NBuffers*BLCKSZ + requested >= total)
|
||||
|
||||
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
|
||||
@@ -38,7 +38,6 @@ neon_download_extension_file_http(const char *filename, bool is_library)
|
||||
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
|
||||
if (handle == NULL)
|
||||
|
||||
@@ -316,6 +316,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
TimestampTz now;
|
||||
uint64_t us_since_last_connect;
|
||||
bool broke_from_loop = false;
|
||||
|
||||
Assert(page_servers[shard_no].conn == NULL);
|
||||
|
||||
@@ -418,7 +419,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
|
||||
neon_shard_log(shard_no, elevel, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
return false;
|
||||
/* Returning from inside PG_TRY is bad, so we break/return later */
|
||||
broke_from_loop = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -431,6 +434,11 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (broke_from_loop)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
neon_shard_log(shard_no, LOG, "libpagestore: connected to '%s'", connstr);
|
||||
page_servers[shard_no].conn = conn;
|
||||
page_servers[shard_no].wes = wes;
|
||||
|
||||
6
pgxn/neon/neon--1.1--1.0.sql
Normal file
6
pgxn/neon/neon--1.1--1.0.sql
Normal file
@@ -0,0 +1,6 @@
|
||||
-- the order of operations is important here
|
||||
-- because the view depends on the function
|
||||
|
||||
DROP VIEW IF EXISTS neon_lfc_stats CASCADE;
|
||||
|
||||
DROP FUNCTION IF EXISTS neon_get_lfc_stats CASCADE;
|
||||
1
pgxn/neon/neon--1.2--1.1.sql
Normal file
1
pgxn/neon/neon--1.2--1.1.sql
Normal file
@@ -0,0 +1 @@
|
||||
DROP VIEW IF EXISTS NEON_STAT_FILE_CACHE CASCADE;
|
||||
1
pgxn/neon/neon--1.3--1.2.sql
Normal file
1
pgxn/neon/neon--1.3--1.2.sql
Normal file
@@ -0,0 +1 @@
|
||||
DROP FUNCTION IF EXISTS approximate_working_set_size(bool) CASCADE;
|
||||
@@ -95,7 +95,6 @@ get_num_snap_files_lsn_threshold(void)
|
||||
DIR *dirdesc;
|
||||
struct dirent *de;
|
||||
char *snap_path = "pg_logical/snapshots/";
|
||||
int cnt = 0;
|
||||
int lsns_allocated = 1024;
|
||||
int lsns_num = 0;
|
||||
XLogRecPtr *lsns;
|
||||
@@ -161,9 +160,6 @@ get_num_snap_files_lsn_threshold(void)
|
||||
PGDLLEXPORT void
|
||||
LogicalSlotsMonitorMain(Datum main_arg)
|
||||
{
|
||||
TimestampTz now,
|
||||
last_checked;
|
||||
|
||||
/* Establish signal handlers. */
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
|
||||
@@ -1888,7 +1888,6 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
int nblocks, bool skipFsync)
|
||||
{
|
||||
const PGAlignedBlock buffer = {0};
|
||||
BlockNumber curblocknum = blocknum;
|
||||
int remblocks = nblocks;
|
||||
XLogRecPtr lsn = 0;
|
||||
|
||||
|
||||
@@ -1220,7 +1220,7 @@ PrepareAppendRequest(WalProposer *wp, AppendRequestHeader *req, XLogRecPtr begin
|
||||
req->epochStartLsn = wp->propEpochStartLsn;
|
||||
req->beginLsn = beginLsn;
|
||||
req->endLsn = endLsn;
|
||||
req->commitLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
req->commitLsn = wp->commitLsn;
|
||||
req->truncateLsn = wp->truncateLsn;
|
||||
req->proposerId = wp->greetRequest.proposerId;
|
||||
}
|
||||
@@ -1405,7 +1405,7 @@ static bool
|
||||
RecvAppendResponses(Safekeeper *sk)
|
||||
{
|
||||
WalProposer *wp = sk->wp;
|
||||
XLogRecPtr minQuorumLsn;
|
||||
XLogRecPtr newCommitLsn;
|
||||
bool readAnything = false;
|
||||
|
||||
while (true)
|
||||
@@ -1444,18 +1444,19 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
if (!readAnything)
|
||||
return sk->state == SS_ACTIVE;
|
||||
|
||||
HandleSafekeeperResponse(wp);
|
||||
|
||||
/* update commit_lsn */
|
||||
newCommitLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
/*
|
||||
* Also send the new commit lsn to all the safekeepers.
|
||||
* Send the new value to all safekeepers.
|
||||
*/
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
if (minQuorumLsn > wp->lastSentCommitLsn)
|
||||
if (newCommitLsn > wp->commitLsn)
|
||||
{
|
||||
wp->commitLsn = newCommitLsn;
|
||||
BroadcastAppendRequest(wp);
|
||||
wp->lastSentCommitLsn = minQuorumLsn;
|
||||
}
|
||||
|
||||
HandleSafekeeperResponse(wp);
|
||||
|
||||
return sk->state == SS_ACTIVE;
|
||||
}
|
||||
|
||||
@@ -1632,11 +1633,9 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
|
||||
static void
|
||||
HandleSafekeeperResponse(WalProposer *wp)
|
||||
{
|
||||
XLogRecPtr minQuorumLsn;
|
||||
XLogRecPtr candidateTruncateLsn;
|
||||
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition(wp);
|
||||
wp->api.process_safekeeper_feedback(wp, minQuorumLsn);
|
||||
wp->api.process_safekeeper_feedback(wp);
|
||||
|
||||
/*
|
||||
* Try to advance truncateLsn -- the last record flushed to all
|
||||
@@ -1649,7 +1648,7 @@ HandleSafekeeperResponse(WalProposer *wp)
|
||||
* can't commit entries from previous term' in Raft); 2)
|
||||
*/
|
||||
candidateTruncateLsn = CalculateMinFlushLsn(wp);
|
||||
candidateTruncateLsn = Min(candidateTruncateLsn, minQuorumLsn);
|
||||
candidateTruncateLsn = Min(candidateTruncateLsn, wp->commitLsn);
|
||||
if (candidateTruncateLsn > wp->truncateLsn)
|
||||
{
|
||||
wp->truncateLsn = candidateTruncateLsn;
|
||||
|
||||
@@ -564,7 +564,7 @@ typedef struct walproposer_api
|
||||
* backpressure feedback and to confirm WAL persistence (has been commited
|
||||
* on the quorum of safekeepers).
|
||||
*/
|
||||
void (*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
|
||||
void (*process_safekeeper_feedback) (WalProposer *wp);
|
||||
|
||||
/*
|
||||
* Write a log message to the internal log processor. This is used only
|
||||
@@ -646,8 +646,8 @@ typedef struct WalProposer
|
||||
/* WAL has been generated up to this point */
|
||||
XLogRecPtr availableLsn;
|
||||
|
||||
/* last commitLsn broadcasted to safekeepers */
|
||||
XLogRecPtr lastSentCommitLsn;
|
||||
/* cached GetAcknowledgedByQuorumWALPosition result */
|
||||
XLogRecPtr commitLsn;
|
||||
|
||||
ProposerGreeting greetRequest;
|
||||
|
||||
|
||||
@@ -68,6 +68,8 @@ static WalproposerShmemState *walprop_shared;
|
||||
static WalProposerConfig walprop_config;
|
||||
static XLogRecPtr sentPtr = InvalidXLogRecPtr;
|
||||
static const walproposer_api walprop_pg;
|
||||
static volatile sig_atomic_t got_SIGUSR2 = false;
|
||||
static bool reported_sigusr2 = false;
|
||||
|
||||
static void nwp_shmem_startup_hook(void);
|
||||
static void nwp_register_gucs(void);
|
||||
@@ -101,6 +103,8 @@ static void add_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void update_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
|
||||
|
||||
static void CheckGracefulShutdown(WalProposer *wp);
|
||||
|
||||
static XLogRecPtr GetLogRepRestartLSN(WalProposer *wp);
|
||||
|
||||
static void
|
||||
@@ -492,6 +496,24 @@ walprop_pg_init_standalone_sync_safekeepers(void)
|
||||
BackgroundWorkerUnblockSignals();
|
||||
}
|
||||
|
||||
/*
|
||||
* We pretend to be a walsender process, and the lifecycle of a walsender is
|
||||
* slightly different than other procesess. At shutdown, walsender processes
|
||||
* stay alive until the very end, after the checkpointer has written the
|
||||
* shutdown checkpoint. When the checkpointer exits, the postmaster sends all
|
||||
* remaining walsender processes SIGUSR2. On receiving SIGUSR2, we try to send
|
||||
* the remaining WAL, and then exit. This ensures that the checkpoint record
|
||||
* reaches durable storage (in safekeepers), before the server shuts down
|
||||
* completely.
|
||||
*/
|
||||
static void
|
||||
walprop_sigusr2(SIGNAL_ARGS)
|
||||
{
|
||||
got_SIGUSR2 = true;
|
||||
|
||||
SetLatch(MyLatch);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_init_bgworker(void)
|
||||
{
|
||||
@@ -503,6 +525,7 @@ walprop_pg_init_bgworker(void)
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
pqsignal(SIGTERM, die);
|
||||
pqsignal(SIGUSR2, walprop_sigusr2);
|
||||
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
@@ -1026,7 +1049,7 @@ static void
|
||||
StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
{
|
||||
XLogRecPtr FlushPtr;
|
||||
TimeLineID currTLI;
|
||||
__attribute__((unused)) TimeLineID currTLI;
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (ThisTimeLineID == 0)
|
||||
@@ -1075,14 +1098,26 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When we first start replication the standby will be behind the primary.
|
||||
* For some applications, for example synchronous replication, it is
|
||||
* important to have a clear state for this initial catchup mode, so we
|
||||
* can trigger actions when we change streaming state later. We may stay
|
||||
* in this state for a long time, which is exactly why we want to be able
|
||||
* to monitor whether or not we are still here.
|
||||
* XXX: Move straight to STOPPING state, skipping the STREAMING state.
|
||||
*
|
||||
* This is a bit weird. Normal walsenders stay in STREAMING state, until
|
||||
* the checkpointer signals them that it is about to start writing the
|
||||
* shutdown checkpoint. The walsenders acknowledge that they have received
|
||||
* that signal by switching to STOPPING state. That tells the walsenders
|
||||
* that they must not write any new WAL.
|
||||
*
|
||||
* However, we cannot easily intercept that signal from the checkpointer.
|
||||
* It's sent by WalSndInitStopping(), using
|
||||
* SendProcSignal(PROCSIGNAL_WALSND_INIT_STOPPING). It's received by
|
||||
* HandleWalSndInitStopping, which sets a process-local got_STOPPING flag.
|
||||
* However, that's all private to walsender.c.
|
||||
*
|
||||
* We don't need to do anything special upon receiving the signal, the
|
||||
* walproposer doesn't write any WAL anyway, so we skip the STREAMING
|
||||
* state and go directly to STOPPING mode. That way, the checkpointer
|
||||
* won't wait for us.
|
||||
*/
|
||||
WalSndSetState(WALSNDSTATE_CATCHUP);
|
||||
WalSndSetState(WALSNDSTATE_STOPPING);
|
||||
|
||||
/*
|
||||
* Don't allow a request to stream from a future point in WAL that hasn't
|
||||
@@ -1122,6 +1157,8 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
static void
|
||||
WalSndLoop(WalProposer *wp)
|
||||
{
|
||||
XLogRecPtr flushPtr;
|
||||
|
||||
/* Clear any already-pending wakeups */
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
@@ -1130,9 +1167,6 @@ WalSndLoop(WalProposer *wp)
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
XLogBroadcastWalProposer(wp);
|
||||
|
||||
if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
|
||||
WalSndSetState(WALSNDSTATE_STREAMING);
|
||||
WalProposerPoll(wp);
|
||||
}
|
||||
}
|
||||
@@ -1230,7 +1264,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
||||
TimeLineID timeline;
|
||||
XLogRecPtr startpos;
|
||||
XLogRecPtr endpos;
|
||||
uint64 download_range_mb;
|
||||
|
||||
startpos = GetLogRepRestartLSN(wp);
|
||||
if (startpos == InvalidXLogRecPtr)
|
||||
@@ -1745,6 +1778,9 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
|
||||
{
|
||||
ConditionVariableCancelSleep();
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CheckGracefulShutdown(wp);
|
||||
|
||||
*events = WL_LATCH_SET;
|
||||
return 1;
|
||||
}
|
||||
@@ -1798,6 +1834,41 @@ walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Like vanilla walsender, on sigusr2 send all remaining WAL and exit.
|
||||
*
|
||||
* Note that unlike sync-safekeepers waiting here is not reliable: we
|
||||
* don't check that majority of safekeepers received and persisted
|
||||
* commit_lsn -- only that walproposer reached it (which immediately
|
||||
* broadcasts new value). Doing that without incurring redundant control
|
||||
* file syncing would need wp -> sk protocol change. OTOH unlike
|
||||
* sync-safekeepers which must bump commit_lsn or basebackup will fail,
|
||||
* this catchup is important only for tests where safekeepers/network
|
||||
* don't crash on their own.
|
||||
*/
|
||||
static void
|
||||
CheckGracefulShutdown(WalProposer *wp)
|
||||
{
|
||||
if (got_SIGUSR2)
|
||||
{
|
||||
if (!reported_sigusr2)
|
||||
{
|
||||
XLogRecPtr flushPtr = walprop_pg_get_flush_rec_ptr(wp);
|
||||
|
||||
wpg_log(LOG, "walproposer will send and wait for remaining WAL between %X/%X and %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->commitLsn), LSN_FORMAT_ARGS(flushPtr));
|
||||
reported_sigusr2 = true;
|
||||
}
|
||||
|
||||
if (wp->commitLsn >= walprop_pg_get_flush_rec_ptr(wp))
|
||||
{
|
||||
wpg_log(LOG, "walproposer sent all WAL up to %X/%X, exiting",
|
||||
LSN_FORMAT_ARGS(wp->commitLsn));
|
||||
proc_exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Choose most advanced PageserverFeedback and set it to *rf.
|
||||
*/
|
||||
@@ -1878,7 +1949,7 @@ CombineHotStanbyFeedbacks(HotStandbyFeedback *hs, WalProposer *wp)
|
||||
* None of that is functional in sync-safekeepers.
|
||||
*/
|
||||
static void
|
||||
walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
walprop_pg_process_safekeeper_feedback(WalProposer *wp)
|
||||
{
|
||||
HotStandbyFeedback hsFeedback;
|
||||
XLogRecPtr oldDiskConsistentLsn;
|
||||
@@ -1893,10 +1964,10 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
replication_feedback_set(&quorumFeedback.rf);
|
||||
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
|
||||
|
||||
if (commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
if (wp->commitLsn > quorumFeedback.flushLsn || oldDiskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
{
|
||||
if (commitLsn > quorumFeedback.flushLsn)
|
||||
quorumFeedback.flushLsn = commitLsn;
|
||||
if (wp->commitLsn > quorumFeedback.flushLsn)
|
||||
quorumFeedback.flushLsn = wp->commitLsn;
|
||||
|
||||
/*
|
||||
* Advance the replication slot to commitLsn. WAL before it is
|
||||
@@ -1929,6 +2000,8 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
XidFromFullTransactionId(hsFeedback.catalog_xmin),
|
||||
EpochFromFullTransactionId(hsFeedback.catalog_xmin));
|
||||
}
|
||||
|
||||
CheckGracefulShutdown(wp);
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
|
||||
@@ -182,8 +182,6 @@ test_consume_memory(PG_FUNCTION_ARGS)
|
||||
Datum
|
||||
test_release_memory(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TimestampTz start;
|
||||
|
||||
if (PG_ARGISNULL(0))
|
||||
{
|
||||
if (consume_cxt)
|
||||
|
||||
@@ -220,6 +220,9 @@ enter_seccomp_mode(void)
|
||||
}
|
||||
#endif /* HAVE_LIBSECCOMP */
|
||||
|
||||
PGDLLEXPORT void
|
||||
WalRedoMain(int argc, char *argv[]);
|
||||
|
||||
/*
|
||||
* Entry point for the WAL redo process.
|
||||
*
|
||||
|
||||
94
poetry.lock
generated
94
poetry.lock
generated
@@ -2182,7 +2182,6 @@ files = [
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
@@ -2529,6 +2528,87 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
|
||||
optional = ["python-socks", "wsaccel"]
|
||||
test = ["websockets"]
|
||||
|
||||
[[package]]
|
||||
name = "websockets"
|
||||
version = "12.0"
|
||||
description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
|
||||
{file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
|
||||
{file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"},
|
||||
{file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"},
|
||||
{file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"},
|
||||
{file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"},
|
||||
{file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"},
|
||||
{file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"},
|
||||
{file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"},
|
||||
{file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"},
|
||||
{file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"},
|
||||
{file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"},
|
||||
{file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"},
|
||||
{file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"},
|
||||
{file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"},
|
||||
{file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"},
|
||||
{file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"},
|
||||
{file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"},
|
||||
{file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"},
|
||||
{file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"},
|
||||
{file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"},
|
||||
{file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"},
|
||||
{file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"},
|
||||
{file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"},
|
||||
{file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"},
|
||||
{file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"},
|
||||
{file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"},
|
||||
{file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"},
|
||||
{file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"},
|
||||
{file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"},
|
||||
{file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"},
|
||||
{file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"},
|
||||
{file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"},
|
||||
{file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"},
|
||||
{file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"},
|
||||
{file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"},
|
||||
{file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"},
|
||||
{file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"},
|
||||
{file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"},
|
||||
{file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"},
|
||||
{file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"},
|
||||
{file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"},
|
||||
{file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"},
|
||||
{file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"},
|
||||
{file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"},
|
||||
{file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"},
|
||||
{file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"},
|
||||
{file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"},
|
||||
{file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"},
|
||||
{file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"},
|
||||
{file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"},
|
||||
{file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"},
|
||||
{file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"},
|
||||
{file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"},
|
||||
{file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"},
|
||||
{file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"},
|
||||
{file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.0.1"
|
||||
@@ -2572,16 +2652,6 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
@@ -2819,4 +2889,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
|
||||
content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"
|
||||
|
||||
@@ -25,13 +25,16 @@ pub async fn authenticate_cleartext(
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer.pause();
|
||||
let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let auth_outcome = AuthFlow::new(client)
|
||||
let auth_flow = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword(secret))
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
drop(paused);
|
||||
// cleartext auth is only allowed to the ws/http protocol.
|
||||
// If we're here, we already received the password in the first message.
|
||||
// Scram protocol will be executed on the proxy side.
|
||||
let auth_outcome = auth_flow.authenticate().await?;
|
||||
|
||||
let keys = match auth_outcome {
|
||||
sasl::Outcome::Success(key) => key,
|
||||
@@ -56,7 +59,7 @@ pub async fn password_hack_no_authentication(
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer.pause();
|
||||
let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
|
||||
@@ -143,7 +143,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
let Scram(secret, ctx) = self.state;
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer.pause();
|
||||
let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
// Initial client message contains the chosen auth method's name.
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
|
||||
@@ -73,7 +73,7 @@ pub mod errors {
|
||||
// Status 406: endpoint is disabled (we don't allow connections).
|
||||
format!("{REQUEST_FAILED}: endpoint is disabled")
|
||||
}
|
||||
http::StatusCode::LOCKED => {
|
||||
http::StatusCode::LOCKED | http::StatusCode::UNPROCESSABLE_ENTITY => {
|
||||
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
|
||||
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
|
||||
}
|
||||
@@ -91,6 +91,12 @@ pub mod errors {
|
||||
status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
..
|
||||
} => crate::error::ErrorKind::User,
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
text,
|
||||
} if text.contains("compute time quota of non-primary branches is exceeded") => {
|
||||
crate::error::ErrorKind::User
|
||||
}
|
||||
ApiError::Console {
|
||||
status: http::StatusCode::LOCKED,
|
||||
text,
|
||||
@@ -120,6 +126,11 @@ pub mod errors {
|
||||
status: http::StatusCode::BAD_REQUEST,
|
||||
..
|
||||
} => true,
|
||||
// don't retry when quotas are exceeded
|
||||
Self::Console {
|
||||
status: http::StatusCode::UNPROCESSABLE_ENTITY,
|
||||
ref text,
|
||||
} => !text.contains("compute time quota of non-primary branches is exceeded"),
|
||||
// locked can be returned when the endpoint was in transition
|
||||
// or when quotas are exceeded. don't retry when quotas are exceeded
|
||||
Self::Console {
|
||||
|
||||
@@ -6,7 +6,9 @@ use super::{
|
||||
ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
|
||||
NodeInfo,
|
||||
};
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
|
||||
use crate::{
|
||||
auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
|
||||
};
|
||||
use crate::{
|
||||
cache::Cached,
|
||||
context::RequestMonitoring,
|
||||
@@ -72,7 +74,9 @@ impl Api {
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = match parse_body::<GetRoleSecret>(response).await {
|
||||
Ok(body) => body,
|
||||
@@ -132,7 +136,9 @@ impl Api {
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = parse_body::<WakeCompute>(response).await?;
|
||||
|
||||
@@ -244,6 +250,8 @@ impl super::Api for Api {
|
||||
// which means that we might cache it to reduce the load and latency.
|
||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||
info!(key = &*key, "found cached compute node info");
|
||||
info!("cold_start_info=warm");
|
||||
ctx.set_cold_start_info(ColdStartInfo::Warm);
|
||||
return Ok(cached);
|
||||
}
|
||||
|
||||
@@ -254,6 +262,8 @@ impl super::Api for Api {
|
||||
if permit.should_check_cache() {
|
||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||
info!(key = &*key, "found cached compute node info");
|
||||
info!("cold_start_info=warm");
|
||||
ctx.set_cold_start_info(ColdStartInfo::Warm);
|
||||
return Ok(cached);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,11 +15,12 @@ use crate::{
|
||||
BranchId, DbName, EndpointId, ProjectId, RoleName,
|
||||
};
|
||||
|
||||
use self::parquet::RequestData;
|
||||
|
||||
pub mod parquet;
|
||||
|
||||
static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestMonitoring>> = OnceCell::new();
|
||||
static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Context data for a single request to connect to a database.
|
||||
///
|
||||
/// This data should **not** be used for connection logic, only for observability and limiting purposes.
|
||||
@@ -46,7 +47,7 @@ pub struct RequestMonitoring {
|
||||
|
||||
// extra
|
||||
// This sender is here to keep the request monitoring channel open while requests are taking place.
|
||||
sender: Option<mpsc::UnboundedSender<RequestMonitoring>>,
|
||||
sender: Option<mpsc::UnboundedSender<RequestData>>,
|
||||
pub latency_timer: LatencyTimer,
|
||||
}
|
||||
|
||||
@@ -111,6 +112,10 @@ impl RequestMonitoring {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
|
||||
self.cold_start_info = Some(info);
|
||||
}
|
||||
|
||||
pub fn set_project(&mut self, x: MetricsAuxInfo) {
|
||||
self.set_endpoint_id(x.endpoint_id);
|
||||
self.branch = Some(x.branch_id);
|
||||
@@ -168,7 +173,7 @@ impl RequestMonitoring {
|
||||
impl Drop for RequestMonitoring {
|
||||
fn drop(&mut self) {
|
||||
if let Some(tx) = self.sender.take() {
|
||||
let _: Result<(), _> = tx.send(self.clone());
|
||||
let _: Result<(), _> = tx.send(RequestData::from(&*self));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user