mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-05 03:30:36 +00:00
Compare commits
312 Commits
lr-tests-c
...
release-51
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7b860b837c | ||
|
|
41fc96e20f | ||
|
|
fb2b1ce57b | ||
|
|
464717451b | ||
|
|
5cec5cb3cf | ||
|
|
0694ee9531 | ||
|
|
9752ad8489 | ||
|
|
ad6f538aef | ||
|
|
1aa159acca | ||
|
|
60f30000ef | ||
|
|
bc1efa827f | ||
|
|
67522ce83d | ||
|
|
7d32af5ad5 | ||
|
|
59b6cce418 | ||
|
|
bf187aa13f | ||
|
|
22c26d610b | ||
|
|
516f793ab4 | ||
|
|
6443dbef90 | ||
|
|
23416cc358 | ||
|
|
46098ea0ea | ||
|
|
49bc734e02 | ||
|
|
76c44dc140 | ||
|
|
58ef78cf41 | ||
|
|
678ed39de2 | ||
|
|
3d8830ac35 | ||
|
|
38767ace68 | ||
|
|
9fe0193e51 | ||
|
|
8075f0965a | ||
|
|
44f42627dd | ||
|
|
3bd6551b36 | ||
|
|
69338e53e3 | ||
|
|
5309711691 | ||
|
|
8a53d576e6 | ||
|
|
b0aff04157 | ||
|
|
0554bee022 | ||
|
|
c6ed86d3d0 | ||
|
|
f0a9017008 | ||
|
|
bb7949ba00 | ||
|
|
1df0f69664 | ||
|
|
970066a914 | ||
|
|
1ebd3897c0 | ||
|
|
6460beffcd | ||
|
|
6f7f8958db | ||
|
|
936a00e077 | ||
|
|
96a4e8de66 | ||
|
|
01180666b0 | ||
|
|
6c94269c32 | ||
|
|
edc691647d | ||
|
|
855d7b4781 | ||
|
|
c49c9707ce | ||
|
|
2227540a0d | ||
|
|
f1347f2417 | ||
|
|
30b295b017 | ||
|
|
1cef395266 | ||
|
|
78d160f76d | ||
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
1
.github/workflows/build_and_test.yml
vendored
1
.github/workflows/build_and_test.yml
vendored
@@ -461,6 +461,7 @@ jobs:
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: 60
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
test_selection: regress
|
||||
|
||||
8
Cargo.lock
generated
8
Cargo.lock
generated
@@ -282,8 +282,10 @@ dependencies = [
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"diesel_migrations",
|
||||
"fail",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
@@ -1344,6 +1346,7 @@ dependencies = [
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"nix 0.27.1",
|
||||
"once_cell",
|
||||
@@ -3527,6 +3530,7 @@ dependencies = [
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"procfs",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
@@ -5884,7 +5888,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -6421,7 +6425,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#342ddd197a060a8354e8f11f4d12994419fff939"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
|
||||
@@ -238,6 +238,14 @@ If you encounter errors during setting up the initial tenant, it's best to stop
|
||||
|
||||
## Running tests
|
||||
|
||||
### Rust unit tests
|
||||
|
||||
We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
|
||||
Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
|
||||
You can install `cargo-nextest` with `cargo install cargo-nextest`.
|
||||
|
||||
### Integration tests
|
||||
|
||||
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
|
||||
|
||||
```sh
|
||||
|
||||
@@ -2,6 +2,8 @@ disallowed-methods = [
|
||||
"tokio::task::block_in_place",
|
||||
# Allow this for now, to deny it later once we stop using Handle::block_on completely
|
||||
# "tokio::runtime::Handle::block_on",
|
||||
# use tokio_epoll_uring_ext instead
|
||||
"tokio_epoll_uring::thread_local_system",
|
||||
]
|
||||
|
||||
disallowed-macros = [
|
||||
|
||||
@@ -743,19 +743,21 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
|
||||
// which may happen in two cases:
|
||||
// - extension was just installed
|
||||
// - extension was already installed and is up to date
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade");
|
||||
let query = "ALTER EXTENSION neon UPDATE";
|
||||
info!("update neon extension version with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
pub fn handle_neon_extension_upgrade(_client: &mut Client) -> Result<()> {
|
||||
info!("handle neon extension upgrade (not really)");
|
||||
// DISABLED due to compute node unpinning epic
|
||||
// let query = "ALTER EXTENSION neon UPDATE";
|
||||
// info!("update neon extension version with query: {}", query);
|
||||
// client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
@@ -19,8 +19,10 @@ aws-config.workspace = true
|
||||
aws-sdk-secretsmanager.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
hyper.workspace = true
|
||||
humantime.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
227
control_plane/attachment_service/src/heartbeater.rs
Normal file
227
control_plane/attachment_service/src/heartbeater.rs
Normal file
@@ -0,0 +1,227 @@
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::Arc,
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use pageserver_api::{
|
||||
controller_api::{NodeAvailability, UtilizationScore},
|
||||
models::PageserverUtilization,
|
||||
};
|
||||
|
||||
use thiserror::Error;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::node::Node;
|
||||
|
||||
struct HeartbeaterTask {
|
||||
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
|
||||
cancel: CancellationToken,
|
||||
|
||||
state: HashMap<NodeId, PageserverState>,
|
||||
|
||||
max_unavailable_interval: Duration,
|
||||
jwt_token: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) enum PageserverState {
|
||||
Available {
|
||||
last_seen_at: Instant,
|
||||
utilization: PageserverUtilization,
|
||||
},
|
||||
Offline,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct AvailablityDeltas(pub Vec<(NodeId, PageserverState)>);
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub(crate) enum HeartbeaterError {
|
||||
#[error("Cancelled")]
|
||||
Cancel,
|
||||
}
|
||||
|
||||
struct HeartbeatRequest {
|
||||
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
reply: tokio::sync::oneshot::Sender<Result<AvailablityDeltas, HeartbeaterError>>,
|
||||
}
|
||||
|
||||
pub(crate) struct Heartbeater {
|
||||
sender: tokio::sync::mpsc::UnboundedSender<HeartbeatRequest>,
|
||||
}
|
||||
|
||||
impl Heartbeater {
|
||||
pub(crate) fn new(
|
||||
jwt_token: Option<String>,
|
||||
max_unavailable_interval: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
let (sender, receiver) = tokio::sync::mpsc::unbounded_channel::<HeartbeatRequest>();
|
||||
let mut heartbeater =
|
||||
HeartbeaterTask::new(receiver, jwt_token, max_unavailable_interval, cancel);
|
||||
tokio::task::spawn(async move { heartbeater.run().await });
|
||||
|
||||
Self { sender }
|
||||
}
|
||||
|
||||
pub(crate) async fn heartbeat(
|
||||
&self,
|
||||
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
) -> Result<AvailablityDeltas, HeartbeaterError> {
|
||||
let (sender, receiver) = tokio::sync::oneshot::channel();
|
||||
self.sender
|
||||
.send(HeartbeatRequest {
|
||||
pageservers,
|
||||
reply: sender,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
receiver.await.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl HeartbeaterTask {
|
||||
fn new(
|
||||
receiver: tokio::sync::mpsc::UnboundedReceiver<HeartbeatRequest>,
|
||||
jwt_token: Option<String>,
|
||||
max_unavailable_interval: Duration,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
Self {
|
||||
receiver,
|
||||
cancel,
|
||||
state: HashMap::new(),
|
||||
max_unavailable_interval,
|
||||
jwt_token,
|
||||
}
|
||||
}
|
||||
|
||||
async fn run(&mut self) {
|
||||
loop {
|
||||
tokio::select! {
|
||||
request = self.receiver.recv() => {
|
||||
match request {
|
||||
Some(req) => {
|
||||
let res = self.heartbeat(req.pageservers).await;
|
||||
req.reply.send(res).unwrap();
|
||||
},
|
||||
None => { return; }
|
||||
}
|
||||
},
|
||||
_ = self.cancel.cancelled() => return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn heartbeat(
|
||||
&mut self,
|
||||
pageservers: Arc<HashMap<NodeId, Node>>,
|
||||
) -> Result<AvailablityDeltas, HeartbeaterError> {
|
||||
let mut new_state = HashMap::new();
|
||||
|
||||
let mut heartbeat_futs = FuturesUnordered::new();
|
||||
for (node_id, node) in &*pageservers {
|
||||
heartbeat_futs.push({
|
||||
let jwt_token = self.jwt_token.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
|
||||
// Clone the node and mark it as available such that the request
|
||||
// goes through to the pageserver even when the node is marked offline.
|
||||
// This doesn't impact the availability observed by [`crate::service::Service`].
|
||||
let mut node = node.clone();
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
|
||||
async move {
|
||||
let response = node
|
||||
.with_client_retries(
|
||||
|client| async move { client.get_utilization().await },
|
||||
&jwt_token,
|
||||
2,
|
||||
3,
|
||||
Duration::from_secs(1),
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
let response = match response {
|
||||
Some(r) => r,
|
||||
None => {
|
||||
// This indicates cancellation of the request.
|
||||
// We ignore the node in this case.
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let status = if let Ok(utilization) = response {
|
||||
PageserverState::Available {
|
||||
last_seen_at: Instant::now(),
|
||||
utilization,
|
||||
}
|
||||
} else {
|
||||
PageserverState::Offline
|
||||
};
|
||||
|
||||
Some((*node_id, status))
|
||||
}
|
||||
});
|
||||
|
||||
loop {
|
||||
let maybe_status = tokio::select! {
|
||||
next = heartbeat_futs.next() => {
|
||||
match next {
|
||||
Some(result) => result,
|
||||
None => { break; }
|
||||
}
|
||||
},
|
||||
_ = self.cancel.cancelled() => { return Err(HeartbeaterError::Cancel); }
|
||||
};
|
||||
|
||||
if let Some((node_id, status)) = maybe_status {
|
||||
new_state.insert(node_id, status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut deltas = Vec::new();
|
||||
let now = Instant::now();
|
||||
for (node_id, ps_state) in new_state {
|
||||
use std::collections::hash_map::Entry::*;
|
||||
let entry = self.state.entry(node_id);
|
||||
|
||||
let mut needs_update = false;
|
||||
match entry {
|
||||
Occupied(ref occ) => match (occ.get(), &ps_state) {
|
||||
(PageserverState::Offline, PageserverState::Offline) => {}
|
||||
(PageserverState::Available { last_seen_at, .. }, PageserverState::Offline) => {
|
||||
if now - *last_seen_at >= self.max_unavailable_interval {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
needs_update = true;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
needs_update = true;
|
||||
}
|
||||
},
|
||||
Vacant(_) => {
|
||||
deltas.push((node_id, ps_state.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
match entry {
|
||||
Occupied(mut occ) if needs_update => {
|
||||
(*occ.get_mut()) = ps_state;
|
||||
}
|
||||
Vacant(vac) => {
|
||||
vac.insert(ps_state);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(AvailablityDeltas(deltas))
|
||||
}
|
||||
}
|
||||
@@ -10,9 +10,11 @@ use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::auth::{Scope, SwappableJwtAuth};
|
||||
use utils::failpoint_support::failpoints_handler;
|
||||
use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
|
||||
use utils::http::request::{must_get_query_param, parse_request_param};
|
||||
use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::{
|
||||
@@ -26,7 +28,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
|
||||
};
|
||||
use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
|
||||
|
||||
@@ -174,14 +176,14 @@ async fn handle_tenant_location_config(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_location_config(tenant_id, config_req)
|
||||
.tenant_location_config(tenant_shard_id, config_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
@@ -246,8 +248,10 @@ async fn handle_tenant_secondary_download(
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
service.tenant_secondary_download(tenant_id).await?;
|
||||
json_response(StatusCode::OK, ())
|
||||
let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
|
||||
|
||||
let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
|
||||
json_response(status, progress)
|
||||
}
|
||||
|
||||
async fn handle_tenant_delete(
|
||||
@@ -387,7 +391,14 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
state.service.node_configure(config_req).await?,
|
||||
state
|
||||
.service
|
||||
.node_configure(
|
||||
config_req.node_id,
|
||||
config_req.availability.map(NodeAvailability::from),
|
||||
config_req.scheduling,
|
||||
)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -554,6 +565,9 @@ pub fn make_router(
|
||||
.post("/debug/v1/consistency_check", |r| {
|
||||
request_span(r, handle_consistency_check)
|
||||
})
|
||||
.put("/debug/v1/failpoints", |r| {
|
||||
request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
|
||||
})
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
@@ -587,7 +601,7 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_get)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||
|
||||
54
control_plane/attachment_service/src/id_lock_map.rs
Normal file
54
control_plane/attachment_service/src/id_lock_map.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
|
||||
/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
|
||||
/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
|
||||
/// is needed at a tenant-wide granularity.
|
||||
pub(crate) struct IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
/// A synchronous lock for getting/setting the async locks that our callers will wait on.
|
||||
entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
|
||||
}
|
||||
|
||||
impl<T> IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
pub(crate) fn shared(
|
||||
&self,
|
||||
key: T,
|
||||
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default();
|
||||
entry.clone().read_owned()
|
||||
}
|
||||
|
||||
pub(crate) fn exclusive(
|
||||
&self,
|
||||
key: T,
|
||||
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default();
|
||||
entry.clone().write_owned()
|
||||
}
|
||||
|
||||
/// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
|
||||
/// periodic housekeeping to avoid the map growing indefinitely
|
||||
pub(crate) fn housekeeping(&self) {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
locked.retain(|_k, lock| lock.try_write().is_err())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Default for IdLockMap<T>
|
||||
where
|
||||
T: Eq + PartialEq + std::hash::Hash,
|
||||
{
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
entities: std::sync::Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,9 @@ use utils::seqwait::MonotonicCounter;
|
||||
|
||||
mod auth;
|
||||
mod compute_hook;
|
||||
mod heartbeater;
|
||||
pub mod http;
|
||||
mod id_lock_map;
|
||||
pub mod metrics;
|
||||
mod node;
|
||||
pub mod persistence;
|
||||
|
||||
@@ -2,7 +2,7 @@ use anyhow::{anyhow, Context};
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service};
|
||||
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
|
||||
use aws_config::{BehaviorVersion, Region};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
@@ -54,6 +54,10 @@ struct Cli {
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||
#[arg(long)]
|
||||
database_url: Option<String>,
|
||||
|
||||
/// Grace period before marking unresponsive pageserver offline
|
||||
#[arg(long)]
|
||||
max_unavailable_interval: Option<humantime::Duration>,
|
||||
}
|
||||
|
||||
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
||||
@@ -206,6 +210,12 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let default_panic = std::panic::take_hook();
|
||||
std::panic::set_hook(Box::new(move |info| {
|
||||
default_panic(info);
|
||||
std::process::exit(1);
|
||||
}));
|
||||
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
// We use spawn_blocking for database operations, so require approximately
|
||||
// as many blocking threads as we will open database connections.
|
||||
@@ -243,6 +253,10 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
jwt_token: secrets.jwt_token,
|
||||
control_plane_jwt_token: secrets.control_plane_jwt_token,
|
||||
compute_hook_url: args.compute_hook_url,
|
||||
max_unavailable_interval: args
|
||||
.max_unavailable_interval
|
||||
.map(humantime::Duration::into)
|
||||
.unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
|
||||
};
|
||||
|
||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||
|
||||
@@ -12,7 +12,7 @@ use serde::Serialize;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{backoff, id::NodeId};
|
||||
|
||||
use crate::persistence::NodePersistence;
|
||||
use crate::{persistence::NodePersistence, scheduler::MaySchedule};
|
||||
|
||||
/// Represents the in-memory description of a Node.
|
||||
///
|
||||
@@ -83,29 +83,38 @@ impl Node {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_availability(
|
||||
&mut self,
|
||||
availability: NodeAvailability,
|
||||
) -> AvailabilityTransition {
|
||||
use NodeAvailability::*;
|
||||
let transition = match (self.availability, availability) {
|
||||
(Offline, Active) => {
|
||||
pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
|
||||
match self.get_availability_transition(availability) {
|
||||
AvailabilityTransition::ToActive => {
|
||||
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
|
||||
// users of previously-cloned copies of the node will still see the old cancellation
|
||||
// state. For example, Reconcilers in flight will have to complete and be spawned
|
||||
// again to realize that the node has become available.
|
||||
self.cancel = CancellationToken::new();
|
||||
AvailabilityTransition::ToActive
|
||||
}
|
||||
(Active, Offline) => {
|
||||
AvailabilityTransition::ToOffline => {
|
||||
// Fire the node's cancellation token to cancel any in-flight API requests to it
|
||||
self.cancel.cancel();
|
||||
AvailabilityTransition::ToOffline
|
||||
}
|
||||
_ => AvailabilityTransition::Unchanged,
|
||||
};
|
||||
AvailabilityTransition::Unchanged => {}
|
||||
}
|
||||
self.availability = availability;
|
||||
transition
|
||||
}
|
||||
|
||||
/// Without modifying the availability of the node, convert the intended availability
|
||||
/// into a description of the transition.
|
||||
pub(crate) fn get_availability_transition(
|
||||
&self,
|
||||
availability: NodeAvailability,
|
||||
) -> AvailabilityTransition {
|
||||
use AvailabilityTransition::*;
|
||||
use NodeAvailability::*;
|
||||
|
||||
match (self.availability, availability) {
|
||||
(Offline, Active(_)) => ToActive,
|
||||
(Active(_), Offline) => ToOffline,
|
||||
_ => Unchanged,
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether we may send API requests to this node.
|
||||
@@ -114,21 +123,21 @@ impl Node {
|
||||
// a reference to the original Node's cancellation status. Checking both of these results
|
||||
// in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
|
||||
// when we cloned it, or if the original Node instance's cancellation token was fired.
|
||||
matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
|
||||
matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
|
||||
}
|
||||
|
||||
/// Is this node elegible to have work scheduled onto it?
|
||||
pub(crate) fn may_schedule(&self) -> bool {
|
||||
match self.availability {
|
||||
NodeAvailability::Active => {}
|
||||
NodeAvailability::Offline => return false,
|
||||
}
|
||||
pub(crate) fn may_schedule(&self) -> MaySchedule {
|
||||
let score = match self.availability {
|
||||
NodeAvailability::Active(score) => score,
|
||||
NodeAvailability::Offline => return MaySchedule::No,
|
||||
};
|
||||
|
||||
match self.scheduling {
|
||||
NodeSchedulingPolicy::Active => true,
|
||||
NodeSchedulingPolicy::Draining => false,
|
||||
NodeSchedulingPolicy::Filling => true,
|
||||
NodeSchedulingPolicy::Pause => false,
|
||||
NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
|
||||
NodeSchedulingPolicy::Draining => MaySchedule::No,
|
||||
NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
|
||||
NodeSchedulingPolicy::Pause => MaySchedule::No,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -146,8 +155,7 @@ impl Node {
|
||||
listen_pg_addr,
|
||||
listen_pg_port,
|
||||
scheduling: NodeSchedulingPolicy::Filling,
|
||||
// TODO: we shouldn't really call this Active until we've heartbeated it.
|
||||
availability: NodeAvailability::Active,
|
||||
availability: NodeAvailability::Offline,
|
||||
cancel: CancellationToken::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,9 @@ use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::ShardConfigError;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::ShardStripeSize;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::generation::Generation;
|
||||
@@ -72,6 +75,14 @@ pub(crate) enum DatabaseError {
|
||||
Logical(String),
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
pub(crate) enum AbortShardSplitStatus {
|
||||
/// We aborted the split in the database by reverting to the parent shards
|
||||
Aborted,
|
||||
/// The split had already been persisted.
|
||||
Complete,
|
||||
}
|
||||
|
||||
pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
|
||||
|
||||
impl Persistence {
|
||||
@@ -570,6 +581,51 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Used when the remote part of a shard split failed: we will revert the database state to have only
|
||||
/// the parent shards, with SplitState::Idle.
|
||||
pub(crate) async fn abort_shard_split(
|
||||
&self,
|
||||
split_tenant_id: TenantId,
|
||||
new_shard_count: ShardCount,
|
||||
) -> DatabaseResult<AbortShardSplitStatus> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
self.with_conn(move |conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
let aborted = conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
|
||||
// Clear the splitting state on parent shards
|
||||
let updated = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.ne(new_shard_count.literal() as i32))
|
||||
.set((splitting.eq(0),))
|
||||
.execute(conn)?;
|
||||
|
||||
// Parent shards are already gone: we cannot abort.
|
||||
if updated == 0 {
|
||||
return Ok(AbortShardSplitStatus::Complete);
|
||||
}
|
||||
|
||||
// Sanity check: if parent shards were present, their cardinality should
|
||||
// be less than the number of child shards.
|
||||
if updated >= new_shard_count.count() as usize {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"Unexpected parent shard count {updated} while aborting split to \
|
||||
count {new_shard_count:?} on tenant {split_tenant_id}"
|
||||
)));
|
||||
}
|
||||
|
||||
// Erase child shards
|
||||
diesel::delete(tenant_shards)
|
||||
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||
.filter(shard_count.eq(new_shard_count.literal() as i32))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(AbortShardSplitStatus::Aborted)
|
||||
})?;
|
||||
|
||||
Ok(aborted)
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
|
||||
@@ -604,6 +660,28 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) config: String,
|
||||
}
|
||||
|
||||
impl TenantShardPersistence {
|
||||
pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
|
||||
if self.shard_count == 0 {
|
||||
Ok(ShardIdentity::unsharded())
|
||||
} else {
|
||||
Ok(ShardIdentity::new(
|
||||
ShardNumber(self.shard_number as u8),
|
||||
ShardCount::new(self.shard_count as u8),
|
||||
ShardStripeSize(self.shard_stripe_size as u32),
|
||||
)?)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
|
||||
Ok(TenantShardId {
|
||||
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(self.shard_number as u8),
|
||||
shard_count: ShardCount::new(self.shard_count as u8),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of [`crate::node::Node`] that are stored durably
|
||||
#[derive(Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq)]
|
||||
#[diesel(table_name = crate::schema::nodes)]
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::persistence::Persistence;
|
||||
use crate::service;
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::models::{
|
||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||
};
|
||||
@@ -7,7 +8,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
@@ -18,6 +19,8 @@ use crate::compute_hook::{ComputeHook, NotifyError};
|
||||
use crate::node::Node;
|
||||
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
|
||||
|
||||
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
|
||||
|
||||
/// Object with the lifetime of the background reconcile task that is created
|
||||
/// for tenants which have a difference between their intent and observed states.
|
||||
pub(super) struct Reconciler {
|
||||
@@ -255,22 +258,81 @@ impl Reconciler {
|
||||
tenant_shard_id: TenantShardId,
|
||||
node: &Node,
|
||||
) -> Result<(), ReconcileError> {
|
||||
match node
|
||||
.with_client_retries(
|
||||
|client| async move { client.tenant_secondary_download(tenant_shard_id).await },
|
||||
&self.service_config.jwt_token,
|
||||
1,
|
||||
1,
|
||||
Duration::from_secs(60),
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
None => Err(ReconcileError::Cancel),
|
||||
Some(Ok(_)) => Ok(()),
|
||||
Some(Err(e)) => {
|
||||
tracing::info!(" (skipping destination download: {})", e);
|
||||
Ok(())
|
||||
// This is not the timeout for a request, but the total amount of time we're willing to wait
|
||||
// for a secondary location to get up to date before
|
||||
const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
|
||||
|
||||
// This the long-polling interval for the secondary download requests we send to destination pageserver
|
||||
// during a migration.
|
||||
const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
|
||||
|
||||
let started_at = Instant::now();
|
||||
|
||||
loop {
|
||||
let (status, progress) = match node
|
||||
.with_client_retries(
|
||||
|client| async move {
|
||||
client
|
||||
.tenant_secondary_download(
|
||||
tenant_shard_id,
|
||||
Some(REQUEST_DOWNLOAD_TIMEOUT),
|
||||
)
|
||||
.await
|
||||
},
|
||||
&self.service_config.jwt_token,
|
||||
1,
|
||||
3,
|
||||
REQUEST_DOWNLOAD_TIMEOUT * 2,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
None => Err(ReconcileError::Cancel),
|
||||
Some(Ok(v)) => Ok(v),
|
||||
Some(Err(e)) => {
|
||||
// Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
|
||||
// attaching, but we should not let an issue with a secondary location stop us proceeding
|
||||
// with a live migration.
|
||||
tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
|
||||
return Ok(());
|
||||
}
|
||||
}?;
|
||||
|
||||
if status == StatusCode::OK {
|
||||
tracing::info!(
|
||||
"Downloads to {} complete: {}/{} layers, {}/{} bytes",
|
||||
node,
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
return Ok(());
|
||||
} else if status == StatusCode::ACCEPTED {
|
||||
let total_runtime = started_at.elapsed();
|
||||
if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
|
||||
tracing::warn!("Timed out after {}ms downloading layers to {node}. Progress so far: {}/{} layers, {}/{} bytes",
|
||||
total_runtime.as_millis(),
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
// Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
|
||||
// it just makes the I/O performance for users less good.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Log and proceed around the loop to retry. We don't sleep between requests, because our HTTP call
|
||||
// to the pageserver is a long-poll.
|
||||
tracing::info!(
|
||||
"Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
|
||||
node,
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -485,17 +547,29 @@ impl Reconciler {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Ok(observed)) => observed,
|
||||
Some(Ok(observed)) => Some(observed),
|
||||
Some(Err(mgmt_api::Error::ApiError(status, _msg)))
|
||||
if status == StatusCode::NOT_FOUND =>
|
||||
{
|
||||
None
|
||||
}
|
||||
Some(Err(e)) => return Err(e.into()),
|
||||
None => return Err(ReconcileError::Cancel),
|
||||
};
|
||||
tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
|
||||
self.observed.locations.insert(
|
||||
attached_node.get_id(),
|
||||
ObservedStateLocation {
|
||||
conf: observed_conf,
|
||||
},
|
||||
);
|
||||
match observed_conf {
|
||||
Some(conf) => {
|
||||
// Pageserver returned a state: update it in observed. This may still be an indeterminate (None) state,
|
||||
// if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
|
||||
self.observed
|
||||
.locations
|
||||
.insert(attached_node.get_id(), ObservedStateLocation { conf });
|
||||
}
|
||||
None => {
|
||||
// Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
|
||||
self.observed.locations.remove(&attached_node.get_id());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -525,7 +599,12 @@ impl Reconciler {
|
||||
)));
|
||||
};
|
||||
|
||||
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
let mut wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
);
|
||||
match self.observed.locations.get(&node.get_id()) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
@@ -662,10 +741,26 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
/// We tweak the externally-set TenantConfig while configuring
|
||||
/// locations, using our awareness of whether secondary locations
|
||||
/// are in use to automatically enable/disable heatmap uploads.
|
||||
fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
|
||||
let mut config = config.clone();
|
||||
if has_secondaries {
|
||||
if config.heatmap_period.is_none() {
|
||||
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
|
||||
}
|
||||
} else {
|
||||
config.heatmap_period = None;
|
||||
}
|
||||
config
|
||||
}
|
||||
|
||||
pub(crate) fn attached_location_conf(
|
||||
generation: Generation,
|
||||
shard: &ShardIdentity,
|
||||
config: &TenantConfig,
|
||||
has_secondaries: bool,
|
||||
) -> LocationConfig {
|
||||
LocationConfig {
|
||||
mode: LocationConfigMode::AttachedSingle,
|
||||
@@ -674,7 +769,7 @@ pub(crate) fn attached_location_conf(
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
tenant_conf: ha_aware_config(config, has_secondaries),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -689,6 +784,6 @@ pub(crate) fn secondary_location_conf(
|
||||
shard_number: shard.number.0,
|
||||
shard_count: shard.count.literal(),
|
||||
shard_stripe_size: shard.stripe_size.0,
|
||||
tenant_conf: config.clone(),
|
||||
tenant_conf: ha_aware_config(config, true),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::{node::Node, tenant_state::TenantState};
|
||||
use pageserver_api::controller_api::UtilizationScore;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use utils::{http::error::ApiError, id::NodeId};
|
||||
@@ -19,15 +20,34 @@ impl From<ScheduleError> for ApiError {
|
||||
}
|
||||
|
||||
#[derive(Serialize, Eq, PartialEq)]
|
||||
pub enum MaySchedule {
|
||||
Yes(UtilizationScore),
|
||||
No,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct SchedulerNode {
|
||||
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
|
||||
shard_count: usize,
|
||||
|
||||
/// Whether this node is currently elegible to have new shards scheduled (this is derived
|
||||
/// from a node's availability state and scheduling policy).
|
||||
may_schedule: bool,
|
||||
may_schedule: MaySchedule,
|
||||
}
|
||||
|
||||
impl PartialEq for SchedulerNode {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
let may_schedule_matches = matches!(
|
||||
(&self.may_schedule, &other.may_schedule),
|
||||
(MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
|
||||
);
|
||||
|
||||
may_schedule_matches && self.shard_count == other.shard_count
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for SchedulerNode {}
|
||||
|
||||
/// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
|
||||
/// on which to run.
|
||||
///
|
||||
@@ -186,13 +206,15 @@ impl Scheduler {
|
||||
return None;
|
||||
}
|
||||
|
||||
// TODO: When the utilization score returned by the pageserver becomes meaningful,
|
||||
// schedule based on that instead of the shard count.
|
||||
let node = nodes
|
||||
.iter()
|
||||
.map(|node_id| {
|
||||
let may_schedule = self
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.map(|n| n.may_schedule)
|
||||
.map(|n| n.may_schedule != MaySchedule::No)
|
||||
.unwrap_or(false);
|
||||
(*node_id, may_schedule)
|
||||
})
|
||||
@@ -211,7 +233,7 @@ impl Scheduler {
|
||||
.nodes
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
if hard_exclude.contains(k) || !v.may_schedule {
|
||||
if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
|
||||
None
|
||||
} else {
|
||||
Some((*k, v.shard_count))
|
||||
@@ -230,7 +252,7 @@ impl Scheduler {
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule,
|
||||
node.may_schedule != MaySchedule::No,
|
||||
node.shard_count
|
||||
);
|
||||
}
|
||||
@@ -255,6 +277,7 @@ impl Scheduler {
|
||||
pub(crate) mod test_utils {
|
||||
|
||||
use crate::node::Node;
|
||||
use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
|
||||
use std::collections::HashMap;
|
||||
use utils::id::NodeId;
|
||||
/// Test helper: synthesize the requested number of nodes, all in active state.
|
||||
@@ -264,13 +287,14 @@ pub(crate) mod test_utils {
|
||||
(1..n + 1)
|
||||
.map(|i| {
|
||||
(NodeId(i), {
|
||||
let node = Node::new(
|
||||
let mut node = Node::new(
|
||||
NodeId(i),
|
||||
format!("httphost-{i}"),
|
||||
80 + i as u16,
|
||||
format!("pghost-{i}"),
|
||||
5432 + i as u16,
|
||||
);
|
||||
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
|
||||
assert!(node.is_available());
|
||||
node
|
||||
})
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -577,7 +577,12 @@ impl TenantState {
|
||||
.generation
|
||||
.expect("Attempted to enter attached state without a generation");
|
||||
|
||||
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
let wanted_conf = attached_location_conf(
|
||||
generation,
|
||||
&self.shard,
|
||||
&self.config,
|
||||
!self.intent.secondary.is_empty(),
|
||||
);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
|
||||
@@ -114,7 +114,7 @@ impl NeonBroker {
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
#[serde(default, deny_unknown_fields)]
|
||||
pub struct PageServerConf {
|
||||
// node id
|
||||
pub id: NodeId,
|
||||
@@ -126,6 +126,9 @@ pub struct PageServerConf {
|
||||
// auth type used for the PG and HTTP ports
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
|
||||
pub(crate) virtual_file_io_engine: String,
|
||||
pub(crate) get_vectored_impl: String,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -136,6 +139,9 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
// FIXME: use the ones exposed by pageserver crate
|
||||
virtual_file_io_engine: "tokio-epoll-uring".to_owned(),
|
||||
get_vectored_impl: "sequential".to_owned(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,18 +78,31 @@ impl PageServerNode {
|
||||
///
|
||||
/// These all end up on the command line of the `pageserver` binary.
|
||||
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
|
||||
let id = format!("id={}", self.conf.id);
|
||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||
let pg_distrib_dir_param = format!(
|
||||
"pg_distrib_dir='{}'",
|
||||
self.env.pg_distrib_dir_raw().display()
|
||||
);
|
||||
|
||||
let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
|
||||
let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
|
||||
let PageServerConf {
|
||||
id,
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
} = &self.conf;
|
||||
|
||||
let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
|
||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
|
||||
let id = format!("id={}", id);
|
||||
|
||||
let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
|
||||
let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
|
||||
|
||||
let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
|
||||
let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
|
||||
let virtual_file_io_engine = format!("virtual_file_io_engine='{virtual_file_io_engine}'");
|
||||
let get_vectored_impl = format!("get_vectored_impl='{get_vectored_impl}'");
|
||||
|
||||
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
|
||||
|
||||
@@ -101,6 +114,8 @@ impl PageServerNode {
|
||||
listen_http_addr_param,
|
||||
listen_pg_addr_param,
|
||||
broker_endpoint_param,
|
||||
virtual_file_io_engine,
|
||||
get_vectored_impl,
|
||||
];
|
||||
|
||||
if let Some(control_plane_api) = &self.env.control_plane_api {
|
||||
@@ -111,7 +126,7 @@ impl PageServerNode {
|
||||
|
||||
// Storage controller uses the same auth as pageserver: if JWT is enabled
|
||||
// for us, we will also need it to talk to them.
|
||||
if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
|
||||
if matches!(http_auth_type, AuthType::NeonJWT) {
|
||||
let jwt_token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
|
||||
@@ -129,8 +144,7 @@ impl PageServerNode {
|
||||
));
|
||||
}
|
||||
|
||||
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
|
||||
{
|
||||
if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
|
||||
// Keys are generated in the toplevel repo dir, pageservers' workdirs
|
||||
// are one level below that, so refer to keys with ../
|
||||
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
|
||||
@@ -554,13 +568,6 @@ impl PageServerNode {
|
||||
Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_secondary_download(*tenant_id)
|
||||
.await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_create(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
|
||||
@@ -38,6 +38,9 @@ const COMMAND: &str = "storage_controller";
|
||||
|
||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||
const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
@@ -269,6 +272,8 @@ impl StorageController {
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&self.listen,
|
||||
@@ -276,6 +281,8 @@ impl StorageController {
|
||||
self.path.as_ref(),
|
||||
"--database-url",
|
||||
&database_url,
|
||||
"--max-unavailable-interval",
|
||||
&max_unavailable.to_string(),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
|
||||
408
docs/rfcs/031-sharding-static.md
Normal file
408
docs/rfcs/031-sharding-static.md
Normal file
@@ -0,0 +1,408 @@
|
||||
# Sharding Phase 1: Static Key-space Sharding
|
||||
|
||||
## Summary
|
||||
|
||||
To enable databases with sizes approaching the capacity of a pageserver's disk,
|
||||
it is necessary to break up the storage for the database, or _shard_ it.
|
||||
|
||||
Sharding in general is a complex area. This RFC aims to define an initial
|
||||
capability that will permit creating large-capacity databases using a static configuration
|
||||
defined at time of Tenant creation.
|
||||
|
||||
## Motivation
|
||||
|
||||
Currently, all data for a Tenant, including all its timelines, is stored on a single
|
||||
pageserver. The local storage required may be several times larger than the actual
|
||||
database size, due to LSM write inflation.
|
||||
|
||||
If a database is larger than what one pageserver can hold, then it becomes impossible
|
||||
for the pageserver to hold it in local storage, as it must do to provide service to
|
||||
clients.
|
||||
|
||||
### Prior art
|
||||
|
||||
In Neon:
|
||||
|
||||
- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
|
||||
- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
|
||||
- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
|
||||
|
||||
Prior art in other distributed systems is too broad to capture here: pretty much
|
||||
any scale out storage system does something like this.
|
||||
|
||||
## Requirements
|
||||
|
||||
- Enable creating a large (for example, 16TiB) database without requiring dedicated
|
||||
pageserver nodes.
|
||||
- Share read/write bandwidth costs for large databases across pageservers, as well
|
||||
as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
|
||||
that disrupt service to other tenants.
|
||||
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
|
||||
does not write out a single contiguous ranges of page numbers.
|
||||
|
||||
_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
|
||||
that a user might create on a current-gen enterprise SSD should also work well on
|
||||
Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
|
||||
pageserver backend is not the limiting factor in the database size_.
|
||||
|
||||
## Non Goals
|
||||
|
||||
- Independently distributing timelines within the same tenant. If a tenant has many
|
||||
timelines, then sharding may be a less efficient mechanism for distributing load than
|
||||
sharing out timelines between pageservers.
|
||||
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
|
||||
based on the idea that separate mechanisms will make sense for each dimension.
|
||||
|
||||
## Impacted Components
|
||||
|
||||
pageserver, control plane, postgres/smgr
|
||||
|
||||
## Terminology
|
||||
|
||||
**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
|
||||
the page number is the key in that store. `Key` is a literal data type in existing code.
|
||||
|
||||
**LSN dimension**: this just means the range of LSNs (history), when talking about the range
|
||||
of keys and LSNs as a two dimensional space.
|
||||
|
||||
## Implementation
|
||||
|
||||
### Key sharding vs. LSN sharding
|
||||
|
||||
When we think of sharding across the two dimensional key/lsn space, this is an
|
||||
opportunity to think about how the two dimensions differ:
|
||||
|
||||
- Sharding the key space distributes the _write_ workload of ingesting data
|
||||
and compacting. This work must be carefully managed so that exactly one
|
||||
node owns a given key.
|
||||
- Sharding the LSN space distributes the _historical read_ workload. This work
|
||||
can be done by anyone without any special coordination, as long as they can
|
||||
see the remote index and layers.
|
||||
|
||||
The key sharding is the harder part, and also the more urgent one, to support larger
|
||||
capacity databases. Because distributing historical LSN read work is a relatively
|
||||
simpler problem that most users don't have, we defer it to future work. It is anticipated
|
||||
that some quite simple P2P offload model will enable distributing work for historical
|
||||
reads: a node which is low on space can call out to peer to ask it to download and
|
||||
serve reads from a historical layer.
|
||||
|
||||
### Key mapping scheme
|
||||
|
||||
Having decided to focus on key sharding, we must next decide how we will map
|
||||
keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
|
||||
between data locality and avoiding entire large relations mapping to the same shard.
|
||||
|
||||
We will define two spaces:
|
||||
|
||||
- Key space: unsigned integer
|
||||
- Shard space: integer from 0 to N-1, where we have N shards.
|
||||
|
||||
### Key -> Shard mapping
|
||||
|
||||
Keys are currently defined in the pageserver's getpage@lsn interface as follows:
|
||||
|
||||
```
|
||||
pub struct Key {
|
||||
pub field1: u8,
|
||||
pub field2: u32,
|
||||
pub field3: u32,
|
||||
pub field4: u32,
|
||||
pub field5: u8,
|
||||
pub field6: u32,
|
||||
}
|
||||
|
||||
|
||||
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
field5: rel.forknum,
|
||||
field6: blknum,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
|
||||
shards. For distribution purposes, we only care about user data keys_
|
||||
|
||||
The properties we want from our Key->Shard mapping are:
|
||||
|
||||
- Locality in `blknum`, such that adjacent `blknum` will usually map to
|
||||
the same stripe and consequently land on the same shard, even though the overall
|
||||
collection of blocks in a relation will be spread over many stripes and therefore
|
||||
many shards.
|
||||
- Avoid the same blknum on different relations landing on the same stripe, so that
|
||||
with many small relations we do not end up aliasing data to the same stripe/shard.
|
||||
- Avoid vulnerability to aliasing in the values of relation identity fields, such that
|
||||
if there are patterns in the value of `relnode`, these do not manifest as patterns
|
||||
in data placement.
|
||||
|
||||
To accomplish this, the blknum is used to select a stripe, and stripes are
|
||||
assigned to shards in a pseudorandom order via a hash. The motivation for
|
||||
pseudo-random distribution (rather than sequential mapping of stripe to shard)
|
||||
is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
|
||||
all relations' stripes to touch pageservers in the same order.
|
||||
|
||||
To map a `Key` to a shard:
|
||||
|
||||
- Hash the `Key` field 4 (relNode).
|
||||
- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
|
||||
hash of this with the hash from the previous step.
|
||||
- The total hash modulo the shard count gives the shard holding this key.
|
||||
|
||||
Why don't we use the other fields in the Key?
|
||||
|
||||
- We ignore `forknum` for key mapping, because it distinguishes different classes of data
|
||||
in the same relation, and we would like to keep the data in a relation together.
|
||||
- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
|
||||
database's blocks differ only by spcNode and dbNode from the original. To enable running
|
||||
this type of creation without cross-pageserver communication, we must ensure that these
|
||||
blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
|
||||
|
||||
### Data placement examples
|
||||
|
||||
For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
|
||||
and a stripe size of 32k pages:
|
||||
|
||||
- A single large relation: `blknum` division will break the data up into 4096
|
||||
stripes, which will be scattered across the shards.
|
||||
- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
|
||||
and that stripe will be placed according to the hash of the key fields 4. The
|
||||
data placement will be statistically uniform across shards.
|
||||
|
||||
Data placement will be more uneven on smaller databases:
|
||||
|
||||
- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
|
||||
that both relations land on the same shard and no data lands on the other shard.
|
||||
- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
|
||||
the data of the other four shards.
|
||||
|
||||
These uneven cases for small amounts of data do not matter, as long as the stripe size
|
||||
is an order of magnitude smaller than the amount of data we are comfortable holding
|
||||
in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
|
||||
a tenant has some shards with 256MB size and some shards with 512MB size, even though
|
||||
the standard deviation of shard size within the tenant is very high. Our key mapping
|
||||
scheme provides a statistical guarantee that as the tenant's overall data size increases,
|
||||
uniformity of placement will improve.
|
||||
|
||||
### Important Types
|
||||
|
||||
#### `ShardIdentity`
|
||||
|
||||
Provides the information needed to know whether a particular key belongs
|
||||
to a particular shard:
|
||||
|
||||
- Layout version
|
||||
- Stripe size
|
||||
- Shard count
|
||||
- Shard index
|
||||
|
||||
This structure's size is constant. Note that if we had used a differnet key
|
||||
mapping scheme such as consistent hashing with explicit hash ranges assigned
|
||||
to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
|
||||
key mapping scheme used here enables a small fixed size ShardIdentity.
|
||||
|
||||
### Pageserver changes
|
||||
|
||||
#### Structural
|
||||
|
||||
Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
|
||||
`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
|
||||
of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
|
||||
covers the whole keyspace.
|
||||
|
||||
When the pageserver writes layers and index_part.json to remote storage, it must
|
||||
include the shard index & count in the name, to avoid collisions (the count is
|
||||
necessary for future-proofing: the count will vary in time). These keys
|
||||
will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
|
||||
exactly the same for TenantShards as it does for Tenants today: each shard will have
|
||||
its own generation number.
|
||||
|
||||
#### Storage Format: Keys
|
||||
|
||||
For tenants with >1 shard, layer files implicitly become sparse: within the key
|
||||
range described in the layer name, the layer file for a shard will only hold the
|
||||
content relevant to stripes assigned to the shard.
|
||||
|
||||
For this reason, the LayerFileName within a tenant is no longer unique: different shards
|
||||
may use the same LayerFileName to refer to different data. We may solve this simply
|
||||
by including the shard number in the keys used for layers.
|
||||
|
||||
The shard number will be included as a prefix (as part of tenant ID), like this:
|
||||
|
||||
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
|
||||
|
||||
`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
|
||||
|
||||
Reasons for this particular format:
|
||||
|
||||
- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
|
||||
we construct a layer file name), and enables efficient listing of index_parts within
|
||||
a particular shard-timeline prefix.
|
||||
- Including the shard _count_ as well as shard number means that in future when we implement
|
||||
shard splitting, it will be possible for a parent shard and one of its children to write
|
||||
the same layer file without a name collision. For example, a parent shard 0_1 might split
|
||||
into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
|
||||
that is distinct from what shard 0_1 would have written at the same place.
|
||||
|
||||
In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
|
||||
and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
|
||||
for example a single-shard tenant's prefix will be `0001`.
|
||||
|
||||
For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
|
||||
and use this as a cue to construct paths with no prefix at all.
|
||||
|
||||
#### Storage Format: Indices
|
||||
|
||||
In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
|
||||
when we implement shard splitting in future, it will be useful to enable shards to reference layers
|
||||
written by other shards (specifically the parent shard during a split), so that shards don't
|
||||
have to exhaustively copy all data into their own shard-prefixed keys.
|
||||
|
||||
To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
|
||||
tuple on each layer, such that it can construct paths for layers written by other shards. This
|
||||
naturally raises the question of who "owns" such layers written by ancestral shards: this problem
|
||||
will be addressed in phase 2.
|
||||
|
||||
For backward compatibility, any index entry without shard information will be assumed to be
|
||||
in the legacy shardidentity.
|
||||
|
||||
#### WAL Ingest
|
||||
|
||||
In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
|
||||
it down to the pages relevant to their shard:
|
||||
|
||||
- For ordinary user data writes, only retain a write if it matches the ShardIdentity
|
||||
- For metadata describing relations etc, all shards retain these writes.
|
||||
|
||||
The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
|
||||
one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
|
||||
and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
|
||||
expensive: if the safekeeper can be made shard-aware then it could be taught to use
|
||||
the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
|
||||
|
||||
#### Compaction/GC
|
||||
|
||||
No changes needed.
|
||||
|
||||
The pageserver doesn't have to do anything special during compaction
|
||||
or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
|
||||
This will result in sparse layer files, containing keys only in the stripes that this
|
||||
shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
|
||||
the key range, these should be updated to ignore gaps that are due to sharding, to
|
||||
avoid spuriously splitting up layers ito stripe-sized pieces.
|
||||
|
||||
### Compute Endpoints
|
||||
|
||||
Compute endpoints will need to:
|
||||
|
||||
- Accept a vector of connection strings as part of their configuration from the control plane
|
||||
- Route pageserver requests according to mapping the hash of key to the correct
|
||||
entry in the vector of connection strings.
|
||||
|
||||
Doing this in compute rather than routing requests via a single pageserver is
|
||||
necessary to enable sharding tenants without adding latency from extra hops.
|
||||
|
||||
### Control Plane
|
||||
|
||||
Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
|
||||
be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
|
||||
tenants.
|
||||
|
||||
Tenant lifecycle operations like deletion will require fanning-out to all the shards
|
||||
in the tenant. The same goes for timeline creation and deletion: a timeline should
|
||||
not be considered created until it has been created in all shards.
|
||||
|
||||
#### Selectively enabling sharding for large tenants
|
||||
|
||||
Initially, we will explicitly enable sharding for large tenants only.
|
||||
|
||||
In future, this hint mechanism will become optional when we implement automatic
|
||||
re-sharding of tenants.
|
||||
|
||||
## Future Phases
|
||||
|
||||
This section exists to indicate what will likely come next after this phase.
|
||||
|
||||
Phases 2a and 2b are amenable to execution in parallel.
|
||||
|
||||
### Phase 2a: WAL fan-out
|
||||
|
||||
**Problem**: when all shards consume the whole WAL, the network bandwidth used
|
||||
for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
|
||||
of the shard count.
|
||||
|
||||
Network bandwidth is not our most pressing bottleneck, but it is likely to become
|
||||
a problem if we set a modest shard count (~8) on a significant number of tenants,
|
||||
especially as those larger tenants which we shard are also likely to have higher
|
||||
write bandwidth than average.
|
||||
|
||||
### Phase 2b: Shard Splitting
|
||||
|
||||
**Problem**: the number of shards in a tenant is defined at creation time and cannot
|
||||
be changed. This causes excessive sharding for most small tenants, and an upper
|
||||
bound on scale for very large tenants.
|
||||
|
||||
To address this, a _splitting_ feature will later be added. One shard can split its
|
||||
data into a number of children by doing a special compaction operation to generate
|
||||
image layers broken up child-shard-wise, and then writing out an `index_part.json` for
|
||||
each child. This will then require external coordination (by the control plane) to
|
||||
safely attach these new child shards and then move them around to distribute work.
|
||||
The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
|
||||
once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
|
||||
the risk/complexity of implementing such a rarely-encountered scenario.
|
||||
|
||||
### Phase N (future): distributed historical reads
|
||||
|
||||
**Problem**: while sharding based on key is good for handling changes in overall
|
||||
database size, it is less suitable for spiky/unpredictable changes in the read
|
||||
workload to historical layers. Sudden increases in historical reads could result
|
||||
in sudden increases in local disk capacity required for a TenantShard.
|
||||
|
||||
Example: the extreme case of this would be to run a tenant for a year, then create branches
|
||||
with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
|
||||
the on-disk capacity footprint of a TenantShard, since it would be serving reads
|
||||
from all those disparate historical layers.
|
||||
|
||||
If we can respond fast enough, then key-sharding a tenant more finely can help with
|
||||
this, but splitting may be a relatively expensive operation and the increased historical
|
||||
read load may be transient.
|
||||
|
||||
A separate mechanism for handling heavy historical reads could be something like
|
||||
a gossip mechanism for pageservers to communicate
|
||||
about their workload, and then a getpageatlsn offload mechanism where one pageserver can
|
||||
ask another to go read the necessary layers from remote storage to serve the read. This
|
||||
requires relativly little coordination because it is read-only: any node can service any
|
||||
read. All reads to a particular shard would still flow through one node, but the
|
||||
disk capactity & I/O impact of servicing the read would be distributed.
|
||||
|
||||
## FAQ/Alternatives
|
||||
|
||||
### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
|
||||
|
||||
When a database is growing under a write workload, writes may predominantly hit the
|
||||
end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
|
||||
is intensively re-writing a particular relation, if that relation lived in a particular
|
||||
shard then it would not achieve our goal of distributing the write work across shards.
|
||||
|
||||
### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
|
||||
|
||||
1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
|
||||
database would still cause a load hotspot on the pageserver routing its read requests.
|
||||
2. The additional hop through the "proxy" pageserver would add latency and overall
|
||||
resource cost (CPU, network bandwidth)
|
||||
|
||||
### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
|
||||
|
||||
In this model, there would be no explicit sharding of work, but the pageserver to which
|
||||
a tenant is attached would not hold all layers on its disk: instead, it would call out
|
||||
to peers to have them store some layers, and call out to those peers to request reads
|
||||
in those layers.
|
||||
|
||||
This mechanism will work well for distributing work in the LSN dimension, but in the key
|
||||
space dimension it has the major limitation of requiring one node to handle all
|
||||
incoming writes, and compactions. Even if the write workload for a large database
|
||||
fits in one pageserver, it will still be a hotspot and such tenants may still
|
||||
de-facto require their own pageserver.
|
||||
479
docs/rfcs/032-shard-splitting.md
Normal file
479
docs/rfcs/032-shard-splitting.md
Normal file
@@ -0,0 +1,479 @@
|
||||
# Shard splitting
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC describes a new pageserver API for splitting an existing tenant shard into
|
||||
multiple shards, and describes how to use this API to safely increase the total
|
||||
shard count of a tenant.
|
||||
|
||||
## Motivation
|
||||
|
||||
In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
|
||||
tenants beyond the capacity of a single pageserver by breaking up the key space
|
||||
into stripes, and distributing these stripes across many pageservers. However,
|
||||
the shard count was defined once at tenant creation time and not varied thereafter.
|
||||
|
||||
In practice, the expected size of a database is rarely known at creation time, and
|
||||
it is inefficient to enable sharding for very small tenants: we need to be
|
||||
able to create a tenant with a small number of shards (such as 1), and later expand
|
||||
when it becomes clear that the tenant has grown in size to a point where sharding
|
||||
is beneficial.
|
||||
|
||||
### Prior art
|
||||
|
||||
Many distributed systems have the problem of choosing how many shards to create for
|
||||
tenants that do not specify an expected size up-front. There are a couple of general
|
||||
approaches:
|
||||
|
||||
- Write to a key space in order, and start a new shard when the highest key advances
|
||||
past some point. This doesn't work well for Neon, because we write to our key space
|
||||
in many different contiguous ranges (per relation), rather than in one contiguous
|
||||
range. To adapt to this kind of model, we would need a sharding scheme where each
|
||||
relation had its own range of shards, which would be inefficient for the common
|
||||
case of databases with many small relations.
|
||||
- Monitor the system, and automatically re-shard at some size threshold. For
|
||||
example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
|
||||
component monitors the size of each RADOS Pool, and adjusts the number of Placement
|
||||
Groups (Ceph's shard equivalent).
|
||||
|
||||
## Requirements
|
||||
|
||||
- A configurable capacity limit per-shard is enforced.
|
||||
- Changes in shard count do not interrupt service beyond requiring postgres
|
||||
to reconnect (i.e. milliseconds).
|
||||
- Human being does not have to choose shard count
|
||||
|
||||
## Non Goals
|
||||
|
||||
- Shard splitting is always a tenant-global operation: we will not enable splitting
|
||||
one shard while leaving others intact.
|
||||
- The inverse operation (shard merging) is not described in this RFC. This is a lower
|
||||
priority than splitting, because databases grow more often than they shrink, and
|
||||
a database with many shards will still work properly if the stored data shrinks, just
|
||||
with slightly more overhead (e.g. redundant WAL replication)
|
||||
- Shard splitting is only initiated based on capacity bounds, not load. Splitting
|
||||
a tenant based on load will make sense for some medium-capacity, high-load workloads,
|
||||
but is more complex to reason about and likely is not desirable until we have
|
||||
shard merging to reduce the shard count again if the database becomes less busy.
|
||||
|
||||
## Impacted Components
|
||||
|
||||
pageserver, storage controller
|
||||
|
||||
(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
|
||||
|
||||
## Terminology
|
||||
|
||||
**Parent** shards are the shards that exist before a split. **Child** shards are
|
||||
the new shards created during a split.
|
||||
|
||||
**Shard** is synonymous with _tenant shard_.
|
||||
|
||||
**Shard Index** is the 2-tuple of shard number and shard count, written in
|
||||
paths as {:02x}{:02x}, e.g. `0001`.
|
||||
|
||||
## Background
|
||||
|
||||
In the implementation section, a couple of existing aspects of sharding are important
|
||||
to remember:
|
||||
|
||||
- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
|
||||
a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
|
||||
storage paths, and remote index metadata.
|
||||
- Remote layer file paths contain the shard index of the shard that created them, and
|
||||
remote indices contain the same index to enable building the layer file path. A shard's
|
||||
index may reference layers that were created by another shard.
|
||||
- Local tenant shard directories include the shard index. All layers downloaded by
|
||||
a tenant shard are stored in this shard-prefixed path, even if those layers were
|
||||
initially created by another shard: tenant shards do not read and write one anothers'
|
||||
paths.
|
||||
- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
|
||||
This is for historical reasons and will be cleaned up in future, but the existing
|
||||
name is used here to help comprehension when reading code.
|
||||
|
||||
## Implementation
|
||||
|
||||
Note: this section focuses on the correctness of the core split process. This will
|
||||
be fairly inefficient in a naive implementation, and several important optimizations
|
||||
are described in a later section.
|
||||
|
||||
There are broadly two parts to the implementation:
|
||||
|
||||
1. The pageserver split API, which splits one shard on one pageserver
|
||||
2. The overall tenant split proccess which is coordinated by the storage controller,
|
||||
and calls into the pageserver split API as needed.
|
||||
|
||||
### Pageserver Split API
|
||||
|
||||
The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
|
||||
that takes the new total shard count in the body.
|
||||
|
||||
The pageserver split API operates on one tenant shard, on one pageserver. External
|
||||
coordination is required to use it safely, this is described in the later
|
||||
'Split procedure' section.
|
||||
|
||||
#### Preparation
|
||||
|
||||
First identify the shard indices for the new child shards. These are deterministic,
|
||||
calculated from the parent shard's index, and the number of children being created (this
|
||||
is an input to the API, and validated to be a power of two). In a trivial example, splitting
|
||||
0001 in two always results in 0002 and 0102.
|
||||
|
||||
Child shard indices are chosen such that the childrens' parts of the keyspace will
|
||||
be subsets of the parent's parts of the keyspace.
|
||||
|
||||
#### Step 1: write new remote indices
|
||||
|
||||
In remote storage, splitting is very simple: we may just write new index_part.json
|
||||
objects for each child shard, containing exactly the same layers as the parent shard.
|
||||
|
||||
The children will have more data than they need, but this avoids any exhausive
|
||||
re-writing or copying of layer files.
|
||||
|
||||
The index key path includes a generation number: the parent shard's current
|
||||
attached generation number will also be used for the child shards' indices. This
|
||||
makes the operation safely retryable: if everything crashes and restarts, we may
|
||||
call the split API again on the parent shard, and the result will be some new remote
|
||||
indices for the child shards, under a higher generation number.
|
||||
|
||||
#### Step 2: start new `Tenant` objects
|
||||
|
||||
A new `Tenant` object may be instantiated for each child shard, while the parent
|
||||
shard still exists. When calling the tenant_spawn function for this object,
|
||||
the remote index from step 1 will be read, and the child shard will start
|
||||
to ingest WAL to catch up from whatever was in the remote storage at step 1.
|
||||
|
||||
We now wait for child shards' WAL ingestion to catch up with the parent shard,
|
||||
so that we can safely tear down the parent shard without risking an availability
|
||||
gap to clients reading recent LSNs.
|
||||
|
||||
#### Step 3: tear down parent `Tenant` object
|
||||
|
||||
Once child shards are running and have caught up with WAL ingest, we no longer
|
||||
need the parent shard. Note that clients may still be using it -- when we
|
||||
shut it down, any page_service handlers will also shut down, causing clients
|
||||
to disconnect. When the client reconnects, it will re-lookup the tenant,
|
||||
and hit the child shard instead of the parent (shard lookup from page_service
|
||||
should bias toward higher ShardCount shards).
|
||||
|
||||
Note that at this stage the page service client has not yet been notified of
|
||||
any split. In the trivial single split example:
|
||||
|
||||
- Shard 0001 is gone: Tenant object torn down
|
||||
- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
|
||||
- Clients will continue to connect to that server thinking that shard 0001 is there,
|
||||
and all requests will work, because any key that was in shard 0001 is definitely
|
||||
available in either shard 0002 or shard 0102.
|
||||
- Eventually, the storage controller (not the pageserver) will decide to migrate
|
||||
some child shards away: at that point it will do a live migration, ensuring
|
||||
that the client has an updated configuration before it detaches anything
|
||||
from the original server.
|
||||
|
||||
#### Complete
|
||||
|
||||
When we send a 200 response to the split request, we are promising the caller:
|
||||
|
||||
- That the child shards are persistent in remote storage
|
||||
- That the parent shard has been shut down
|
||||
|
||||
This enables the caller to proceed with the overall shard split operation, which
|
||||
may involve other shards on other pageservers.
|
||||
|
||||
### Storage Controller Split procedure
|
||||
|
||||
Splitting a tenant requires calling the pageserver split API, and tracking
|
||||
enough state to ensure recovery + completion in the event of any component (pageserver
|
||||
or storage controller) crashing (or request timing out) during the split.
|
||||
|
||||
1. call the split API on all existing shards. Ensure that the resulting
|
||||
child shards are pinned to their pageservers until _all_ the split calls are done.
|
||||
This pinning may be implemented as a "split bit" on the tenant shards, that
|
||||
blocks any migrations, and also acts as a sign that if we restart, we must go
|
||||
through some recovery steps to resume the split.
|
||||
2. Once all the split calls are done, we may unpin the child shards (clear
|
||||
the split bit). The split is now complete: subsequent steps are just migrations,
|
||||
not strictly part of the split.
|
||||
3. Try to schedule new pageserver locations for the child shards, using
|
||||
a soft anti-affinity constraint to place shards from the same tenant onto different
|
||||
pageservers.
|
||||
|
||||
Updating computes about the new shard count is not necessary until we migrate
|
||||
any of the child shards away from the parent's location.
|
||||
|
||||
### Recovering from failures
|
||||
|
||||
#### Rolling back an incomplete split
|
||||
|
||||
An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
|
||||
and detaching child shards. This will lose any WAL ingested into the children after the parents
|
||||
were detached earlier, but the parents will catch up.
|
||||
|
||||
No special pageserver API is needed for this. From the storage controllers point of view, the
|
||||
procedure is:
|
||||
|
||||
1. For all parent shards in the tenant, ensure they are attached
|
||||
2. For all child shards, ensure they are not attached
|
||||
3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
|
||||
|
||||
Any remote storage content for child shards is left behind. This is similar to other cases where
|
||||
we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
|
||||
index that references it). Future online scrub/cleanup functionality can remove these objects, or
|
||||
they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
|
||||
which would include any child shards that were rolled back.
|
||||
|
||||
If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
|
||||
this, we will **block timeline creation during splitting**, so that we can safely roll back until
|
||||
the split is complete, without risking losing timelines.
|
||||
|
||||
Rolling back an incomplete split will happen automatically if a split fails due to some fatal
|
||||
reason, and will not be accessible via an API:
|
||||
|
||||
- A pageserver fails to complete its split API request after too many retries
|
||||
- A pageserver returns a fatal unexpected error such as 400 or 500
|
||||
- The storage controller database returns a non-retryable error
|
||||
- Some internal invariant is violated in the storage controller split code
|
||||
|
||||
#### Rolling back a complete split
|
||||
|
||||
A complete shard split may be rolled back similarly to an incomplete split, with the following
|
||||
modifications:
|
||||
|
||||
- The parent shards will no longer exist in the storage controller database, so these must
|
||||
be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
|
||||
may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
|
||||
shards in the storage controller database.
|
||||
- Any timelines that were created after the split complete will disappear when rolling back
|
||||
to the tenant shards. For this reason, rolling back after a complete split should only
|
||||
be done due to serious issues where loss of recently created timelines is acceptable, or
|
||||
in cases where we have confirmed that no timelines were created in the intervening period.
|
||||
- Parent shards' layers must not have been deleted: this property will come "for free" when
|
||||
we first roll out sharding, by simply not implementing deletion of parent layers after
|
||||
a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
|
||||
Optimizations section), it should apply a TTL to layers such that we have a
|
||||
defined walltime window in which rollback will be possible.
|
||||
|
||||
The storage controller will expose an API for rolling back a complete split, for use
|
||||
in the field if we encounter some critical bug with a post-split tenant.
|
||||
|
||||
#### Retrying API calls during Pageserver Restart
|
||||
|
||||
When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
|
||||
child shards from an ongoing split. This does not intrinsically break anything, and the
|
||||
pageserver may include all these shards in its `/re-attach` request to the storage controller.
|
||||
|
||||
In order to support such restarts, it is important that the storage controller stores
|
||||
persistent records of each child shard before it calls into a pageserver, as these child shards
|
||||
may require generation increments via a `/re-attach` request.
|
||||
|
||||
The pageserver restart will also result in a failed API call from the storage controller's point
|
||||
of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
|
||||
complete, and all shards must remain pinned to their current pageserver locations until the
|
||||
split is done.
|
||||
|
||||
The pageserver API calls during splitting will retry on transient errors, so that
|
||||
short availability gaps do not result in a failure of the overall operation. The
|
||||
split in progress will be automatically rolled back if the threshold for API
|
||||
retries is reached (e.g. if a pageserver stays offline for longer than a typical
|
||||
restart).
|
||||
|
||||
#### Rollback on Storage Controller Restart
|
||||
|
||||
On startup, the storage controller will inspect the split bit for tenant shards that
|
||||
it loads from the database. If any splits are in progress:
|
||||
|
||||
- Database content will be reverted to the parent shards
|
||||
- Child shards will be dropped from memory
|
||||
- The parent and child shards will be included in the general startup reconciliation that
|
||||
the storage controller does: any child shards will be detached from pageservers because
|
||||
they don't exist in the storage controller's expected set of shards, and parent shards
|
||||
will be attached if they aren't already.
|
||||
|
||||
#### Storage controller API request failures/retries
|
||||
|
||||
The split request handler will implement idempotency: if the [`Tenant`] requested to split
|
||||
doesn't exist, we will check for the would-be child shards, and if they already exist,
|
||||
we consider the request complete.
|
||||
|
||||
If a request is retried while the original request is still underway, then the split
|
||||
request handler will notice an InProgress marker in TenantManager, and return 503
|
||||
to encourage the client to backoff/retry. This is the same as the general pageserver
|
||||
API handling for calls that try to act on an InProgress shard.
|
||||
|
||||
#### Compute start/restart during a split
|
||||
|
||||
If a compute starts up during split, it will be configured with the old sharding
|
||||
configuration. This will work for reads irrespective of the progress of the split
|
||||
as long as no child hards have been migrated away from their original location, and
|
||||
this is guaranteed in the split procedure (see earlier section).
|
||||
|
||||
#### Pageserver fails permanently during a split
|
||||
|
||||
If a pageserver permanently fails (i.e. the storage controller availability state for it
|
||||
goes to Offline) while a split is in progress, the splitting operation will roll back, and
|
||||
during the roll back it will skip any API calls to the offline pageserver. If the offline
|
||||
pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
|
||||
|
||||
### Handling secondary locations
|
||||
|
||||
For correctness, it is not necessary to split secondary locations. We can simply detach
|
||||
the secondary locations for parent shards, and then attach new secondary locations
|
||||
for child shards.
|
||||
|
||||
Clearly this is not optimal, as it will result in re-downloads of layer files that
|
||||
were already present on disk. See "Splitting secondary locations"
|
||||
|
||||
### Conditions to trigger a split
|
||||
|
||||
The pageserver will expose a new API for reporting on shards that are candidates
|
||||
for split: this will return a top-N report of the largest tenant shards by
|
||||
physical size (remote size). This should exclude any tenants that are already
|
||||
at the maximum configured shard count.
|
||||
|
||||
The API would look something like:
|
||||
`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
|
||||
|
||||
The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
|
||||
|
||||
A split operation will be started when the tenant exceeds some threshold. This threshold
|
||||
should be _less than_ how large we actually want shards to be, perhaps much less. That's to
|
||||
minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
|
||||
wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
|
||||
tenant size distribution may be useful here: if we can make a statement like "usually, if
|
||||
a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
|
||||
make our policy to split a tenant at 20GiB.
|
||||
|
||||
The finest split we can do is by factors of two, but we can do higher-cardinality splits
|
||||
too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
|
||||
as it grows. An example of a very simple heuristic for early deployment of the splitting
|
||||
feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
|
||||
would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
|
||||
split a tenant, it will not need re-splitting soon after.
|
||||
|
||||
## Optimizations
|
||||
|
||||
### Flush parent shard to remote storage during split
|
||||
|
||||
Any data that is in WAL but not remote storage at time of split will need
|
||||
to be replayed by child shards when they start for the first time. To minimize
|
||||
this work, we may flush the parent shard to remote storage before writing the
|
||||
remote indices for child shards.
|
||||
|
||||
It is important that this flush is subject to some time bounds: we may be splitting
|
||||
in response to a surge of write ingest, so it may be time-critical to split. A
|
||||
few seconds to flush latest data should be sufficient to optimize common cases without
|
||||
running the risk of holding up a split for a harmful length of time when a parent
|
||||
shard is being written heavily. If the flush doesn't complete in time, we may proceed
|
||||
to shut down the parent shard and carry on with the split.
|
||||
|
||||
### Hard linking parent layers into child shard directories
|
||||
|
||||
Before we start the Tenant objects for child shards, we may pre-populate their
|
||||
local storage directories with hard links to the layer files already present
|
||||
in the parent shard's local directory. When the child shard starts and downloads
|
||||
its remote index, it will find all those layer files already present on local disk.
|
||||
|
||||
This avoids wasting download capacity and makes splitting faster, but more importantly
|
||||
it avoids taking up a factor of N more disk space when splitting 1 shard into N.
|
||||
|
||||
This mechanism will work well in typical flows where shards are migrated away
|
||||
promptly after a split, but for the general case including what happens when
|
||||
layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
|
||||
section below.
|
||||
|
||||
### Filtering during compaction
|
||||
|
||||
Compaction, especially image layer generation, should skip any keys that are
|
||||
present in a shard's layer files, but do not match the shard's ShardIdentity's
|
||||
is_key_local() check. This avoids carrying around data for longer than necessary
|
||||
in post-split compactions.
|
||||
|
||||
This was already implemented in https://github.com/neondatabase/neon/pull/6246
|
||||
|
||||
### Proactive compaction
|
||||
|
||||
In remote storage, there is little reason to rewrite any data on a shard split:
|
||||
all the children can reference parent layers via the very cheap write of the child
|
||||
index_part.json.
|
||||
|
||||
In local storage, things are more nuanced. During the initial split there is no
|
||||
capacity cost to duplicating parent layers, if we implement the hard linking
|
||||
optimization described above. However, as soon as any layers are evicted from
|
||||
local disk and re-downloaded, the downloaded layers will not be hard-links any more:
|
||||
they'll have real capacity footprint. That isn't a problem if we migrate child shards
|
||||
away from the parent node swiftly, but it risks a significant over-use of local disk
|
||||
space if we do not.
|
||||
|
||||
For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
|
||||
the shards elsewhere, then churned all the layers in all the shards via eviction,
|
||||
then we would blow up the storage capacity used on the node by 8x. If we're splitting
|
||||
a 100GB shard, that could take the pageserver to the point of exhausting disk space.
|
||||
|
||||
To avoid this scenario, we could implement a special compaction mode where we just
|
||||
read historic layers, drop unwanted keys, and write back the layer file. This
|
||||
is pretty expensive, but useful if we have split a large shard and are not going to
|
||||
migrate the child shards away.
|
||||
|
||||
The heuristic conditions for triggering such a compaction are:
|
||||
|
||||
- A) eviction plus time: if a child shard
|
||||
has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
|
||||
- B) resident size plus time: we may inspect the resident layers and calculate how
|
||||
many of them include the overhead of storing pre-split keys. After some time
|
||||
threshold (different to the one in case A) we still have such layers occupying
|
||||
local disk space, then we should proactively compact them.
|
||||
|
||||
### Cleaning up parent-shard layers
|
||||
|
||||
It is functionally harmless to leave parent shard layers in remote storage indefinitely.
|
||||
They would be cleaned up in the event of the tenant's deletion.
|
||||
|
||||
As an optimization to avoid leaking remote storage capacity (which costs money), we may
|
||||
lazily clean up parent shard layers once no child shards reference them.
|
||||
|
||||
This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
|
||||
|
||||
- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
|
||||
which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
|
||||
- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
|
||||
may drop out now.
|
||||
- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
|
||||
- for all ancestral shards, list objects in the prefix and delete any layer which was not
|
||||
referenced by a current shard.
|
||||
|
||||
If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
|
||||
|
||||
The cleanup may be done by the scrubber (external process), or we may choose to have
|
||||
the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
|
||||
reading the other shard's indices at runtime, and we do not require visibility of the
|
||||
latest index writes.
|
||||
|
||||
Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
|
||||
that we retain the option to roll back a split in case of bugs.
|
||||
|
||||
### Splitting secondary locations
|
||||
|
||||
We may implement a pageserver API similar to the main splitting API, which does a simpler
|
||||
operation for secondary locations: it would not write anything to S3, instead it would simply
|
||||
create the child shard directory on local disk, hard link in directories from the parent,
|
||||
and set up the in memory (TenantSlot) state for the children.
|
||||
|
||||
Similar to attached locations, a subset of secondary locations will probably need re-locating
|
||||
after the split is complete, to avoid leaving multiple child shards on the same pageservers,
|
||||
where they may use excessive space for the tenant.
|
||||
|
||||
## FAQ/Alternatives
|
||||
|
||||
### What should the thresholds be set to?
|
||||
|
||||
Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
|
||||
|
||||
Max shard count:
|
||||
|
||||
- The safekeeper overhead to sharding is currently O(N) network bandwidth because
|
||||
the un-filtered WAL is sent to all shards. To avoid this growing out of control,
|
||||
a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
|
||||
on the safekeeper.
|
||||
- there is also little benefit to increasing the shard count beyond the number
|
||||
of pageservers in a region.
|
||||
|
||||
### Is it worth just rewriting all the data during a split to simplify reasoning about space?
|
||||
@@ -35,7 +35,7 @@ pub struct NodeRegisterRequest {
|
||||
pub struct NodeConfigureRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
pub availability: Option<NodeAvailability>,
|
||||
pub availability: Option<NodeAvailabilityWrapper>,
|
||||
pub scheduling: Option<NodeSchedulingPolicy>,
|
||||
}
|
||||
|
||||
@@ -66,22 +66,76 @@ pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
|
||||
/// Utilisation score indicating how good a candidate a pageserver
|
||||
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
|
||||
/// Lower values are better.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub struct UtilizationScore(pub u64);
|
||||
|
||||
impl UtilizationScore {
|
||||
pub fn worst() -> Self {
|
||||
UtilizationScore(u64::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy)]
|
||||
#[serde(into = "NodeAvailabilityWrapper")]
|
||||
pub enum NodeAvailability {
|
||||
// Normal, happy state
|
||||
Active,
|
||||
Active(UtilizationScore),
|
||||
// Offline: Tenants shouldn't try to attach here, but they may assume that their
|
||||
// secondary locations on this node still exist. Newly added nodes are in this
|
||||
// state until we successfully contact them.
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl PartialEq for NodeAvailability {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
use NodeAvailability::*;
|
||||
matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for NodeAvailability {}
|
||||
|
||||
// This wrapper provides serde functionality and it should only be used to
|
||||
// communicate with external callers which don't know or care about the
|
||||
// utilisation score of the pageserver it is targeting.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub enum NodeAvailabilityWrapper {
|
||||
Active,
|
||||
Offline,
|
||||
}
|
||||
|
||||
impl From<NodeAvailabilityWrapper> for NodeAvailability {
|
||||
fn from(val: NodeAvailabilityWrapper) -> Self {
|
||||
match val {
|
||||
// Assume the worst utilisation score to begin with. It will later be updated by
|
||||
// the heartbeats.
|
||||
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
|
||||
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
||||
fn from(val: NodeAvailability) -> Self {
|
||||
match val {
|
||||
NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
|
||||
NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for NodeAvailability {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"active" => Ok(Self::Active),
|
||||
// This is used when parsing node configuration requests from neon-local.
|
||||
// Assume the worst possible utilisation score
|
||||
// and let it get updated via the heartbeats.
|
||||
"active" => Ok(Self::Active(UtilizationScore::worst())),
|
||||
"offline" => Ok(Self::Offline),
|
||||
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ pub mod utilization;
|
||||
pub use utilization::PageserverUtilization;
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU64, NonZeroUsize},
|
||||
@@ -426,7 +427,7 @@ pub struct StatusResponse {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
pub tenant_id: TenantShardId,
|
||||
pub tenant_id: Option<TenantShardId>,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
@@ -577,7 +578,7 @@ pub struct TimelineInfo {
|
||||
pub walreceiver_status: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerMapInfo {
|
||||
pub in_memory_layers: Vec<InMemoryLayerInfo>,
|
||||
pub historic_layers: Vec<HistoricLayerInfo>,
|
||||
@@ -595,7 +596,7 @@ pub enum LayerAccessKind {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerAccessStatFullDetails {
|
||||
pub when_millis_since_epoch: u64,
|
||||
pub task_kind: &'static str,
|
||||
pub task_kind: Cow<'static, str>,
|
||||
pub access_kind: LayerAccessKind,
|
||||
}
|
||||
|
||||
@@ -654,23 +655,23 @@ impl LayerResidenceEvent {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerAccessStats {
|
||||
pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
|
||||
pub task_kind_access_flag: Vec<&'static str>,
|
||||
pub task_kind_access_flag: Vec<Cow<'static, str>>,
|
||||
pub first: Option<LayerAccessStatFullDetails>,
|
||||
pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
|
||||
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum InMemoryLayerInfo {
|
||||
Open { lsn_start: Lsn },
|
||||
Frozen { lsn_start: Lsn, lsn_end: Lsn },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum HistoricLayerInfo {
|
||||
Delta {
|
||||
@@ -692,6 +693,32 @@ pub enum HistoricLayerInfo {
|
||||
},
|
||||
}
|
||||
|
||||
impl HistoricLayerInfo {
|
||||
pub fn layer_file_name(&self) -> &str {
|
||||
match self {
|
||||
HistoricLayerInfo::Delta {
|
||||
layer_file_name, ..
|
||||
} => layer_file_name,
|
||||
HistoricLayerInfo::Image {
|
||||
layer_file_name, ..
|
||||
} => layer_file_name,
|
||||
}
|
||||
}
|
||||
pub fn is_remote(&self) -> bool {
|
||||
match self {
|
||||
HistoricLayerInfo::Delta { remote, .. } => *remote,
|
||||
HistoricLayerInfo::Image { remote, .. } => *remote,
|
||||
}
|
||||
}
|
||||
pub fn set_remote(&mut self, value: bool) {
|
||||
let field = match self {
|
||||
HistoricLayerInfo::Delta { remote, .. } => remote,
|
||||
HistoricLayerInfo::Image { remote, .. } => remote,
|
||||
};
|
||||
*field = value;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct DownloadRemoteLayersTaskSpawnRequest {
|
||||
pub max_concurrent_downloads: NonZeroUsize,
|
||||
@@ -724,6 +751,52 @@ pub struct WalRedoManagerStatus {
|
||||
pub pid: Option<u32>,
|
||||
}
|
||||
|
||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||
/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
|
||||
/// what's happening.
|
||||
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct SecondaryProgress {
|
||||
/// The remote storage LastModified time of the heatmap object we last downloaded.
|
||||
#[serde(
|
||||
serialize_with = "opt_ser_rfc3339_millis",
|
||||
deserialize_with = "opt_deser_rfc3339_millis"
|
||||
)]
|
||||
pub heatmap_mtime: Option<SystemTime>,
|
||||
|
||||
/// The number of layers currently on-disk
|
||||
pub layers_downloaded: usize,
|
||||
/// The number of layers in the most recently seen heatmap
|
||||
pub layers_total: usize,
|
||||
|
||||
/// The number of layer bytes currently on-disk
|
||||
pub bytes_downloaded: u64,
|
||||
/// The number of layer bytes in the most recently seen heatmap
|
||||
pub bytes_total: u64,
|
||||
}
|
||||
|
||||
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
|
||||
ts: &Option<SystemTime>,
|
||||
serializer: S,
|
||||
) -> Result<S::Ok, S::Error> {
|
||||
match ts {
|
||||
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
|
||||
None => serializer.serialize_none(),
|
||||
}
|
||||
}
|
||||
|
||||
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
match s {
|
||||
None => Ok(None),
|
||||
Some(s) => humantime::parse_rfc3339(&s)
|
||||
.map_err(serde::de::Error::custom)
|
||||
.map(Some),
|
||||
}
|
||||
}
|
||||
|
||||
pub mod virtual_file {
|
||||
#[derive(
|
||||
Copy,
|
||||
|
||||
@@ -7,7 +7,7 @@ use std::time::SystemTime;
|
||||
///
|
||||
/// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
|
||||
/// not handle full u64 values properly.
|
||||
#[derive(serde::Serialize, Debug)]
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
|
||||
pub struct PageserverUtilization {
|
||||
/// Used disk space
|
||||
#[serde(serialize_with = "ser_saturating_u63")]
|
||||
@@ -21,7 +21,10 @@ pub struct PageserverUtilization {
|
||||
/// When was this snapshot captured, pageserver local time.
|
||||
///
|
||||
/// Use millis to give confidence that the value is regenerated often enough.
|
||||
#[serde(serialize_with = "ser_rfc3339_millis")]
|
||||
#[serde(
|
||||
serialize_with = "ser_rfc3339_millis",
|
||||
deserialize_with = "deser_rfc3339_millis"
|
||||
)]
|
||||
pub captured_at: SystemTime,
|
||||
}
|
||||
|
||||
@@ -32,6 +35,14 @@ fn ser_rfc3339_millis<S: serde::Serializer>(
|
||||
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
|
||||
}
|
||||
|
||||
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
|
||||
where
|
||||
D: serde::de::Deserializer<'de>,
|
||||
{
|
||||
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
|
||||
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
|
||||
}
|
||||
|
||||
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
|
||||
///
|
||||
/// Instead of newtype, use this because a newtype would get require handling deserializing values
|
||||
|
||||
@@ -18,6 +18,7 @@ camino.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
|
||||
@@ -157,9 +157,8 @@ impl AzureBlobStorage {
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part?;
|
||||
let etag_str: &str = part.blob.properties.etag.as_ref();
|
||||
if etag.is_none() {
|
||||
etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
|
||||
etag = Some(part.blob.properties.etag);
|
||||
}
|
||||
if last_modified.is_none() {
|
||||
last_modified = Some(part.blob.properties.last_modified.into());
|
||||
@@ -174,6 +173,16 @@ impl AzureBlobStorage {
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
bufs.push(data);
|
||||
}
|
||||
|
||||
if bufs.is_empty() {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Azure GET response contained no buffers"
|
||||
)));
|
||||
}
|
||||
// unwrap safety: if these were None, bufs would be empty and we would have returned an error already
|
||||
let etag = etag.unwrap();
|
||||
let last_modified = last_modified.unwrap();
|
||||
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
etag,
|
||||
|
||||
@@ -42,6 +42,9 @@ pub use self::{
|
||||
};
|
||||
use s3_bucket::RequestKind;
|
||||
|
||||
/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
|
||||
pub use azure_core::Etag;
|
||||
|
||||
pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
|
||||
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
@@ -291,9 +294,9 @@ pub type DownloadStream =
|
||||
pub struct Download {
|
||||
pub download_stream: DownloadStream,
|
||||
/// The last time the file was modified (`last-modified` HTTP header)
|
||||
pub last_modified: Option<SystemTime>,
|
||||
pub last_modified: SystemTime,
|
||||
/// A way to identify this specific version of the resource (`etag` HTTP header)
|
||||
pub etag: Option<String>,
|
||||
pub etag: Etag,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::{
|
||||
io::ErrorKind,
|
||||
num::NonZeroU32,
|
||||
pin::Pin,
|
||||
time::{Duration, SystemTime},
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
@@ -30,6 +30,7 @@ use crate::{
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
use crate::Etag;
|
||||
|
||||
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
@@ -406,35 +407,37 @@ impl RemoteStorage for LocalFs {
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let source = ReaderStream::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
let file_metadata = file_metadata(&target_path).await?;
|
||||
|
||||
let source = ReaderStream::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
let etag = mock_etag(&file_metadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: file_metadata
|
||||
.modified()
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
||||
etag,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
@@ -452,50 +455,51 @@ impl RemoteStorage for LocalFs {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
||||
}
|
||||
}
|
||||
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let mut source = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let len = source
|
||||
.metadata()
|
||||
.await
|
||||
.context("query file length")
|
||||
.map_err(DownloadError::Other)?
|
||||
.len();
|
||||
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
.context("Failed to seek to the range start in a local storage file")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||
let source = ReaderStream::new(source);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: None,
|
||||
etag: None,
|
||||
download_stream: Box::pin(source),
|
||||
let file_metadata = file_metadata(&target_path).await?;
|
||||
let mut source = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let len = source
|
||||
.metadata()
|
||||
.await
|
||||
.context("query file length")
|
||||
.map_err(DownloadError::Other)?
|
||||
.len();
|
||||
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
.context("Failed to seek to the range start in a local storage file")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&target_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||
let source = ReaderStream::new(source);
|
||||
|
||||
let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
|
||||
let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
|
||||
|
||||
let etag = mock_etag(&file_metadata);
|
||||
Ok(Download {
|
||||
metadata,
|
||||
last_modified: file_metadata
|
||||
.modified()
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
|
||||
etag,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
@@ -610,13 +614,22 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
||||
if file_path.exists() {
|
||||
ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
|
||||
tokio::fs::metadata(&file_path).await.map_err(|e| {
|
||||
if e.kind() == ErrorKind::NotFound {
|
||||
DownloadError::NotFound
|
||||
} else {
|
||||
DownloadError::BadInput(e.into())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Use mtime as stand-in for ETag. We could calculate a meaningful one by md5'ing the contents of files we
|
||||
// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
|
||||
// quickly, with less overhead than using a mock S3 server.
|
||||
fn mock_etag(meta: &std::fs::Metadata) -> Etag {
|
||||
let mtime = meta.modified().expect("Filesystem mtime missing");
|
||||
format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
|
||||
};
|
||||
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use aws_smithy_types::byte_stream::ByteStream;
|
||||
use aws_smithy_types::{body::SdkBody, DateTime};
|
||||
use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use hyper::Body;
|
||||
@@ -287,8 +287,17 @@ impl S3Bucket {
|
||||
let remaining = self.timeout.saturating_sub(started_at.elapsed());
|
||||
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
let etag = object_output.e_tag;
|
||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||
let etag = object_output
|
||||
.e_tag
|
||||
.ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
|
||||
.into();
|
||||
let last_modified = object_output
|
||||
.last_modified
|
||||
.ok_or(DownloadError::Other(anyhow::anyhow!(
|
||||
"Missing LastModified header"
|
||||
)))?
|
||||
.try_into()
|
||||
.map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
|
||||
|
||||
let body = object_output.body;
|
||||
let body = ByteStreamAsStream::from(body);
|
||||
|
||||
@@ -118,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// A little check to ensure that our clock is not too far off from the S3 clock
|
||||
{
|
||||
let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
|
||||
let last_modified = dl.last_modified.unwrap();
|
||||
let last_modified = dl.last_modified;
|
||||
let half_wt = WAIT_TIME.mul_f32(0.5);
|
||||
let t0_hwt = t0 + half_wt;
|
||||
let t1_hwt = t1 - half_wt;
|
||||
|
||||
@@ -47,9 +47,10 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct SerdeRepr<T> {
|
||||
buffer: Vec<T>,
|
||||
buffer_size: usize,
|
||||
drop_count: u64,
|
||||
}
|
||||
|
||||
@@ -61,6 +62,7 @@ where
|
||||
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
|
||||
SerdeRepr {
|
||||
buffer: buffer.iter().cloned().collect(),
|
||||
buffer_size: L,
|
||||
drop_count: *drop_count,
|
||||
}
|
||||
}
|
||||
@@ -78,19 +80,52 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
|
||||
where
|
||||
T: Clone + serde::Deserialize<'de>,
|
||||
{
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
let SerdeRepr {
|
||||
buffer: des_buffer,
|
||||
drop_count,
|
||||
buffer_size,
|
||||
} = SerdeRepr::<T>::deserialize(deserializer)?;
|
||||
if buffer_size != L {
|
||||
use serde::de::Error;
|
||||
return Err(D::Error::custom(format!(
|
||||
"invalid buffer_size, expecting {L} got {buffer_size}"
|
||||
)));
|
||||
}
|
||||
let mut buffer = HistoryBuffer::new();
|
||||
buffer.extend(des_buffer);
|
||||
Ok(HistoryBufferWithDropCounter { buffer, drop_count })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::HistoryBufferWithDropCounter;
|
||||
|
||||
#[test]
|
||||
fn test_basics() {
|
||||
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
|
||||
let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
|
||||
b.write(1);
|
||||
b.write(2);
|
||||
b.write(3);
|
||||
assert!(b.iter().any(|e| *e == 2));
|
||||
assert!(b.iter().any(|e| *e == 3));
|
||||
assert!(!b.iter().any(|e| *e == 1));
|
||||
|
||||
// round-trip serde
|
||||
let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
|
||||
serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
|
||||
assert_eq!(
|
||||
round_tripped.iter().cloned().collect::<Vec<_>>(),
|
||||
b.iter().cloned().collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -110,6 +110,49 @@ impl<T> OnceCell<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, or returns an unique initialization
|
||||
/// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
|
||||
pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
|
||||
// It looks like OnceCell::get_or_init could be implemented using this method instead of
|
||||
// duplication. However, that makes the future be !Send due to possibly holding on to the
|
||||
// MutexGuard over an await point.
|
||||
loop {
|
||||
let sem = {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
{
|
||||
let permit = {
|
||||
// increment the count for the duration of queued
|
||||
let _guard = CountWaitingInitializers::start(self);
|
||||
sem.acquire().await
|
||||
};
|
||||
|
||||
let Ok(permit) = permit else {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
|
||||
// there was a take_and_deinit in between
|
||||
continue;
|
||||
}
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
};
|
||||
|
||||
permit.forget();
|
||||
}
|
||||
|
||||
let permit = InitPermit(sem);
|
||||
return Err(permit);
|
||||
}
|
||||
}
|
||||
|
||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||
/// to complete initializing the inner value.
|
||||
///
|
||||
@@ -481,4 +524,39 @@ mod tests {
|
||||
|
||||
assert_eq!("t1", *cell.get().unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn detached_init_smoke() {
|
||||
let target = OnceCell::default();
|
||||
|
||||
let Err(permit) = target.get_or_init_detached().await else {
|
||||
unreachable!("it is not initialized")
|
||||
};
|
||||
|
||||
tokio::time::timeout(
|
||||
std::time::Duration::from_secs(3600 * 24 * 7 * 365),
|
||||
target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
|
||||
)
|
||||
.await
|
||||
.expect_err("should timeout since we are already holding the permit");
|
||||
|
||||
target.set(42, permit);
|
||||
|
||||
let (_answer, permit) = {
|
||||
let mut guard = target
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(*guard, 42);
|
||||
|
||||
guard.take_and_deinit()
|
||||
};
|
||||
|
||||
assert!(target.get().is_none());
|
||||
|
||||
target.set(11, permit);
|
||||
|
||||
assert_eq!(*target.get().unwrap(), 11);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -89,6 +89,9 @@ enumset = { workspace = true, features = ["serde"]}
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
|
||||
@@ -169,7 +169,7 @@ impl Client {
|
||||
self.request(Method::GET, uri, ()).await
|
||||
}
|
||||
|
||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
@@ -181,7 +181,16 @@ impl Client {
|
||||
} else {
|
||||
req
|
||||
};
|
||||
let res = req.json(&body).send().await.map_err(Error::ReceiveBody)?;
|
||||
req.json(&body).send().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
|
||||
&self,
|
||||
method: Method,
|
||||
uri: U,
|
||||
body: B,
|
||||
) -> Result<reqwest::Response> {
|
||||
let res = self.request_noerror(method, uri, body).await?;
|
||||
let response = res.error_from_body().await?;
|
||||
Ok(response)
|
||||
}
|
||||
@@ -240,13 +249,26 @@ impl Client {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
pub async fn tenant_secondary_download(
|
||||
&self,
|
||||
tenant_id: TenantShardId,
|
||||
wait: Option<std::time::Duration>,
|
||||
) -> Result<(StatusCode, SecondaryProgress)> {
|
||||
let mut path = reqwest::Url::parse(&format!(
|
||||
"{}/v1/tenant/{}/secondary/download",
|
||||
self.mgmt_api_endpoint, tenant_id
|
||||
);
|
||||
self.request(Method::POST, &uri, ()).await?;
|
||||
Ok(())
|
||||
))
|
||||
.expect("Cannot build URL");
|
||||
|
||||
if let Some(wait) = wait {
|
||||
path.query_pairs_mut()
|
||||
.append_pair("wait_ms", &format!("{}", wait.as_millis()));
|
||||
}
|
||||
|
||||
let response = self.request(Method::POST, path, ()).await?;
|
||||
let status = response.status();
|
||||
let progress: SecondaryProgress = response.json().await.map_err(Error::ReceiveBody)?;
|
||||
Ok((status, progress))
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
@@ -257,7 +279,7 @@ impl Client {
|
||||
lazy: bool,
|
||||
) -> Result<()> {
|
||||
let req_body = TenantLocationConfigRequest {
|
||||
tenant_id: tenant_shard_id,
|
||||
tenant_id: Some(tenant_shard_id),
|
||||
config,
|
||||
};
|
||||
|
||||
@@ -416,4 +438,77 @@ impl Client {
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
||||
self.get(uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn layer_map_info(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<LayerMapInfo> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/layer",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id,
|
||||
);
|
||||
self.get(&uri)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn layer_evict(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &str,
|
||||
) -> Result<bool> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/layer/{}",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
|
||||
);
|
||||
let resp = self.request_noerror(Method::DELETE, &uri, ()).await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => Ok(true),
|
||||
StatusCode::NOT_MODIFIED => Ok(false),
|
||||
// TODO: dedupe this pattern / introduce separate error variant?
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn layer_ondemand_download(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
layer_file_name: &str,
|
||||
) -> Result<bool> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/timeline/{}/layer/{}",
|
||||
self.mgmt_api_endpoint, tenant_shard_id, timeline_id, layer_file_name
|
||||
);
|
||||
let resp = self.request_noerror(Method::GET, &uri, ()).await?;
|
||||
match resp.status() {
|
||||
StatusCode::OK => Ok(true),
|
||||
StatusCode::NOT_MODIFIED => Ok(false),
|
||||
// TODO: dedupe this pattern / introduce separate error variant?
|
||||
status => Err(match resp.json::<HttpErrorBody>().await {
|
||||
Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
|
||||
Err(_) => {
|
||||
Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), uri))
|
||||
}
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
272
pageserver/pagebench/src/cmd/ondemand_download_churn.rs
Normal file
272
pageserver/pagebench/src/cmd/ondemand_download_churn.rs
Normal file
@@ -0,0 +1,272 @@
|
||||
use pageserver_api::{models::HistoricLayerInfo, shard::TenantShardId};
|
||||
|
||||
use pageserver_client::mgmt_api;
|
||||
use rand::seq::SliceRandom;
|
||||
use tracing::{debug, info};
|
||||
use utils::id::{TenantTimelineId, TimelineId};
|
||||
|
||||
use tokio::{
|
||||
sync::{mpsc, OwnedSemaphorePermit},
|
||||
task::JoinSet,
|
||||
};
|
||||
|
||||
use std::{
|
||||
num::NonZeroUsize,
|
||||
sync::{
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
/// Evict & on-demand download random layers.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
#[clap(long, default_value = "1")]
|
||||
tasks_per_target: NonZeroUsize,
|
||||
#[clap(long, default_value = "1")]
|
||||
concurrency_per_target: NonZeroUsize,
|
||||
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||
#[clap(long)]
|
||||
limit_to_first_n_targets: Option<usize>,
|
||||
/// Before starting the benchmark, live-reconfigure the pageserver to use the given
|
||||
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
||||
#[clap(long)]
|
||||
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let task = rt.spawn(main_impl(args));
|
||||
rt.block_on(task).unwrap().unwrap();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
evictions: AtomicU64,
|
||||
downloads: AtomicU64,
|
||||
timeline_restarts: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn eviction_done(&self) {
|
||||
self.evictions.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn download_done(&self) {
|
||||
self.downloads.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
fn timeline_restart_done(&self) {
|
||||
self.timeline_restarts.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
if let Some(engine_str) = &args.set_io_engine {
|
||||
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||
}
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
crate::util::cli::targets::Spec {
|
||||
limit_to_first_n_targets: args.limit_to_first_n_targets,
|
||||
targets: args.targets.clone(),
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut tasks = JoinSet::new();
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
tasks.spawn({
|
||||
let live_stats = Arc::clone(&live_stats);
|
||||
async move {
|
||||
let mut last_at = Instant::now();
|
||||
loop {
|
||||
tokio::time::sleep_until((last_at + Duration::from_secs(1)).into()).await;
|
||||
let now = Instant::now();
|
||||
let delta: Duration = now - last_at;
|
||||
last_at = now;
|
||||
|
||||
let LiveStats {
|
||||
evictions,
|
||||
downloads,
|
||||
timeline_restarts,
|
||||
} = &*live_stats;
|
||||
let evictions = evictions.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||
let downloads = downloads.swap(0, Ordering::Relaxed) as f64 / delta.as_secs_f64();
|
||||
let timeline_restarts = timeline_restarts.swap(0, Ordering::Relaxed);
|
||||
info!("evictions={evictions:.2}/s downloads={downloads:.2}/s timeline_restarts={timeline_restarts}");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
for tl in timelines {
|
||||
for _ in 0..args.tasks_per_target.get() {
|
||||
tasks.spawn(timeline_actor(
|
||||
args,
|
||||
Arc::clone(&mgmt_api_client),
|
||||
tl,
|
||||
Arc::clone(&live_stats),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
res.unwrap();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn timeline_actor(
|
||||
args: &'static Args,
|
||||
mgmt_api_client: Arc<pageserver_client::mgmt_api::Client>,
|
||||
timeline: TenantTimelineId,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
// TODO: support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(timeline.tenant_id);
|
||||
|
||||
struct Timeline {
|
||||
joinset: JoinSet<()>,
|
||||
layers: Vec<mpsc::Sender<OwnedSemaphorePermit>>,
|
||||
concurrency: Arc<tokio::sync::Semaphore>,
|
||||
}
|
||||
loop {
|
||||
debug!("restarting timeline");
|
||||
let layer_map_info = mgmt_api_client
|
||||
.layer_map_info(tenant_shard_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
let concurrency = Arc::new(tokio::sync::Semaphore::new(
|
||||
args.concurrency_per_target.get(),
|
||||
));
|
||||
|
||||
let mut joinset = JoinSet::new();
|
||||
let layers = layer_map_info
|
||||
.historic_layers
|
||||
.into_iter()
|
||||
.map(|historic_layer| {
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
joinset.spawn(layer_actor(
|
||||
tenant_shard_id,
|
||||
timeline.timeline_id,
|
||||
historic_layer,
|
||||
rx,
|
||||
Arc::clone(&mgmt_api_client),
|
||||
Arc::clone(&live_stats),
|
||||
));
|
||||
tx
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut timeline = Timeline {
|
||||
joinset,
|
||||
layers,
|
||||
concurrency,
|
||||
};
|
||||
|
||||
live_stats.timeline_restart_done();
|
||||
|
||||
loop {
|
||||
assert!(!timeline.joinset.is_empty());
|
||||
if let Some(res) = timeline.joinset.try_join_next() {
|
||||
debug!(?res, "a layer actor exited, should not happen");
|
||||
timeline.joinset.shutdown().await;
|
||||
break;
|
||||
}
|
||||
|
||||
let mut permit = Some(
|
||||
Arc::clone(&timeline.concurrency)
|
||||
.acquire_owned()
|
||||
.await
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
loop {
|
||||
let layer_tx = {
|
||||
let mut rng = rand::thread_rng();
|
||||
timeline.layers.choose_mut(&mut rng).expect("no layers")
|
||||
};
|
||||
match layer_tx.try_send(permit.take().unwrap()) {
|
||||
Ok(_) => break,
|
||||
Err(e) => match e {
|
||||
mpsc::error::TrySendError::Full(back) => {
|
||||
// TODO: retrying introduces bias away from slow downloaders
|
||||
permit.replace(back);
|
||||
}
|
||||
mpsc::error::TrySendError::Closed(_) => panic!(),
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn layer_actor(
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
mut layer: HistoricLayerInfo,
|
||||
mut rx: mpsc::Receiver<tokio::sync::OwnedSemaphorePermit>,
|
||||
mgmt_api_client: Arc<mgmt_api::Client>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
#[derive(Clone, Copy)]
|
||||
enum Action {
|
||||
Evict,
|
||||
OnDemandDownload,
|
||||
}
|
||||
|
||||
while let Some(_permit) = rx.recv().await {
|
||||
let action = if layer.is_remote() {
|
||||
Action::OnDemandDownload
|
||||
} else {
|
||||
Action::Evict
|
||||
};
|
||||
|
||||
let did_it = match action {
|
||||
Action::Evict => {
|
||||
let did_it = mgmt_api_client
|
||||
.layer_evict(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||
.await
|
||||
.unwrap();
|
||||
live_stats.eviction_done();
|
||||
did_it
|
||||
}
|
||||
Action::OnDemandDownload => {
|
||||
let did_it = mgmt_api_client
|
||||
.layer_ondemand_download(tenant_shard_id, timeline_id, layer.layer_file_name())
|
||||
.await
|
||||
.unwrap();
|
||||
live_stats.download_done();
|
||||
did_it
|
||||
}
|
||||
};
|
||||
if !did_it {
|
||||
debug!("local copy of layer map appears out of sync, re-downloading");
|
||||
return;
|
||||
}
|
||||
debug!("did it");
|
||||
layer.set_remote(match action {
|
||||
Action::Evict => true,
|
||||
Action::OnDemandDownload => false,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,7 @@ mod util {
|
||||
mod cmd {
|
||||
pub(super) mod basebackup;
|
||||
pub(super) mod getpage_latest_lsn;
|
||||
pub(super) mod ondemand_download_churn;
|
||||
pub(super) mod trigger_initial_size_calculation;
|
||||
}
|
||||
|
||||
@@ -25,6 +26,7 @@ enum Args {
|
||||
Basebackup(cmd::basebackup::Args),
|
||||
GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
|
||||
TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
|
||||
OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -43,6 +45,7 @@ fn main() {
|
||||
Args::TriggerInitialSizeCalculation(args) => {
|
||||
cmd::trigger_initial_size_calculation::main(args)
|
||||
}
|
||||
Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
#![recursion_limit = "300"]
|
||||
|
||||
//! Main entry point for the Page Server executable.
|
||||
|
||||
use std::env::{var, VarError};
|
||||
@@ -118,6 +120,9 @@ fn main() -> anyhow::Result<()> {
|
||||
&[("node_id", &conf.id.to_string())],
|
||||
);
|
||||
|
||||
// after setting up logging, log the effective IO engine choice
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
utils::crashsafe::create_dir_all(conf.tenants_path())
|
||||
@@ -312,6 +317,7 @@ fn start_pageserver(
|
||||
let http_listener = tcp_listener::bind(http_addr)?;
|
||||
|
||||
let pg_addr = &conf.listen_pg_addr;
|
||||
|
||||
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||
|
||||
@@ -544,7 +550,7 @@ fn start_pageserver(
|
||||
let router_state = Arc::new(
|
||||
http::routes::State::new(
|
||||
conf,
|
||||
tenant_manager,
|
||||
tenant_manager.clone(),
|
||||
http_auth.clone(),
|
||||
remote_storage.clone(),
|
||||
broker_client.clone(),
|
||||
@@ -688,6 +694,7 @@ fn start_pageserver(
|
||||
let bg_remote_storage = remote_storage.clone();
|
||||
let bg_deletion_queue = deletion_queue.clone();
|
||||
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
|
||||
&tenant_manager,
|
||||
bg_remote_storage.map(|_| bg_deletion_queue),
|
||||
0,
|
||||
));
|
||||
|
||||
@@ -30,18 +30,17 @@ use utils::{
|
||||
logging::LogFormat,
|
||||
};
|
||||
|
||||
use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::timeline::GetVectoredImpl;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::virtual_file;
|
||||
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
||||
use crate::{tenant::config::TenantConf, virtual_file};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
@@ -291,16 +290,23 @@ pub static SAFEKEEPER_AUTH_TOKEN: OnceCell<Arc<String>> = OnceCell::new();
|
||||
|
||||
// use dedicated enum for builder to better indicate the intention
|
||||
// and avoid possible confusion with nested options
|
||||
#[derive(Clone, Default)]
|
||||
pub enum BuilderValue<T> {
|
||||
Set(T),
|
||||
#[default]
|
||||
NotSet,
|
||||
}
|
||||
|
||||
impl<T> BuilderValue<T> {
|
||||
pub fn ok_or<E>(self, err: E) -> Result<T, E> {
|
||||
impl<T: Clone> BuilderValue<T> {
|
||||
pub fn ok_or(&self, field_name: &'static str, default: BuilderValue<T>) -> anyhow::Result<T> {
|
||||
match self {
|
||||
Self::Set(v) => Ok(v),
|
||||
Self::NotSet => Err(err),
|
||||
Self::Set(v) => Ok(v.clone()),
|
||||
Self::NotSet => match default {
|
||||
BuilderValue::Set(v) => Ok(v.clone()),
|
||||
BuilderValue::NotSet => {
|
||||
anyhow::bail!("missing config value {field_name:?}")
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -326,6 +332,7 @@ pub(crate) struct NodeMetadata {
|
||||
}
|
||||
|
||||
// needed to simplify config construction
|
||||
#[derive(Default)]
|
||||
struct PageServerConfigBuilder {
|
||||
listen_pg_addr: BuilderValue<String>,
|
||||
|
||||
@@ -393,8 +400,9 @@ struct PageServerConfigBuilder {
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
fn default() -> Self {
|
||||
impl PageServerConfigBuilder {
|
||||
#[inline(always)]
|
||||
fn default_values() -> Self {
|
||||
use self::BuilderValue::*;
|
||||
use defaults::*;
|
||||
Self {
|
||||
@@ -647,125 +655,96 @@ impl PageServerConfigBuilder {
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_warmup = self
|
||||
.concurrent_tenant_warmup
|
||||
.ok_or(anyhow!("missing concurrent_tenant_warmup"))?;
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?;
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
.ok_or(anyhow!("missing listen_pg_addr"))?,
|
||||
listen_http_addr: self
|
||||
.listen_http_addr
|
||||
.ok_or(anyhow!("missing listen_http_addr"))?,
|
||||
availability_zone: self
|
||||
.availability_zone
|
||||
.ok_or(anyhow!("missing availability_zone"))?,
|
||||
wait_lsn_timeout: self
|
||||
.wait_lsn_timeout
|
||||
.ok_or(anyhow!("missing wait_lsn_timeout"))?,
|
||||
wal_redo_timeout: self
|
||||
.wal_redo_timeout
|
||||
.ok_or(anyhow!("missing wal_redo_timeout"))?,
|
||||
superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
|
||||
page_cache_size: self
|
||||
.page_cache_size
|
||||
.ok_or(anyhow!("missing page_cache_size"))?,
|
||||
max_file_descriptors: self
|
||||
.max_file_descriptors
|
||||
.ok_or(anyhow!("missing max_file_descriptors"))?,
|
||||
workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
|
||||
pg_distrib_dir: self
|
||||
.pg_distrib_dir
|
||||
.ok_or(anyhow!("missing pg_distrib_dir"))?,
|
||||
http_auth_type: self
|
||||
.http_auth_type
|
||||
.ok_or(anyhow!("missing http_auth_type"))?,
|
||||
pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
.ok_or(anyhow!("missing auth_validation_public_key_path"))?,
|
||||
remote_storage_config: self
|
||||
.remote_storage_config
|
||||
.ok_or(anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow!("missing id"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoint: self
|
||||
.broker_endpoint
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?,
|
||||
broker_keepalive_interval: self
|
||||
.broker_keepalive_interval
|
||||
.ok_or(anyhow!("No broker keepalive interval provided"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
metric_collection_interval: self
|
||||
.metric_collection_interval
|
||||
.ok_or(anyhow!("missing metric_collection_interval"))?,
|
||||
cached_metric_collection_interval: self
|
||||
.cached_metric_collection_interval
|
||||
.ok_or(anyhow!("missing cached_metric_collection_interval"))?,
|
||||
metric_collection_endpoint: self
|
||||
.metric_collection_endpoint
|
||||
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
|
||||
synthetic_size_calculation_interval: self
|
||||
.synthetic_size_calculation_interval
|
||||
.ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
|
||||
disk_usage_based_eviction: self
|
||||
.disk_usage_based_eviction
|
||||
.ok_or(anyhow!("missing disk_usage_based_eviction"))?,
|
||||
test_remote_failures: self
|
||||
.test_remote_failures
|
||||
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
||||
ondemand_download_behavior_treat_error_as_warn: self
|
||||
.ondemand_download_behavior_treat_error_as_warn
|
||||
.ok_or(anyhow!(
|
||||
"missing ondemand_download_behavior_treat_error_as_warn"
|
||||
))?,
|
||||
background_task_maximum_delay: self
|
||||
.background_task_maximum_delay
|
||||
.ok_or(anyhow!("missing background_task_maximum_delay"))?,
|
||||
control_plane_api: self
|
||||
.control_plane_api
|
||||
.ok_or(anyhow!("missing control_plane_api"))?,
|
||||
control_plane_api_token: self
|
||||
.control_plane_api_token
|
||||
.ok_or(anyhow!("missing control_plane_api_token"))?,
|
||||
control_plane_emergency_mode: self
|
||||
.control_plane_emergency_mode
|
||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
||||
heatmap_upload_concurrency: self
|
||||
.heatmap_upload_concurrency
|
||||
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
||||
secondary_download_concurrency: self
|
||||
.secondary_download_concurrency
|
||||
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
|
||||
ingest_batch_size: self
|
||||
.ingest_batch_size
|
||||
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
||||
virtual_file_io_engine: self
|
||||
.virtual_file_io_engine
|
||||
.ok_or(anyhow!("missing virtual_file_io_engine"))?,
|
||||
get_vectored_impl: self
|
||||
.get_vectored_impl
|
||||
.ok_or(anyhow!("missing get_vectored_impl"))?,
|
||||
max_vectored_read_bytes: self
|
||||
.max_vectored_read_bytes
|
||||
.ok_or(anyhow!("missing max_vectored_read_bytes"))?,
|
||||
validate_vectored_get: self
|
||||
.validate_vectored_get
|
||||
.ok_or(anyhow!("missing validate_vectored_get"))?,
|
||||
})
|
||||
let default = Self::default_values();
|
||||
|
||||
macro_rules! conf {
|
||||
(USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => {
|
||||
PageServerConf {
|
||||
$(
|
||||
$field: self.$field.ok_or(stringify!($field), default.$field)?,
|
||||
)*
|
||||
$(
|
||||
$custom_field: $custom_value,
|
||||
)*
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Ok(conf!(
|
||||
USING DEFAULT
|
||||
{
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
availability_zone,
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
superuser,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
workdir,
|
||||
pg_distrib_dir,
|
||||
http_auth_type,
|
||||
pg_auth_type,
|
||||
auth_validation_public_key_path,
|
||||
remote_storage_config,
|
||||
id,
|
||||
broker_endpoint,
|
||||
broker_keepalive_interval,
|
||||
log_format,
|
||||
metric_collection_interval,
|
||||
cached_metric_collection_interval,
|
||||
metric_collection_endpoint,
|
||||
synthetic_size_calculation_interval,
|
||||
disk_usage_based_eviction,
|
||||
test_remote_failures,
|
||||
ondemand_download_behavior_treat_error_as_warn,
|
||||
background_task_maximum_delay,
|
||||
control_plane_api,
|
||||
control_plane_api_token,
|
||||
control_plane_emergency_mode,
|
||||
heatmap_upload_concurrency,
|
||||
secondary_download_concurrency,
|
||||
ingest_batch_size,
|
||||
get_vectored_impl,
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
concurrent_tenant_warmup: ConfigurableSemaphore::new({
|
||||
self
|
||||
.concurrent_tenant_warmup
|
||||
.ok_or("concurrent_tenant_warmpup",
|
||||
default.concurrent_tenant_warmup)?
|
||||
}),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||
self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or("concurrent_tenant_size_logical_size_queries",
|
||||
default.concurrent_tenant_size_logical_size_queries.clone())?
|
||||
),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
||||
// re-use `concurrent_tenant_size_logical_size_queries`
|
||||
self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or("eviction_task_immitated_concurrent_logical_size_queries",
|
||||
default.concurrent_tenant_size_logical_size_queries.clone())?,
|
||||
),
|
||||
virtual_file_io_engine: match self.virtual_file_io_engine {
|
||||
BuilderValue::Set(v) => v,
|
||||
BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? {
|
||||
io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise
|
||||
io_engine::FeatureTestResult::Worse { engine, remark } => {
|
||||
// TODO: bubble this up to the caller so we can tracing::warn! it.
|
||||
eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}");
|
||||
engine
|
||||
}
|
||||
},
|
||||
},
|
||||
}
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -845,18 +824,7 @@ impl PageServerConf {
|
||||
.join(timeline_id.to_string())
|
||||
}
|
||||
|
||||
pub fn timeline_uninit_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Utf8PathBuf {
|
||||
path_with_suffix_extension(
|
||||
self.timeline_path(&tenant_shard_id, &timeline_id),
|
||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn timeline_delete_mark_file_path(
|
||||
pub(crate) fn timeline_delete_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -867,7 +835,10 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
pub(crate) fn tenant_deleted_mark_file_path(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(TENANT_DELETED_MARKER_FILE_NAME)
|
||||
}
|
||||
|
||||
@@ -567,9 +567,9 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
/v1/tenant/{tenant_id}/location_config:
|
||||
/v1/tenant/{tenant_shard_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
- name: tenant_shard_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
@@ -965,12 +965,28 @@ paths:
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: wait_ms
|
||||
description: If set, we will wait this long for download to complete, and if it isn't complete then return 202
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
post:
|
||||
description: |
|
||||
If the location is in secondary mode, download latest heatmap and layers
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
"202":
|
||||
description: Download has started but not yet finished
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
@@ -1367,10 +1383,11 @@ components:
|
||||
TenantLocationConfigRequest:
|
||||
type: object
|
||||
required:
|
||||
- tenant_id
|
||||
- mode
|
||||
properties:
|
||||
tenant_id:
|
||||
type: string
|
||||
description: Not used, scheduled for removal.
|
||||
mode:
|
||||
type: string
|
||||
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
|
||||
@@ -1622,6 +1639,37 @@ components:
|
||||
Lower is better score for how good this pageserver would be for the next tenant.
|
||||
The default or maximum value can be returned in situations when a proper score cannot (yet) be calculated.
|
||||
|
||||
SecondaryProgress:
|
||||
type: object
|
||||
required:
|
||||
- heatmap_mtime
|
||||
- layers_downloaded
|
||||
- layers_total
|
||||
- bytes_downloaded
|
||||
- bytes_total
|
||||
properties:
|
||||
heatmap_mtime:
|
||||
type: string
|
||||
format: date-time
|
||||
description: Modification time of the most recently downloaded layer heatmap (RFC 3339 format)
|
||||
layers_downloaded:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many layers from the latest layer heatmap are present on disk
|
||||
bytes_downloaded:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many bytes of layer content from the latest layer heatmap are present on disk
|
||||
layers_total:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many layers were in the latest layer heatmap
|
||||
bytes_total:
|
||||
type: integer
|
||||
format: int64
|
||||
description: How many bytes of layer content were in the latest layer heatmap
|
||||
|
||||
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -535,9 +535,9 @@ async fn timeline_create_handler(
|
||||
)
|
||||
}
|
||||
Err(
|
||||
tenant::CreateTimelineError::Conflict
|
||||
| tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, ()),
|
||||
e @ tenant::CreateTimelineError::Conflict
|
||||
| e @ tenant::CreateTimelineError::AlreadyCreating,
|
||||
) => json_response(StatusCode::CONFLICT, HttpErrorBody::from_msg(e.to_string())),
|
||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
||||
StatusCode::NOT_ACCEPTABLE,
|
||||
HttpErrorBody::from_msg(format!("{err:#}")),
|
||||
@@ -1987,13 +1987,42 @@ async fn secondary_download_handler(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&request);
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
state
|
||||
.secondary_controller
|
||||
.download_tenant(tenant_shard_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let wait = parse_query_param(&request, "wait_ms")?.map(Duration::from_millis);
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
// We don't need this to issue the download request, but:
|
||||
// - it enables us to cleanly return 404 if we get a request for an absent shard
|
||||
// - we will use this to provide status feedback in the response
|
||||
let Some(secondary_tenant) = state
|
||||
.tenant_manager
|
||||
.get_secondary_tenant_shard(tenant_shard_id)
|
||||
else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Shard {} not found", tenant_shard_id).into(),
|
||||
));
|
||||
};
|
||||
|
||||
let timeout = wait.unwrap_or(Duration::MAX);
|
||||
|
||||
let status = match tokio::time::timeout(
|
||||
timeout,
|
||||
state.secondary_controller.download_tenant(tenant_shard_id),
|
||||
)
|
||||
.await
|
||||
{
|
||||
// Download job ran to completion.
|
||||
Ok(Ok(())) => StatusCode::OK,
|
||||
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
|
||||
// okay. We could get an error here in the unlikely edge case that the tenant
|
||||
// was detached between our check above and executing the download job.
|
||||
Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
|
||||
// A timeout is not an error: we have started the download, we're just not done
|
||||
// yet. The caller will get a response body indicating status.
|
||||
Err(_) => StatusCode::ACCEPTED,
|
||||
};
|
||||
|
||||
let progress = secondary_tenant.progress.lock().unwrap().clone();
|
||||
|
||||
json_response(status, progress)
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -2053,6 +2082,10 @@ async fn get_utilization(
|
||||
r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
fail::fail_point!("get-utilization-http-handler", |_| {
|
||||
Err(ApiError::ResourceUnavailable("failpoint".into()))
|
||||
});
|
||||
|
||||
// this probably could be completely public, but lets make that change later.
|
||||
check_permission(&r, None)?;
|
||||
|
||||
@@ -2108,6 +2141,16 @@ where
|
||||
R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
|
||||
H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
|
||||
{
|
||||
if request.uri() != &"/v1/failpoints".parse::<Uri>().unwrap() {
|
||||
fail::fail_point!("api-503", |_| Err(ApiError::ResourceUnavailable(
|
||||
"failpoint".into()
|
||||
)));
|
||||
|
||||
fail::fail_point!("api-500", |_| Err(ApiError::InternalServerError(
|
||||
anyhow::anyhow!("failpoint")
|
||||
)));
|
||||
}
|
||||
|
||||
// Spawn a new task to handle the request, to protect the handler from unexpected
|
||||
// async cancellations. Most pageserver functions are not async cancellation safe.
|
||||
// We arm a drop-guard, so that if Hyper drops the Future, we signal the task
|
||||
|
||||
@@ -31,6 +31,7 @@ pub mod walredo;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use camino::Utf8Path;
|
||||
use deletion_queue::DeletionQueue;
|
||||
use tenant::mgr::TenantManager;
|
||||
use tracing::info;
|
||||
|
||||
/// Current storage format version
|
||||
@@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
#[tracing::instrument(skip_all, fields(%exit_code))]
|
||||
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
|
||||
pub async fn shutdown_pageserver(
|
||||
tenant_manager: &TenantManager,
|
||||
deletion_queue: Option<DeletionQueue>,
|
||||
exit_code: i32,
|
||||
) {
|
||||
use std::time::Duration;
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
@@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC tasks.
|
||||
timed(
|
||||
tenant::mgr::shutdown_all_tenants(),
|
||||
tenant_manager.shutdown(),
|
||||
"shutdown all tenants",
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
@@ -114,27 +119,27 @@ pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub const TENANT_CONFIG_NAME: &str = "config";
|
||||
pub(crate) const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
/// Per-tenant configuration file.
|
||||
/// Full path: `tenants/<tenant_id>/config`.
|
||||
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
pub(crate) const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||
|
||||
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
||||
/// tenant path while in secondary mode.
|
||||
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||
pub(crate) const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||
|
||||
/// A suffix used for various temporary files. Any temporary files found in the
|
||||
/// data directory at pageserver startup can be automatically removed.
|
||||
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
pub(crate) const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
|
||||
/// A marker file to mark that a timeline directory was not fully initialized.
|
||||
/// If a timeline directory with this marker is encountered at pageserver startup,
|
||||
/// the timeline directory and the marker file are both removed.
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
||||
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
@@ -161,11 +166,11 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
|
||||
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
|
||||
// from the name.
|
||||
|
||||
pub fn is_uninit_mark(path: &Utf8Path) -> bool {
|
||||
pub(crate) fn is_uninit_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
pub fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||
pub(crate) fn is_delete_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
|
||||
@@ -167,7 +167,7 @@ impl GetVectoredLatency {
|
||||
pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
|
||||
let inner = register_histogram_vec!(
|
||||
"pageserver_get_vectored_seconds",
|
||||
"Time spent in get_vectored",
|
||||
"Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
|
||||
&["task_kind"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
@@ -2465,7 +2465,8 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
}
|
||||
|
||||
pub mod tokio_epoll_uring {
|
||||
use metrics::UIntGauge;
|
||||
use metrics::{register_int_counter, UIntGauge};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub struct Collector {
|
||||
descs: Vec<metrics::core::Desc>,
|
||||
@@ -2473,15 +2474,13 @@ pub mod tokio_epoll_uring {
|
||||
systems_destroyed: UIntGauge,
|
||||
}
|
||||
|
||||
const NMETRICS: usize = 2;
|
||||
|
||||
impl metrics::core::Collector for Collector {
|
||||
fn desc(&self) -> Vec<&metrics::core::Desc> {
|
||||
self.descs.iter().collect()
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
||||
let mut mfs = Vec::with_capacity(NMETRICS);
|
||||
let mut mfs = Vec::with_capacity(Self::NMETRICS);
|
||||
let tokio_epoll_uring::metrics::Metrics {
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
@@ -2495,6 +2494,8 @@ pub mod tokio_epoll_uring {
|
||||
}
|
||||
|
||||
impl Collector {
|
||||
const NMETRICS: usize = 2;
|
||||
|
||||
#[allow(clippy::new_without_default)]
|
||||
pub fn new() -> Self {
|
||||
let mut descs = Vec::new();
|
||||
@@ -2528,6 +2529,22 @@ pub mod tokio_epoll_uring {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static THREAD_LOCAL_LAUNCH_SUCCESSES: Lazy<metrics::IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_tokio_epoll_uring_pageserver_thread_local_launch_success_count",
|
||||
"Number of times where thread_local_system creation spanned multiple executor threads",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static THREAD_LOCAL_LAUNCH_FAILURES: Lazy<metrics::IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_tokio_epoll_uring_pageserver_thread_local_launch_failures_count",
|
||||
"Number of times thread_local_system creation failed and was retried after back-off.",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) mod tenant_throttling {
|
||||
@@ -2656,6 +2673,8 @@ pub fn preinitialize_metrics() {
|
||||
&WALRECEIVER_BROKER_UPDATES,
|
||||
&WALRECEIVER_CANDIDATES_ADDED,
|
||||
&WALRECEIVER_CANDIDATES_REMOVED,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_FAILURES,
|
||||
&tokio_epoll_uring::THREAD_LOCAL_LAUNCH_SUCCESSES,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
|
||||
@@ -50,8 +50,6 @@ use once_cell::sync::Lazy;
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::shutdown_pageserver;
|
||||
|
||||
//
|
||||
// There are four runtimes:
|
||||
//
|
||||
@@ -453,7 +451,7 @@ async fn task_finish(
|
||||
}
|
||||
|
||||
if shutdown_process {
|
||||
shutdown_pageserver(None, 1).await;
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -55,8 +55,8 @@ use self::mgr::GetTenantError;
|
||||
use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::upload::upload_index_part;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineCreateGuard;
|
||||
use self::timeline::uninit::TimelineExclusionError;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
@@ -565,9 +565,8 @@ impl Tenant {
|
||||
// avoiding holding it across awaits
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
match timelines_accessor.entry(timeline_id) {
|
||||
// We should never try and load the same timeline twice during startup
|
||||
Entry::Occupied(_) => {
|
||||
// The uninit mark file acts as a lock that prevents another task from
|
||||
// initializing the timeline at the same time.
|
||||
unreachable!(
|
||||
"Timeline {tenant_id}/{timeline_id} already exists in the tenant map"
|
||||
);
|
||||
@@ -1064,8 +1063,7 @@ impl Tenant {
|
||||
let entry_path = entry.path();
|
||||
|
||||
let purge = if crate::is_temporary(entry_path)
|
||||
// TODO: uninit_mark isn't needed any more, since uninitialized timelines are already
|
||||
// covered by the check that the timeline must exist in remote storage.
|
||||
// TODO: remove uninit mark code (https://github.com/neondatabase/neon/issues/5718)
|
||||
|| is_uninit_mark(entry_path)
|
||||
|| crate::is_delete_mark(entry_path)
|
||||
{
|
||||
@@ -1298,11 +1296,6 @@ impl Tenant {
|
||||
/// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
|
||||
/// and the timeline will fail to load at a restart.
|
||||
///
|
||||
/// That's why we add an uninit mark file, and wrap it together witht the Timeline
|
||||
/// in-memory object into UninitializedTimeline.
|
||||
/// Once the caller is done setting up the timeline, they should call
|
||||
/// `UninitializedTimeline::initialize_with_lock` to remove the uninit mark.
|
||||
///
|
||||
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
|
||||
/// minimum amount of keys required to get a writable timeline.
|
||||
/// (Without it, `put` might fail due to `repartition` failing.)
|
||||
@@ -1318,7 +1311,9 @@ impl Tenant {
|
||||
"Cannot create empty timelines on inactive tenant"
|
||||
);
|
||||
|
||||
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
|
||||
// Protect against concurrent attempts to use this TimelineId
|
||||
let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
|
||||
|
||||
let new_metadata = TimelineMetadata::new(
|
||||
// Initialize disk_consistent LSN to 0, The caller must import some data to
|
||||
// make it valid, before calling finish_creation()
|
||||
@@ -1333,7 +1328,7 @@ impl Tenant {
|
||||
self.prepare_new_timeline(
|
||||
new_timeline_id,
|
||||
&new_metadata,
|
||||
timeline_uninit_mark,
|
||||
create_guard,
|
||||
initdb_lsn,
|
||||
None,
|
||||
)
|
||||
@@ -1421,9 +1416,8 @@ impl Tenant {
|
||||
.map_err(|_| CreateTimelineError::ShuttingDown)?;
|
||||
|
||||
// Get exclusive access to the timeline ID: this ensures that it does not already exist,
|
||||
// and that no other creation attempts will be allowed in while we are working. The
|
||||
// uninit_mark is a guard.
|
||||
let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
|
||||
// and that no other creation attempts will be allowed in while we are working.
|
||||
let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
|
||||
Ok(m) => m,
|
||||
Err(TimelineExclusionError::AlreadyCreating) => {
|
||||
// Creation is in progress, we cannot create it again, and we cannot
|
||||
@@ -1466,6 +1460,8 @@ impl Tenant {
|
||||
}
|
||||
};
|
||||
|
||||
pausable_failpoint!("timeline-creation-after-uninit");
|
||||
|
||||
let loaded_timeline = match ancestor_timeline_id {
|
||||
Some(ancestor_timeline_id) => {
|
||||
let ancestor_timeline = self
|
||||
@@ -1513,7 +1509,7 @@ impl Tenant {
|
||||
&ancestor_timeline,
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
uninit_mark,
|
||||
create_guard,
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
@@ -1523,7 +1519,7 @@ impl Tenant {
|
||||
new_timeline_id,
|
||||
pg_version,
|
||||
load_existing_initdb,
|
||||
uninit_mark,
|
||||
create_guard,
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
@@ -2870,9 +2866,9 @@ impl Tenant {
|
||||
start_lsn: Option<Lsn>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
|
||||
let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
|
||||
let tl = self
|
||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
|
||||
.branch_timeline_impl(src_timeline, dst_id, start_lsn, create_guard, ctx)
|
||||
.await?;
|
||||
tl.set_state(TimelineState::Active);
|
||||
Ok(tl)
|
||||
@@ -2886,10 +2882,10 @@ impl Tenant {
|
||||
src_timeline: &Arc<Timeline>,
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
|
||||
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -2898,7 +2894,7 @@ impl Tenant {
|
||||
src_timeline: &Arc<Timeline>,
|
||||
dst_id: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
let src_id = src_timeline.timeline_id;
|
||||
@@ -2982,7 +2978,7 @@ impl Tenant {
|
||||
.prepare_new_timeline(
|
||||
dst_id,
|
||||
&metadata,
|
||||
timeline_uninit_mark,
|
||||
timeline_create_guard,
|
||||
start_lsn + 1,
|
||||
Some(Arc::clone(src_timeline)),
|
||||
)
|
||||
@@ -3014,12 +3010,12 @@ impl Tenant {
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
|
||||
let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
|
||||
self.bootstrap_timeline(
|
||||
timeline_id,
|
||||
pg_version,
|
||||
load_existing_initdb,
|
||||
uninit_mark,
|
||||
create_guard,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -3083,7 +3079,7 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
timeline_uninit_mark: TimelineUninitMark<'_>,
|
||||
timeline_create_guard: TimelineCreateGuard<'_>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
@@ -3095,13 +3091,14 @@ impl Tenant {
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
// an uninit mark was placed before, nothing else can access this timeline files
|
||||
// current initdb was not run yet, so remove whatever was left from the previous runs
|
||||
// Remove whatever was left from the previous runs: safe because TimelineCreateGuard guarantees
|
||||
// we won't race with other creations or existent timelines with the same path.
|
||||
if pgdata_path.exists() {
|
||||
fs::remove_dir_all(&pgdata_path).with_context(|| {
|
||||
format!("Failed to remove already existing initdb directory: {pgdata_path}")
|
||||
})?;
|
||||
}
|
||||
|
||||
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
|
||||
scopeguard::defer! {
|
||||
if let Err(e) = fs::remove_dir_all(&pgdata_path) {
|
||||
@@ -3178,7 +3175,7 @@ impl Tenant {
|
||||
.prepare_new_timeline(
|
||||
timeline_id,
|
||||
&new_metadata,
|
||||
timeline_uninit_mark,
|
||||
timeline_create_guard,
|
||||
pgdata_lsn,
|
||||
None,
|
||||
)
|
||||
@@ -3250,13 +3247,12 @@ impl Tenant {
|
||||
///
|
||||
/// An empty layer map is initialized, and new data and WAL can be imported starting
|
||||
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
|
||||
/// `finish_creation` to insert the Timeline into the timelines map and to remove the
|
||||
/// uninit mark file.
|
||||
/// `finish_creation` to insert the Timeline into the timelines map.
|
||||
async fn prepare_new_timeline<'a>(
|
||||
&'a self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: &TimelineMetadata,
|
||||
uninit_mark: TimelineUninitMark<'a>,
|
||||
create_guard: TimelineCreateGuard<'a>,
|
||||
start_lsn: Lsn,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
) -> anyhow::Result<UninitializedTimeline> {
|
||||
@@ -3279,9 +3275,12 @@ impl Tenant {
|
||||
|
||||
timeline_struct.init_empty_layer_map(start_lsn);
|
||||
|
||||
if let Err(e) = self.create_timeline_files(&uninit_mark.timeline_path).await {
|
||||
if let Err(e) = self
|
||||
.create_timeline_files(&create_guard.timeline_path)
|
||||
.await
|
||||
{
|
||||
error!("Failed to create initial files for timeline {tenant_shard_id}/{new_timeline_id}, cleaning up: {e:?}");
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
cleanup_timeline_directory(create_guard);
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
@@ -3292,41 +3291,31 @@ impl Tenant {
|
||||
Ok(UninitializedTimeline::new(
|
||||
self,
|
||||
new_timeline_id,
|
||||
Some((timeline_struct, uninit_mark)),
|
||||
Some((timeline_struct, create_guard)),
|
||||
))
|
||||
}
|
||||
|
||||
async fn create_timeline_files(&self, timeline_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
crashsafe::create_dir(timeline_path).context("Failed to create timeline directory")?;
|
||||
|
||||
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
|
||||
fail::fail_point!("after-timeline-dir-creation", |_| {
|
||||
anyhow::bail!("failpoint after-timeline-dir-creation");
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Attempts to create an uninit mark file for the timeline initialization.
|
||||
/// Bails, if the timeline is already loaded into the memory (i.e. initialized before), or the uninit mark file already exists.
|
||||
///
|
||||
/// This way, we need to hold the timelines lock only for small amount of time during the mark check/creation per timeline init.
|
||||
fn create_timeline_uninit_mark(
|
||||
/// Get a guard that provides exclusive access to the timeline directory, preventing
|
||||
/// concurrent attempts to create the same timeline.
|
||||
fn create_timeline_create_guard(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<TimelineUninitMark, TimelineExclusionError> {
|
||||
) -> Result<TimelineCreateGuard, TimelineExclusionError> {
|
||||
let tenant_shard_id = self.tenant_shard_id;
|
||||
|
||||
let uninit_mark_path = self
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
|
||||
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
|
||||
|
||||
let uninit_mark = TimelineUninitMark::new(
|
||||
self,
|
||||
timeline_id,
|
||||
uninit_mark_path.clone(),
|
||||
timeline_path.clone(),
|
||||
)?;
|
||||
let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
|
||||
|
||||
// At this stage, we have got exclusive access to in-memory state for this timeline ID
|
||||
// for creation.
|
||||
@@ -3342,23 +3331,7 @@ impl Tenant {
|
||||
)));
|
||||
}
|
||||
|
||||
// Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
|
||||
// that during process runtime, colliding creations will be caught in-memory without getting
|
||||
// as far as failing to write a file.
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(&uninit_mark_path)
|
||||
.context("Failed to create uninit mark file")
|
||||
.and_then(|_| {
|
||||
crashsafe::fsync_file_and_parent(&uninit_mark_path)
|
||||
.context("Failed to fsync uninit mark file")
|
||||
})
|
||||
.with_context(|| {
|
||||
format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
|
||||
})?;
|
||||
|
||||
Ok(uninit_mark)
|
||||
Ok(create_guard)
|
||||
}
|
||||
|
||||
/// Gathers inputs from all of the timelines to produce a sizing model input.
|
||||
@@ -5099,15 +5072,15 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_uninit_mark_crash() -> anyhow::Result<()> {
|
||||
let name = "test_uninit_mark_crash";
|
||||
async fn test_create_guard_crash() -> anyhow::Result<()> {
|
||||
let name = "test_create_guard_crash";
|
||||
let harness = TenantHarness::create(name)?;
|
||||
{
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
// Keeps uninit mark in place
|
||||
// Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
|
||||
let raw_tline = tline.raw_timeline().unwrap();
|
||||
raw_tline
|
||||
.shutdown()
|
||||
@@ -5135,11 +5108,6 @@ mod tests {
|
||||
.timeline_path(&tenant.tenant_shard_id, &TIMELINE_ID)
|
||||
.exists());
|
||||
|
||||
assert!(!harness
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(tenant.tenant_shard_id, TIMELINE_ID)
|
||||
.exists());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -296,6 +296,7 @@ impl DeleteTenantFlow {
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
@@ -303,7 +304,9 @@ impl DeleteTenantFlow {
|
||||
|
||||
let mut guard = Self::prepare(&tenant).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||
if let Err(e) =
|
||||
Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
|
||||
{
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
return Err(e);
|
||||
}
|
||||
@@ -322,6 +325,7 @@ impl DeleteTenantFlow {
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
@@ -335,15 +339,9 @@ impl DeleteTenantFlow {
|
||||
// Though sounds scary, different mark name?
|
||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
create_remote_delete_mark(
|
||||
conf,
|
||||
remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
||||
&CancellationToken::new(),
|
||||
)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
@@ -546,8 +544,7 @@ impl DeleteTenantFlow {
|
||||
conf,
|
||||
remote_storage.as_ref(),
|
||||
&tenant.tenant_shard_id,
|
||||
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
|
||||
&CancellationToken::new(),
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -102,7 +102,7 @@ pub(crate) enum TenantsMap {
|
||||
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
|
||||
/// New tenants can be added using [`tenant_map_acquire_slot`].
|
||||
Open(BTreeMap<TenantShardId, TenantSlot>),
|
||||
/// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
|
||||
/// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
|
||||
/// Existing tenants are still accessible, but no new tenants can be created.
|
||||
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
|
||||
}
|
||||
@@ -261,6 +261,12 @@ pub struct TenantManager {
|
||||
// See https://github.com/neondatabase/neon/issues/5796
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
resources: TenantSharedResources,
|
||||
|
||||
// Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
|
||||
// This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime),
|
||||
// tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
|
||||
// when the tenant detaches.
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
fn emergency_generations(
|
||||
@@ -620,6 +626,7 @@ pub async fn init_tenant_mgr(
|
||||
conf,
|
||||
tenants: &TENANTS,
|
||||
resources,
|
||||
cancel: CancellationToken::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -680,21 +687,6 @@ pub(crate) fn tenant_spawn(
|
||||
Ok(tenant)
|
||||
}
|
||||
|
||||
///
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
/// NB: We leave the tenants in the map, so that they remain accessible through
|
||||
/// the management API until we shut it down. If we removed the shut-down tenants
|
||||
/// from the tenants map, the management API would return 404 for these tenants,
|
||||
/// because TenantsMap::get() now returns `None`.
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn shutdown_all_tenants() {
|
||||
shutdown_all_tenants0(&TENANTS).await
|
||||
}
|
||||
|
||||
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
let mut join_set = JoinSet::new();
|
||||
|
||||
@@ -1428,6 +1420,7 @@ impl TenantManager {
|
||||
self.resources.remote_storage.clone(),
|
||||
&TENANTS,
|
||||
tenant,
|
||||
&self.cancel,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -1443,6 +1436,35 @@ impl TenantManager {
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let r = self
|
||||
.do_shard_split(tenant_shard_id, new_shard_count, new_stripe_size, ctx)
|
||||
.await;
|
||||
if r.is_err() {
|
||||
// Shard splitting might have left the original shard in a partially shut down state (it
|
||||
// stops the shard's remote timeline client). Reset it to ensure we leave things in
|
||||
// a working state.
|
||||
if self.get(tenant_shard_id).is_some() {
|
||||
tracing::warn!("Resetting {tenant_shard_id} after shard split failure");
|
||||
if let Err(e) = self.reset_tenant(tenant_shard_id, false, ctx).await {
|
||||
// Log this error because our return value will still be the original error, not this one. This is
|
||||
// a severe error: if this happens, we might be leaving behind a tenant that is not fully functional
|
||||
// (e.g. has uploads disabled). We can't do anything else: if reset fails then shutting the tenant down or
|
||||
// setting it broken probably won't help either.
|
||||
tracing::error!("Failed to reset {tenant_shard_id}: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r
|
||||
}
|
||||
|
||||
pub(crate) async fn do_shard_split(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||
|
||||
@@ -1477,6 +1499,10 @@ impl TenantManager {
|
||||
.join(",")
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-pre-prepare", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
let parent_shard_identity = tenant.shard_identity;
|
||||
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||
let parent_generation = tenant.generation;
|
||||
@@ -1490,6 +1516,10 @@ impl TenantManager {
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
fail::fail_point!("shard-split-post-prepare", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
self.resources.deletion_queue_client.flush_advisory();
|
||||
|
||||
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||
@@ -1511,11 +1541,16 @@ impl TenantManager {
|
||||
anyhow::bail!("Detached parent shard in the middle of split!")
|
||||
}
|
||||
};
|
||||
|
||||
fail::fail_point!("shard-split-pre-hardlink", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
// Optimization: hardlink layers from the parent into the children, so that they don't have to
|
||||
// re-download & duplicate the data referenced in their initial IndexPart
|
||||
self.shard_split_hardlink(parent, child_shards.clone())
|
||||
.await?;
|
||||
fail::fail_point!("shard-split-post-hardlink", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
|
||||
// child shards to reach this point.
|
||||
@@ -1555,6 +1590,10 @@ impl TenantManager {
|
||||
.await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("shard-split-post-child-conf", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
|
||||
for child_shard_id in &child_shards {
|
||||
let child_shard_id = *child_shard_id;
|
||||
@@ -1587,6 +1626,10 @@ impl TenantManager {
|
||||
timeline.timeline_id,
|
||||
target_lsn
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
||||
// Failure here might mean shutdown, in any case this part is an optimization
|
||||
// and we shouldn't hold up the split operation.
|
||||
@@ -1632,6 +1675,10 @@ impl TenantManager {
|
||||
},
|
||||
);
|
||||
|
||||
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
|
||||
"failpoint"
|
||||
)));
|
||||
|
||||
parent_slot_guard.drop_old_value()?;
|
||||
|
||||
// Phase 6: Release the InProgress on the parent shard
|
||||
@@ -1763,6 +1810,23 @@ impl TenantManager {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
/// NB: We leave the tenants in the map, so that they remain accessible through
|
||||
/// the management API until we shut it down. If we removed the shut-down tenants
|
||||
/// from the tenants map, the management API would return 404 for these tenants,
|
||||
/// because TenantsMap::get() now returns `None`.
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
self.cancel.cancel();
|
||||
|
||||
shutdown_all_tenants0(self.tenants).await
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
|
||||
@@ -23,7 +23,7 @@ use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -73,55 +73,13 @@ pub async fn download_layer_file<'a>(
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let (mut destination_file, bytes_amount) = download_retry(
|
||||
|| async {
|
||||
let destination_file = tokio::fs::File::create(&temp_file_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = storage.download(&remote_path, cancel).await?;
|
||||
|
||||
let mut destination_file =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file).await;
|
||||
|
||||
match bytes_amount {
|
||||
Ok(bytes_amount) => {
|
||||
let destination_file = destination_file.into_inner();
|
||||
Ok((destination_file, bytes_amount))
|
||||
}
|
||||
Err(e) => {
|
||||
if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
|
||||
}
|
||||
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
},
|
||||
let bytes_amount = download_retry(
|
||||
|| async { download_object(storage, &remote_path, &temp_file_path, cancel).await },
|
||||
&format!("download {remote_path:?}"),
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.with_context(|| format!("flush source file at {temp_file_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let expected = layer_metadata.file_size();
|
||||
if expected != bytes_amount {
|
||||
return Err(DownloadError::Other(anyhow!(
|
||||
@@ -129,14 +87,6 @@ pub async fn download_layer_file<'a>(
|
||||
)));
|
||||
}
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to fsync source file at {temp_file_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
Err(DownloadError::Other(anyhow!(
|
||||
"remote-storage-download-pre-rename failpoint triggered"
|
||||
@@ -169,6 +119,128 @@ pub async fn download_layer_file<'a>(
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
|
||||
/// Download the object `src_path` in the remote `storage` to local path `dst_path`.
|
||||
///
|
||||
/// If Ok() is returned, the download succeeded and the inode & data have been made durable.
|
||||
/// (Note that the directory entry for the inode is not made durable.)
|
||||
/// The file size in bytes is returned.
|
||||
///
|
||||
/// If Err() is returned, there was some error. The file at `dst_path` has been unlinked.
|
||||
/// The unlinking has _not_ been made durable.
|
||||
async fn download_object<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
src_path: &RemotePath,
|
||||
dst_path: &Utf8PathBuf,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let res = match crate::virtual_file::io_engine::get() {
|
||||
crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
|
||||
crate::virtual_file::io_engine::IoEngine::StdFs => {
|
||||
async {
|
||||
let destination_file = tokio::fs::File::create(dst_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let download = storage.download(src_path, cancel).await?;
|
||||
|
||||
let mut buf_writer =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
|
||||
buf_writer.flush().await?;
|
||||
|
||||
let mut destination_file = buf_writer.into_inner();
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file
|
||||
.flush()
|
||||
.await
|
||||
.with_context(|| format!("flush source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
.await
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
|
||||
use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
|
||||
async {
|
||||
let destination_file = VirtualFile::create(dst_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut download = storage.download(src_path, cancel).await?;
|
||||
|
||||
// TODO: use vectored write (writev) once supported by tokio-epoll-uring.
|
||||
// There's chunks_vectored() on the stream.
|
||||
let (bytes_amount, destination_file) = async {
|
||||
let size_tracking = size_tracking_writer::Writer::new(destination_file);
|
||||
let mut buffered = owned_buffers_io::write::BufferedWriter::<
|
||||
{ super::BUFFER_SIZE },
|
||||
_,
|
||||
>::new(size_tracking);
|
||||
while let Some(res) =
|
||||
futures::StreamExt::next(&mut download.download_stream).await
|
||||
{
|
||||
let chunk = match res {
|
||||
Ok(chunk) => chunk,
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
buffered
|
||||
.write_buffered(tokio_epoll_uring::BoundedBuf::slice_full(chunk))
|
||||
.await?;
|
||||
}
|
||||
let size_tracking = buffered.flush_and_into_inner().await?;
|
||||
Ok(size_tracking.into_inner())
|
||||
}
|
||||
.await?;
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to fsync source file at {dst_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(bytes_amount)
|
||||
}
|
||||
.await
|
||||
}
|
||||
};
|
||||
|
||||
// in case the download failed, clean up
|
||||
match res {
|
||||
Ok(bytes_amount) => Ok(bytes_amount),
|
||||
Err(e) => {
|
||||
if let Err(e) = tokio::fs::remove_file(dst_path).await {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
|
||||
}
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
|
||||
|
||||
@@ -95,7 +95,11 @@ pub(crate) struct SecondaryTenant {
|
||||
shard_identity: ShardIdentity,
|
||||
tenant_conf: std::sync::Mutex<TenantConfOpt>,
|
||||
|
||||
// Internal state used by the Downloader.
|
||||
detail: std::sync::Mutex<SecondaryDetail>,
|
||||
|
||||
// Public state indicating overall progress of downloads relative to the last heatmap seen
|
||||
pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
|
||||
}
|
||||
|
||||
impl SecondaryTenant {
|
||||
@@ -118,6 +122,8 @@ impl SecondaryTenant {
|
||||
tenant_conf: std::sync::Mutex::new(tenant_conf),
|
||||
|
||||
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
||||
|
||||
progress: std::sync::Mutex::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -247,9 +253,12 @@ impl SecondaryTenant {
|
||||
}
|
||||
|
||||
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
|
||||
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
|
||||
/// where we want to immediately upload/download for a particular tenant. In normal operation
|
||||
/// uploads & downloads are autonomous and not driven by this interface.
|
||||
/// and heatmap uploads. This is not a hot data path: it's used for:
|
||||
/// - Live migrations, where we want to ensure a migration destination has the freshest possible
|
||||
/// content before trying to cut over.
|
||||
/// - Tests, where we want to immediately upload/download for a particular tenant.
|
||||
///
|
||||
/// In normal operations, outside of migrations, uploads & downloads are autonomous and not driven by this interface.
|
||||
pub struct SecondaryController {
|
||||
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
|
||||
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
|
||||
|
||||
@@ -41,14 +41,16 @@ use crate::tenant::{
|
||||
use camino::Utf8PathBuf;
|
||||
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||
use futures::Future;
|
||||
use pageserver_api::models::SecondaryProgress;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{info_span, instrument, warn, Instrument};
|
||||
use utils::{
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
|
||||
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, failpoint_support, fs_ext,
|
||||
id::TimelineId,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -128,6 +130,7 @@ pub(super) struct SecondaryDetail {
|
||||
pub(super) config: SecondaryLocationConfig,
|
||||
|
||||
last_download: Option<Instant>,
|
||||
last_etag: Option<Etag>,
|
||||
next_download: Option<Instant>,
|
||||
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||
}
|
||||
@@ -138,11 +141,26 @@ fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
|
||||
datetime.format("%d/%m/%Y %T")
|
||||
}
|
||||
|
||||
/// Information returned from download function when it detects the heatmap has changed
|
||||
struct HeatMapModified {
|
||||
etag: Etag,
|
||||
last_modified: SystemTime,
|
||||
bytes: Vec<u8>,
|
||||
}
|
||||
|
||||
enum HeatMapDownload {
|
||||
// The heatmap's etag has changed: return the new etag, mtime and the body bytes
|
||||
Modified(HeatMapModified),
|
||||
// The heatmap's etag is unchanged
|
||||
Unmodified,
|
||||
}
|
||||
|
||||
impl SecondaryDetail {
|
||||
pub(super) fn new(config: SecondaryLocationConfig) -> Self {
|
||||
Self {
|
||||
config,
|
||||
last_download: None,
|
||||
last_etag: None,
|
||||
next_download: None,
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
@@ -477,11 +495,31 @@ impl<'a> TenantDownloader<'a> {
|
||||
};
|
||||
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
|
||||
// We will use the etag from last successful download to make the download conditional on changes
|
||||
let last_etag = self
|
||||
.secondary_state
|
||||
.detail
|
||||
.lock()
|
||||
.unwrap()
|
||||
.last_etag
|
||||
.clone();
|
||||
|
||||
// Download the tenant's heatmap
|
||||
let heatmap_bytes = tokio::select!(
|
||||
bytes = self.download_heatmap() => {bytes?},
|
||||
let HeatMapModified {
|
||||
last_modified: heatmap_mtime,
|
||||
etag: heatmap_etag,
|
||||
bytes: heatmap_bytes,
|
||||
} = match tokio::select!(
|
||||
bytes = self.download_heatmap(last_etag.as_ref()) => {bytes?},
|
||||
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
||||
);
|
||||
) {
|
||||
HeatMapDownload::Unmodified => {
|
||||
tracing::info!("Heatmap unchanged since last successful download");
|
||||
return Ok(());
|
||||
}
|
||||
HeatMapDownload::Modified(m) => m,
|
||||
};
|
||||
|
||||
let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
|
||||
|
||||
@@ -498,6 +536,14 @@ impl<'a> TenantDownloader<'a> {
|
||||
|
||||
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
|
||||
|
||||
// Clean up any local layers that aren't in the heatmap. We do this first for all timelines, on the general
|
||||
// principle that deletions should be done before writes wherever possible, and so that we can use this
|
||||
// phase to initialize our SecondaryProgress.
|
||||
{
|
||||
*self.secondary_state.progress.lock().unwrap() =
|
||||
self.prepare_timelines(&heatmap, heatmap_mtime).await?;
|
||||
}
|
||||
|
||||
// Download the layers in the heatmap
|
||||
for timeline in heatmap.timelines {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
@@ -515,30 +561,159 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Only update last_etag after a full successful download: this way will not skip
|
||||
// the next download, even if the heatmap's actual etag is unchanged.
|
||||
self.secondary_state.detail.lock().unwrap().last_etag = Some(heatmap_etag);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
|
||||
/// Do any fast local cleanup that comes before the much slower process of downloading
|
||||
/// layers from remote storage. In the process, initialize the SecondaryProgress object
|
||||
/// that will later be updated incrementally as we download layers.
|
||||
async fn prepare_timelines(
|
||||
&self,
|
||||
heatmap: &HeatMapTenant,
|
||||
heatmap_mtime: SystemTime,
|
||||
) -> Result<SecondaryProgress, UpdateError> {
|
||||
let heatmap_stats = heatmap.get_stats();
|
||||
// We will construct a progress object, and then populate its initial "downloaded" numbers
|
||||
// while iterating through local layer state in [`Self::prepare_timelines`]
|
||||
let mut progress = SecondaryProgress {
|
||||
layers_total: heatmap_stats.layers,
|
||||
bytes_total: heatmap_stats.bytes,
|
||||
heatmap_mtime: Some(heatmap_mtime),
|
||||
layers_downloaded: 0,
|
||||
bytes_downloaded: 0,
|
||||
};
|
||||
// Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
|
||||
let mut delete_layers = Vec::new();
|
||||
let mut delete_timelines = Vec::new();
|
||||
{
|
||||
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||
for (timeline_id, timeline_state) in &mut detail.timelines {
|
||||
let Some(heatmap_timeline_index) = heatmap
|
||||
.timelines
|
||||
.iter()
|
||||
.position(|t| t.timeline_id == *timeline_id)
|
||||
else {
|
||||
// This timeline is no longer referenced in the heatmap: delete it locally
|
||||
delete_timelines.push(*timeline_id);
|
||||
continue;
|
||||
};
|
||||
|
||||
let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
|
||||
|
||||
let layers_in_heatmap = heatmap_timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let mut layer_count = layers_on_disk.len();
|
||||
let mut layer_byte_count: u64 = timeline_state
|
||||
.on_disk_layers
|
||||
.values()
|
||||
.map(|l| l.metadata.file_size())
|
||||
.sum();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
layer_count -= 1;
|
||||
layer_byte_count -= timeline_state
|
||||
.on_disk_layers
|
||||
.get(layer)
|
||||
.unwrap()
|
||||
.metadata
|
||||
.file_size();
|
||||
|
||||
delete_layers.push((*timeline_id, (*layer).clone()));
|
||||
}
|
||||
|
||||
progress.bytes_downloaded += layer_byte_count;
|
||||
progress.layers_downloaded += layer_count;
|
||||
}
|
||||
}
|
||||
|
||||
// Execute accumulated deletions
|
||||
for (timeline_id, layer_name) in delete_layers {
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||
let local_path = timeline_path.join(layer_name.to_string());
|
||||
tracing::info!(timeline_id=%timeline_id, "Removing secondary local layer {layer_name} because it's absent in heatmap",);
|
||||
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary layer")?;
|
||||
|
||||
// Update in-memory housekeeping to reflect the absence of the deleted layer
|
||||
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||
let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
|
||||
continue;
|
||||
};
|
||||
timeline_state.on_disk_layers.remove(&layer_name);
|
||||
}
|
||||
|
||||
for timeline_id in delete_timelines {
|
||||
let timeline_path = self
|
||||
.conf
|
||||
.timeline_path(self.secondary_state.get_tenant_shard_id(), &timeline_id);
|
||||
tracing::info!(timeline_id=%timeline_id,
|
||||
"Timeline no longer in heatmap, removing from secondary location"
|
||||
);
|
||||
tokio::fs::remove_dir_all(&timeline_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary timeline")?;
|
||||
}
|
||||
|
||||
Ok(progress)
|
||||
}
|
||||
|
||||
/// Returns downloaded bytes if the etag differs from `prev_etag`, or None if the object
|
||||
/// still matches `prev_etag`.
|
||||
async fn download_heatmap(
|
||||
&self,
|
||||
prev_etag: Option<&Etag>,
|
||||
) -> Result<HeatMapDownload, UpdateError> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
// TODO: make download conditional on ETag having changed since last download
|
||||
// TODO: pull up etag check into the request, to do a conditional GET rather than
|
||||
// issuing a GET and then maybe ignoring the response body
|
||||
// (https://github.com/neondatabase/neon/issues/6199)
|
||||
tracing::debug!("Downloading heatmap for secondary tenant",);
|
||||
|
||||
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
||||
let cancel = &self.secondary_state.cancel;
|
||||
|
||||
let heatmap_bytes = backoff::retry(
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let download = self
|
||||
.remote_storage
|
||||
.download(&heatmap_path, cancel)
|
||||
.await
|
||||
.map_err(UpdateError::from)?;
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||
Ok(heatmap_bytes)
|
||||
|
||||
if Some(&download.etag) == prev_etag {
|
||||
Ok(HeatMapDownload::Unmodified)
|
||||
} else {
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
Ok(HeatMapDownload::Modified(HeatMapModified {
|
||||
etag: download.etag,
|
||||
last_modified: download.last_modified,
|
||||
bytes: heatmap_bytes,
|
||||
}))
|
||||
}
|
||||
},
|
||||
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
@@ -548,11 +723,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| UpdateError::Cancelled)
|
||||
.and_then(|x| x)?;
|
||||
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
|
||||
Ok(heatmap_bytes)
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
|
||||
@@ -593,27 +764,6 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
};
|
||||
|
||||
let layers_in_heatmap = timeline
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| &l.name)
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
.on_disk_layers
|
||||
.iter()
|
||||
.map(|l| l.0)
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
// Remove on-disk layers that are no longer present in heatmap
|
||||
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||
let local_path = timeline_path.join(layer.to_string());
|
||||
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.maybe_fatal_err("Removing secondary layer")?;
|
||||
}
|
||||
|
||||
// Download heatmap layers that are not present on local disk, or update their
|
||||
// access time if they are already present.
|
||||
for layer in timeline.layers {
|
||||
@@ -662,6 +812,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// Failpoint for simulating slow remote storage
|
||||
failpoint_support::sleep_millis_async!(
|
||||
"secondary-layer-download-sleep",
|
||||
&self.secondary_state.cancel
|
||||
);
|
||||
|
||||
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||
let downloaded_bytes = match download_layer_file(
|
||||
self.conf,
|
||||
@@ -701,6 +857,11 @@ impl<'a> TenantDownloader<'a> {
|
||||
tokio::fs::remove_file(&local_path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)?;
|
||||
} else {
|
||||
tracing::info!("Downloaded layer {}, size {}", layer.name, downloaded_bytes);
|
||||
let mut progress = self.secondary_state.progress.lock().unwrap();
|
||||
progress.bytes_downloaded += downloaded_bytes;
|
||||
progress.layers_downloaded += 1;
|
||||
}
|
||||
|
||||
SECONDARY_MODE.download_layer.inc();
|
||||
|
||||
@@ -62,3 +62,25 @@ impl HeatMapTimeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct HeatMapStats {
|
||||
pub(crate) bytes: u64,
|
||||
pub(crate) layers: usize,
|
||||
}
|
||||
|
||||
impl HeatMapTenant {
|
||||
pub(crate) fn get_stats(&self) -> HeatMapStats {
|
||||
let mut stats = HeatMapStats {
|
||||
bytes: 0,
|
||||
layers: 0,
|
||||
};
|
||||
for timeline in &self.timelines {
|
||||
for layer in &timeline.layers {
|
||||
stats.layers += 1;
|
||||
stats.bytes += layer.metadata.file_size;
|
||||
}
|
||||
}
|
||||
|
||||
stats
|
||||
}
|
||||
}
|
||||
|
||||
@@ -183,7 +183,13 @@ pub(super) async fn gather_inputs(
|
||||
// new gc run, which we have no control over. however differently from `Timeline::gc`
|
||||
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
|
||||
// actually removing files.
|
||||
let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
|
||||
//
|
||||
// We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
|
||||
// a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
|
||||
// than a space bound (horizon cutoff). This means that if someone drops a database and waits for their
|
||||
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
|
||||
// horizon_cutoff.
|
||||
let mut next_gc_cutoff = gc_info.pitr_cutoff;
|
||||
|
||||
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
|
||||
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
|
||||
|
||||
@@ -20,6 +20,7 @@ use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::models::{
|
||||
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
@@ -427,7 +428,7 @@ impl LayerAccessStatFullDetails {
|
||||
} = self;
|
||||
pageserver_api::models::LayerAccessStatFullDetails {
|
||||
when_millis_since_epoch: system_time_to_millis_since_epoch(when),
|
||||
task_kind: task_kind.into(), // into static str, powered by strum_macros
|
||||
task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
|
||||
access_kind: *access_kind,
|
||||
}
|
||||
}
|
||||
@@ -525,7 +526,7 @@ impl LayerAccessStats {
|
||||
.collect(),
|
||||
task_kind_access_flag: task_kind_flag
|
||||
.iter()
|
||||
.map(|task_kind| task_kind.into()) // into static str, powered by strum_macros
|
||||
.map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
|
||||
.collect(),
|
||||
first: first_access.as_ref().map(|a| a.as_api_model()),
|
||||
accesses_history: last_accesses.map(|m| m.as_api_model()),
|
||||
|
||||
@@ -710,10 +710,6 @@ impl LayerInner {
|
||||
// disable any scheduled but not yet running eviction deletions for this
|
||||
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// count cancellations, which currently remain largely unexpected
|
||||
let init_cancelled =
|
||||
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
|
||||
|
||||
// no need to make the evict_and_wait wait for the actual download to complete
|
||||
drop(self.status.send(Status::Downloaded));
|
||||
|
||||
@@ -722,7 +718,9 @@ impl LayerInner {
|
||||
.upgrade()
|
||||
.ok_or_else(|| DownloadError::TimelineShutdown)?;
|
||||
|
||||
// FIXME: grab a gate
|
||||
// count cancellations, which currently remain largely unexpected
|
||||
let init_cancelled =
|
||||
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
|
||||
|
||||
let can_ever_evict = timeline.remote_client.as_ref().is_some();
|
||||
|
||||
@@ -731,9 +729,17 @@ impl LayerInner {
|
||||
let needs_download = self
|
||||
.needs_download()
|
||||
.await
|
||||
.map_err(DownloadError::PreStatFailed)?;
|
||||
.map_err(DownloadError::PreStatFailed);
|
||||
|
||||
let permit = if let Some(reason) = needs_download {
|
||||
let needs_download = match needs_download {
|
||||
Ok(reason) => reason,
|
||||
Err(e) => {
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
let (permit, downloaded) = if let Some(reason) = needs_download {
|
||||
if let NeedsDownload::NotFile(ft) = reason {
|
||||
return Err(DownloadError::NotFile(ft));
|
||||
}
|
||||
@@ -744,36 +750,59 @@ impl LayerInner {
|
||||
self.wanted_evicted.store(false, Ordering::Release);
|
||||
|
||||
if !can_ever_evict {
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(DownloadError::NoRemoteStorage);
|
||||
}
|
||||
|
||||
if let Some(ctx) = ctx {
|
||||
self.check_expected_download(ctx)?;
|
||||
let res = self.check_expected_download(ctx);
|
||||
if let Err(e) = res {
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
if !allow_download {
|
||||
// this does look weird, but for LayerInner the "downloading" means also changing
|
||||
// internal once related state ...
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(DownloadError::DownloadRequired);
|
||||
}
|
||||
|
||||
tracing::info!(%reason, "downloading on-demand");
|
||||
|
||||
self.spawn_download_and_wait(timeline, permit).await?
|
||||
let permit = self.spawn_download_and_wait(timeline, permit).await;
|
||||
|
||||
let permit = match permit {
|
||||
Ok(permit) => permit,
|
||||
Err(e) => {
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
|
||||
(permit, true)
|
||||
} else {
|
||||
// the file is present locally, probably by a previous but cancelled call to
|
||||
// get_or_maybe_download. alternatively we might be running without remote storage.
|
||||
LAYER_IMPL_METRICS.inc_init_needed_no_download();
|
||||
|
||||
permit
|
||||
(permit, false)
|
||||
};
|
||||
|
||||
let since_last_eviction =
|
||||
self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
|
||||
if let Some(since_last_eviction) = since_last_eviction {
|
||||
// FIXME: this will not always be recorded correctly until #6028 (the no
|
||||
// download needed branch above)
|
||||
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
|
||||
if downloaded {
|
||||
let since_last_eviction = self
|
||||
.last_evicted_at
|
||||
.lock()
|
||||
.unwrap()
|
||||
.take()
|
||||
.map(|ts| ts.elapsed());
|
||||
|
||||
if let Some(since_last_eviction) = since_last_eviction {
|
||||
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
|
||||
}
|
||||
}
|
||||
|
||||
let res = Arc::new(DownloadedLayer {
|
||||
@@ -795,8 +824,6 @@ impl LayerInner {
|
||||
);
|
||||
}
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
|
||||
Ok((ResidentOrWantedEvicted::Resident(res), permit))
|
||||
}
|
||||
.instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
|
||||
|
||||
@@ -130,10 +130,10 @@ where
|
||||
self.inner.load().config.steady_rps()
|
||||
}
|
||||
|
||||
pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
|
||||
pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
|
||||
let inner = self.inner.load_full(); // clones the `Inner` Arc
|
||||
if !inner.task_kinds.contains(ctx.task_kind()) {
|
||||
return;
|
||||
return None;
|
||||
};
|
||||
let start = std::time::Instant::now();
|
||||
let mut did_throttle = false;
|
||||
@@ -170,6 +170,9 @@ where
|
||||
});
|
||||
}
|
||||
}
|
||||
Some(wait_time)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -634,6 +634,8 @@ impl Timeline {
|
||||
/// If a remote layer file is needed, it is downloaded as part of this
|
||||
/// call.
|
||||
///
|
||||
/// This method enforces [`Self::timeline_get_throttle`] internally.
|
||||
///
|
||||
/// NOTE: It is considered an error to 'get' a key that doesn't exist. The
|
||||
/// abstraction above this needs to store suitable metadata to track what
|
||||
/// data exists with what keys, in separate metadata entries. If a
|
||||
@@ -644,18 +646,27 @@ impl Timeline {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
#[inline(always)]
|
||||
pub(crate) async fn get(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
self.timeline_get_throttle.throttle(ctx, 1).await;
|
||||
self.get_impl(key, lsn, ctx).await
|
||||
}
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
async fn get_impl(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
if !lsn.is_valid() {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
|
||||
}
|
||||
|
||||
self.timeline_get_throttle.throttle(ctx, 1).await;
|
||||
|
||||
// This check is debug-only because of the cost of hashing, and because it's a double-check: we
|
||||
// already checked the key against the shard_identity when looking up the Timeline from
|
||||
// page_service.
|
||||
@@ -752,10 +763,6 @@ impl Timeline {
|
||||
return Err(GetVectoredError::Oversized(key_count));
|
||||
}
|
||||
|
||||
self.timeline_get_throttle
|
||||
.throttle(ctx, key_count as usize)
|
||||
.await;
|
||||
|
||||
for range in &keyspace.ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
@@ -772,11 +779,18 @@ impl Timeline {
|
||||
self.conf.get_vectored_impl
|
||||
);
|
||||
|
||||
let _timer = crate::metrics::GET_VECTORED_LATENCY
|
||||
let start = crate::metrics::GET_VECTORED_LATENCY
|
||||
.for_task_kind(ctx.task_kind())
|
||||
.map(|t| t.start_timer());
|
||||
.map(|metric| (metric, Instant::now()));
|
||||
|
||||
match self.conf.get_vectored_impl {
|
||||
// start counting after throttle so that throttle time
|
||||
// is always less than observation time
|
||||
let throttled = self
|
||||
.timeline_get_throttle
|
||||
.throttle(ctx, key_count as usize)
|
||||
.await;
|
||||
|
||||
let res = match self.conf.get_vectored_impl {
|
||||
GetVectoredImpl::Sequential => {
|
||||
self.get_vectored_sequential_impl(keyspace, lsn, ctx).await
|
||||
}
|
||||
@@ -790,9 +804,33 @@ impl Timeline {
|
||||
|
||||
vectored_res
|
||||
}
|
||||
};
|
||||
|
||||
if let Some((metric, start)) = start {
|
||||
let elapsed = start.elapsed();
|
||||
let ex_throttled = if let Some(throttled) = throttled {
|
||||
elapsed.checked_sub(throttled)
|
||||
} else {
|
||||
Some(elapsed)
|
||||
};
|
||||
|
||||
if let Some(ex_throttled) = ex_throttled {
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
} else {
|
||||
use utils::rate_limit::RateLimit;
|
||||
static LOGGED: Lazy<Mutex<RateLimit>> =
|
||||
Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
|
||||
let mut rate_limit = LOGGED.lock().unwrap();
|
||||
rate_limit.call(|| {
|
||||
warn!("error deducting time spent throttled; this message is logged at a global rate limit");
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
pub(super) async fn get_vectored_sequential_impl(
|
||||
&self,
|
||||
keyspace: KeySpace,
|
||||
@@ -803,7 +841,7 @@ impl Timeline {
|
||||
for range in keyspace.ranges {
|
||||
let mut key = range.start;
|
||||
while key != range.end {
|
||||
let block = self.get(key, lsn, ctx).await;
|
||||
let block = self.get_impl(key, lsn, ctx).await;
|
||||
|
||||
use PageReconstructError::*;
|
||||
match block {
|
||||
@@ -853,6 +891,7 @@ impl Timeline {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Not subject to [`Self::timeline_get_throttle`].
|
||||
pub(super) async fn validate_get_vectored_impl(
|
||||
&self,
|
||||
vectored_res: &Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError>,
|
||||
@@ -2967,7 +3006,6 @@ impl Timeline {
|
||||
}
|
||||
|
||||
trace!("waking up");
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
let flush_counter = *layer_flush_start_rx.borrow();
|
||||
let result = loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -2978,6 +3016,8 @@ impl Timeline {
|
||||
return;
|
||||
}
|
||||
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
|
||||
let layer_to_flush = {
|
||||
let guard = self.layers.read().await;
|
||||
guard.layer_map().frozen_layers.front().cloned()
|
||||
@@ -2999,13 +3039,12 @@ impl Timeline {
|
||||
break err;
|
||||
}
|
||||
}
|
||||
timer.stop_and_record();
|
||||
};
|
||||
// Notify any listeners that we're done
|
||||
let _ = self
|
||||
.layer_flush_done_tx
|
||||
.send_replace((flush_counter, result));
|
||||
|
||||
timer.stop_and_record();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3073,6 +3112,7 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), FlushLayerError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
@@ -3744,8 +3784,11 @@ impl Timeline {
|
||||
// The timestamp is in the future. That sounds impossible,
|
||||
// but what it really means is that there hasn't been
|
||||
// any commits since the cutoff timestamp.
|
||||
//
|
||||
// In this case we should use the LSN of the most recent commit,
|
||||
// which is implicitly the last LSN in the log.
|
||||
debug!("future({})", lsn);
|
||||
cutoff_horizon
|
||||
self.get_last_record_lsn()
|
||||
}
|
||||
LsnForTimestamp::Past(lsn) => {
|
||||
debug!("past({})", lsn);
|
||||
|
||||
@@ -2,8 +2,8 @@ use std::{collections::hash_map::Entry, fs, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use tracing::{error, info, info_span, warn};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
|
||||
use tracing::{error, info, info_span};
|
||||
use utils::{fs_ext, id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{context::RequestContext, import_datadir, tenant::Tenant};
|
||||
|
||||
@@ -11,22 +11,22 @@ use super::Timeline;
|
||||
|
||||
/// A timeline with some of its files on disk, being initialized.
|
||||
/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
|
||||
/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
|
||||
/// to be removed on next restart.
|
||||
/// its local files are removed. If we crash while this class exists, then the timeline's local
|
||||
/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage.
|
||||
///
|
||||
/// The caller is responsible for proper timeline data filling before the final init.
|
||||
#[must_use]
|
||||
pub struct UninitializedTimeline<'t> {
|
||||
pub(crate) owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
|
||||
}
|
||||
|
||||
impl<'t> UninitializedTimeline<'t> {
|
||||
pub(crate) fn new(
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
|
||||
raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard<'t>)>,
|
||||
) -> Self {
|
||||
Self {
|
||||
owning_tenant,
|
||||
@@ -35,8 +35,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Finish timeline creation: insert it into the Tenant's timelines map and remove the
|
||||
/// uninit mark file.
|
||||
/// Finish timeline creation: insert it into the Tenant's timelines map
|
||||
///
|
||||
/// This function launches the flush loop if not already done.
|
||||
///
|
||||
@@ -72,16 +71,9 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
Entry::Vacant(v) => {
|
||||
// after taking here should be no fallible operations, because the drop guard will not
|
||||
// cleanup after and would block for example the tenant deletion
|
||||
let (new_timeline, uninit_mark) =
|
||||
let (new_timeline, _create_guard) =
|
||||
self.raw_timeline.take().expect("already checked");
|
||||
|
||||
// this is the mutual exclusion between different retries to create the timeline;
|
||||
// this should be an assertion.
|
||||
uninit_mark.remove_uninit_mark().with_context(|| {
|
||||
format!(
|
||||
"Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
|
||||
)
|
||||
})?;
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
|
||||
new_timeline.maybe_spawn_flush_loop();
|
||||
@@ -120,8 +112,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
.await
|
||||
.context("Failed to flush after basebackup import")?;
|
||||
|
||||
// All the data has been imported. Insert the Timeline into the tenant's timelines
|
||||
// map and remove the uninit mark file.
|
||||
// All the data has been imported. Insert the Timeline into the tenant's timelines map
|
||||
let tl = self.finish_creation()?;
|
||||
tl.activate(broker_client, None, ctx);
|
||||
Ok(tl)
|
||||
@@ -143,37 +134,35 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
|
||||
impl Drop for UninitializedTimeline<'_> {
|
||||
fn drop(&mut self) {
|
||||
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
|
||||
if let Some((_, create_guard)) = self.raw_timeline.take() {
|
||||
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
|
||||
error!("Timeline got dropped without initializing, cleaning its files");
|
||||
cleanup_timeline_directory(uninit_mark);
|
||||
cleanup_timeline_directory(create_guard);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
|
||||
let timeline_path = &uninit_mark.timeline_path;
|
||||
pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
|
||||
let timeline_path = &create_guard.timeline_path;
|
||||
match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
|
||||
Ok(()) => {
|
||||
info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
|
||||
info!("Timeline dir {timeline_path:?} removed successfully")
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
|
||||
}
|
||||
}
|
||||
drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
|
||||
// Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other
|
||||
// timeline creation attempts under this TimelineId to proceed
|
||||
drop(create_guard);
|
||||
}
|
||||
|
||||
/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
|
||||
/// or gets removed eventually.
|
||||
///
|
||||
/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
|
||||
/// A guard for timeline creations in process: as long as this object exists, the timeline ID
|
||||
/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
|
||||
#[must_use]
|
||||
pub(crate) struct TimelineUninitMark<'t> {
|
||||
pub(crate) struct TimelineCreateGuard<'t> {
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
uninit_mark_deleted: bool,
|
||||
uninit_mark_path: Utf8PathBuf,
|
||||
pub(crate) timeline_path: Utf8PathBuf,
|
||||
}
|
||||
|
||||
@@ -190,11 +179,10 @@ pub(crate) enum TimelineExclusionError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl<'t> TimelineUninitMark<'t> {
|
||||
impl<'t> TimelineCreateGuard<'t> {
|
||||
pub(crate) fn new(
|
||||
owning_tenant: &'t Tenant,
|
||||
timeline_id: TimelineId,
|
||||
uninit_mark_path: Utf8PathBuf,
|
||||
timeline_path: Utf8PathBuf,
|
||||
) -> Result<Self, TimelineExclusionError> {
|
||||
// Lock order: this is the only place we take both locks. During drop() we only
|
||||
@@ -214,56 +202,14 @@ impl<'t> TimelineUninitMark<'t> {
|
||||
Ok(Self {
|
||||
owning_tenant,
|
||||
timeline_id,
|
||||
uninit_mark_deleted: false,
|
||||
uninit_mark_path,
|
||||
timeline_path,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
|
||||
if !self.uninit_mark_deleted {
|
||||
self.delete_mark_file_if_present()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
|
||||
let uninit_mark_file = &self.uninit_mark_path;
|
||||
let uninit_mark_parent = uninit_mark_file
|
||||
.parent()
|
||||
.with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
|
||||
fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
|
||||
format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
|
||||
})?;
|
||||
crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
|
||||
self.uninit_mark_deleted = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TimelineUninitMark<'_> {
|
||||
impl Drop for TimelineCreateGuard<'_> {
|
||||
fn drop(&mut self) {
|
||||
if !self.uninit_mark_deleted {
|
||||
if self.timeline_path.exists() {
|
||||
error!(
|
||||
"Uninit mark {} is not removed, timeline {} stays uninitialized",
|
||||
self.uninit_mark_path, self.timeline_path
|
||||
)
|
||||
} else {
|
||||
// unblock later timeline creation attempts
|
||||
warn!(
|
||||
"Removing intermediate uninit mark file {}",
|
||||
self.uninit_mark_path
|
||||
);
|
||||
if let Err(e) = self.delete_mark_file_if_present() {
|
||||
error!("Failed to remove the uninit mark file: {e}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.owning_tenant
|
||||
.timelines_creating
|
||||
.lock()
|
||||
|
||||
@@ -28,12 +28,31 @@ use tokio::time::Instant;
|
||||
|
||||
pub use pageserver_api::models::virtual_file as api;
|
||||
pub(crate) mod io_engine;
|
||||
pub use io_engine::feature_test as io_engine_feature_test;
|
||||
pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
|
||||
mod metadata;
|
||||
mod open_options;
|
||||
pub(crate) use io_engine::IoEngineKind;
|
||||
pub(crate) use metadata::Metadata;
|
||||
pub(crate) use open_options::*;
|
||||
|
||||
#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
|
||||
pub(crate) mod owned_buffers_io {
|
||||
//! Abstractions for IO with owned buffers.
|
||||
//!
|
||||
//! Not actually tied to [`crate::virtual_file`] specifically, but, it's the primary
|
||||
//! reason we need this abstraction.
|
||||
//!
|
||||
//! Over time, this could move into the `tokio-epoll-uring` crate, maybe `uring-common`,
|
||||
//! but for the time being we're proving out the primitives in the neon.git repo
|
||||
//! for faster iteration.
|
||||
|
||||
pub(crate) mod write;
|
||||
pub(crate) mod util {
|
||||
pub(crate) mod size_tracking_writer;
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
/// the underlying file is closed if the system is low on file descriptors,
|
||||
|
||||
@@ -6,6 +6,11 @@
|
||||
//! Initialize using [`init`].
|
||||
//!
|
||||
//! Then use [`get`] and [`super::OpenOptions`].
|
||||
//!
|
||||
//!
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub(super) mod tokio_epoll_uring_ext;
|
||||
|
||||
use tokio_epoll_uring::{IoBuf, Slice};
|
||||
use tracing::Instrument;
|
||||
@@ -145,7 +150,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.read(file_guard, offset, buf).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
}
|
||||
@@ -160,7 +165,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.fsync(file_guard).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
}
|
||||
@@ -178,7 +183,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.fdatasync(file_guard).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
}
|
||||
@@ -197,7 +202,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.statx(file_guard).await;
|
||||
(
|
||||
resources,
|
||||
@@ -220,7 +225,7 @@ impl IoEngine {
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
IoEngine::TokioEpollUring => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = tokio_epoll_uring_ext::thread_local_system().await;
|
||||
let (resources, res) = system.write(file_guard, offset, buf).await;
|
||||
(resources, res.map_err(epoll_uring_error_to_std))
|
||||
}
|
||||
@@ -253,3 +258,82 @@ impl IoEngine {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum FeatureTestResult {
|
||||
PlatformPreferred(IoEngineKind),
|
||||
Worse {
|
||||
engine: IoEngineKind,
|
||||
remark: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl FeatureTestResult {
|
||||
#[cfg(target_os = "linux")]
|
||||
const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::TokioEpollUring;
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
const PLATFORM_PREFERRED: IoEngineKind = IoEngineKind::StdFs;
|
||||
}
|
||||
|
||||
impl From<FeatureTestResult> for IoEngineKind {
|
||||
fn from(val: FeatureTestResult) -> Self {
|
||||
match val {
|
||||
FeatureTestResult::PlatformPreferred(e) => e,
|
||||
FeatureTestResult::Worse { engine, .. } => engine,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Somewhat costly under the hood, do only once.
|
||||
/// Panics if we can't set up the feature test.
|
||||
pub fn feature_test() -> anyhow::Result<FeatureTestResult> {
|
||||
std::thread::spawn(|| {
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
{
|
||||
Ok(FeatureTestResult::PlatformPreferred(
|
||||
FeatureTestResult::PLATFORM_PREFERRED,
|
||||
))
|
||||
}
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
Ok(match rt.block_on(tokio_epoll_uring::System::launch()) {
|
||||
Ok(_) => FeatureTestResult::PlatformPreferred({
|
||||
assert!(matches!(
|
||||
IoEngineKind::TokioEpollUring,
|
||||
FeatureTestResult::PLATFORM_PREFERRED
|
||||
));
|
||||
FeatureTestResult::PLATFORM_PREFERRED
|
||||
}),
|
||||
Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) => {
|
||||
let remark = match e.raw_os_error() {
|
||||
Some(nix::libc::EPERM) => {
|
||||
// fall back
|
||||
"creating tokio-epoll-uring fails with EPERM, assuming it's admin-disabled "
|
||||
.to_string()
|
||||
}
|
||||
Some(nix::libc::EFAULT) => {
|
||||
// fail feature test
|
||||
anyhow::bail!(
|
||||
"creating tokio-epoll-uring fails with EFAULT, might have corrupted memory"
|
||||
);
|
||||
}
|
||||
Some(_) | None => {
|
||||
// fall back
|
||||
format!("creating tokio-epoll-uring fails with error: {e:#}")
|
||||
}
|
||||
};
|
||||
FeatureTestResult::Worse {
|
||||
engine: IoEngineKind::StdFs,
|
||||
remark,
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
})
|
||||
.join()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
194
pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
Normal file
194
pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
Normal file
@@ -0,0 +1,194 @@
|
||||
//! Like [`::tokio_epoll_uring::thread_local_system()`], but with pageserver-specific
|
||||
//! handling in case the instance can't launched.
|
||||
//!
|
||||
//! This is primarily necessary due to ENOMEM aka OutOfMemory errors during io_uring creation
|
||||
//! on older kernels, such as some (but not all) older kernels in the Linux 5.10 series.
|
||||
//! See <https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391> for more details.
|
||||
|
||||
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::backoff::{DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
|
||||
use tokio_epoll_uring::{System, SystemHandle};
|
||||
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
|
||||
use crate::metrics::tokio_epoll_uring as metrics;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct ThreadLocalState(Arc<ThreadLocalStateInner>);
|
||||
|
||||
struct ThreadLocalStateInner {
|
||||
cell: tokio::sync::OnceCell<SystemHandle>,
|
||||
launch_attempts: AtomicU32,
|
||||
/// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
|
||||
thread_local_state_id: u64,
|
||||
}
|
||||
|
||||
impl ThreadLocalState {
|
||||
pub fn new() -> Self {
|
||||
Self(Arc::new(ThreadLocalStateInner {
|
||||
cell: tokio::sync::OnceCell::default(),
|
||||
launch_attempts: AtomicU32::new(0),
|
||||
thread_local_state_id: THREAD_LOCAL_STATE_ID.fetch_add(1, Ordering::Relaxed),
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn make_id_string(&self) -> String {
|
||||
format!("{}", self.0.thread_local_state_id)
|
||||
}
|
||||
}
|
||||
|
||||
static THREAD_LOCAL_STATE_ID: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
thread_local! {
|
||||
static THREAD_LOCAL: ThreadLocalState = ThreadLocalState::new();
|
||||
}
|
||||
|
||||
/// Panics if we cannot [`System::launch`].
|
||||
pub async fn thread_local_system() -> Handle {
|
||||
let fake_cancel = CancellationToken::new();
|
||||
loop {
|
||||
let thread_local_state = THREAD_LOCAL.with(|arc| arc.clone());
|
||||
let inner = &thread_local_state.0;
|
||||
let get_or_init_res = inner
|
||||
.cell
|
||||
.get_or_try_init(|| async {
|
||||
let attempt_no = inner
|
||||
.launch_attempts
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let span = info_span!("tokio_epoll_uring_ext::thread_local_system", thread_local=%thread_local_state.make_id_string(), %attempt_no);
|
||||
async {
|
||||
// Rate-limit retries per thread-local.
|
||||
// NB: doesn't yield to executor at attempt_no=0.
|
||||
utils::backoff::exponential_backoff(
|
||||
attempt_no,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
&fake_cancel,
|
||||
)
|
||||
.await;
|
||||
let res = System::launch()
|
||||
// this might move us to another executor thread => loop outside the get_or_try_init, not inside it
|
||||
.await;
|
||||
match res {
|
||||
Ok(system) => {
|
||||
info!("successfully launched system");
|
||||
metrics::THREAD_LOCAL_LAUNCH_SUCCESSES.inc();
|
||||
Ok(system)
|
||||
}
|
||||
Err(tokio_epoll_uring::LaunchResult::IoUringBuild(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
|
||||
warn!("not enough locked memory to tokio-epoll-uring, will retry");
|
||||
info_span!("stats").in_scope(|| {
|
||||
emit_launch_failure_process_stats();
|
||||
});
|
||||
metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
|
||||
Err(())
|
||||
}
|
||||
// abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
|
||||
// This is equivalent to a fatal IO error.
|
||||
Err(ref e @ tokio_epoll_uring::LaunchResult::IoUringBuild(ref inner)) => {
|
||||
error!(error=%e, "failed to launch thread-local tokio-epoll-uring, this should not happen, aborting process");
|
||||
info_span!("stats").in_scope(|| {
|
||||
emit_launch_failure_process_stats();
|
||||
});
|
||||
on_fatal_io_error(inner, "launch thread-local tokio-epoll-uring");
|
||||
},
|
||||
}
|
||||
}
|
||||
.instrument(span)
|
||||
.await
|
||||
})
|
||||
.await;
|
||||
if get_or_init_res.is_ok() {
|
||||
return Handle(thread_local_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn emit_launch_failure_process_stats() {
|
||||
// tokio-epoll-uring stats
|
||||
// vmlck + rlimit
|
||||
// number of threads
|
||||
// rss / system memory usage generally
|
||||
|
||||
let tokio_epoll_uring::metrics::Metrics {
|
||||
systems_created,
|
||||
systems_destroyed,
|
||||
} = tokio_epoll_uring::metrics::global();
|
||||
info!(systems_created, systems_destroyed, "tokio-epoll-uring");
|
||||
|
||||
match procfs::process::Process::myself() {
|
||||
Ok(myself) => {
|
||||
match myself.limits() {
|
||||
Ok(limits) => {
|
||||
info!(?limits.max_locked_memory, "/proc/self/limits");
|
||||
}
|
||||
Err(error) => {
|
||||
info!(%error, "no limit stats due to error");
|
||||
}
|
||||
}
|
||||
|
||||
match myself.status() {
|
||||
Ok(status) => {
|
||||
let procfs::process::Status {
|
||||
vmsize,
|
||||
vmlck,
|
||||
vmpin,
|
||||
vmrss,
|
||||
rssanon,
|
||||
rssfile,
|
||||
rssshmem,
|
||||
vmdata,
|
||||
vmstk,
|
||||
vmexe,
|
||||
vmlib,
|
||||
vmpte,
|
||||
threads,
|
||||
..
|
||||
} = status;
|
||||
info!(
|
||||
vmsize,
|
||||
vmlck,
|
||||
vmpin,
|
||||
vmrss,
|
||||
rssanon,
|
||||
rssfile,
|
||||
rssshmem,
|
||||
vmdata,
|
||||
vmstk,
|
||||
vmexe,
|
||||
vmlib,
|
||||
vmpte,
|
||||
threads,
|
||||
"/proc/self/status"
|
||||
);
|
||||
}
|
||||
Err(error) => {
|
||||
info!(%error, "no status status due to error");
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(error) => {
|
||||
info!(%error, "no process stats due to error");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Handle(ThreadLocalState);
|
||||
|
||||
impl std::ops::Deref for Handle {
|
||||
type Target = SystemHandle;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0
|
||||
.0
|
||||
.cell
|
||||
.get()
|
||||
.expect("must be already initialized when using this")
|
||||
}
|
||||
}
|
||||
@@ -98,7 +98,7 @@ impl OpenOptions {
|
||||
OpenOptions::StdFs(x) => x.open(path).map(|file| file.into()),
|
||||
#[cfg(target_os = "linux")]
|
||||
OpenOptions::TokioEpollUring(x) => {
|
||||
let system = tokio_epoll_uring::thread_local_system().await;
|
||||
let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await;
|
||||
system.open(path, x).await.map_err(|e| match e {
|
||||
tokio_epoll_uring::Error::Op(e) => e,
|
||||
tokio_epoll_uring::Error::System(system) => {
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf};
|
||||
|
||||
pub struct Writer {
|
||||
dst: VirtualFile,
|
||||
bytes_amount: u64,
|
||||
}
|
||||
|
||||
impl Writer {
|
||||
pub fn new(dst: VirtualFile) -> Self {
|
||||
Self {
|
||||
dst,
|
||||
bytes_amount: 0,
|
||||
}
|
||||
}
|
||||
/// Returns the wrapped `VirtualFile` object as well as the number
|
||||
/// of bytes that were written to it through this object.
|
||||
pub fn into_inner(self) -> (u64, VirtualFile) {
|
||||
(self.bytes_amount, self.dst)
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for Writer {
|
||||
#[inline(always)]
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let (buf, res) = self.dst.write_all(buf).await;
|
||||
let nwritten = res?;
|
||||
self.bytes_amount += u64::try_from(nwritten).unwrap();
|
||||
Ok((nwritten, buf))
|
||||
}
|
||||
}
|
||||
206
pageserver/src/virtual_file/owned_buffers_io/write.rs
Normal file
206
pageserver/src/virtual_file/owned_buffers_io/write.rs
Normal file
@@ -0,0 +1,206 @@
|
||||
use bytes::BytesMut;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
|
||||
|
||||
/// A trait for doing owned-buffer write IO.
|
||||
/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
|
||||
pub trait OwnedAsyncWriter {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)>;
|
||||
}
|
||||
|
||||
/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
|
||||
/// into `BUFFER_SIZE`-sized writes.
|
||||
///
|
||||
/// # Passthrough Of Large Writers
|
||||
///
|
||||
/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
|
||||
/// buffer to be flushed, even if it is not full yet. Then, the large
|
||||
/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
|
||||
///
|
||||
/// This pass-through is generally beneficial for throughput, but if
|
||||
/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
|
||||
/// unlimited large writes may cause latency or fairness issues.
|
||||
///
|
||||
/// In such cases, a different implementation that always buffers in memory
|
||||
/// may be preferable.
|
||||
pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
|
||||
writer: W,
|
||||
// invariant: always remains Some(buf)
|
||||
// with buf.capacity() == BUFFER_SIZE except
|
||||
// - while IO is ongoing => goes back to Some() once the IO completed successfully
|
||||
// - after an IO error => stays `None` forever
|
||||
// In these exceptional cases, it's `None`.
|
||||
buf: Option<BytesMut>,
|
||||
}
|
||||
|
||||
impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
|
||||
where
|
||||
W: OwnedAsyncWriter,
|
||||
{
|
||||
pub fn new(writer: W) -> Self {
|
||||
Self {
|
||||
writer,
|
||||
buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
|
||||
self.flush().await?;
|
||||
let Self { buf, writer } = self;
|
||||
assert!(buf.is_some());
|
||||
Ok(writer)
|
||||
}
|
||||
|
||||
pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
|
||||
where
|
||||
B: IoBuf + Send,
|
||||
{
|
||||
// avoid memcpy for the middle of the chunk
|
||||
if chunk.len() >= BUFFER_SIZE {
|
||||
self.flush().await?;
|
||||
// do a big write, bypassing `buf`
|
||||
assert_eq!(
|
||||
self.buf
|
||||
.as_ref()
|
||||
.expect("must not use after an error")
|
||||
.len(),
|
||||
0
|
||||
);
|
||||
let chunk_len = chunk.len();
|
||||
let (nwritten, chunk) = self.writer.write_all(chunk).await?;
|
||||
assert_eq!(nwritten, chunk_len);
|
||||
drop(chunk);
|
||||
return Ok(());
|
||||
}
|
||||
// in-memory copy the < BUFFER_SIZED tail of the chunk
|
||||
assert!(chunk.len() < BUFFER_SIZE);
|
||||
let mut chunk = &chunk[..];
|
||||
while !chunk.is_empty() {
|
||||
let buf = self.buf.as_mut().expect("must not use after an error");
|
||||
let need = BUFFER_SIZE - buf.len();
|
||||
let have = chunk.len();
|
||||
let n = std::cmp::min(need, have);
|
||||
buf.extend_from_slice(&chunk[..n]);
|
||||
chunk = &chunk[n..];
|
||||
if buf.len() >= BUFFER_SIZE {
|
||||
assert_eq!(buf.len(), BUFFER_SIZE);
|
||||
self.flush().await?;
|
||||
}
|
||||
}
|
||||
assert!(chunk.is_empty(), "by now we should have drained the chunk");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> std::io::Result<()> {
|
||||
let buf = self.buf.take().expect("must not use after an error");
|
||||
if buf.is_empty() {
|
||||
self.buf = Some(buf);
|
||||
return std::io::Result::Ok(());
|
||||
}
|
||||
let buf_len = buf.len();
|
||||
let (nwritten, mut buf) = self.writer.write_all(buf).await?;
|
||||
assert_eq!(nwritten, buf_len);
|
||||
buf.clear();
|
||||
self.buf = Some(buf);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for Vec<u8> {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
return Ok((0, Slice::into_inner(buf.slice_full())));
|
||||
}
|
||||
let buf = buf.slice(0..nbytes);
|
||||
self.extend_from_slice(&buf[..]);
|
||||
Ok((buf.len(), Slice::into_inner(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RecorderWriter {
|
||||
writes: Vec<Vec<u8>>,
|
||||
}
|
||||
impl OwnedAsyncWriter for RecorderWriter {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let nbytes = buf.bytes_init();
|
||||
if nbytes == 0 {
|
||||
self.writes.push(vec![]);
|
||||
return Ok((0, Slice::into_inner(buf.slice_full())));
|
||||
}
|
||||
let buf = buf.slice(0..nbytes);
|
||||
self.writes.push(Vec::from(&buf[..]));
|
||||
Ok((buf.len(), Slice::into_inner(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! write {
|
||||
($writer:ident, $data:literal) => {{
|
||||
$writer
|
||||
.write_buffered(::bytes::Bytes::from_static($data).slice_full())
|
||||
.await?;
|
||||
}};
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_buffered_writes_only() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::<2, _>::new(recorder);
|
||||
write!(writer, b"a");
|
||||
write!(writer, b"b");
|
||||
write!(writer, b"c");
|
||||
write!(writer, b"d");
|
||||
write!(writer, b"e");
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_passthrough_writes_only() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::<2, _>::new(recorder);
|
||||
write!(writer, b"abc");
|
||||
write!(writer, b"de");
|
||||
write!(writer, b"");
|
||||
write!(writer, b"fghijk");
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::<2, _>::new(recorder);
|
||||
write!(writer, b"a");
|
||||
write!(writer, b"bc");
|
||||
write!(writer, b"d");
|
||||
write!(writer, b"e");
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
94
poetry.lock
generated
94
poetry.lock
generated
@@ -2182,7 +2182,6 @@ files = [
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
@@ -2529,6 +2528,87 @@ docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
|
||||
optional = ["python-socks", "wsaccel"]
|
||||
test = ["websockets"]
|
||||
|
||||
[[package]]
|
||||
name = "websockets"
|
||||
version = "12.0"
|
||||
description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "websockets-12.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d554236b2a2006e0ce16315c16eaa0d628dab009c33b63ea03f41c6107958374"},
|
||||
{file = "websockets-12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2d225bb6886591b1746b17c0573e29804619c8f755b5598d875bb4235ea639be"},
|
||||
{file = "websockets-12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:eb809e816916a3b210bed3c82fb88eaf16e8afcf9c115ebb2bacede1797d2547"},
|
||||
{file = "websockets-12.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c588f6abc13f78a67044c6b1273a99e1cf31038ad51815b3b016ce699f0d75c2"},
|
||||
{file = "websockets-12.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5aa9348186d79a5f232115ed3fa9020eab66d6c3437d72f9d2c8ac0c6858c558"},
|
||||
{file = "websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6350b14a40c95ddd53e775dbdbbbc59b124a5c8ecd6fbb09c2e52029f7a9f480"},
|
||||
{file = "websockets-12.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:70ec754cc2a769bcd218ed8d7209055667b30860ffecb8633a834dde27d6307c"},
|
||||
{file = "websockets-12.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6e96f5ed1b83a8ddb07909b45bd94833b0710f738115751cdaa9da1fb0cb66e8"},
|
||||
{file = "websockets-12.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4d87be612cbef86f994178d5186add3d94e9f31cc3cb499a0482b866ec477603"},
|
||||
{file = "websockets-12.0-cp310-cp310-win32.whl", hash = "sha256:befe90632d66caaf72e8b2ed4d7f02b348913813c8b0a32fae1cc5fe3730902f"},
|
||||
{file = "websockets-12.0-cp310-cp310-win_amd64.whl", hash = "sha256:363f57ca8bc8576195d0540c648aa58ac18cf85b76ad5202b9f976918f4219cf"},
|
||||
{file = "websockets-12.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5d873c7de42dea355d73f170be0f23788cf3fa9f7bed718fd2830eefedce01b4"},
|
||||
{file = "websockets-12.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3f61726cae9f65b872502ff3c1496abc93ffbe31b278455c418492016e2afc8f"},
|
||||
{file = "websockets-12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed2fcf7a07334c77fc8a230755c2209223a7cc44fc27597729b8ef5425aa61a3"},
|
||||
{file = "websockets-12.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e332c210b14b57904869ca9f9bf4ca32f5427a03eeb625da9b616c85a3a506c"},
|
||||
{file = "websockets-12.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5693ef74233122f8ebab026817b1b37fe25c411ecfca084b29bc7d6efc548f45"},
|
||||
{file = "websockets-12.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e9e7db18b4539a29cc5ad8c8b252738a30e2b13f033c2d6e9d0549b45841c04"},
|
||||
{file = "websockets-12.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6e2df67b8014767d0f785baa98393725739287684b9f8d8a1001eb2839031447"},
|
||||
{file = "websockets-12.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bea88d71630c5900690fcb03161ab18f8f244805c59e2e0dc4ffadae0a7ee0ca"},
|
||||
{file = "websockets-12.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dff6cdf35e31d1315790149fee351f9e52978130cef6c87c4b6c9b3baf78bc53"},
|
||||
{file = "websockets-12.0-cp311-cp311-win32.whl", hash = "sha256:3e3aa8c468af01d70332a382350ee95f6986db479ce7af14d5e81ec52aa2b402"},
|
||||
{file = "websockets-12.0-cp311-cp311-win_amd64.whl", hash = "sha256:25eb766c8ad27da0f79420b2af4b85d29914ba0edf69f547cc4f06ca6f1d403b"},
|
||||
{file = "websockets-12.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0e6e2711d5a8e6e482cacb927a49a3d432345dfe7dea8ace7b5790df5932e4df"},
|
||||
{file = "websockets-12.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:dbcf72a37f0b3316e993e13ecf32f10c0e1259c28ffd0a85cee26e8549595fbc"},
|
||||
{file = "websockets-12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:12743ab88ab2af1d17dd4acb4645677cb7063ef4db93abffbf164218a5d54c6b"},
|
||||
{file = "websockets-12.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b645f491f3c48d3f8a00d1fce07445fab7347fec54a3e65f0725d730d5b99cb"},
|
||||
{file = "websockets-12.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9893d1aa45a7f8b3bc4510f6ccf8db8c3b62120917af15e3de247f0780294b92"},
|
||||
{file = "websockets-12.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f38a7b376117ef7aff996e737583172bdf535932c9ca021746573bce40165ed"},
|
||||
{file = "websockets-12.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f764ba54e33daf20e167915edc443b6f88956f37fb606449b4a5b10ba42235a5"},
|
||||
{file = "websockets-12.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:1e4b3f8ea6a9cfa8be8484c9221ec0257508e3a1ec43c36acdefb2a9c3b00aa2"},
|
||||
{file = "websockets-12.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9fdf06fd06c32205a07e47328ab49c40fc1407cdec801d698a7c41167ea45113"},
|
||||
{file = "websockets-12.0-cp312-cp312-win32.whl", hash = "sha256:baa386875b70cbd81798fa9f71be689c1bf484f65fd6fb08d051a0ee4e79924d"},
|
||||
{file = "websockets-12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ae0a5da8f35a5be197f328d4727dbcfafa53d1824fac3d96cdd3a642fe09394f"},
|
||||
{file = "websockets-12.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5f6ffe2c6598f7f7207eef9a1228b6f5c818f9f4d53ee920aacd35cec8110438"},
|
||||
{file = "websockets-12.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9edf3fc590cc2ec20dc9d7a45108b5bbaf21c0d89f9fd3fd1685e223771dc0b2"},
|
||||
{file = "websockets-12.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8572132c7be52632201a35f5e08348137f658e5ffd21f51f94572ca6c05ea81d"},
|
||||
{file = "websockets-12.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:604428d1b87edbf02b233e2c207d7d528460fa978f9e391bd8aaf9c8311de137"},
|
||||
{file = "websockets-12.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a9d160fd080c6285e202327aba140fc9a0d910b09e423afff4ae5cbbf1c7205"},
|
||||
{file = "websockets-12.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87b4aafed34653e465eb77b7c93ef058516cb5acf3eb21e42f33928616172def"},
|
||||
{file = "websockets-12.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b2ee7288b85959797970114deae81ab41b731f19ebcd3bd499ae9ca0e3f1d2c8"},
|
||||
{file = "websockets-12.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:7fa3d25e81bfe6a89718e9791128398a50dec6d57faf23770787ff441d851967"},
|
||||
{file = "websockets-12.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a571f035a47212288e3b3519944f6bf4ac7bc7553243e41eac50dd48552b6df7"},
|
||||
{file = "websockets-12.0-cp38-cp38-win32.whl", hash = "sha256:3c6cc1360c10c17463aadd29dd3af332d4a1adaa8796f6b0e9f9df1fdb0bad62"},
|
||||
{file = "websockets-12.0-cp38-cp38-win_amd64.whl", hash = "sha256:1bf386089178ea69d720f8db6199a0504a406209a0fc23e603b27b300fdd6892"},
|
||||
{file = "websockets-12.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab3d732ad50a4fbd04a4490ef08acd0517b6ae6b77eb967251f4c263011a990d"},
|
||||
{file = "websockets-12.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1d9697f3337a89691e3bd8dc56dea45a6f6d975f92e7d5f773bc715c15dde28"},
|
||||
{file = "websockets-12.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1df2fbd2c8a98d38a66f5238484405b8d1d16f929bb7a33ed73e4801222a6f53"},
|
||||
{file = "websockets-12.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23509452b3bc38e3a057382c2e941d5ac2e01e251acce7adc74011d7d8de434c"},
|
||||
{file = "websockets-12.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2e5fc14ec6ea568200ea4ef46545073da81900a2b67b3e666f04adf53ad452ec"},
|
||||
{file = "websockets-12.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46e71dbbd12850224243f5d2aeec90f0aaa0f2dde5aeeb8fc8df21e04d99eff9"},
|
||||
{file = "websockets-12.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b81f90dcc6c85a9b7f29873beb56c94c85d6f0dac2ea8b60d995bd18bf3e2aae"},
|
||||
{file = "websockets-12.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a02413bc474feda2849c59ed2dfb2cddb4cd3d2f03a2fedec51d6e959d9b608b"},
|
||||
{file = "websockets-12.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bbe6013f9f791944ed31ca08b077e26249309639313fff132bfbf3ba105673b9"},
|
||||
{file = "websockets-12.0-cp39-cp39-win32.whl", hash = "sha256:cbe83a6bbdf207ff0541de01e11904827540aa069293696dd528a6640bd6a5f6"},
|
||||
{file = "websockets-12.0-cp39-cp39-win_amd64.whl", hash = "sha256:fc4e7fa5414512b481a2483775a8e8be7803a35b30ca805afa4998a84f9fd9e8"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:248d8e2446e13c1d4326e0a6a4e9629cb13a11195051a73acf414812700badbd"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f44069528d45a933997a6fef143030d8ca8042f0dfaad753e2906398290e2870"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c4e37d36f0d19f0a4413d3e18c0d03d0c268ada2061868c1e6f5ab1a6d575077"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d829f975fc2e527a3ef2f9c8f25e553eb7bc779c6665e8e1d52aa22800bb38b"},
|
||||
{file = "websockets-12.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2c71bd45a777433dd9113847af751aae36e448bc6b8c361a566cb043eda6ec30"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:0bee75f400895aef54157b36ed6d3b308fcab62e5260703add87f44cee9c82a6"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:423fc1ed29f7512fceb727e2d2aecb952c46aa34895e9ed96071821309951123"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27a5e9964ef509016759f2ef3f2c1e13f403725a5e6a1775555994966a66e931"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3181df4583c4d3994d31fb235dc681d2aaad744fbdbf94c4802485ececdecf2"},
|
||||
{file = "websockets-12.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b067cb952ce8bf40115f6c19f478dc71c5e719b7fbaa511359795dfd9d1a6468"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:00700340c6c7ab788f176d118775202aadea7602c5cc6be6ae127761c16d6b0b"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e469d01137942849cff40517c97a30a93ae79917752b34029f0ec72df6b46399"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffefa1374cd508d633646d51a8e9277763a9b78ae71324183693959cf94635a7"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba0cab91b3956dfa9f512147860783a1829a8d905ee218a9837c18f683239611"},
|
||||
{file = "websockets-12.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:2cb388a5bfb56df4d9a406783b7f9dbefb888c09b71629351cc6b036e9259370"},
|
||||
{file = "websockets-12.0-py3-none-any.whl", hash = "sha256:dc284bbc8d7c78a6c69e0c7325ab46ee5e40bb4d50e494d8131a07ef47500e9e"},
|
||||
{file = "websockets-12.0.tar.gz", hash = "sha256:81df9cbcbb6c260de1e007e58c011bfebe2dafc8435107b0537f393dd38c8b1b"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.0.1"
|
||||
@@ -2572,16 +2652,6 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
@@ -2819,4 +2889,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "af9d5b45310c12411bfe67cb9677d2236808d0780ca1bd81525d2763a928f7f9"
|
||||
content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"
|
||||
|
||||
@@ -25,13 +25,16 @@ pub async fn authenticate_cleartext(
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer.pause();
|
||||
let paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let auth_outcome = AuthFlow::new(client)
|
||||
let auth_flow = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword(secret))
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
drop(paused);
|
||||
// cleartext auth is only allowed to the ws/http protocol.
|
||||
// If we're here, we already received the password in the first message.
|
||||
// Scram protocol will be executed on the proxy side.
|
||||
let auth_outcome = auth_flow.authenticate().await?;
|
||||
|
||||
let keys = match auth_outcome {
|
||||
sasl::Outcome::Success(key) => key,
|
||||
@@ -56,7 +59,7 @@ pub async fn password_hack_no_authentication(
|
||||
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer.pause();
|
||||
let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
|
||||
@@ -143,7 +143,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
let Scram(secret, ctx) = self.state;
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = ctx.latency_timer.pause();
|
||||
let _paused = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
|
||||
// Initial client message contains the chosen auth method's name.
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
|
||||
@@ -6,7 +6,9 @@ use super::{
|
||||
ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
|
||||
NodeInfo,
|
||||
};
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
|
||||
use crate::{
|
||||
auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
|
||||
};
|
||||
use crate::{
|
||||
cache::Cached,
|
||||
context::RequestMonitoring,
|
||||
@@ -72,7 +74,9 @@ impl Api {
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = match parse_body::<GetRoleSecret>(response).await {
|
||||
Ok(body) => body,
|
||||
@@ -132,7 +136,9 @@ impl Api {
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
let body = parse_body::<WakeCompute>(response).await?;
|
||||
|
||||
@@ -244,6 +250,8 @@ impl super::Api for Api {
|
||||
// which means that we might cache it to reduce the load and latency.
|
||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||
info!(key = &*key, "found cached compute node info");
|
||||
info!("cold_start_info=warm");
|
||||
ctx.set_cold_start_info(ColdStartInfo::Warm);
|
||||
return Ok(cached);
|
||||
}
|
||||
|
||||
@@ -254,6 +262,8 @@ impl super::Api for Api {
|
||||
if permit.should_check_cache() {
|
||||
if let Some(cached) = self.caches.node_info.get(&key) {
|
||||
info!(key = &*key, "found cached compute node info");
|
||||
info!("cold_start_info=warm");
|
||||
ctx.set_cold_start_info(ColdStartInfo::Warm);
|
||||
return Ok(cached);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,11 +15,12 @@ use crate::{
|
||||
BranchId, DbName, EndpointId, ProjectId, RoleName,
|
||||
};
|
||||
|
||||
use self::parquet::RequestData;
|
||||
|
||||
pub mod parquet;
|
||||
|
||||
static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestMonitoring>> = OnceCell::new();
|
||||
static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Context data for a single request to connect to a database.
|
||||
///
|
||||
/// This data should **not** be used for connection logic, only for observability and limiting purposes.
|
||||
@@ -46,7 +47,7 @@ pub struct RequestMonitoring {
|
||||
|
||||
// extra
|
||||
// This sender is here to keep the request monitoring channel open while requests are taking place.
|
||||
sender: Option<mpsc::UnboundedSender<RequestMonitoring>>,
|
||||
sender: Option<mpsc::UnboundedSender<RequestData>>,
|
||||
pub latency_timer: LatencyTimer,
|
||||
}
|
||||
|
||||
@@ -111,6 +112,10 @@ impl RequestMonitoring {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
|
||||
self.cold_start_info = Some(info);
|
||||
}
|
||||
|
||||
pub fn set_project(&mut self, x: MetricsAuxInfo) {
|
||||
self.set_endpoint_id(x.endpoint_id);
|
||||
self.branch = Some(x.branch_id);
|
||||
@@ -168,7 +173,7 @@ impl RequestMonitoring {
|
||||
impl Drop for RequestMonitoring {
|
||||
fn drop(&mut self) {
|
||||
if let Some(tx) = self.sender.take() {
|
||||
let _: Result<(), _> = tx.send(self.clone());
|
||||
let _: Result<(), _> = tx.send(RequestData::from(&*self));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
|
||||
// * after each rowgroup write, we check the length of the file and upload to s3 if large enough
|
||||
|
||||
#[derive(parquet_derive::ParquetRecordWriter)]
|
||||
struct RequestData {
|
||||
pub struct RequestData {
|
||||
region: &'static str,
|
||||
protocol: &'static str,
|
||||
/// Must be UTC. The derive macro doesn't like the timezones
|
||||
@@ -93,14 +93,14 @@ struct RequestData {
|
||||
/// Or if we make it to proxy_pass
|
||||
success: bool,
|
||||
/// Indicates if the cplane started the new compute node for this request.
|
||||
cold_start_info: Option<String>,
|
||||
cold_start_info: Option<&'static str>,
|
||||
/// Tracks time from session start (HTTP request/libpq TCP handshake)
|
||||
/// Through to success/failure
|
||||
duration_us: u64,
|
||||
}
|
||||
|
||||
impl From<RequestMonitoring> for RequestData {
|
||||
fn from(value: RequestMonitoring) -> Self {
|
||||
impl From<&RequestMonitoring> for RequestData {
|
||||
fn from(value: &RequestMonitoring) -> Self {
|
||||
Self {
|
||||
session_id: value.session_id,
|
||||
peer_addr: value.peer_addr.to_string(),
|
||||
@@ -121,10 +121,12 @@ impl From<RequestMonitoring> for RequestData {
|
||||
region: value.region,
|
||||
error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
|
||||
success: value.success,
|
||||
cold_start_info: value
|
||||
.cold_start_info
|
||||
.as_ref()
|
||||
.map(|x| serde_json::to_string(x).unwrap_or_default()),
|
||||
cold_start_info: value.cold_start_info.as_ref().map(|x| match x {
|
||||
crate::console::messages::ColdStartInfo::Unknown => "unknown",
|
||||
crate::console::messages::ColdStartInfo::Warm => "warm",
|
||||
crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
|
||||
crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
|
||||
}),
|
||||
duration_us: SystemTime::from(value.first_packet)
|
||||
.elapsed()
|
||||
.unwrap_or_default()
|
||||
@@ -458,7 +460,7 @@ mod tests {
|
||||
region: "us-east-1",
|
||||
error: None,
|
||||
success: rng.gen(),
|
||||
cold_start_info: Some("no".into()),
|
||||
cold_start_info: Some("no"),
|
||||
duration_us: rng.gen_range(0..30_000_000),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use ::metrics::{
|
||||
use metrics::{register_int_counter, register_int_counter_pair, IntCounter, IntCounterPair};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use tokio::time;
|
||||
use tokio::time::{self, Instant};
|
||||
|
||||
pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
register_int_counter_pair_vec!(
|
||||
@@ -46,9 +46,9 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"proxy_compute_connection_latency_seconds",
|
||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||
// http/ws/tcp, true/false, true/false, success/failure
|
||||
// 3 * 2 * 2 * 2 = 24 counters
|
||||
&["protocol", "cache_miss", "pool_miss", "outcome"],
|
||||
// http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
|
||||
// 3 * 2 * 2 * 2 * 2 = 48 counters
|
||||
&["protocol", "cache_miss", "pool_miss", "outcome", "excluded"],
|
||||
// largest bucket = 2^16 * 0.5ms = 32s
|
||||
exponential_buckets(0.0005, 2.0, 16).unwrap(),
|
||||
)
|
||||
@@ -161,12 +161,26 @@ pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum Waiting {
|
||||
Cplane,
|
||||
Client,
|
||||
Compute,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Accumulated {
|
||||
cplane: time::Duration,
|
||||
client: time::Duration,
|
||||
compute: time::Duration,
|
||||
}
|
||||
|
||||
pub struct LatencyTimer {
|
||||
// time since the stopwatch was started
|
||||
start: Option<time::Instant>,
|
||||
start: time::Instant,
|
||||
// time since the stopwatch was stopped
|
||||
stop: Option<time::Instant>,
|
||||
// accumulated time on the stopwatch
|
||||
pub accumulated: std::time::Duration,
|
||||
accumulated: Accumulated,
|
||||
// label data
|
||||
protocol: &'static str,
|
||||
cache_miss: bool,
|
||||
@@ -176,13 +190,16 @@ pub struct LatencyTimer {
|
||||
|
||||
pub struct LatencyTimerPause<'a> {
|
||||
timer: &'a mut LatencyTimer,
|
||||
start: time::Instant,
|
||||
waiting_for: Waiting,
|
||||
}
|
||||
|
||||
impl LatencyTimer {
|
||||
pub fn new(protocol: &'static str) -> Self {
|
||||
Self {
|
||||
start: Some(time::Instant::now()),
|
||||
accumulated: std::time::Duration::ZERO,
|
||||
start: time::Instant::now(),
|
||||
stop: None,
|
||||
accumulated: Accumulated::default(),
|
||||
protocol,
|
||||
cache_miss: false,
|
||||
// by default we don't do pooling
|
||||
@@ -192,11 +209,12 @@ impl LatencyTimer {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pause(&mut self) -> LatencyTimerPause<'_> {
|
||||
// stop the stopwatch and record the time that we have accumulated
|
||||
let start = self.start.take().expect("latency timer should be started");
|
||||
self.accumulated += start.elapsed();
|
||||
LatencyTimerPause { timer: self }
|
||||
pub fn pause(&mut self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
|
||||
LatencyTimerPause {
|
||||
timer: self,
|
||||
start: Instant::now(),
|
||||
waiting_for,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cache_miss(&mut self) {
|
||||
@@ -209,9 +227,7 @@ impl LatencyTimer {
|
||||
|
||||
pub fn success(&mut self) {
|
||||
// stop the stopwatch and record the time that we have accumulated
|
||||
if let Some(start) = self.start.take() {
|
||||
self.accumulated += start.elapsed();
|
||||
}
|
||||
self.stop = Some(time::Instant::now());
|
||||
|
||||
// success
|
||||
self.outcome = "success";
|
||||
@@ -220,23 +236,42 @@ impl LatencyTimer {
|
||||
|
||||
impl Drop for LatencyTimerPause<'_> {
|
||||
fn drop(&mut self) {
|
||||
// start the stopwatch again
|
||||
self.timer.start = Some(time::Instant::now());
|
||||
let dur = self.start.elapsed();
|
||||
match self.waiting_for {
|
||||
Waiting::Cplane => self.timer.accumulated.cplane += dur,
|
||||
Waiting::Client => self.timer.accumulated.client += dur,
|
||||
Waiting::Compute => self.timer.accumulated.compute += dur,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for LatencyTimer {
|
||||
fn drop(&mut self) {
|
||||
let duration =
|
||||
self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated;
|
||||
let duration = self
|
||||
.stop
|
||||
.unwrap_or_else(time::Instant::now)
|
||||
.duration_since(self.start);
|
||||
// Excluding cplane communication from the accumulated time.
|
||||
COMPUTE_CONNECTION_LATENCY
|
||||
.with_label_values(&[
|
||||
self.protocol,
|
||||
bool_to_str(self.cache_miss),
|
||||
bool_to_str(self.pool_miss),
|
||||
self.outcome,
|
||||
"client",
|
||||
])
|
||||
.observe(duration.as_secs_f64())
|
||||
.observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
|
||||
// Exclude client and cplane communication from the accumulated time.
|
||||
let accumulated_total = self.accumulated.client + self.accumulated.cplane;
|
||||
COMPUTE_CONNECTION_LATENCY
|
||||
.with_label_values(&[
|
||||
self.protocol,
|
||||
bool_to_str(self.cache_miss),
|
||||
bool_to_str(self.pool_miss),
|
||||
self.outcome,
|
||||
"client_and_cplane",
|
||||
])
|
||||
.observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -341,7 +341,14 @@ impl Accept for ProxyProtocolAccept {
|
||||
cx: &mut Context<'_>,
|
||||
) -> Poll<Option<Result<Self::Conn, Self::Error>>> {
|
||||
let conn = ready!(Pin::new(&mut self.incoming).poll_accept(cx)?);
|
||||
tracing::info!(protocol = self.protocol, "accepted new TCP connection");
|
||||
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
let span = tracing::info_span!("http_conn", ?conn_id);
|
||||
{
|
||||
let _enter = span.enter();
|
||||
tracing::info!("accepted new TCP connection");
|
||||
}
|
||||
|
||||
let Some(conn) = conn else {
|
||||
return Poll::Ready(None);
|
||||
};
|
||||
@@ -354,6 +361,7 @@ impl Accept for ProxyProtocolAccept {
|
||||
.with_label_values(&[self.protocol])
|
||||
.guard(),
|
||||
)),
|
||||
span,
|
||||
})))
|
||||
}
|
||||
}
|
||||
@@ -364,6 +372,14 @@ pin_project! {
|
||||
pub inner: T,
|
||||
pub connection_id: Uuid,
|
||||
pub gauge: Mutex<Option<IntCounterPairGuard>>,
|
||||
pub span: tracing::Span,
|
||||
}
|
||||
|
||||
impl<S> PinnedDrop for WithConnectionGuard<S> {
|
||||
fn drop(this: Pin<&mut Self>) {
|
||||
let _enter = this.span.enter();
|
||||
tracing::info!("HTTP connection closed")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -248,7 +248,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
|
||||
let pause = ctx.latency_timer.pause();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(stream, mode.handshake_tls(tls));
|
||||
let (mut stream, params) =
|
||||
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
|
||||
|
||||
@@ -19,6 +19,7 @@ use rand::SeedableRng;
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
use tokio_util::task::TaskTracker;
|
||||
use tracing::instrument::Instrumented;
|
||||
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::protocol2::{ProxyProtocolAccept, WithClientIp, WithConnectionGuard};
|
||||
@@ -30,13 +31,12 @@ use hyper::{
|
||||
Body, Method, Request, Response,
|
||||
};
|
||||
|
||||
use std::convert::Infallible;
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use tls_listener::TlsListener;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tokio_util::sync::{CancellationToken, DropGuard};
|
||||
use tracing::{error, info, warn, Instrument};
|
||||
use utils::http::{error::ApiError, json::json_response};
|
||||
|
||||
@@ -100,12 +100,7 @@ pub async fn task_main(
|
||||
let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
ws_connections.close(); // allows `ws_connections.wait to complete`
|
||||
|
||||
let tls_listener = TlsListener::new(
|
||||
tls_acceptor,
|
||||
addr_incoming,
|
||||
"http",
|
||||
config.handshake_timeout,
|
||||
);
|
||||
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming, config.handshake_timeout);
|
||||
|
||||
let make_svc = hyper::service::make_service_fn(
|
||||
|stream: &tokio_rustls::server::TlsStream<
|
||||
@@ -121,6 +116,11 @@ pub async fn task_main(
|
||||
.take()
|
||||
.expect("gauge should be set on connection start");
|
||||
|
||||
// Cancel all current inflight HTTP requests if the HTTP connection is closed.
|
||||
let http_cancellation_token = CancellationToken::new();
|
||||
let cancel_connection = http_cancellation_token.clone().drop_guard();
|
||||
|
||||
let span = conn.span.clone();
|
||||
let client_addr = conn.inner.client_addr();
|
||||
let remote_addr = conn.inner.inner.remote_addr();
|
||||
let backend = backend.clone();
|
||||
@@ -136,27 +136,43 @@ pub async fn task_main(
|
||||
Ok(MetricService::new(
|
||||
hyper::service::service_fn(move |req: Request<Body>| {
|
||||
let backend = backend.clone();
|
||||
let ws_connections = ws_connections.clone();
|
||||
let ws_connections2 = ws_connections.clone();
|
||||
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
|
||||
let cancellation_handler = cancellation_handler.clone();
|
||||
let http_cancellation_token = http_cancellation_token.child_token();
|
||||
|
||||
async move {
|
||||
Ok::<_, Infallible>(
|
||||
request_handler(
|
||||
// `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
|
||||
// By spawning the future, we ensure it never gets cancelled until it decides to.
|
||||
ws_connections.spawn(
|
||||
async move {
|
||||
// Cancel the current inflight HTTP request if the requets stream is closed.
|
||||
// This is slightly different to `_cancel_connection` in that
|
||||
// h2 can cancel individual requests with a `RST_STREAM`.
|
||||
let _cancel_session = http_cancellation_token.clone().drop_guard();
|
||||
|
||||
let res = request_handler(
|
||||
req,
|
||||
config,
|
||||
backend,
|
||||
ws_connections,
|
||||
ws_connections2,
|
||||
cancellation_handler,
|
||||
peer_addr.ip(),
|
||||
endpoint_rate_limiter,
|
||||
http_cancellation_token,
|
||||
)
|
||||
.await
|
||||
.map_or_else(|e| e.into_response(), |r| r),
|
||||
)
|
||||
}
|
||||
.map_or_else(|e| e.into_response(), |r| r);
|
||||
|
||||
_cancel_session.disarm();
|
||||
|
||||
res
|
||||
}
|
||||
.in_current_span(),
|
||||
)
|
||||
}),
|
||||
gauge,
|
||||
cancel_connection,
|
||||
span,
|
||||
))
|
||||
}
|
||||
},
|
||||
@@ -176,11 +192,23 @@ pub async fn task_main(
|
||||
struct MetricService<S> {
|
||||
inner: S,
|
||||
_gauge: IntCounterPairGuard,
|
||||
_cancel: DropGuard,
|
||||
span: tracing::Span,
|
||||
}
|
||||
|
||||
impl<S> MetricService<S> {
|
||||
fn new(inner: S, _gauge: IntCounterPairGuard) -> MetricService<S> {
|
||||
MetricService { inner, _gauge }
|
||||
fn new(
|
||||
inner: S,
|
||||
_gauge: IntCounterPairGuard,
|
||||
_cancel: DropGuard,
|
||||
span: tracing::Span,
|
||||
) -> MetricService<S> {
|
||||
MetricService {
|
||||
inner,
|
||||
_gauge,
|
||||
_cancel,
|
||||
span,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -190,14 +218,16 @@ where
|
||||
{
|
||||
type Response = S::Response;
|
||||
type Error = S::Error;
|
||||
type Future = S::Future;
|
||||
type Future = Instrumented<S::Future>;
|
||||
|
||||
fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
|
||||
self.inner.poll_ready(cx)
|
||||
}
|
||||
|
||||
fn call(&mut self, req: Request<ReqBody>) -> Self::Future {
|
||||
self.inner.call(req)
|
||||
self.span
|
||||
.in_scope(|| self.inner.call(req))
|
||||
.instrument(self.span.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,6 +240,8 @@ async fn request_handler(
|
||||
cancellation_handler: Arc<CancellationHandler>,
|
||||
peer_addr: IpAddr,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
// used to cancel in-flight HTTP requests. not used to cancel websockets
|
||||
http_cancellation_token: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
|
||||
@@ -253,7 +285,7 @@ async fn request_handler(
|
||||
let ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
|
||||
let span = ctx.span.clone();
|
||||
|
||||
sql_over_http::handle(config, ctx, request, backend)
|
||||
sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
|
||||
.instrument(span)
|
||||
.await
|
||||
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
|
||||
|
||||
@@ -9,6 +9,7 @@ use crate::{
|
||||
config::ProxyConfig,
|
||||
console::{
|
||||
errors::{GetAuthInfoError, WakeComputeError},
|
||||
messages::ColdStartInfo,
|
||||
CachedNodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
@@ -83,6 +84,8 @@ impl PoolingBackend {
|
||||
};
|
||||
|
||||
if let Some(client) = maybe_client {
|
||||
info!("cold_start_info=warm");
|
||||
ctx.set_cold_start_info(ColdStartInfo::Warm);
|
||||
return Ok(client);
|
||||
}
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
|
||||
@@ -217,8 +217,8 @@ pub async fn handle(
|
||||
mut ctx: RequestMonitoring,
|
||||
request: Request<Body>,
|
||||
backend: Arc<PoolingBackend>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let cancel = CancellationToken::new();
|
||||
let cancel2 = cancel.clone();
|
||||
let handle = tokio::spawn(async move {
|
||||
time::sleep(config.http_config.request_timeout).await;
|
||||
|
||||
@@ -13,7 +13,7 @@ use tokio::{
|
||||
time::timeout,
|
||||
};
|
||||
use tokio_rustls::{server::TlsStream, TlsAcceptor};
|
||||
use tracing::{info, warn};
|
||||
use tracing::{info, warn, Instrument};
|
||||
|
||||
use crate::{
|
||||
metrics::TLS_HANDSHAKE_FAILURES,
|
||||
@@ -29,24 +29,17 @@ pin_project! {
|
||||
tls: TlsAcceptor,
|
||||
waiting: JoinSet<Option<TlsStream<A::Conn>>>,
|
||||
timeout: Duration,
|
||||
protocol: &'static str,
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Accept> TlsListener<A> {
|
||||
/// Create a `TlsListener` with default options.
|
||||
pub(crate) fn new(
|
||||
tls: TlsAcceptor,
|
||||
listener: A,
|
||||
protocol: &'static str,
|
||||
timeout: Duration,
|
||||
) -> Self {
|
||||
pub(crate) fn new(tls: TlsAcceptor, listener: A, timeout: Duration) -> Self {
|
||||
TlsListener {
|
||||
listener,
|
||||
tls,
|
||||
waiting: JoinSet::new(),
|
||||
timeout,
|
||||
protocol,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -73,7 +66,7 @@ where
|
||||
Poll::Ready(Some(Ok(mut conn))) => {
|
||||
let t = *this.timeout;
|
||||
let tls = this.tls.clone();
|
||||
let protocol = *this.protocol;
|
||||
let span = conn.span.clone();
|
||||
this.waiting.spawn(async move {
|
||||
let peer_addr = match conn.inner.wait_for_addr().await {
|
||||
Ok(Some(addr)) => addr,
|
||||
@@ -86,21 +79,24 @@ where
|
||||
|
||||
let accept = tls.accept(conn);
|
||||
match timeout(t, accept).await {
|
||||
Ok(Ok(conn)) => Some(conn),
|
||||
Ok(Ok(conn)) => {
|
||||
info!(%peer_addr, "accepted new TLS connection");
|
||||
Some(conn)
|
||||
},
|
||||
// The handshake failed, try getting another connection from the queue
|
||||
Ok(Err(e)) => {
|
||||
TLS_HANDSHAKE_FAILURES.inc();
|
||||
warn!(%peer_addr, protocol, "failed to accept TLS connection: {e:?}");
|
||||
warn!(%peer_addr, "failed to accept TLS connection: {e:?}");
|
||||
None
|
||||
}
|
||||
// The handshake timed out, try getting another connection from the queue
|
||||
Err(_) => {
|
||||
TLS_HANDSHAKE_FAILURES.inc();
|
||||
warn!(%peer_addr, protocol, "failed to accept TLS connection: timeout");
|
||||
warn!(%peer_addr, "failed to accept TLS connection: timeout");
|
||||
None
|
||||
}
|
||||
}
|
||||
});
|
||||
}.instrument(span));
|
||||
}
|
||||
Poll::Ready(Some(Err(e))) => {
|
||||
tracing::error!("error accepting TCP connection: {e}");
|
||||
@@ -112,10 +108,7 @@ where
|
||||
|
||||
loop {
|
||||
return match this.waiting.poll_join_next(cx) {
|
||||
Poll::Ready(Some(Ok(Some(conn)))) => {
|
||||
info!(protocol = this.protocol, "accepted new TLS connection");
|
||||
Poll::Ready(Some(Ok(conn)))
|
||||
}
|
||||
Poll::Ready(Some(Ok(Some(conn)))) => Poll::Ready(Some(Ok(conn))),
|
||||
// The handshake failed to complete, try getting another connection from the queue
|
||||
Poll::Ready(Some(Ok(None))) => continue,
|
||||
// The handshake panicked or was cancelled. ignore and get another connection
|
||||
|
||||
@@ -40,6 +40,7 @@ pytest-split = "^0.8.1"
|
||||
zstandard = "^0.21.0"
|
||||
httpx = {extras = ["http2"], version = "^0.26.0"}
|
||||
pytest-repeat = "^0.9.3"
|
||||
websockets = "^12.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
mypy = "==1.3.0"
|
||||
|
||||
@@ -40,7 +40,7 @@ To run your local neon.git build on the instance store volume,
|
||||
run the following commands from the top of the neon.git checkout
|
||||
|
||||
# raise file descriptor limit of your shell and its child processes
|
||||
sudo prlimit -p $$ --nofile=800000:800000
|
||||
sudo prlimit -p \$\$ --nofile=800000:800000
|
||||
|
||||
# test suite run
|
||||
export TEST_OUTPUT="$TEST_OUTPUT"
|
||||
|
||||
@@ -2,6 +2,7 @@ pytest_plugins = (
|
||||
"fixtures.pg_version",
|
||||
"fixtures.parametrize",
|
||||
"fixtures.httpserver",
|
||||
"fixtures.compute_reconfigure",
|
||||
"fixtures.neon_fixtures",
|
||||
"fixtures.benchmark_fixture",
|
||||
"fixtures.pg_stats",
|
||||
|
||||
62
test_runner/fixtures/compute_reconfigure.py
Normal file
62
test_runner/fixtures/compute_reconfigure.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import concurrent.futures
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
from werkzeug.wrappers.request import Request
|
||||
from werkzeug.wrappers.response import Response
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.types import TenantId
|
||||
|
||||
|
||||
class ComputeReconfigure:
|
||||
def __init__(self, server):
|
||||
self.server = server
|
||||
self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach"
|
||||
self.workloads = {}
|
||||
|
||||
def register_workload(self, workload):
|
||||
self.workloads[workload.tenant_id] = workload
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def compute_reconfigure_listener(make_httpserver):
|
||||
"""
|
||||
This fixture exposes an HTTP listener for the storage controller to submit
|
||||
compute notifications to us, instead of updating neon_local endpoints itself.
|
||||
|
||||
Although storage controller can use neon_local directly, this causes problems when
|
||||
the test is also concurrently modifying endpoints. Instead, configure storage controller
|
||||
to send notifications up to this test code, which will route all endpoint updates
|
||||
through Workload, which has a mutex to make concurrent updates safe.
|
||||
"""
|
||||
server = make_httpserver
|
||||
|
||||
self = ComputeReconfigure(server)
|
||||
|
||||
# Do neon_local endpoint reconfiguration in the background so that we can
|
||||
# accept a healthy rate of calls into notify-attach.
|
||||
reconfigure_threads = concurrent.futures.ThreadPoolExecutor(max_workers=1)
|
||||
|
||||
def handler(request: Request):
|
||||
assert request.json is not None
|
||||
body: dict[str, Any] = request.json
|
||||
log.info(f"notify-attach request: {body}")
|
||||
|
||||
try:
|
||||
workload = self.workloads[TenantId(body["tenant_id"])]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
# This causes the endpoint to query storage controller for its location, which
|
||||
# is redundant since we already have it here, but this avoids extending the
|
||||
# neon_local CLI to take full lists of locations
|
||||
reconfigure_threads.submit(lambda workload=workload: workload.reconfigure()) # type: ignore[no-any-return]
|
||||
|
||||
return Response(status=200)
|
||||
|
||||
self.server.expect_request("/notify-attach", method="PUT").respond_with_handler(handler)
|
||||
|
||||
yield self
|
||||
reconfigure_threads.shutdown()
|
||||
server.clear()
|
||||
@@ -1525,6 +1525,7 @@ class NeonCli(AbstractNeonCli):
|
||||
conf: Optional[Dict[str, Any]] = None,
|
||||
shard_count: Optional[int] = None,
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
placement_policy: Optional[str] = None,
|
||||
set_default: bool = False,
|
||||
) -> Tuple[TenantId, TimelineId]:
|
||||
"""
|
||||
@@ -1558,6 +1559,9 @@ class NeonCli(AbstractNeonCli):
|
||||
if shard_stripe_size is not None:
|
||||
args.extend(["--shard-stripe-size", str(shard_stripe_size)])
|
||||
|
||||
if placement_policy is not None:
|
||||
args.extend(["--placement-policy", str(placement_policy)])
|
||||
|
||||
res = self.raw_cli(args)
|
||||
res.check_returncode()
|
||||
return tenant_id, timeline_id
|
||||
@@ -2088,6 +2092,14 @@ class NeonStorageController(MetricsGetter):
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def tenant_list(self):
|
||||
response = self.request(
|
||||
"GET",
|
||||
f"{self.env.storage_controller_api}/debug/v1/tenant",
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
return response.json()
|
||||
|
||||
def node_configure(self, node_id, body: dict[str, Any]):
|
||||
log.info(f"node_configure({node_id}, {body})")
|
||||
body["node_id"] = node_id
|
||||
@@ -2177,6 +2189,23 @@ class NeonStorageController(MetricsGetter):
|
||||
)
|
||||
log.info("storage controller passed consistency check")
|
||||
|
||||
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||
if isinstance(config_strings, tuple):
|
||||
pairs = [config_strings]
|
||||
else:
|
||||
pairs = config_strings
|
||||
|
||||
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||
|
||||
res = self.request(
|
||||
"PUT",
|
||||
f"{self.env.storage_controller_api}/debug/v1/failpoints",
|
||||
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||
headers=self.headers(TokenScope.ADMIN),
|
||||
)
|
||||
log.info(f"Got failpoints request response code {res.status_code}")
|
||||
res.raise_for_status()
|
||||
|
||||
def __enter__(self) -> "NeonStorageController":
|
||||
return self
|
||||
|
||||
@@ -2944,6 +2973,7 @@ class NeonProxy(PgProtocol):
|
||||
user = quote(kwargs["user"])
|
||||
password = quote(kwargs["password"])
|
||||
expected_code = kwargs.get("expected_code")
|
||||
timeout = kwargs.get("timeout")
|
||||
|
||||
log.info(f"Executing http query: {query}")
|
||||
|
||||
@@ -2957,6 +2987,7 @@ class NeonProxy(PgProtocol):
|
||||
"Neon-Pool-Opt-In": "true",
|
||||
},
|
||||
verify=str(self.test_output_dir / "proxy.crt"),
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
if expected_code is not None:
|
||||
|
||||
@@ -55,7 +55,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
# FIXME: These need investigation
|
||||
".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
|
||||
".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
|
||||
".*Removing intermediate uninit mark file.*",
|
||||
# Tenant::delete_timeline() can cause any of the four following errors.
|
||||
# FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
|
||||
".*could not flush frozen layer.*queue is in state Stopped", # when schedule layer upload fails because queued got closed before compaction got killed
|
||||
|
||||
@@ -34,7 +34,7 @@ class TimelineCreate406(PageserverApiException):
|
||||
class TimelineCreate409(PageserverApiException):
|
||||
def __init__(self, res: requests.Response):
|
||||
assert res.status_code == 409
|
||||
super().__init__("", res.status_code)
|
||||
super().__init__(res.json()["msg"], res.status_code)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -357,9 +357,15 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_secondary_download(self, tenant_id: Union[TenantId, TenantShardId]):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
|
||||
def tenant_secondary_download(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], wait_ms: Optional[int] = None
|
||||
) -> tuple[int, dict[Any, Any]]:
|
||||
url = f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download"
|
||||
if wait_ms is not None:
|
||||
url = url + f"?wait_ms={wait_ms}"
|
||||
res = self.post(url)
|
||||
self.verbose_error(res)
|
||||
return (res.status_code, res.json())
|
||||
|
||||
def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
|
||||
assert "tenant_id" not in config.keys()
|
||||
|
||||
@@ -28,7 +28,7 @@ def platform() -> Optional[str]:
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def pageserver_virtual_file_io_engine() -> Optional[str]:
|
||||
return None
|
||||
return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE")
|
||||
|
||||
|
||||
def pytest_generate_tests(metafunc: Metafunc):
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import threading
|
||||
from typing import Optional
|
||||
|
||||
from fixtures.log_helper import log
|
||||
@@ -11,6 +12,10 @@ from fixtures.neon_fixtures import (
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
# neon_local doesn't handle creating/modifying endpoints concurrently, so we use a mutex
|
||||
# to ensure we don't do that: this enables running lots of Workloads in parallel safely.
|
||||
ENDPOINT_LOCK = threading.Lock()
|
||||
|
||||
|
||||
class Workload:
|
||||
"""
|
||||
@@ -41,17 +46,30 @@ class Workload:
|
||||
|
||||
self._endpoint: Optional[Endpoint] = None
|
||||
|
||||
def reconfigure(self):
|
||||
"""
|
||||
Request the endpoint to reconfigure based on location reported by storage controller
|
||||
"""
|
||||
if self._endpoint is not None:
|
||||
with ENDPOINT_LOCK:
|
||||
self._endpoint.reconfigure()
|
||||
|
||||
def endpoint(self, pageserver_id: Optional[int] = None) -> Endpoint:
|
||||
if self._endpoint is None:
|
||||
self._endpoint = self.env.endpoints.create(
|
||||
self.branch_name,
|
||||
tenant_id=self.tenant_id,
|
||||
pageserver_id=pageserver_id,
|
||||
endpoint_id="ep-workload",
|
||||
)
|
||||
self._endpoint.start(pageserver_id=pageserver_id)
|
||||
else:
|
||||
self._endpoint.reconfigure(pageserver_id=pageserver_id)
|
||||
# We may be running alongside other Workloads for different tenants. Full TTID is
|
||||
# obnoxiously long for use here, but a cut-down version is still unique enough for tests.
|
||||
endpoint_id = f"ep-workload-{str(self.tenant_id)[0:4]}-{str(self.timeline_id)[0:4]}"
|
||||
|
||||
with ENDPOINT_LOCK:
|
||||
if self._endpoint is None:
|
||||
self._endpoint = self.env.endpoints.create(
|
||||
self.branch_name,
|
||||
tenant_id=self.tenant_id,
|
||||
pageserver_id=pageserver_id,
|
||||
endpoint_id=endpoint_id,
|
||||
)
|
||||
self._endpoint.start(pageserver_id=pageserver_id)
|
||||
else:
|
||||
self._endpoint.reconfigure(pageserver_id=pageserver_id)
|
||||
|
||||
connstring = self._endpoint.safe_psql(
|
||||
"SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
|
||||
@@ -94,7 +112,7 @@ class Workload:
|
||||
else:
|
||||
return False
|
||||
|
||||
def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True):
|
||||
def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest=True):
|
||||
assert self.expect_rows >= n
|
||||
|
||||
max_iters = 10
|
||||
@@ -132,22 +150,28 @@ class Workload:
|
||||
]
|
||||
)
|
||||
|
||||
for tenant_shard_id, pageserver in tenant_get_shards(
|
||||
self.env, self.tenant_id, pageserver_id
|
||||
):
|
||||
last_flush_lsn = wait_for_last_flush_lsn(
|
||||
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
|
||||
)
|
||||
ps_http = pageserver.http_client()
|
||||
wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
if ingest:
|
||||
# Wait for written data to be ingested by the pageserver
|
||||
for tenant_shard_id, pageserver in tenant_get_shards(
|
||||
self.env, self.tenant_id, pageserver_id
|
||||
):
|
||||
last_flush_lsn = wait_for_last_flush_lsn(
|
||||
self.env,
|
||||
endpoint,
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
pageserver_id=pageserver_id,
|
||||
)
|
||||
ps_http = pageserver.http_client()
|
||||
wait_for_last_record_lsn(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
|
||||
if upload:
|
||||
# force a checkpoint to trigger upload
|
||||
ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
|
||||
wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
|
||||
else:
|
||||
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
|
||||
if upload:
|
||||
# Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload)
|
||||
ps_http.timeline_checkpoint(tenant_shard_id, self.timeline_id)
|
||||
wait_for_upload(ps_http, tenant_shard_id, self.timeline_id, last_flush_lsn)
|
||||
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
|
||||
else:
|
||||
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
|
||||
|
||||
def validate(self, pageserver_id: Optional[int] = None):
|
||||
endpoint = self.endpoint(pageserver_id)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
@@ -20,10 +19,6 @@ from performance.pageserver.util import (
|
||||
@pytest.mark.parametrize("n_tenants", [10])
|
||||
@pytest.mark.parametrize("get_vectored_impl", ["sequential", "vectored"])
|
||||
@pytest.mark.timeout(1000)
|
||||
@pytest.mark.skipif(
|
||||
os.getenv("CI", "false") == "true",
|
||||
reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/7006",
|
||||
)
|
||||
def test_basebackup_with_high_slru_count(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
|
||||
@@ -347,6 +347,64 @@ def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvB
|
||||
ps_http.timeline_detail(env.initial_tenant, branch_id)
|
||||
|
||||
|
||||
def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
env.pageserver.tenant_create(env.initial_tenant)
|
||||
|
||||
success_timeline = TimelineId.generate()
|
||||
log.info(f"Creating timeline {success_timeline}")
|
||||
ps_http = env.pageserver.http_client()
|
||||
success_result = ps_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, success_timeline, timeout=60
|
||||
)
|
||||
|
||||
ps_http.configure_failpoints(("timeline-creation-after-uninit", "pause"))
|
||||
|
||||
def start_creating_timeline():
|
||||
log.info(f"Creating (expect failure) timeline {env.initial_timeline}")
|
||||
with pytest.raises(RequestException):
|
||||
ps_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
|
||||
)
|
||||
|
||||
t = threading.Thread(target=start_creating_timeline)
|
||||
try:
|
||||
t.start()
|
||||
|
||||
wait_until_paused(env, "timeline-creation-after-uninit")
|
||||
|
||||
# While timeline creation is in progress, trying to create a timeline
|
||||
# again with the same ID should return 409
|
||||
with pytest.raises(
|
||||
PageserverApiException, match="creation of timeline with the given ID is in progress"
|
||||
):
|
||||
ps_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
|
||||
)
|
||||
|
||||
# Creation of a timeline already successfully created is idempotent, and is not impeded by some
|
||||
# other timeline creation with a different TimelineId being stuck.
|
||||
repeat_result = ps_http.timeline_create(
|
||||
env.pg_version, env.initial_tenant, success_timeline, timeout=60
|
||||
)
|
||||
assert repeat_result == success_result
|
||||
finally:
|
||||
env.pageserver.stop(immediate=True)
|
||||
t.join()
|
||||
|
||||
# now without a failpoint
|
||||
env.pageserver.start()
|
||||
|
||||
wait_until_tenant_active(ps_http, env.initial_tenant)
|
||||
|
||||
with pytest.raises(PageserverApiException, match="not found"):
|
||||
ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# The one successfully created timeline should still be there.
|
||||
assert len(ps_http.timeline_list(tenant_id=env.initial_tenant)) == 1
|
||||
|
||||
|
||||
def wait_until_paused(env: NeonEnv, failpoint: str):
|
||||
found = False
|
||||
msg = f"at failpoint {failpoint}"
|
||||
|
||||
@@ -204,7 +204,7 @@ def test_timeline_init_break_before_checkpoint_recreate(
|
||||
assert timeline_id == new_timeline_id
|
||||
|
||||
|
||||
def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilder):
|
||||
def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
@@ -214,9 +214,9 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde
|
||||
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
|
||||
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
|
||||
|
||||
# Introduce failpoint when creating a new timeline uninit mark, before any other files were created
|
||||
pageserver_http.configure_failpoints(("after-timeline-uninit-mark-creation", "return"))
|
||||
with pytest.raises(Exception, match="after-timeline-uninit-mark-creation"):
|
||||
# Introduce failpoint when creating a new timeline, right after creating its directory
|
||||
pageserver_http.configure_failpoints(("after-timeline-dir-creation", "return"))
|
||||
with pytest.raises(Exception, match="after-timeline-dir-creation"):
|
||||
_ = pageserver_http.timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
|
||||
|
||||
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
||||
|
||||
@@ -90,7 +90,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
|
||||
[
|
||||
".*error importing base backup .*",
|
||||
".*Timeline got dropped without initializing, cleaning its files.*",
|
||||
".*Removing intermediate uninit mark file.*",
|
||||
".*InternalServerError.*timeline not found.*",
|
||||
".*InternalServerError.*Tenant .* not found.*",
|
||||
".*InternalServerError.*Timeline .* not found.*",
|
||||
|
||||
@@ -37,23 +37,18 @@ def test_pageserver_init_node_id(
|
||||
assert (
|
||||
bad_init.returncode == 1
|
||||
), "pageserver should not be able to init new config without the node id"
|
||||
assert "missing id" in bad_init.stderr
|
||||
assert 'missing config value "id"' in bad_init.stderr
|
||||
assert not pageserver_config.exists(), "config file should not be created after init error"
|
||||
|
||||
completed_init = run_pageserver(
|
||||
["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
|
||||
)
|
||||
good_init_cmd = ["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
|
||||
completed_init = run_pageserver(good_init_cmd)
|
||||
assert (
|
||||
completed_init.returncode == 0
|
||||
), "pageserver should be able to create a new config with the node id given"
|
||||
assert pageserver_config.exists(), "config file should be created successfully"
|
||||
|
||||
bad_reinit = run_pageserver(
|
||||
["--init", "-c", "id = 12345", "-c", f'pg_distrib_dir="{pg_distrib_dir}"']
|
||||
)
|
||||
assert (
|
||||
bad_reinit.returncode == 1
|
||||
), "pageserver should not be able to init new config without the node id"
|
||||
bad_reinit = run_pageserver(good_init_cmd)
|
||||
assert bad_reinit.returncode == 1, "pageserver refuses to init if already exists"
|
||||
assert "already exists, cannot init it" in bad_reinit.stderr
|
||||
|
||||
bad_update = run_pageserver(["--update-config", "-c", "id = 3"])
|
||||
|
||||
@@ -209,10 +209,12 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
env.storage_controller.node_register(env.pageserver)
|
||||
|
||||
env.pageserver.start(overrides=('--pageserver-config-override=control_plane_api=""',))
|
||||
env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"})
|
||||
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline
|
||||
)
|
||||
|
||||
generate_uploads_and_deletions(env, pageserver=env.pageserver)
|
||||
|
||||
def parse_generation_suffix(key):
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
@@ -553,3 +554,103 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
|
||||
@pytest.mark.parametrize("via_controller", [True, False])
|
||||
def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):
|
||||
"""
|
||||
Test use of secondary download API for slow downloads, where slow means either a healthy
|
||||
system with a large capacity shard, or some unhealthy remote storage.
|
||||
|
||||
The download API is meant to respect a client-supplied time limit, and return 200 or 202
|
||||
selectively based on whether the download completed.
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 2
|
||||
neon_env_builder.enable_pageserver_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
|
||||
tenant_id = TenantId.generate()
|
||||
timeline_id = TimelineId.generate()
|
||||
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Double":1}'
|
||||
)
|
||||
|
||||
attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"]
|
||||
ps_attached = env.get_pageserver(attached_to_id)
|
||||
ps_secondary = next(p for p in env.pageservers if p != ps_attached)
|
||||
|
||||
# Generate a bunch of small layers (we will apply a slowdown failpoint that works on a per-layer basis)
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(128)
|
||||
ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
|
||||
workload.write_rows(128)
|
||||
ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
|
||||
workload.write_rows(128)
|
||||
ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
|
||||
workload.write_rows(128)
|
||||
ps_attached.http_client().timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# Expect lots of layers
|
||||
assert len(list_layers(ps_attached, tenant_id, timeline_id)) > 10
|
||||
|
||||
# Simulate large data by making layer downloads artifically slow
|
||||
for ps in env.pageservers:
|
||||
ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
|
||||
|
||||
# Upload a heatmap, so that secondaries have something to download
|
||||
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
|
||||
|
||||
if via_controller:
|
||||
http_client = env.storage_controller.pageserver_api()
|
||||
http_client.tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Secondary",
|
||||
"secondary_conf": {"warm": True},
|
||||
"tenant_conf": {},
|
||||
"generation": None,
|
||||
},
|
||||
)
|
||||
else:
|
||||
http_client = ps_secondary.http_client()
|
||||
|
||||
# This has no chance to succeed: we have lots of layers and each one takes at least 1000ms
|
||||
(status, progress_1) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000)
|
||||
assert status == 202
|
||||
assert progress_1["heatmap_mtime"] is not None
|
||||
assert progress_1["layers_downloaded"] > 0
|
||||
assert progress_1["bytes_downloaded"] > 0
|
||||
assert progress_1["layers_total"] > progress_1["layers_downloaded"]
|
||||
assert progress_1["bytes_total"] > progress_1["bytes_downloaded"]
|
||||
|
||||
# Multiple polls should work: use a shorter wait period this time
|
||||
(status, progress_2) = http_client.tenant_secondary_download(tenant_id, wait_ms=1000)
|
||||
assert status == 202
|
||||
assert progress_2["heatmap_mtime"] is not None
|
||||
assert progress_2["layers_downloaded"] > 0
|
||||
assert progress_2["bytes_downloaded"] > 0
|
||||
assert progress_2["layers_total"] > progress_2["layers_downloaded"]
|
||||
assert progress_2["bytes_total"] > progress_2["bytes_downloaded"]
|
||||
|
||||
# Progress should be >= the first poll: this can only go backward if we see a new heatmap,
|
||||
# and the heatmap period on the attached node is much longer than the runtime of this test, so no
|
||||
# new heatmap should have been uploaded.
|
||||
assert progress_2["layers_downloaded"] >= progress_1["layers_downloaded"]
|
||||
assert progress_2["bytes_downloaded"] >= progress_1["bytes_downloaded"]
|
||||
assert progress_2["layers_total"] == progress_1["layers_total"]
|
||||
assert progress_2["bytes_total"] == progress_1["bytes_total"]
|
||||
|
||||
# Make downloads fast again: when the download completes within this last request, we
|
||||
# get a 200 instead of a 202
|
||||
for ps in env.pageservers:
|
||||
ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")])
|
||||
(status, progress_3) = http_client.tenant_secondary_download(tenant_id, wait_ms=20000)
|
||||
assert status == 200
|
||||
assert progress_3["heatmap_mtime"] is not None
|
||||
assert progress_3["layers_total"] == progress_3["layers_downloaded"]
|
||||
assert progress_3["bytes_total"] == progress_3["bytes_downloaded"]
|
||||
|
||||
@@ -596,3 +596,39 @@ def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy):
|
||||
assert (
|
||||
"duplicate key value violates unique constraint" in res["message"]
|
||||
), "HTTP query should conflict"
|
||||
|
||||
|
||||
def test_sql_over_http_connection_cancel(static_proxy: NeonProxy):
|
||||
static_proxy.safe_psql("create role http with login password 'http' superuser")
|
||||
|
||||
static_proxy.safe_psql("create table test_table ( id int primary key )")
|
||||
|
||||
# insert into a table, with a unique constraint, after sleeping for n seconds
|
||||
query = "WITH temp AS ( \
|
||||
SELECT pg_sleep($1) as sleep, $2::int as id \
|
||||
) INSERT INTO test_table (id) SELECT id FROM temp"
|
||||
|
||||
try:
|
||||
# The request should complete before the proxy HTTP timeout triggers.
|
||||
# Timeout and cancel the request on the client side before the query completes.
|
||||
static_proxy.http_query(
|
||||
query,
|
||||
[static_proxy.http_timeout_seconds - 1, 1],
|
||||
user="http",
|
||||
password="http",
|
||||
timeout=2,
|
||||
)
|
||||
except requests.exceptions.ReadTimeout:
|
||||
pass
|
||||
|
||||
# wait until the query _would_ have been complete
|
||||
time.sleep(static_proxy.http_timeout_seconds)
|
||||
|
||||
res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200)
|
||||
assert res["command"] == "INSERT", "HTTP query should insert"
|
||||
assert res["rowCount"] == 1, "HTTP query should insert"
|
||||
|
||||
res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400)
|
||||
assert (
|
||||
"duplicate key value violates unique constraint" in res["message"]
|
||||
), "HTTP query should conflict"
|
||||
|
||||
189
test_runner/regress/test_proxy_websockets.py
Normal file
189
test_runner/regress/test_proxy_websockets.py
Normal file
@@ -0,0 +1,189 @@
|
||||
import ssl
|
||||
|
||||
import pytest
|
||||
import websockets
|
||||
from fixtures.neon_fixtures import NeonProxy
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_websockets(static_proxy: NeonProxy):
|
||||
static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
|
||||
|
||||
user = "ws_auth"
|
||||
password = "ws"
|
||||
|
||||
version = b"\x00\x03\x00\x00"
|
||||
params = {
|
||||
"user": user,
|
||||
"database": "postgres",
|
||||
"client_encoding": "UTF8",
|
||||
}
|
||||
|
||||
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
|
||||
|
||||
async with websockets.connect(
|
||||
f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
|
||||
ssl=ssl_context,
|
||||
) as websocket:
|
||||
startup_message = bytearray(version)
|
||||
for key, value in params.items():
|
||||
startup_message.extend(key.encode("ascii"))
|
||||
startup_message.extend(b"\0")
|
||||
startup_message.extend(value.encode("ascii"))
|
||||
startup_message.extend(b"\0")
|
||||
startup_message.extend(b"\0")
|
||||
length = (4 + len(startup_message)).to_bytes(4, byteorder="big")
|
||||
|
||||
await websocket.send([length, startup_message])
|
||||
|
||||
startup_response = await websocket.recv()
|
||||
assert isinstance(startup_response, bytes)
|
||||
assert startup_response[0:1] == b"R", "should be authentication message"
|
||||
assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
|
||||
assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext"
|
||||
|
||||
auth_message = password.encode("utf-8") + b"\0"
|
||||
length = (4 + len(auth_message)).to_bytes(4, byteorder="big")
|
||||
await websocket.send([b"p", length, auth_message])
|
||||
|
||||
auth_response = await websocket.recv()
|
||||
assert isinstance(auth_response, bytes)
|
||||
assert auth_response[0:1] == b"R", "should be authentication message"
|
||||
assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
|
||||
assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated"
|
||||
|
||||
query_message = "SELECT 1".encode("utf-8") + b"\0"
|
||||
length = (4 + len(query_message)).to_bytes(4, byteorder="big")
|
||||
await websocket.send([b"Q", length, query_message])
|
||||
|
||||
query_response = await websocket.recv()
|
||||
assert isinstance(query_response, bytes)
|
||||
# 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00'
|
||||
# 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011'
|
||||
# 'C\x00\x00\x00\rSELECT 1\x00'
|
||||
# 'Z\x00\x00\x00\x05I'
|
||||
|
||||
assert query_response[0:1] == b"T", "should be row description message"
|
||||
row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
|
||||
row_description, query_response = (
|
||||
query_response[:row_description_len],
|
||||
query_response[row_description_len:],
|
||||
)
|
||||
assert row_description[5:7] == b"\x00\x01", "should have 1 column"
|
||||
assert row_description[7:16] == b"?column?\0", "column should be named ?column?"
|
||||
assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4"
|
||||
|
||||
assert query_response[0:1] == b"D", "should be data row message"
|
||||
data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
|
||||
data_row, query_response = query_response[:data_row_len], query_response[data_row_len:]
|
||||
assert (
|
||||
data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011"
|
||||
), "should contain 1 column with text value 1"
|
||||
|
||||
assert query_response[0:1] == b"C", "should be command complete message"
|
||||
command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
|
||||
command_complete, query_response = (
|
||||
query_response[:command_complete_len],
|
||||
query_response[command_complete_len:],
|
||||
)
|
||||
assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0"
|
||||
|
||||
assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)"
|
||||
|
||||
# close
|
||||
await websocket.send(b"X\x00\x00\x00\x04")
|
||||
await websocket.wait_closed()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_websockets_pipelined(static_proxy: NeonProxy):
|
||||
"""
|
||||
Test whether we can send the startup + auth + query all in one go
|
||||
"""
|
||||
|
||||
static_proxy.safe_psql("create user ws_auth with password 'ws' superuser")
|
||||
|
||||
user = "ws_auth"
|
||||
password = "ws"
|
||||
|
||||
version = b"\x00\x03\x00\x00"
|
||||
params = {
|
||||
"user": user,
|
||||
"database": "postgres",
|
||||
"client_encoding": "UTF8",
|
||||
}
|
||||
|
||||
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ssl_context.load_verify_locations(str(static_proxy.test_output_dir / "proxy.crt"))
|
||||
|
||||
async with websockets.connect(
|
||||
f"wss://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
|
||||
ssl=ssl_context,
|
||||
) as websocket:
|
||||
startup_message = bytearray(version)
|
||||
for key, value in params.items():
|
||||
startup_message.extend(key.encode("ascii"))
|
||||
startup_message.extend(b"\0")
|
||||
startup_message.extend(value.encode("ascii"))
|
||||
startup_message.extend(b"\0")
|
||||
startup_message.extend(b"\0")
|
||||
length0 = (4 + len(startup_message)).to_bytes(4, byteorder="big")
|
||||
|
||||
auth_message = password.encode("utf-8") + b"\0"
|
||||
length1 = (4 + len(auth_message)).to_bytes(4, byteorder="big")
|
||||
query_message = "SELECT 1".encode("utf-8") + b"\0"
|
||||
length2 = (4 + len(query_message)).to_bytes(4, byteorder="big")
|
||||
await websocket.send(
|
||||
[length0, startup_message, b"p", length1, auth_message, b"Q", length2, query_message]
|
||||
)
|
||||
|
||||
startup_response = await websocket.recv()
|
||||
assert isinstance(startup_response, bytes)
|
||||
assert startup_response[0:1] == b"R", "should be authentication message"
|
||||
assert startup_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
|
||||
assert startup_response[5:9] == b"\x00\x00\x00\x03", "should be cleartext"
|
||||
|
||||
auth_response = await websocket.recv()
|
||||
assert isinstance(auth_response, bytes)
|
||||
assert auth_response[0:1] == b"R", "should be authentication message"
|
||||
assert auth_response[1:5] == b"\x00\x00\x00\x08", "should be 8 bytes long message"
|
||||
assert auth_response[5:9] == b"\x00\x00\x00\x00", "should be authenticated"
|
||||
|
||||
query_response = await websocket.recv()
|
||||
assert isinstance(query_response, bytes)
|
||||
# 'T\x00\x00\x00!\x00\x01?column?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17\x00\x04\xff\xff\xff\xff\x00\x00'
|
||||
# 'D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011'
|
||||
# 'C\x00\x00\x00\rSELECT 1\x00'
|
||||
# 'Z\x00\x00\x00\x05I'
|
||||
|
||||
assert query_response[0:1] == b"T", "should be row description message"
|
||||
row_description_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
|
||||
row_description, query_response = (
|
||||
query_response[:row_description_len],
|
||||
query_response[row_description_len:],
|
||||
)
|
||||
assert row_description[5:7] == b"\x00\x01", "should have 1 column"
|
||||
assert row_description[7:16] == b"?column?\0", "column should be named ?column?"
|
||||
assert row_description[22:26] == b"\x00\x00\x00\x17", "column should be an int4"
|
||||
|
||||
assert query_response[0:1] == b"D", "should be data row message"
|
||||
data_row_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
|
||||
data_row, query_response = query_response[:data_row_len], query_response[data_row_len:]
|
||||
assert (
|
||||
data_row == b"D\x00\x00\x00\x0b\x00\x01\x00\x00\x00\x011"
|
||||
), "should contain 1 column with text value 1"
|
||||
|
||||
assert query_response[0:1] == b"C", "should be command complete message"
|
||||
command_complete_len = int.from_bytes(query_response[1:5], byteorder="big") + 1
|
||||
command_complete, query_response = (
|
||||
query_response[:command_complete_len],
|
||||
query_response[command_complete_len:],
|
||||
)
|
||||
assert command_complete == b"C\x00\x00\x00\x0dSELECT 1\0"
|
||||
|
||||
assert query_response[0:6] == b"Z\x00\x00\x00\x05I", "should be ready for query (idle)"
|
||||
|
||||
# close
|
||||
await websocket.send(b"X\x00\x00\x00\x04")
|
||||
await websocket.wait_closed()
|
||||
@@ -1,10 +1,14 @@
|
||||
import os
|
||||
from typing import Dict, List, Union
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.compute_reconfigure import ComputeReconfigure
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
StorageControllerApiException,
|
||||
tenant_get_shards,
|
||||
)
|
||||
from fixtures.remote_storage import s3_storage
|
||||
@@ -495,3 +499,341 @@ def test_sharding_ingest(
|
||||
|
||||
# Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance.
|
||||
assert huge_layer_count <= shard_count
|
||||
|
||||
|
||||
class Failure:
|
||||
pageserver_id: Optional[int]
|
||||
|
||||
def apply(self, env: NeonEnv):
|
||||
raise NotImplementedError()
|
||||
|
||||
def clear(self, env: NeonEnv):
|
||||
"""
|
||||
Clear the failure, in a way that should enable the system to proceed
|
||||
to a totally clean state (all nodes online and reconciled)
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def expect_available(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def can_mitigate(self):
|
||||
"""Whether Self.mitigate is available for use"""
|
||||
return False
|
||||
|
||||
def mitigate(self, env: NeonEnv):
|
||||
"""
|
||||
Mitigate the failure in a way that should allow shard split to
|
||||
complete and service to resume, but does not guarantee to leave
|
||||
the whole world in a clean state (e.g. an Offline node might have
|
||||
junk LocationConfigs on it)
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def fails_forward(self, env: NeonEnv):
|
||||
"""
|
||||
If true, this failure results in a state that eventualy completes the split.
|
||||
"""
|
||||
return False
|
||||
|
||||
def expect_exception(self):
|
||||
"""
|
||||
How do we expect a call to the split API to fail?
|
||||
"""
|
||||
return StorageControllerApiException
|
||||
|
||||
|
||||
class PageserverFailpoint(Failure):
|
||||
def __init__(self, failpoint, pageserver_id, mitigate):
|
||||
self.failpoint = failpoint
|
||||
self.pageserver_id = pageserver_id
|
||||
self._mitigate = mitigate
|
||||
|
||||
def apply(self, env: NeonEnv):
|
||||
pageserver = env.get_pageserver(self.pageserver_id)
|
||||
pageserver.allowed_errors.extend(
|
||||
[".*failpoint.*", ".*Resetting.*after shard split failure.*"]
|
||||
)
|
||||
pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
|
||||
|
||||
def clear(self, env: NeonEnv):
|
||||
pageserver = env.get_pageserver(self.pageserver_id)
|
||||
pageserver.http_client().configure_failpoints((self.failpoint, "off"))
|
||||
if self._mitigate:
|
||||
env.storage_controller.node_configure(self.pageserver_id, {"availability": "Active"})
|
||||
|
||||
def expect_available(self):
|
||||
return True
|
||||
|
||||
def can_mitigate(self):
|
||||
return self._mitigate
|
||||
|
||||
def mitigate(self, env):
|
||||
env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"})
|
||||
|
||||
|
||||
class StorageControllerFailpoint(Failure):
|
||||
def __init__(self, failpoint, action):
|
||||
self.failpoint = failpoint
|
||||
self.pageserver_id = None
|
||||
self.action = action
|
||||
|
||||
def apply(self, env: NeonEnv):
|
||||
env.storage_controller.configure_failpoints((self.failpoint, self.action))
|
||||
|
||||
def clear(self, env: NeonEnv):
|
||||
if "panic" in self.action:
|
||||
log.info("Restarting storage controller after panic")
|
||||
env.storage_controller.stop()
|
||||
env.storage_controller.start()
|
||||
else:
|
||||
env.storage_controller.configure_failpoints((self.failpoint, "off"))
|
||||
|
||||
def expect_available(self):
|
||||
# Controller panics _do_ leave pageservers available, but our test code relies
|
||||
# on using the locate API to update configurations in Workload, so we must skip
|
||||
# these actions when the controller has been panicked.
|
||||
return "panic" not in self.action
|
||||
|
||||
def can_mitigate(self):
|
||||
return False
|
||||
|
||||
def fails_forward(self, env):
|
||||
# Edge case: the very last failpoint that simulates a DB connection error, where
|
||||
# the abort path will fail-forward and result in a complete split.
|
||||
fail_forward = self.failpoint == "shard-split-post-complete"
|
||||
|
||||
# If the failure was a panic, then if we expect split to eventually (after restart)
|
||||
# complete, we must restart before checking that.
|
||||
if fail_forward and "panic" in self.action:
|
||||
log.info("Restarting storage controller after panic")
|
||||
env.storage_controller.stop()
|
||||
env.storage_controller.start()
|
||||
|
||||
return fail_forward
|
||||
|
||||
def expect_exception(self):
|
||||
if "panic" in self.action:
|
||||
return requests.exceptions.ConnectionError
|
||||
else:
|
||||
return StorageControllerApiException
|
||||
|
||||
|
||||
class NodeKill(Failure):
|
||||
def __init__(self, pageserver_id, mitigate):
|
||||
self.pageserver_id = pageserver_id
|
||||
self._mitigate = mitigate
|
||||
|
||||
def apply(self, env: NeonEnv):
|
||||
pageserver = env.get_pageserver(self.pageserver_id)
|
||||
pageserver.stop(immediate=True)
|
||||
|
||||
def clear(self, env: NeonEnv):
|
||||
pageserver = env.get_pageserver(self.pageserver_id)
|
||||
pageserver.start()
|
||||
|
||||
def expect_available(self):
|
||||
return False
|
||||
|
||||
def mitigate(self, env):
|
||||
env.storage_controller.node_configure(self.pageserver_id, {"availability": "Offline"})
|
||||
|
||||
|
||||
class CompositeFailure(Failure):
|
||||
"""
|
||||
Wrapper for failures in multiple components (e.g. a failpoint in the storage controller, *and*
|
||||
stop a pageserver to interfere with rollback)
|
||||
"""
|
||||
|
||||
def __init__(self, failures: list[Failure]):
|
||||
self.failures = failures
|
||||
|
||||
self.pageserver_id = None
|
||||
for f in failures:
|
||||
if f.pageserver_id is not None:
|
||||
self.pageserver_id = f.pageserver_id
|
||||
break
|
||||
|
||||
def apply(self, env: NeonEnv):
|
||||
for f in self.failures:
|
||||
f.apply(env)
|
||||
|
||||
def clear(self, env):
|
||||
for f in self.failures:
|
||||
f.clear(env)
|
||||
|
||||
def expect_available(self):
|
||||
return all(f.expect_available() for f in self.failures)
|
||||
|
||||
def mitigate(self, env):
|
||||
for f in self.failures:
|
||||
f.mitigate(env)
|
||||
|
||||
def expect_exception(self):
|
||||
expect = set(f.expect_exception() for f in self.failures)
|
||||
|
||||
# We can't give a sensible response if our failures have different expectations
|
||||
assert len(expect) == 1
|
||||
|
||||
return list(expect)[0]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"failure",
|
||||
[
|
||||
PageserverFailpoint("api-500", 1, False),
|
||||
NodeKill(1, False),
|
||||
PageserverFailpoint("api-500", 1, True),
|
||||
NodeKill(1, True),
|
||||
PageserverFailpoint("shard-split-pre-prepare", 1, False),
|
||||
PageserverFailpoint("shard-split-post-prepare", 1, False),
|
||||
PageserverFailpoint("shard-split-pre-hardlink", 1, False),
|
||||
PageserverFailpoint("shard-split-post-hardlink", 1, False),
|
||||
PageserverFailpoint("shard-split-post-child-conf", 1, False),
|
||||
PageserverFailpoint("shard-split-lsn-wait", 1, False),
|
||||
PageserverFailpoint("shard-split-pre-finish", 1, False),
|
||||
StorageControllerFailpoint("shard-split-validation", "return(1)"),
|
||||
StorageControllerFailpoint("shard-split-post-begin", "return(1)"),
|
||||
StorageControllerFailpoint("shard-split-post-remote", "return(1)"),
|
||||
StorageControllerFailpoint("shard-split-post-complete", "return(1)"),
|
||||
StorageControllerFailpoint("shard-split-validation", "panic(failpoint)"),
|
||||
StorageControllerFailpoint("shard-split-post-begin", "panic(failpoint)"),
|
||||
StorageControllerFailpoint("shard-split-post-remote", "panic(failpoint)"),
|
||||
StorageControllerFailpoint("shard-split-post-complete", "panic(failpoint)"),
|
||||
CompositeFailure(
|
||||
[NodeKill(1, True), StorageControllerFailpoint("shard-split-post-begin", "return(1)")]
|
||||
),
|
||||
CompositeFailure(
|
||||
[NodeKill(1, False), StorageControllerFailpoint("shard-split-post-begin", "return(1)")]
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_sharding_split_failures(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
compute_reconfigure_listener: ComputeReconfigure,
|
||||
failure: Failure,
|
||||
):
|
||||
neon_env_builder.num_pageservers = 4
|
||||
neon_env_builder.control_plane_compute_hook_api = (
|
||||
compute_reconfigure_listener.control_plane_compute_hook_api
|
||||
)
|
||||
initial_shard_count = 2
|
||||
split_shard_count = 4
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=initial_shard_count)
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
for ps in env.pageservers:
|
||||
# When we do node failures and abandon a shard, it will de-facto have old generation and
|
||||
# thereby be unable to publish remote consistent LSN updates
|
||||
ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
|
||||
|
||||
# If we're using a failure that will panic the storage controller, all background
|
||||
# upcalls from the pageserver can fail
|
||||
ps.allowed_errors.append(".*calling control plane generation validation API failed.*")
|
||||
|
||||
# Make sure the node we're failing has a shard on it, otherwise the test isn't testing anything
|
||||
assert (
|
||||
failure.pageserver_id is None
|
||||
or len(
|
||||
env.get_pageserver(failure.pageserver_id)
|
||||
.http_client()
|
||||
.tenant_list_locations()["tenant_shards"]
|
||||
)
|
||||
> 0
|
||||
)
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init()
|
||||
workload.write_rows(100)
|
||||
|
||||
# Put the environment into a failing state (exact meaning depends on `failure`)
|
||||
failure.apply(env)
|
||||
|
||||
with pytest.raises(failure.expect_exception()):
|
||||
env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
|
||||
|
||||
# We expect that the overall operation will fail, but some split requests
|
||||
# will have succeeded: the net result should be to return to a clean state, including
|
||||
# detaching any child shards.
|
||||
def assert_rolled_back(exclude_ps_id=None) -> None:
|
||||
count = 0
|
||||
for ps in env.pageservers:
|
||||
if exclude_ps_id is not None and ps.id == exclude_ps_id:
|
||||
continue
|
||||
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
|
||||
assert tenant_shard_id.shard_count == initial_shard_count
|
||||
count += 1
|
||||
assert count == initial_shard_count
|
||||
|
||||
def assert_split_done(exclude_ps_id=None) -> None:
|
||||
count = 0
|
||||
for ps in env.pageservers:
|
||||
if exclude_ps_id is not None and ps.id == exclude_ps_id:
|
||||
continue
|
||||
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id}")
|
||||
assert tenant_shard_id.shard_count == split_shard_count
|
||||
count += 1
|
||||
assert count == split_shard_count
|
||||
|
||||
def finish_split():
|
||||
# Having failed+rolled back, we should be able to split again
|
||||
# No failures this time; it will succeed
|
||||
env.storage_controller.tenant_shard_split(tenant_id, shard_count=split_shard_count)
|
||||
|
||||
workload.churn_rows(10)
|
||||
workload.validate()
|
||||
|
||||
if failure.expect_available():
|
||||
# Even though the split failed partway through, this should not have interrupted
|
||||
# clients. Disable waiting for pageservers in the workload helper, because our
|
||||
# failpoints may prevent API access.
|
||||
# This only applies for failure modes that leave pageserver page_service API available.
|
||||
workload.churn_rows(10, upload=False, ingest=False)
|
||||
workload.validate()
|
||||
|
||||
if failure.fails_forward(env):
|
||||
log.info("Fail-forward failure, checking split eventually completes...")
|
||||
# A failure type which results in eventual completion of the split
|
||||
wait_until(30, 1, assert_split_done)
|
||||
elif failure.can_mitigate():
|
||||
log.info("Mitigating failure...")
|
||||
# Mitigation phase: we expect to be able to proceed with a successful shard split
|
||||
failure.mitigate(env)
|
||||
|
||||
# The split should appear to be rolled back from the point of view of all pageservers
|
||||
# apart from the one that is offline
|
||||
wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
|
||||
|
||||
finish_split()
|
||||
wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
|
||||
|
||||
# Having cleared the failure, everything should converge to a pristine state
|
||||
failure.clear(env)
|
||||
wait_until(30, 1, assert_split_done)
|
||||
else:
|
||||
# Once we restore the faulty pageserver's API to good health, rollback should
|
||||
# eventually complete.
|
||||
log.info("Clearing failure...")
|
||||
failure.clear(env)
|
||||
|
||||
wait_until(30, 1, assert_rolled_back)
|
||||
|
||||
# Having rolled back, the tenant should be working
|
||||
workload.churn_rows(10)
|
||||
workload.validate()
|
||||
|
||||
# Splitting again should work, since we cleared the failure
|
||||
finish_split()
|
||||
assert_split_done()
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
@@ -769,3 +769,172 @@ def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
|
||||
assert "pitr_interval" not in readback_ps.tenant_specific_overrides
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
|
||||
class Failure:
|
||||
pageserver_id: int
|
||||
|
||||
def apply(self, env: NeonEnv):
|
||||
raise NotImplementedError()
|
||||
|
||||
def clear(self, env: NeonEnv):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class NodeStop(Failure):
|
||||
def __init__(self, pageserver_id, immediate):
|
||||
self.pageserver_id = pageserver_id
|
||||
self.immediate = immediate
|
||||
|
||||
def apply(self, env: NeonEnv):
|
||||
pageserver = env.get_pageserver(self.pageserver_id)
|
||||
pageserver.stop(immediate=self.immediate)
|
||||
|
||||
def clear(self, env: NeonEnv):
|
||||
pageserver = env.get_pageserver(self.pageserver_id)
|
||||
pageserver.start()
|
||||
|
||||
|
||||
class PageserverFailpoint(Failure):
|
||||
def __init__(self, failpoint, pageserver_id):
|
||||
self.failpoint = failpoint
|
||||
self.pageserver_id = pageserver_id
|
||||
|
||||
def apply(self, env: NeonEnv):
|
||||
pageserver = env.get_pageserver(self.pageserver_id)
|
||||
pageserver.http_client().configure_failpoints((self.failpoint, "return(1)"))
|
||||
|
||||
def clear(self, env: NeonEnv):
|
||||
pageserver = env.get_pageserver(self.pageserver_id)
|
||||
pageserver.http_client().configure_failpoints((self.failpoint, "off"))
|
||||
|
||||
|
||||
def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
|
||||
tenants = env.storage_controller.tenant_list()
|
||||
|
||||
node_to_tenants: dict[int, list[TenantId]] = {}
|
||||
for t in tenants:
|
||||
for node_id, loc_state in t["observed"]["locations"].items():
|
||||
if (
|
||||
loc_state is not None
|
||||
and "conf" in loc_state
|
||||
and loc_state["conf"] is not None
|
||||
and loc_state["conf"]["mode"] == "AttachedSingle"
|
||||
):
|
||||
crnt = node_to_tenants.get(int(node_id), [])
|
||||
crnt.append(TenantId(t["tenant_shard_id"]))
|
||||
node_to_tenants[int(node_id)] = crnt
|
||||
|
||||
return node_to_tenants
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"failure",
|
||||
[
|
||||
NodeStop(pageserver_id=1, immediate=False),
|
||||
NodeStop(pageserver_id=1, immediate=True),
|
||||
PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"),
|
||||
],
|
||||
)
|
||||
def test_sharding_service_heartbeats(
|
||||
neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure
|
||||
):
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
# Initially we have two online pageservers
|
||||
nodes = env.storage_controller.node_list()
|
||||
assert len(nodes) == 2
|
||||
assert all([n["availability"] == "Active" for n in nodes])
|
||||
|
||||
# ... then we create two tenants and write some data into them
|
||||
def create_tenant(tid: TenantId):
|
||||
env.storage_controller.tenant_create(tid)
|
||||
|
||||
branch_name = "main"
|
||||
env.neon_cli.create_timeline(
|
||||
branch_name,
|
||||
tenant_id=tid,
|
||||
)
|
||||
|
||||
with env.endpoints.create_start("main", tenant_id=tid) as endpoint:
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
endpoint.safe_psql("CREATE TABLE created_foo(id integer);")
|
||||
|
||||
tenant_ids = [TenantId.generate(), TenantId.generate()]
|
||||
for tid in tenant_ids:
|
||||
create_tenant(tid)
|
||||
|
||||
# ... expecting that each tenant will be placed on a different node
|
||||
def tenants_placed():
|
||||
node_to_tenants = build_node_to_tenants_map(env)
|
||||
log.info(f"{node_to_tenants=}")
|
||||
|
||||
# Check that all the tenants have been attached
|
||||
assert sum((len(ts) for ts in node_to_tenants.values())) == len(tenant_ids)
|
||||
# Check that each node got one tenant
|
||||
assert all((len(ts) == 1 for ts in node_to_tenants.values()))
|
||||
|
||||
wait_until(10, 1, tenants_placed)
|
||||
|
||||
# ... then we apply the failure
|
||||
offline_node_id = failure.pageserver_id
|
||||
online_node_id = (set(range(1, len(env.pageservers) + 1)) - {offline_node_id}).pop()
|
||||
env.get_pageserver(offline_node_id).allowed_errors.append(
|
||||
# In the case of the failpoint failure, the impacted pageserver
|
||||
# still believes it has the tenant attached since location
|
||||
# config calls into it will fail due to being marked offline.
|
||||
".*Dropped remote consistent LSN updates.*",
|
||||
)
|
||||
|
||||
failure.apply(env)
|
||||
|
||||
# ... expecting the heartbeats to mark it offline
|
||||
def node_offline():
|
||||
nodes = env.storage_controller.node_list()
|
||||
log.info(f"{nodes=}")
|
||||
target = next(n for n in nodes if n["id"] == offline_node_id)
|
||||
assert target["availability"] == "Offline"
|
||||
|
||||
# A node is considered offline if the last successful heartbeat
|
||||
# was more than 10 seconds ago (hardcoded in the storage controller).
|
||||
wait_until(20, 1, node_offline)
|
||||
|
||||
# .. expecting the tenant on the offline node to be migrated
|
||||
def tenant_migrated():
|
||||
node_to_tenants = build_node_to_tenants_map(env)
|
||||
log.info(f"{node_to_tenants=}")
|
||||
assert set(node_to_tenants[online_node_id]) == set(tenant_ids)
|
||||
|
||||
wait_until(10, 1, tenant_migrated)
|
||||
|
||||
# ... then we clear the failure
|
||||
failure.clear(env)
|
||||
|
||||
# ... expecting the offline node to become active again
|
||||
def node_online():
|
||||
nodes = env.storage_controller.node_list()
|
||||
target = next(n for n in nodes if n["id"] == offline_node_id)
|
||||
assert target["availability"] == "Active"
|
||||
|
||||
wait_until(10, 1, node_online)
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
# ... then we create a new tenant
|
||||
tid = TenantId.generate()
|
||||
env.storage_controller.tenant_create(tid)
|
||||
|
||||
# ... expecting it to be placed on the node that just came back online
|
||||
tenants = env.storage_controller.tenant_list()
|
||||
newest_tenant = next(t for t in tenants if t["tenant_shard_id"] == str(tid))
|
||||
locations = list(newest_tenant["observed"]["locations"].keys())
|
||||
locations = [int(node_id) for node_id in locations]
|
||||
assert locations == [offline_node_id]
|
||||
|
||||
# ... expecting the storage controller to reach a consistent state
|
||||
def storage_controller_consistent():
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
wait_until(10, 1, storage_controller_consistent)
|
||||
|
||||
@@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# We may leave some upload tasks in the queue. They're likely deletes.
|
||||
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
|
||||
# So by ignoring these instead of waiting for empty upload queue
|
||||
@@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach(
|
||||
# From deletion polling
|
||||
f".*NotFound: tenant {env.initial_tenant}.*",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# error from http response is also logged
|
||||
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
|
||||
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
@@ -326,7 +327,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
|
||||
size_debug_file.write(size_debug)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
|
||||
def test_single_branch_get_tenant_size_grows(
|
||||
neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
|
||||
):
|
||||
@@ -349,10 +350,21 @@ def test_single_branch_get_tenant_size_grows(
|
||||
# adjust the gc_horizon accordingly.
|
||||
if pg_version == PgVersion.V14:
|
||||
gc_horizon = 0x4A000
|
||||
elif pg_version == PgVersion.V15:
|
||||
gc_horizon = 0x3BA00
|
||||
elif pg_version == PgVersion.V16:
|
||||
gc_horizon = 210000
|
||||
else:
|
||||
raise NotImplementedError(pg_version)
|
||||
|
||||
neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
|
||||
tenant_config = {
|
||||
"compaction_period": "0s",
|
||||
"gc_period": "0s",
|
||||
"pitr_interval": "0s",
|
||||
"gc_horizon": gc_horizon,
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
branch_name, timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
|
||||
@@ -405,6 +417,7 @@ def test_single_branch_get_tenant_size_grows(
|
||||
current_lsn = after_lsn
|
||||
size_debug_file.write(size_debug)
|
||||
assert size > 0
|
||||
log.info(f"size: {size} at lsn {current_lsn}")
|
||||
return (current_lsn, size)
|
||||
|
||||
with env.endpoints.create_start(
|
||||
@@ -492,24 +505,41 @@ def test_single_branch_get_tenant_size_grows(
|
||||
|
||||
collected_responses.append(("DELETE", current_lsn, size))
|
||||
|
||||
size_before_drop = get_current_consistent_size(
|
||||
env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
|
||||
)[1]
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("DROP TABLE t0")
|
||||
|
||||
# Without setting a PITR interval, dropping the table doesn't reclaim any space
|
||||
# from the user's point of view, because the DROP transaction is too small
|
||||
# to fall out of gc_horizon.
|
||||
(current_lsn, size) = get_current_consistent_size(
|
||||
env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
|
||||
)
|
||||
prev_size = collected_responses[-1][2]
|
||||
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
|
||||
|
||||
# Set a tiny PITR interval to allow the DROP to impact the synthetic size
|
||||
# Because synthetic size calculation uses pitr interval when available,
|
||||
# when our tenant is configured with a tiny pitr interval, dropping a table should
|
||||
# cause synthetic size to go down immediately
|
||||
tenant_config["pitr_interval"] = "1ms"
|
||||
env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
|
||||
(current_lsn, size) = get_current_consistent_size(
|
||||
env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
|
||||
)
|
||||
assert size < size_before_drop
|
||||
|
||||
# The size of the tenant should still be as large as before we dropped
|
||||
# the table, because the drop operation can still be undone in the PITR
|
||||
# defined by gc_horizon.
|
||||
(current_lsn, size) = get_current_consistent_size(
|
||||
env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
|
||||
)
|
||||
|
||||
prev_size = collected_responses[-1][2]
|
||||
|
||||
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
|
||||
|
||||
collected_responses.append(("DROP", current_lsn, size))
|
||||
|
||||
# Should have gone past gc_horizon, otherwise gc_horizon is too large
|
||||
assert current_lsn - initdb_lsn > gc_horizon
|
||||
bytes_written = current_lsn - initdb_lsn
|
||||
assert bytes_written > gc_horizon
|
||||
|
||||
# this isn't too many lines to forget for a while. observed while
|
||||
# developing these tests that locally the value is a bit more than what we
|
||||
|
||||
@@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
[
|
||||
f".*{timeline_id}.*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion and then pageserver is stopped
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# This happens when we fail before scheduling background operation.
|
||||
# Timeline is left in stopping state and retry tries to stop it again.
|
||||
".*Ignoring new state, equal to the existing one: Stopping",
|
||||
@@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
|
||||
".*failpoint: timeline-delete-before-rm",
|
||||
".*Ignoring new state, equal to the existing one: Stopping",
|
||||
# this happens, because the stuck timeline is visible to shutdown
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach(
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# error from http response is also logged
|
||||
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
|
||||
# Polling after attach may fail with this
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user