mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 22:50:38 +00:00
Compare commits
220 Commits
jcsp/secon
...
release-41
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cdcaa329bf | ||
|
|
27bdbf5e36 | ||
|
|
4c7fa12a2a | ||
|
|
367971a0e9 | ||
|
|
51570114ea | ||
|
|
098d3111a5 | ||
|
|
3737fe3a4b | ||
|
|
5650138532 | ||
|
|
2dca4c03fc | ||
|
|
0b790b6d00 | ||
|
|
e82d1ad6b8 | ||
|
|
4f0a8e92ad | ||
|
|
5952f350cb | ||
|
|
726c8e6730 | ||
|
|
f7067a38b7 | ||
|
|
896347f307 | ||
|
|
e5c81fef86 | ||
|
|
7ebe9ca1ac | ||
|
|
1588601503 | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -2,7 +2,7 @@ name: Create Release Branch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 7 * * 2'
|
||||
- cron: '0 7 * * 5'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
10
Cargo.lock
generated
10
Cargo.lock
generated
@@ -3550,7 +3550,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3563,7 +3563,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -3574,7 +3574,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3592,7 +3592,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -5396,7 +5396,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
|
||||
12
Cargo.toml
12
Cargo.toml
@@ -161,11 +161,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -202,7 +202,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -278,32 +278,26 @@ fn main() -> Result<()> {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
use std::env;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
|
||||
let vm_monitor_addr = matches
|
||||
.get_one::<String>("vm-monitor-addr")
|
||||
.expect("--vm-monitor-addr should always be set because it has a default arg");
|
||||
let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
|
||||
let cgroup = matches.get_one::<String>("cgroup");
|
||||
let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
|
||||
|
||||
// Only make a runtime if we need to.
|
||||
// Note: it seems like you can make a runtime in an inner scope and
|
||||
// if you start a task in it it won't be dropped. However, make it
|
||||
// in the outermost scope just to be safe.
|
||||
let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
|
||||
(None, None) => None,
|
||||
(None, Some(_)) => {
|
||||
warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
|
||||
None
|
||||
}
|
||||
(Some(_), None) => {
|
||||
panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
|
||||
}
|
||||
(Some(_), Some(_)) => Some(
|
||||
let rt = if env::var_os("AUTOSCALING").is_some() {
|
||||
Some(
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.worker_threads(4)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create tokio runtime for monitor"),
|
||||
),
|
||||
.expect("failed to create tokio runtime for monitor")
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
@@ -314,8 +308,7 @@ fn main() -> Result<()> {
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: cgroup.cloned(),
|
||||
pgconnstr: file_cache_connstr.cloned(),
|
||||
addr: vm_monitor_addr.cloned().unwrap(),
|
||||
file_cache_on_disk,
|
||||
addr: vm_monitor_addr.clone(),
|
||||
})),
|
||||
token.clone(),
|
||||
))
|
||||
@@ -487,6 +480,8 @@ fn cli() -> clap::Command {
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
// DEPRECATED, NO LONGER DOES ANYTHING.
|
||||
// See https://github.com/neondatabase/cloud/issues/7516
|
||||
Arg::new("file-cache-on-disk")
|
||||
.long("file-cache-on-disk")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
|
||||
@@ -24,7 +24,7 @@ fn do_control_plane_request(
|
||||
) -> Result<ControlPlaneSpecResponse, (bool, String)> {
|
||||
let resp = reqwest::blocking::Client::new()
|
||||
.get(uri)
|
||||
.header("Authorization", jwt)
|
||||
.header("Authorization", format!("Bearer {}", jwt))
|
||||
.send()
|
||||
.map_err(|e| {
|
||||
(
|
||||
|
||||
@@ -12,6 +12,7 @@ use hyper::{Body, Request, Response};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
@@ -221,8 +222,25 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
generation: 0,
|
||||
});
|
||||
|
||||
if attach_req.node_id.is_some() {
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
tenant_state.generation += 1;
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
ps_id = %attaching_pageserver,
|
||||
generation = %tenant_state.generation,
|
||||
"issuing",
|
||||
);
|
||||
} else if let Some(ps_id) = tenant_state.pageserver {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
%ps_id,
|
||||
generation = %tenant_state.generation,
|
||||
"dropping",
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
"no-op: tenant already has no pageserver");
|
||||
}
|
||||
tenant_state.pageserver = attach_req.node_id;
|
||||
let generation = tenant_state.generation;
|
||||
@@ -240,9 +258,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.data(Arc::new(State::new(persistent_state)))
|
||||
.post("/re-attach", handle_re_attach)
|
||||
.post("/validate", handle_validate)
|
||||
.post("/attach-hook", handle_attach_hook)
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
|
||||
108
docs/updating-postgres.md
Normal file
108
docs/updating-postgres.md
Normal file
@@ -0,0 +1,108 @@
|
||||
# Updating Postgres
|
||||
|
||||
## Minor Versions
|
||||
|
||||
When upgrading to a new minor version of Postgres, please follow these steps:
|
||||
|
||||
_Example: 15.4 is the new minor version to upgrade to from 15.3._
|
||||
|
||||
1. Clone the Neon Postgres repository if you have not done so already.
|
||||
|
||||
```shell
|
||||
git clone git@github.com:neondatabase/postgres.git
|
||||
```
|
||||
|
||||
1. Add the Postgres upstream remote.
|
||||
|
||||
```shell
|
||||
git remote add upstream https://git.postgresql.org/git/postgresql.git
|
||||
```
|
||||
|
||||
1. Create a new branch based on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git checkout -b my-branch REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
1. Tag the last commit on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git tag REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Push the new tag to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
|
||||
1. Rebase the branch you created on the tag and resolve any conflicts.
|
||||
|
||||
```shell
|
||||
git fetch upstream REL_15_4
|
||||
git rebase REL_15_4
|
||||
```
|
||||
|
||||
1. Run the Postgres test suite to make sure our commits have not affected
|
||||
Postgres in a negative way.
|
||||
|
||||
```shell
|
||||
make check
|
||||
# OR
|
||||
meson test -C builddir
|
||||
```
|
||||
|
||||
1. Push your branch to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin my-branch
|
||||
```
|
||||
|
||||
1. Clone the Neon repository if you have not done so already.
|
||||
|
||||
```shell
|
||||
git clone git@github.com:neondatabase/neon.git
|
||||
```
|
||||
|
||||
1. Create a new branch.
|
||||
|
||||
1. Change the `revisions.json` file to point at the HEAD of your Postgres
|
||||
branch.
|
||||
|
||||
1. Update the Git submodule.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch my-branch vendor/postgres-v15
|
||||
git submodule update --remote vendor/postgres-v15
|
||||
```
|
||||
|
||||
1. Run the Neon test suite to make sure that Neon is still good to go on this
|
||||
minor Postgres release.
|
||||
|
||||
```shell
|
||||
./scripts/poetry -k pg15
|
||||
```
|
||||
|
||||
1. Commit your changes.
|
||||
|
||||
1. Create a pull request, and wait for CI to go green.
|
||||
|
||||
1. Force push the rebased Postgres branches into the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push --force origin my-branch:REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
It may require disabling various branch protections.
|
||||
|
||||
1. Update your Neon PR to point at the branches.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
|
||||
git commit --amend --no-edit
|
||||
git push --force origin
|
||||
```
|
||||
|
||||
1. Merge the pull request after getting approval(s) and CI completion.
|
||||
@@ -1,4 +1,7 @@
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc, Mutex, MutexGuard,
|
||||
};
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
|
||||
@@ -10,6 +13,7 @@ use tokio::sync::Semaphore;
|
||||
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
|
||||
pub struct OnceCell<T> {
|
||||
inner: Mutex<Inner<T>>,
|
||||
initializers: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<T> Default for OnceCell<T> {
|
||||
@@ -17,6 +21,7 @@ impl<T> Default for OnceCell<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inner: Default::default(),
|
||||
initializers: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -49,6 +54,7 @@ impl<T> OnceCell<T> {
|
||||
init_semaphore: Arc::new(sem),
|
||||
value: Some(value),
|
||||
}),
|
||||
initializers: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,8 +66,8 @@ impl<T> OnceCell<T> {
|
||||
/// Initialization is panic-safe and cancellation-safe.
|
||||
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
|
||||
where
|
||||
F: FnOnce() -> Fut,
|
||||
Fut: std::future::Future<Output = Result<T, E>>,
|
||||
F: FnOnce(InitPermit) -> Fut,
|
||||
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
|
||||
{
|
||||
let sem = {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
@@ -71,29 +77,61 @@ impl<T> OnceCell<T> {
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
let permit = sem.acquire_owned().await;
|
||||
if permit.is_err() {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
} else {
|
||||
// now we try
|
||||
let value = factory().await?;
|
||||
let permit = {
|
||||
// increment the count for the duration of queued
|
||||
let _guard = CountWaitingInitializers::start(self);
|
||||
sem.acquire_owned().await
|
||||
};
|
||||
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_none(),
|
||||
"we won permit, must not be initialized"
|
||||
);
|
||||
guard.value = Some(value);
|
||||
guard.init_semaphore.close();
|
||||
Ok(Guard(guard))
|
||||
match permit {
|
||||
Ok(permit) => {
|
||||
let permit = InitPermit(permit);
|
||||
let (value, _permit) = factory(permit).await?;
|
||||
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
Ok(Self::set0(value, guard))
|
||||
}
|
||||
Err(_closed) => {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||
/// to complete initializing the inner value.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the inner has already been initialized.
|
||||
pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
// cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
|
||||
// give more permits right now.
|
||||
if guard.init_semaphore.try_acquire().is_ok() {
|
||||
drop(guard);
|
||||
panic!("permit is of wrong origin");
|
||||
}
|
||||
|
||||
Self::set0(value, guard)
|
||||
}
|
||||
|
||||
fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
|
||||
if guard.value.is_some() {
|
||||
drop(guard);
|
||||
unreachable!("we won permit, must not be initialized");
|
||||
}
|
||||
guard.value = Some(value);
|
||||
guard.init_semaphore.close();
|
||||
Guard(guard)
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, if any.
|
||||
pub fn get(&self) -> Option<Guard<'_, T>> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
@@ -103,6 +141,28 @@ impl<T> OnceCell<T> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
||||
pub fn initializer_count(&self) -> usize {
|
||||
self.initializers.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
|
||||
/// initializing task for example at the end of initialization.
|
||||
struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
|
||||
|
||||
impl<'a, T> CountWaitingInitializers<'a, T> {
|
||||
fn start(target: &'a OnceCell<T>) -> Self {
|
||||
target.initializers.fetch_add(1, Ordering::Relaxed);
|
||||
CountWaitingInitializers(target)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
|
||||
fn drop(&mut self) {
|
||||
self.0.initializers.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
|
||||
@@ -135,7 +195,7 @@ impl<'a, T> Guard<'a, T> {
|
||||
///
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
|
||||
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
||||
let mut swapped = Inner::default();
|
||||
let permit = swapped
|
||||
.init_semaphore
|
||||
@@ -145,11 +205,14 @@ impl<'a, T> Guard<'a, T> {
|
||||
std::mem::swap(&mut *self.0, &mut swapped);
|
||||
swapped
|
||||
.value
|
||||
.map(|v| (v, permit))
|
||||
.map(|v| (v, InitPermit(permit)))
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
/// Type held by OnceCell (de)initializing task.
|
||||
pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -185,11 +248,11 @@ mod tests {
|
||||
barrier.wait().await;
|
||||
let won = {
|
||||
let g = cell
|
||||
.get_or_init(|| {
|
||||
.get_or_init(|permit| {
|
||||
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
|
||||
async {
|
||||
counters.future_polled.fetch_add(1, Ordering::Relaxed);
|
||||
Ok::<_, Infallible>(i)
|
||||
Ok::<_, Infallible>((i, permit))
|
||||
}
|
||||
})
|
||||
.await
|
||||
@@ -243,7 +306,7 @@ mod tests {
|
||||
deinitialization_started.wait().await;
|
||||
|
||||
let started_at = tokio::time::Instant::now();
|
||||
cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
|
||||
cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -258,18 +321,32 @@ mod tests {
|
||||
assert_eq!(*cell.get().unwrap(), reinit);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reinit_with_deinit_permit() {
|
||||
let cell = Arc::new(OnceCell::new(42));
|
||||
|
||||
let (mol, permit) = cell.get().unwrap().take_and_deinit();
|
||||
cell.set(5, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 5);
|
||||
|
||||
let (five, permit) = cell.get().unwrap().take_and_deinit();
|
||||
assert_eq!(5, five);
|
||||
cell.set(mol, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 42);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn initialization_attemptable_until_ok() {
|
||||
let cell = OnceCell::default();
|
||||
|
||||
for _ in 0..10 {
|
||||
cell.get_or_init(|| async { Err("whatever error") })
|
||||
cell.get_or_init(|_permit| async { Err("whatever error") })
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|| async { Ok::<_, Infallible>("finally success") })
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "finally success");
|
||||
@@ -281,11 +358,11 @@ mod tests {
|
||||
|
||||
let barrier = tokio::sync::Barrier::new(2);
|
||||
|
||||
let initializer = cell.get_or_init(|| async {
|
||||
let initializer = cell.get_or_init(|permit| async {
|
||||
barrier.wait().await;
|
||||
futures::future::pending::<()>().await;
|
||||
|
||||
Ok::<_, Infallible>("never reached")
|
||||
Ok::<_, Infallible>(("never reached", permit))
|
||||
});
|
||||
|
||||
tokio::select! {
|
||||
@@ -298,7 +375,7 @@ mod tests {
|
||||
assert!(cell.get().is_none());
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "now initialized");
|
||||
|
||||
@@ -21,11 +21,6 @@ pub struct FileCacheState {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FileCacheConfig {
|
||||
/// Whether the file cache is *actually* stored in memory (e.g. by writing to
|
||||
/// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
|
||||
/// memory available for the cgroup.
|
||||
pub(crate) in_memory: bool,
|
||||
|
||||
/// The size of the file cache, in terms of the size of the resource it consumes
|
||||
/// (currently: only memory)
|
||||
///
|
||||
@@ -59,22 +54,9 @@ pub struct FileCacheConfig {
|
||||
spread_factor: f64,
|
||||
}
|
||||
|
||||
impl FileCacheConfig {
|
||||
pub fn default_in_memory() -> Self {
|
||||
impl Default for FileCacheConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
in_memory: true,
|
||||
// 75 %
|
||||
resource_multiplier: 0.75,
|
||||
// 640 MiB; (512 + 128)
|
||||
min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
|
||||
// ensure any increase in file cache size is split 90-10 with 10% to other memory
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn default_on_disk() -> Self {
|
||||
Self {
|
||||
in_memory: false,
|
||||
resource_multiplier: 0.75,
|
||||
// 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
|
||||
// memory, the kernel will just evict from its page cache, rather than e.g. killing
|
||||
@@ -83,7 +65,9 @@ impl FileCacheConfig {
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileCacheConfig {
|
||||
/// Make sure fields of the config are consistent.
|
||||
pub fn validate(&self) -> anyhow::Result<()> {
|
||||
// Single field validity
|
||||
|
||||
@@ -39,16 +39,6 @@ pub struct Args {
|
||||
#[arg(short, long)]
|
||||
pub pgconnstr: Option<String>,
|
||||
|
||||
/// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
|
||||
/// kernel's page cache), and therefore should not count against available memory.
|
||||
//
|
||||
// NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
|
||||
// than a roundabout way, via whether it's on disk), but in order to be backwards compatible
|
||||
// during the switch away from an in-memory file cache, we had to default to the previous
|
||||
// behavior.
|
||||
#[arg(long)]
|
||||
pub file_cache_on_disk: bool,
|
||||
|
||||
/// The address we should listen on for connection requests. For the
|
||||
/// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
|
||||
#[arg(short, long)]
|
||||
|
||||
@@ -156,10 +156,7 @@ impl Runner {
|
||||
// memory limits.
|
||||
if let Some(connstr) = &args.pgconnstr {
|
||||
info!("initializing file cache");
|
||||
let config = match args.file_cache_on_disk {
|
||||
true => FileCacheConfig::default_on_disk(),
|
||||
false => FileCacheConfig::default_in_memory(),
|
||||
};
|
||||
let config = FileCacheConfig::default();
|
||||
|
||||
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
|
||||
.await
|
||||
@@ -187,10 +184,7 @@ impl Runner {
|
||||
info!("file cache size actually got set to {actual_size}")
|
||||
}
|
||||
|
||||
if args.file_cache_on_disk {
|
||||
file_cache_disk_size = actual_size;
|
||||
}
|
||||
|
||||
file_cache_disk_size = actual_size;
|
||||
state.filecache = Some(file_cache);
|
||||
}
|
||||
|
||||
@@ -239,17 +233,11 @@ impl Runner {
|
||||
|
||||
let requested_mem = target.mem;
|
||||
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
let (expected_file_cache_size, expected_file_cache_disk_size) = self
|
||||
let expected_file_cache_size = self
|
||||
.filecache
|
||||
.as_ref()
|
||||
.map(|file_cache| {
|
||||
let size = file_cache.config.calculate_cache_size(usable_system_memory);
|
||||
match file_cache.config.in_memory {
|
||||
true => (size, 0),
|
||||
false => (size, size),
|
||||
}
|
||||
})
|
||||
.unwrap_or((0, 0));
|
||||
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
|
||||
.unwrap_or(0);
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let (last_time, last_history) = *cgroup.watcher.borrow();
|
||||
|
||||
@@ -273,7 +261,7 @@ impl Runner {
|
||||
|
||||
let new_threshold = self
|
||||
.config
|
||||
.cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
|
||||
.cgroup_threshold(usable_system_memory, expected_file_cache_size);
|
||||
|
||||
let current = last_history.avg_non_reclaimable;
|
||||
|
||||
@@ -300,13 +288,10 @@ impl Runner {
|
||||
.set_file_cache_size(expected_file_cache_size)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if !file_cache.config.in_memory {
|
||||
file_cache_disk_size = actual_usage;
|
||||
}
|
||||
file_cache_disk_size = actual_usage;
|
||||
let message = format!(
|
||||
"set file cache size to {} MiB (in memory = {})",
|
||||
"set file cache size to {} MiB",
|
||||
bytes_to_mebibytes(actual_usage),
|
||||
file_cache.config.in_memory,
|
||||
);
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
@@ -357,9 +342,7 @@ impl Runner {
|
||||
.set_file_cache_size(expected_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if !file_cache.config.in_memory {
|
||||
file_cache_disk_size = actual_usage;
|
||||
}
|
||||
file_cache_disk_size = actual_usage;
|
||||
|
||||
if actual_usage != expected_usage {
|
||||
warn!(
|
||||
|
||||
@@ -57,7 +57,10 @@ impl ControlPlaneClient {
|
||||
|
||||
if let Some(jwt) = &conf.control_plane_api_token {
|
||||
let mut headers = hyper::HeaderMap::new();
|
||||
headers.insert("Authorization", jwt.get_contents().parse().unwrap());
|
||||
headers.insert(
|
||||
"Authorization",
|
||||
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
|
||||
);
|
||||
client = client.default_headers(headers);
|
||||
}
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
@@ -271,7 +272,9 @@ impl DeletionHeader {
|
||||
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
|
||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
|
||||
.await
|
||||
.map_err(Into::into)
|
||||
.maybe_fatal_err("save deletion header")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -360,6 +363,7 @@ impl DeletionList {
|
||||
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion list")
|
||||
.map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
// The number of keys in a DeletionList before we will proactively persist it
|
||||
// (without reaching a flush deadline). This aims to deliver objects of the order
|
||||
@@ -195,7 +197,7 @@ impl ListWriter {
|
||||
debug!("Deletion header {header_path} not found, first start?");
|
||||
Ok(None)
|
||||
} else {
|
||||
Err(anyhow::anyhow!(e))
|
||||
on_fatal_io_error(&e, "reading deletion header");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,16 +218,9 @@ impl ListWriter {
|
||||
self.pending.sequence = validated_sequence + 1;
|
||||
|
||||
let deletion_directory = self.conf.deletion_prefix();
|
||||
let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
|
||||
|
||||
// Give up: if we can't read the deletion list directory, we probably can't
|
||||
// write lists into it later, so the queue won't work.
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
let mut dir = tokio::fs::read_dir(&deletion_directory)
|
||||
.await
|
||||
.fatal_err("read deletion directory");
|
||||
|
||||
let list_name_pattern =
|
||||
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
|
||||
@@ -233,7 +228,7 @@ impl ListWriter {
|
||||
let temp_extension = format!(".{TEMP_SUFFIX}");
|
||||
let header_path = self.conf.deletion_header_path();
|
||||
let mut seqs: Vec<u64> = Vec::new();
|
||||
while let Some(dentry) = dir.next_entry().await? {
|
||||
while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
|
||||
let file_name = dentry.file_name();
|
||||
let dentry_str = file_name.to_string_lossy();
|
||||
|
||||
@@ -246,11 +241,9 @@ impl ListWriter {
|
||||
info!("Cleaning up temporary file {dentry_str}");
|
||||
let absolute_path =
|
||||
deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
|
||||
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
|
||||
// Non-fatal error: we will just leave the file behind but not
|
||||
// try and load it.
|
||||
warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
|
||||
}
|
||||
tokio::fs::remove_file(&absolute_path)
|
||||
.await
|
||||
.fatal_err("delete temp file");
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -290,7 +283,9 @@ impl ListWriter {
|
||||
for s in seqs {
|
||||
let list_path = self.conf.deletion_list_path(s);
|
||||
|
||||
let list_bytes = tokio::fs::read(&list_path).await?;
|
||||
let list_bytes = tokio::fs::read(&list_path)
|
||||
.await
|
||||
.fatal_err("read deletion list");
|
||||
|
||||
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
|
||||
Ok(l) => l,
|
||||
|
||||
@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
|
||||
use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::control_plane_client::RetryForeverError;
|
||||
use crate::metrics;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
use super::deleter::DeleterMessage;
|
||||
use super::DeletionHeader;
|
||||
@@ -287,16 +288,9 @@ where
|
||||
async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
|
||||
for list_path in list_paths {
|
||||
debug!("Removing deletion list {list_path}");
|
||||
|
||||
if let Err(e) = tokio::fs::remove_file(&list_path).await {
|
||||
// Unexpected: we should have permissions and nothing else should
|
||||
// be touching these files. We will leave the file behind. Subsequent
|
||||
// pageservers will try and load it again: hopefully whatever storage
|
||||
// issue (probably permissions) has been fixed by then.
|
||||
tracing::error!("Failed to delete {list_path}: {e:#}");
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
break;
|
||||
}
|
||||
tokio::fs::remove_file(&list_path)
|
||||
.await
|
||||
.fatal_err("remove deletion list");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -569,7 +569,17 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: Tenant download is already in progress
|
||||
description: |
|
||||
The tenant is already known to Pageserver in some way,
|
||||
and hence this `/attach` call has been rejected.
|
||||
|
||||
Some examples of how this can happen:
|
||||
- tenant was created on this pageserver
|
||||
- tenant attachment was started by an earlier call to `/attach`.
|
||||
|
||||
Callers should poll the tenant status's `attachment_status` field,
|
||||
like for status 202. See the longer description for `POST /attach`
|
||||
for details.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
|
||||
@@ -125,6 +125,7 @@ impl Layer {
|
||||
let inner = Arc::new(DownloadedLayer {
|
||||
owner: owner.clone(),
|
||||
kind: tokio::sync::OnceCell::default(),
|
||||
version: 0,
|
||||
});
|
||||
resident = Some(inner.clone());
|
||||
|
||||
@@ -163,6 +164,7 @@ impl Layer {
|
||||
let inner = Arc::new(DownloadedLayer {
|
||||
owner: owner.clone(),
|
||||
kind: tokio::sync::OnceCell::default(),
|
||||
version: 0,
|
||||
});
|
||||
resident = Some(inner.clone());
|
||||
let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
|
||||
@@ -328,42 +330,46 @@ impl Layer {
|
||||
/// read with [`Layer::get_value_reconstruct_data`].
|
||||
///
|
||||
/// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
|
||||
#[derive(Debug)]
|
||||
enum ResidentOrWantedEvicted {
|
||||
Resident(Arc<DownloadedLayer>),
|
||||
WantedEvicted(Weak<DownloadedLayer>),
|
||||
WantedEvicted(Weak<DownloadedLayer>, usize),
|
||||
}
|
||||
|
||||
impl ResidentOrWantedEvicted {
|
||||
fn get(&self) -> Option<Arc<DownloadedLayer>> {
|
||||
fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
|
||||
match self {
|
||||
ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
|
||||
ResidentOrWantedEvicted::WantedEvicted(weak) => match weak.upgrade() {
|
||||
ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
|
||||
ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
|
||||
Some(strong) => {
|
||||
LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
|
||||
Some(strong)
|
||||
|
||||
*self = ResidentOrWantedEvicted::Resident(strong.clone());
|
||||
|
||||
Some((strong, true))
|
||||
}
|
||||
None => None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// When eviction is first requested, drop down to holding a [`Weak`].
|
||||
///
|
||||
/// Returns `true` if this was the first time eviction was requested.
|
||||
fn downgrade(&mut self) -> &Weak<DownloadedLayer> {
|
||||
let _was_first = match self {
|
||||
/// Returns `Some` if this was the first time eviction was requested. Care should be taken to
|
||||
/// drop the possibly last strong reference outside of the mutex of
|
||||
/// heavier_once_cell::OnceCell.
|
||||
fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
|
||||
match self {
|
||||
ResidentOrWantedEvicted::Resident(strong) => {
|
||||
let weak = Arc::downgrade(strong);
|
||||
*self = ResidentOrWantedEvicted::WantedEvicted(weak);
|
||||
// returning the weak is not useful, because the drop could had already ran with
|
||||
// the replacement above, and that will take care of cleaning the Option we are in
|
||||
true
|
||||
let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
|
||||
std::mem::swap(self, &mut temp);
|
||||
match temp {
|
||||
ResidentOrWantedEvicted::Resident(strong) => Some(strong),
|
||||
ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
|
||||
}
|
||||
}
|
||||
ResidentOrWantedEvicted::WantedEvicted(_) => false,
|
||||
};
|
||||
|
||||
match self {
|
||||
ResidentOrWantedEvicted::WantedEvicted(ref weak) => weak,
|
||||
_ => unreachable!("just wrote wanted evicted"),
|
||||
ResidentOrWantedEvicted::WantedEvicted(..) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -398,11 +404,17 @@ struct LayerInner {
|
||||
/// [`LayerInner::on_downloaded_layer_drop`].
|
||||
wanted_evicted: AtomicBool,
|
||||
|
||||
/// Version is to make sure we will in fact only evict a file if no new download has been
|
||||
/// started.
|
||||
/// Version is to make sure we will only evict a specific download of a file.
|
||||
///
|
||||
/// Incremented for each download, stored in `DownloadedLayer::version` or
|
||||
/// `ResidentOrWantedEvicted::WantedEvicted`.
|
||||
version: AtomicUsize,
|
||||
|
||||
/// Allow subscribing to when the layer actually gets evicted.
|
||||
///
|
||||
/// If in future we need to implement "wait until layer instances are gone and done", carrying
|
||||
/// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
|
||||
/// method for "wait_gc" which will wait to this being closed.
|
||||
status: tokio::sync::broadcast::Sender<Status>,
|
||||
|
||||
/// Counter for exponential backoff with the download
|
||||
@@ -515,6 +527,14 @@ impl LayerInner {
|
||||
.timeline_path(&timeline.tenant_id, &timeline.timeline_id)
|
||||
.join(desc.filename().to_string());
|
||||
|
||||
let (inner, version) = if let Some(inner) = downloaded {
|
||||
let version = inner.version;
|
||||
let resident = ResidentOrWantedEvicted::Resident(inner);
|
||||
(heavier_once_cell::OnceCell::new(resident), version)
|
||||
} else {
|
||||
(heavier_once_cell::OnceCell::default(), 0)
|
||||
};
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
path,
|
||||
@@ -524,12 +544,8 @@ impl LayerInner {
|
||||
access_stats,
|
||||
wanted_garbage_collected: AtomicBool::new(false),
|
||||
wanted_evicted: AtomicBool::new(false),
|
||||
inner: if let Some(inner) = downloaded {
|
||||
heavier_once_cell::OnceCell::new(ResidentOrWantedEvicted::Resident(inner))
|
||||
} else {
|
||||
heavier_once_cell::OnceCell::default()
|
||||
},
|
||||
version: AtomicUsize::new(0),
|
||||
inner,
|
||||
version: AtomicUsize::new(version),
|
||||
status: tokio::sync::broadcast::channel(1).0,
|
||||
consecutive_failures: AtomicUsize::new(0),
|
||||
generation,
|
||||
@@ -549,6 +565,8 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancellation safe, however dropping the future and calling this method again might result
|
||||
/// in a new attempt to evict OR join the previously started attempt.
|
||||
pub(crate) async fn evict_and_wait(
|
||||
&self,
|
||||
_: &RemoteTimelineClient,
|
||||
@@ -559,20 +577,22 @@ impl LayerInner {
|
||||
|
||||
let mut rx = self.status.subscribe();
|
||||
|
||||
let res =
|
||||
self.wanted_evicted
|
||||
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
|
||||
let strong = {
|
||||
match self.inner.get() {
|
||||
Some(mut either) => {
|
||||
self.wanted_evicted.store(true, Ordering::Relaxed);
|
||||
either.downgrade()
|
||||
}
|
||||
None => return Err(EvictionError::NotFound),
|
||||
}
|
||||
};
|
||||
|
||||
if res.is_ok() {
|
||||
if strong.is_some() {
|
||||
// drop the DownloadedLayer outside of the holding the guard
|
||||
drop(strong);
|
||||
LAYER_IMPL_METRICS.inc_started_evictions();
|
||||
}
|
||||
|
||||
if self.get().is_none() {
|
||||
// it was not evictable in the first place
|
||||
// our store to the wanted_evicted does not matter; it will be reset by next download
|
||||
return Err(EvictionError::NotFound);
|
||||
}
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(Status::Evicted) => Ok(()),
|
||||
Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
|
||||
@@ -586,7 +606,8 @@ impl LayerInner {
|
||||
//
|
||||
// use however late (compared to the initial expressing of wanted) as the
|
||||
// "outcome" now
|
||||
match self.get() {
|
||||
LAYER_IMPL_METRICS.inc_broadcast_lagged();
|
||||
match self.inner.get() {
|
||||
Some(_) => Err(EvictionError::Downloaded),
|
||||
None => Ok(()),
|
||||
}
|
||||
@@ -594,17 +615,19 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Should be cancellation safe, but cancellation is troublesome together with the spawned
|
||||
/// download.
|
||||
/// Cancellation safe.
|
||||
#[tracing::instrument(skip_all, fields(layer=%self))]
|
||||
async fn get_or_maybe_download(
|
||||
self: &Arc<Self>,
|
||||
allow_download: bool,
|
||||
ctx: Option<&RequestContext>,
|
||||
) -> Result<Arc<DownloadedLayer>, DownloadError> {
|
||||
let mut init_permit = None;
|
||||
|
||||
loop {
|
||||
let download = move || async move {
|
||||
let download = move |permit| async move {
|
||||
// disable any scheduled but not yet running eviction deletions for this
|
||||
self.version.fetch_add(1, Ordering::Relaxed);
|
||||
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// no need to make the evict_and_wait wait for the actual download to complete
|
||||
drop(self.status.send(Status::Downloaded));
|
||||
@@ -623,7 +646,11 @@ impl LayerInner {
|
||||
.await
|
||||
.map_err(DownloadError::PreStatFailed)?;
|
||||
|
||||
if let Some(reason) = needs_download {
|
||||
let permit = if let Some(reason) = needs_download {
|
||||
if let NeedsDownload::NotFile(ft) = reason {
|
||||
return Err(DownloadError::NotFile(ft));
|
||||
}
|
||||
|
||||
// only reset this after we've decided we really need to download. otherwise it'd
|
||||
// be impossible to mark cancelled downloads for eviction, like one could imagine
|
||||
// we would like to do for prefetching which was not needed.
|
||||
@@ -633,8 +660,6 @@ impl LayerInner {
|
||||
return Err(DownloadError::NoRemoteStorage);
|
||||
}
|
||||
|
||||
tracing::debug!(%reason, "downloading layer");
|
||||
|
||||
if let Some(ctx) = ctx {
|
||||
self.check_expected_download(ctx)?;
|
||||
}
|
||||
@@ -645,16 +670,21 @@ impl LayerInner {
|
||||
return Err(DownloadError::DownloadRequired);
|
||||
}
|
||||
|
||||
self.spawn_download_and_wait(timeline).await?;
|
||||
tracing::info!(%reason, "downloading on-demand");
|
||||
|
||||
self.spawn_download_and_wait(timeline, permit).await?
|
||||
} else {
|
||||
// the file is present locally, probably by a previous but cancelled call to
|
||||
// get_or_maybe_download. alternatively we might be running without remote storage.
|
||||
LAYER_IMPL_METRICS.inc_init_needed_no_download();
|
||||
}
|
||||
|
||||
permit
|
||||
};
|
||||
|
||||
let res = Arc::new(DownloadedLayer {
|
||||
owner: Arc::downgrade(self),
|
||||
kind: tokio::sync::OnceCell::default(),
|
||||
version: next_version,
|
||||
});
|
||||
|
||||
self.access_stats.record_residence_event(
|
||||
@@ -662,19 +692,60 @@ impl LayerInner {
|
||||
LayerResidenceEventReason::ResidenceChange,
|
||||
);
|
||||
|
||||
Ok(ResidentOrWantedEvicted::Resident(res))
|
||||
let waiters = self.inner.initializer_count();
|
||||
if waiters > 0 {
|
||||
tracing::info!(waiters, "completing the on-demand download for other tasks");
|
||||
}
|
||||
|
||||
Ok((ResidentOrWantedEvicted::Resident(res), permit))
|
||||
};
|
||||
|
||||
let locked = self.inner.get_or_init(download).await?;
|
||||
|
||||
if let Some(strong) = Self::get_or_apply_evictedness(Some(locked), &self.wanted_evicted)
|
||||
{
|
||||
if let Some(init_permit) = init_permit.take() {
|
||||
// use the already held initialization permit because it is impossible to hit the
|
||||
// below paths anymore essentially limiting the max loop iterations to 2.
|
||||
let (value, init_permit) = download(init_permit).await?;
|
||||
let mut guard = self.inner.set(value, init_permit);
|
||||
let (strong, _upgraded) = guard
|
||||
.get_and_upgrade()
|
||||
.expect("init creates strong reference, we held the init permit");
|
||||
return Ok(strong);
|
||||
}
|
||||
|
||||
// the situation in which we might need to retry is that our init was ready
|
||||
// immediatedly, but the DownloadedLayer had been dropped BUT failed to complete
|
||||
// Self::evict_blocking
|
||||
let (weak, permit) = {
|
||||
let mut locked = self.inner.get_or_init(download).await?;
|
||||
|
||||
if let Some((strong, upgraded)) = locked.get_and_upgrade() {
|
||||
if upgraded {
|
||||
// when upgraded back, the Arc<DownloadedLayer> is still available, but
|
||||
// previously a `evict_and_wait` was received.
|
||||
self.wanted_evicted.store(false, Ordering::Relaxed);
|
||||
|
||||
// error out any `evict_and_wait`
|
||||
drop(self.status.send(Status::Downloaded));
|
||||
LAYER_IMPL_METRICS
|
||||
.inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
|
||||
}
|
||||
|
||||
return Ok(strong);
|
||||
} else {
|
||||
// path to here: the evict_blocking is stuck on spawn_blocking queue.
|
||||
//
|
||||
// reset the contents, deactivating the eviction and causing a
|
||||
// EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
|
||||
locked.take_and_deinit()
|
||||
}
|
||||
};
|
||||
|
||||
// unlock first, then drop the weak, but because upgrade failed, we
|
||||
// know it cannot be a problem.
|
||||
|
||||
assert!(
|
||||
matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
|
||||
"unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
|
||||
);
|
||||
|
||||
init_permit = Some(permit);
|
||||
|
||||
LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
|
||||
}
|
||||
}
|
||||
@@ -686,8 +757,8 @@ impl LayerInner {
|
||||
match b {
|
||||
Download => Ok(()),
|
||||
Warn | Error => {
|
||||
tracing::warn!(
|
||||
"unexpectedly on-demand downloading remote layer {self} for task kind {:?}",
|
||||
tracing::info!(
|
||||
"unexpectedly on-demand downloading for task kind {:?}",
|
||||
ctx.task_kind()
|
||||
);
|
||||
crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
|
||||
@@ -709,14 +780,17 @@ impl LayerInner {
|
||||
async fn spawn_download_and_wait(
|
||||
self: &Arc<Self>,
|
||||
timeline: Arc<Timeline>,
|
||||
) -> Result<(), DownloadError> {
|
||||
permit: heavier_once_cell::InitPermit,
|
||||
) -> Result<heavier_once_cell::InitPermit, DownloadError> {
|
||||
let task_name = format!("download layer {}", self);
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
// this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
|
||||
// block tenant::mgr::remove_tenant_from_memory.
|
||||
|
||||
let this: Arc<Self> = self.clone();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
||||
@@ -725,6 +799,7 @@ impl LayerInner {
|
||||
&task_name,
|
||||
false,
|
||||
async move {
|
||||
|
||||
let client = timeline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
@@ -746,9 +821,9 @@ impl LayerInner {
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(res) = tx.send(result) {
|
||||
if let Err(res) = tx.send((result, permit)) {
|
||||
match res {
|
||||
Ok(()) => {
|
||||
(Ok(()), _) => {
|
||||
// our caller is cancellation safe so this is fine; if someone
|
||||
// else requests the layer, they'll find it already downloaded
|
||||
// or redownload.
|
||||
@@ -759,7 +834,7 @@ impl LayerInner {
|
||||
tracing::info!("layer file download completed after requester had cancelled");
|
||||
LAYER_IMPL_METRICS.inc_download_completed_without_requester();
|
||||
},
|
||||
Err(e) => {
|
||||
(Err(e), _) => {
|
||||
// our caller is cancellation safe, but we might be racing with
|
||||
// another attempt to initialize. before we have cancellation
|
||||
// token support: these attempts should converge regardless of
|
||||
@@ -775,7 +850,7 @@ impl LayerInner {
|
||||
.in_current_span(),
|
||||
);
|
||||
match rx.await {
|
||||
Ok(Ok(())) => {
|
||||
Ok((Ok(()), permit)) => {
|
||||
if let Some(reason) = self
|
||||
.needs_download()
|
||||
.await
|
||||
@@ -786,10 +861,12 @@ impl LayerInner {
|
||||
}
|
||||
|
||||
self.consecutive_failures.store(0, Ordering::Relaxed);
|
||||
tracing::info!("on-demand download successful");
|
||||
|
||||
Ok(())
|
||||
Ok(permit)
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
Ok((Err(e), _permit)) => {
|
||||
// FIXME: this should be with the spawned task and be cancellation sensitive
|
||||
let consecutive_failures =
|
||||
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
||||
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
|
||||
@@ -807,33 +884,6 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Access the current state without waiting for the file to be downloaded.
|
||||
///
|
||||
/// Requires that we've initialized to state which is respective to the
|
||||
/// actual residency state.
|
||||
fn get(&self) -> Option<Arc<DownloadedLayer>> {
|
||||
let locked = self.inner.get();
|
||||
Self::get_or_apply_evictedness(locked, &self.wanted_evicted)
|
||||
}
|
||||
|
||||
fn get_or_apply_evictedness(
|
||||
guard: Option<heavier_once_cell::Guard<'_, ResidentOrWantedEvicted>>,
|
||||
wanted_evicted: &AtomicBool,
|
||||
) -> Option<Arc<DownloadedLayer>> {
|
||||
if let Some(mut x) = guard {
|
||||
if let Some(won) = x.get() {
|
||||
// there are no guarantees that we will always get to observe a concurrent call
|
||||
// to evict
|
||||
if wanted_evicted.load(Ordering::Acquire) {
|
||||
x.downgrade();
|
||||
}
|
||||
return Some(won);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
|
||||
match tokio::fs::metadata(&self.path).await {
|
||||
Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
|
||||
@@ -853,7 +903,7 @@ impl LayerInner {
|
||||
fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
|
||||
// in future, this should include sha2-256 validation of the file.
|
||||
if !m.is_file() {
|
||||
Err(NeedsDownload::NotFile)
|
||||
Err(NeedsDownload::NotFile(m.file_type()))
|
||||
} else if m.len() != self.desc.file_size {
|
||||
Err(NeedsDownload::WrongSize {
|
||||
actual: m.len(),
|
||||
@@ -867,7 +917,9 @@ impl LayerInner {
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.desc.filename().file_name();
|
||||
|
||||
let remote = self.get().is_none();
|
||||
// this is not accurate: we could have the file locally but there was a cancellation
|
||||
// and now we are not in sync, or we are currently downloading it.
|
||||
let remote = self.inner.get().is_none();
|
||||
|
||||
let access_stats = self.access_stats.as_api_model(reset);
|
||||
|
||||
@@ -896,7 +948,7 @@ impl LayerInner {
|
||||
}
|
||||
|
||||
/// `DownloadedLayer` is being dropped, so it calls this method.
|
||||
fn on_downloaded_layer_drop(self: Arc<LayerInner>) {
|
||||
fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
|
||||
let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
|
||||
let evict = self.wanted_evicted.load(Ordering::Acquire);
|
||||
let can_evict = self.have_remote_client;
|
||||
@@ -904,15 +956,16 @@ impl LayerInner {
|
||||
if gc {
|
||||
// do nothing now, only in LayerInner::drop
|
||||
} else if can_evict && evict {
|
||||
let version = self.version.load(Ordering::Relaxed);
|
||||
|
||||
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self);
|
||||
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);
|
||||
|
||||
// downgrade for queueing, in case there's a tear down already ongoing we should not
|
||||
// hold it alive.
|
||||
let this = Arc::downgrade(&self);
|
||||
drop(self);
|
||||
|
||||
// NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
|
||||
// drop while the `self.inner` is being locked, leading to a deadlock.
|
||||
|
||||
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
|
||||
let _g = span.entered();
|
||||
|
||||
@@ -922,19 +975,15 @@ impl LayerInner {
|
||||
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
||||
return;
|
||||
};
|
||||
this.evict_blocking(version);
|
||||
match this.evict_blocking(version) {
|
||||
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
|
||||
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn evict_blocking(&self, version: usize) {
|
||||
match self.evict_blocking0(version) {
|
||||
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
|
||||
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
|
||||
}
|
||||
}
|
||||
|
||||
fn evict_blocking0(&self, version: usize) -> Result<(), EvictionCancelled> {
|
||||
fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
|
||||
// deleted or detached timeline, don't do anything.
|
||||
let Some(timeline) = self.timeline.upgrade() else {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
@@ -945,32 +994,34 @@ impl LayerInner {
|
||||
let _permit = {
|
||||
let maybe_downloaded = self.inner.get();
|
||||
|
||||
if version != self.version.load(Ordering::Relaxed) {
|
||||
// downloadness-state has advanced, we might no longer be the latest eviction
|
||||
// work; don't do anything.
|
||||
//
|
||||
// this is possible to get to by having:
|
||||
//
|
||||
// 1. wanted_evicted.store(true)
|
||||
// 2. ResidentOrWantedEvicted::downgrade
|
||||
// 3. DownloadedLayer::drop
|
||||
// 4. LayerInner::get_or_maybe_download
|
||||
// 5. LayerInner::evict_blocking
|
||||
return Err(EvictionCancelled::VersionCheckFailed);
|
||||
}
|
||||
|
||||
// free the DownloadedLayer allocation
|
||||
match maybe_downloaded.map(|mut g| g.take_and_deinit()) {
|
||||
Some((taken, permit)) => {
|
||||
assert!(matches!(taken, ResidentOrWantedEvicted::WantedEvicted(_)));
|
||||
permit
|
||||
let (_weak, permit) = match maybe_downloaded {
|
||||
Some(mut guard) => {
|
||||
if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
|
||||
if *version == only_version {
|
||||
guard.take_and_deinit()
|
||||
} else {
|
||||
// this was not for us; maybe there's another eviction job
|
||||
// TODO: does it make any sense to stall here? unique versions do not
|
||||
// matter, we only want to make sure not to evict a resident, which we
|
||||
// are not doing.
|
||||
return Err(EvictionCancelled::VersionCheckFailed);
|
||||
}
|
||||
} else {
|
||||
return Err(EvictionCancelled::AlreadyReinitialized);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
unreachable!("we do the version checking for this exact reason")
|
||||
// already deinitialized, perhaps get_or_maybe_download did this and is
|
||||
// currently waiting to reinitialize it
|
||||
return Err(EvictionCancelled::LostToDownload);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
permit
|
||||
};
|
||||
|
||||
// now accesses to inner.get_or_init wait on the semaphore or the `_permit`
|
||||
|
||||
self.access_stats.record_residence_event(
|
||||
LayerResidenceStatus::Evicted,
|
||||
LayerResidenceEventReason::ResidenceChange,
|
||||
@@ -1003,11 +1054,14 @@ impl LayerInner {
|
||||
Ok(())
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
tracing::info!("failed to evict file from disk, it was already gone");
|
||||
tracing::error!(
|
||||
layer_size = %self.desc.file_size,
|
||||
"failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
|
||||
);
|
||||
Err(EvictionCancelled::FileNotFound)
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("failed to evict file from disk: {e:#}");
|
||||
tracing::error!("failed to evict file from disk: {e:#}");
|
||||
Err(EvictionCancelled::RemoveFailed)
|
||||
}
|
||||
};
|
||||
@@ -1051,6 +1105,8 @@ enum DownloadError {
|
||||
ContextAndConfigReallyDeniesDownloads,
|
||||
#[error("downloading is really required but not allowed by this method")]
|
||||
DownloadRequired,
|
||||
#[error("layer path exists, but it is not a file: {0:?}")]
|
||||
NotFile(std::fs::FileType),
|
||||
/// Why no error here? Because it will be reported by page_service. We should had also done
|
||||
/// retries already.
|
||||
#[error("downloading evicted layer file failed")]
|
||||
@@ -1066,7 +1122,7 @@ enum DownloadError {
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) enum NeedsDownload {
|
||||
NotFound,
|
||||
NotFile,
|
||||
NotFile(std::fs::FileType),
|
||||
WrongSize { actual: u64, expected: u64 },
|
||||
}
|
||||
|
||||
@@ -1074,7 +1130,7 @@ impl std::fmt::Display for NeedsDownload {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
NeedsDownload::NotFound => write!(f, "file was not found"),
|
||||
NeedsDownload::NotFile => write!(f, "path is not a file"),
|
||||
NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
|
||||
NeedsDownload::WrongSize { actual, expected } => {
|
||||
write!(f, "file size mismatch {actual} vs. {expected}")
|
||||
}
|
||||
@@ -1085,7 +1141,10 @@ impl std::fmt::Display for NeedsDownload {
|
||||
/// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
|
||||
pub(crate) struct DownloadedLayer {
|
||||
owner: Weak<LayerInner>,
|
||||
// Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
|
||||
// DownloadedLayer
|
||||
kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
|
||||
version: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DownloadedLayer {
|
||||
@@ -1093,6 +1152,7 @@ impl std::fmt::Debug for DownloadedLayer {
|
||||
f.debug_struct("DownloadedLayer")
|
||||
// owner omitted because it is always "Weak"
|
||||
.field("kind", &self.kind)
|
||||
.field("version", &self.version)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -1100,7 +1160,7 @@ impl std::fmt::Debug for DownloadedLayer {
|
||||
impl Drop for DownloadedLayer {
|
||||
fn drop(&mut self) {
|
||||
if let Some(owner) = self.owner.upgrade() {
|
||||
owner.on_downloaded_layer_drop();
|
||||
owner.on_downloaded_layer_drop(self.version);
|
||||
} else {
|
||||
// no need to do anything, we are shutting down
|
||||
}
|
||||
@@ -1126,7 +1186,6 @@ impl DownloadedLayer {
|
||||
"these are the same, just avoiding the upgrade"
|
||||
);
|
||||
|
||||
// there is nothing async here, but it should be async
|
||||
let res = if owner.desc.is_delta {
|
||||
let summary = Some(delta_layer::Summary::expected(
|
||||
owner.desc.tenant_id,
|
||||
@@ -1225,6 +1284,8 @@ impl std::fmt::Debug for ResidentLayer {
|
||||
|
||||
impl ResidentLayer {
|
||||
/// Release the eviction guard, converting back into a plain [`Layer`].
|
||||
///
|
||||
/// You can access the [`Layer`] also by using `as_ref`.
|
||||
pub(crate) fn drop_eviction_guard(self) -> Layer {
|
||||
self.into()
|
||||
}
|
||||
@@ -1280,7 +1341,7 @@ impl AsRef<Layer> for ResidentLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Allow slimming down if we don't want the `2*usize` with eviction candidates?
|
||||
/// Drop the eviction guard.
|
||||
impl From<ResidentLayer> for Layer {
|
||||
fn from(value: ResidentLayer) -> Self {
|
||||
value.owner
|
||||
@@ -1450,6 +1511,13 @@ impl LayerImplMetrics {
|
||||
.unwrap()
|
||||
.inc();
|
||||
}
|
||||
|
||||
fn inc_broadcast_lagged(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["broadcast_lagged"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
|
||||
enum EvictionCancelled {
|
||||
@@ -1458,6 +1526,11 @@ enum EvictionCancelled {
|
||||
VersionCheckFailed,
|
||||
FileNotFound,
|
||||
RemoveFailed,
|
||||
AlreadyReinitialized,
|
||||
/// Not evicted because of a pending reinitialization
|
||||
LostToDownload,
|
||||
/// After eviction, there was a new layer access which cancelled the eviction.
|
||||
UpgradedBackOnAccess,
|
||||
}
|
||||
|
||||
impl EvictionCancelled {
|
||||
@@ -1468,6 +1541,9 @@ impl EvictionCancelled {
|
||||
EvictionCancelled::VersionCheckFailed => "version_check_fail",
|
||||
EvictionCancelled::FileNotFound => "file_not_found",
|
||||
EvictionCancelled::RemoveFailed => "remove_failed",
|
||||
EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
|
||||
EvictionCancelled::LostToDownload => "lost_to_download",
|
||||
EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{RwLock, RwLockWriteGuard};
|
||||
use utils::fs_ext;
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
@@ -173,37 +174,78 @@ impl OpenFiles {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum CrashsafeOverwriteError {
|
||||
#[error("final path has no parent dir")]
|
||||
FinalPathHasNoParentDir,
|
||||
#[error("remove tempfile")]
|
||||
RemovePreviousTempfile(#[source] std::io::Error),
|
||||
#[error("create tempfile")]
|
||||
CreateTempfile(#[source] std::io::Error),
|
||||
#[error("write tempfile")]
|
||||
WriteContents(#[source] std::io::Error),
|
||||
#[error("sync tempfile")]
|
||||
SyncTempfile(#[source] std::io::Error),
|
||||
#[error("rename tempfile to final path")]
|
||||
RenameTempfileToFinalPath(#[source] std::io::Error),
|
||||
#[error("open final path parent dir")]
|
||||
OpenFinalPathParentDir(#[source] std::io::Error),
|
||||
#[error("sync final path parent dir")]
|
||||
SyncFinalPathParentDir(#[source] std::io::Error),
|
||||
/// Identify error types that should alwways terminate the process. Other
|
||||
/// error types may be elegible for retry.
|
||||
pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
|
||||
use nix::errno::Errno::*;
|
||||
match e.raw_os_error().map(nix::errno::from_i32) {
|
||||
Some(EIO) => {
|
||||
// Terminate on EIO because we no longer trust the device to store
|
||||
// data safely, or to uphold persistence guarantees on fsync.
|
||||
true
|
||||
}
|
||||
Some(EROFS) => {
|
||||
// Terminate on EROFS because a filesystem is usually remounted
|
||||
// readonly when it has experienced some critical issue, so the same
|
||||
// logic as EIO applies.
|
||||
true
|
||||
}
|
||||
Some(EACCES) => {
|
||||
// Terminate on EACCESS because we should always have permissions
|
||||
// for our own data dir: if we don't, then we can't do our job and
|
||||
// need administrative intervention to fix permissions. Terminating
|
||||
// is the best way to make sure we stop cleanly rather than going
|
||||
// into infinite retry loops, and will make it clear to the outside
|
||||
// world that we need help.
|
||||
true
|
||||
}
|
||||
_ => {
|
||||
// Treat all other local file I/O errors are retryable. This includes:
|
||||
// - ENOSPC: we stay up and wait for eviction to free some space
|
||||
// - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
|
||||
// - WriteZero, Interrupted: these are used internally VirtualFile
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
impl CrashsafeOverwriteError {
|
||||
/// Returns true iff the new contents are durably stored.
|
||||
pub fn are_new_contents_durable(&self) -> bool {
|
||||
|
||||
/// Call this when the local filesystem gives us an error with an external
|
||||
/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
|
||||
/// bad storage or bad configuration, and we can't fix that from inside
|
||||
/// a running process.
|
||||
pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
|
||||
tracing::error!("Fatal I/O error: {e}: {context})");
|
||||
std::process::abort();
|
||||
}
|
||||
|
||||
pub(crate) trait MaybeFatalIo<T> {
|
||||
fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
|
||||
fn fatal_err(self, context: &str) -> T;
|
||||
}
|
||||
|
||||
impl<T> MaybeFatalIo<T> for std::io::Result<T> {
|
||||
/// Terminate the process if the result is an error of a fatal type, else pass it through
|
||||
///
|
||||
/// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
|
||||
/// not on ENOSPC.
|
||||
fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
|
||||
if let Err(e) = &self {
|
||||
if is_fatal_io_error(e) {
|
||||
on_fatal_io_error(e, context);
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// Terminate the process on any I/O error.
|
||||
///
|
||||
/// This is appropriate for reads on files that we know exist: they should always work.
|
||||
fn fatal_err(self, context: &str) -> T {
|
||||
match self {
|
||||
Self::FinalPathHasNoParentDir => false,
|
||||
Self::RemovePreviousTempfile(_) => false,
|
||||
Self::CreateTempfile(_) => false,
|
||||
Self::WriteContents(_) => false,
|
||||
Self::SyncTempfile(_) => false,
|
||||
Self::RenameTempfileToFinalPath(_) => false,
|
||||
Self::OpenFinalPathParentDir(_) => false,
|
||||
Self::SyncFinalPathParentDir(_) => true,
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
on_fatal_io_error(&e, context);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -284,15 +326,13 @@ impl VirtualFile {
|
||||
final_path: &Utf8Path,
|
||||
tmp_path: &Utf8Path,
|
||||
content: &[u8],
|
||||
) -> Result<(), CrashsafeOverwriteError> {
|
||||
) -> std::io::Result<()> {
|
||||
let Some(final_path_parent) = final_path.parent() else {
|
||||
return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
|
||||
return Err(std::io::Error::from_raw_os_error(
|
||||
nix::errno::Errno::EINVAL as i32,
|
||||
));
|
||||
};
|
||||
match std::fs::remove_file(tmp_path) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
|
||||
}
|
||||
std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
|
||||
let mut file = Self::open_with_options(
|
||||
tmp_path,
|
||||
OpenOptions::new()
|
||||
@@ -301,31 +341,20 @@ impl VirtualFile {
|
||||
// we bail out instead of causing damage.
|
||||
.create_new(true),
|
||||
)
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::CreateTempfile)?;
|
||||
file.write_all(content)
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::WriteContents)?;
|
||||
file.sync_all()
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::SyncTempfile)?;
|
||||
.await?;
|
||||
file.write_all(content).await?;
|
||||
file.sync_all().await?;
|
||||
drop(file); // before the rename, that's important!
|
||||
// renames are atomic
|
||||
std::fs::rename(tmp_path, final_path)
|
||||
.map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
|
||||
std::fs::rename(tmp_path, final_path)?;
|
||||
// Only open final path parent dirfd now, so that this operation only
|
||||
// ever holds one VirtualFile fd at a time. That's important because
|
||||
// the current `find_victim_slot` impl might pick the same slot for both
|
||||
// VirtualFile., and it eventually does a blocking write lock instead of
|
||||
// try_lock.
|
||||
let final_parent_dirfd =
|
||||
Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
|
||||
final_parent_dirfd
|
||||
.sync_all()
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
|
||||
Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
|
||||
final_parent_dirfd.sync_all().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -857,7 +857,8 @@ impl WalRedoProcess {
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
} else if in_revents.contains(PollFlags::POLLHUP) {
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
@@ -907,7 +908,8 @@ impl WalRedoProcess {
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
} else if out_revents.contains(PollFlags::POLLHUP) {
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ pub struct ConsoleError {
|
||||
#[derive(Deserialize)]
|
||||
pub struct GetRoleSecret {
|
||||
pub role_secret: Box<str>,
|
||||
pub allowed_ips: Option<Vec<Box<str>>>,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
@@ -187,4 +188,31 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_wake_compute() -> anyhow::Result<()> {
|
||||
let json = json!({
|
||||
"address": "0.0.0.0",
|
||||
"aux": dummy_aux(),
|
||||
});
|
||||
let _: WakeCompute = serde_json::from_str(&json.to_string())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_get_role_secret() -> anyhow::Result<()> {
|
||||
// Empty `allowed_ips` field.
|
||||
let json = json!({
|
||||
"role_secret": "secret",
|
||||
});
|
||||
let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
|
||||
// Empty `allowed_ips` field.
|
||||
let json = json!({
|
||||
"role_secret": "secret",
|
||||
"allowed_ips": ["8.8.8.8"],
|
||||
});
|
||||
let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -49,7 +49,7 @@ impl Api {
|
||||
.endpoint
|
||||
.get("proxy_get_role_secret")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.header("Authorization", &self.jwt)
|
||||
.header("Authorization", format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", extra.application_name),
|
||||
@@ -94,7 +94,7 @@ impl Api {
|
||||
.endpoint
|
||||
.get("proxy_wake_compute")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.header("Authorization", &self.jwt)
|
||||
.header("Authorization", format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", extra.application_name),
|
||||
|
||||
@@ -470,30 +470,26 @@ async fn query_to_json<T: GenericClient>(
|
||||
}
|
||||
.and_then(|s| s.parse::<i64>().ok());
|
||||
|
||||
let fields = if !rows.is_empty() {
|
||||
rows[0]
|
||||
.columns()
|
||||
.iter()
|
||||
.map(|c| {
|
||||
json!({
|
||||
"name": Value::String(c.name().to_owned()),
|
||||
"dataTypeID": Value::Number(c.type_().oid().into()),
|
||||
"tableID": c.table_oid(),
|
||||
"columnID": c.column_id(),
|
||||
"dataTypeSize": c.type_size(),
|
||||
"dataTypeModifier": c.type_modifier(),
|
||||
"format": "text",
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
let mut fields = vec![];
|
||||
let mut columns = vec![];
|
||||
|
||||
for c in row_stream.columns() {
|
||||
fields.push(json!({
|
||||
"name": Value::String(c.name().to_owned()),
|
||||
"dataTypeID": Value::Number(c.type_().oid().into()),
|
||||
"tableID": c.table_oid(),
|
||||
"columnID": c.column_id(),
|
||||
"dataTypeSize": c.type_size(),
|
||||
"dataTypeModifier": c.type_modifier(),
|
||||
"format": "text",
|
||||
}));
|
||||
columns.push(client.get_type(c.type_oid()).await?);
|
||||
}
|
||||
|
||||
// convert rows to JSON
|
||||
let rows = rows
|
||||
.iter()
|
||||
.map(|row| pg_text_row_to_json(row, raw_output, array_mode))
|
||||
.map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
// resulting JSON format is based on the format of node-postgres result
|
||||
@@ -514,22 +510,28 @@ async fn query_to_json<T: GenericClient>(
|
||||
//
|
||||
pub fn pg_text_row_to_json(
|
||||
row: &Row,
|
||||
columns: &[Type],
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> Result<Value, anyhow::Error> {
|
||||
let iter = row.columns().iter().enumerate().map(|(i, column)| {
|
||||
let name = column.name();
|
||||
let pg_value = row.as_text(i)?;
|
||||
let json_value = if raw_output {
|
||||
match pg_value {
|
||||
Some(v) => Value::String(v.to_string()),
|
||||
None => Value::Null,
|
||||
}
|
||||
} else {
|
||||
pg_text_to_json(pg_value, column.type_())?
|
||||
};
|
||||
Ok((name.to_string(), json_value))
|
||||
});
|
||||
let iter = row
|
||||
.columns()
|
||||
.iter()
|
||||
.zip(columns)
|
||||
.enumerate()
|
||||
.map(|(i, (column, typ))| {
|
||||
let name = column.name();
|
||||
let pg_value = row.as_text(i)?;
|
||||
let json_value = if raw_output {
|
||||
match pg_value {
|
||||
Some(v) => Value::String(v.to_string()),
|
||||
None => Value::Null,
|
||||
}
|
||||
} else {
|
||||
pg_text_to_json(pg_value, typ)?
|
||||
};
|
||||
Ok((name.to_string(), json_value))
|
||||
});
|
||||
|
||||
if array_mode {
|
||||
// drop keys and aggregate into array
|
||||
|
||||
@@ -81,7 +81,6 @@ FALLBACK_DURATION = {
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
|
||||
"test_runner/performance/test_startup.py::test_startup": 890.114,
|
||||
"test_runner/performance/test_startup.py::test_startup_simple": 2.51,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
|
||||
|
||||
@@ -249,7 +249,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
|
||||
# this has been seen in the wild by tests with the below contradicting logging
|
||||
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
|
||||
# this seems like a mock_s3 issue
|
||||
log.warn(
|
||||
log.warning(
|
||||
f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
|
||||
)
|
||||
keys = 0
|
||||
@@ -257,7 +257,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
|
||||
# this has been seen in one case with mock_s3:
|
||||
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
|
||||
# looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
|
||||
log.warn(
|
||||
log.warning(
|
||||
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
|
||||
)
|
||||
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.compare_fixtures import NeonCompare, PgCompare
|
||||
from fixtures.pageserver.utils import wait_tenant_status_404
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.types import Lsn
|
||||
|
||||
|
||||
#
|
||||
@@ -18,6 +20,8 @@ from fixtures.pg_version import PgVersion
|
||||
def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
env = neon_with_baseline
|
||||
|
||||
start_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
|
||||
|
||||
with closing(env.pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table huge (i int, j int);")
|
||||
@@ -31,6 +35,13 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
env.report_peak_memory_use()
|
||||
env.report_size()
|
||||
|
||||
# Report amount of wal written. Useful for comparing vanilla wal format vs
|
||||
# neon wal format, measuring neon write amplification, etc.
|
||||
end_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
|
||||
wal_written_bytes = end_lsn - start_lsn
|
||||
wal_written_mb = round(wal_written_bytes / (1024 * 1024))
|
||||
env.zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
|
||||
|
||||
# When testing neon, also check how long it takes the pageserver to reingest the
|
||||
# wal from safekeepers. If this number is close to total runtime, then the pageserver
|
||||
# is the bottleneck.
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
from contextlib import closing
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
@@ -81,49 +78,3 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
|
||||
|
||||
# Imitate optimizations that console would do for the second start
|
||||
endpoint.respec(skip_pg_catalog_updates=True)
|
||||
|
||||
|
||||
# This test sometimes runs for longer than the global 5 minute timeout.
|
||||
@pytest.mark.timeout(900)
|
||||
def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Start
|
||||
env.neon_cli.create_branch("test_startup")
|
||||
with zenbenchmark.record_duration("startup_time"):
|
||||
endpoint = env.endpoints.create_start("test_startup")
|
||||
endpoint.safe_psql("select 1;")
|
||||
|
||||
# Restart
|
||||
endpoint.stop_and_destroy()
|
||||
with zenbenchmark.record_duration("restart_time"):
|
||||
endpoint.create_start("test_startup")
|
||||
endpoint.safe_psql("select 1;")
|
||||
|
||||
# Fill up
|
||||
num_rows = 1000000 # 30 MB
|
||||
num_tables = 100
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for i in range(num_tables):
|
||||
cur.execute(f"create table t_{i} (i integer);")
|
||||
cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));")
|
||||
|
||||
# Read
|
||||
with zenbenchmark.record_duration("read_time"):
|
||||
endpoint.safe_psql("select * from t_0;")
|
||||
|
||||
# Read again
|
||||
with zenbenchmark.record_duration("second_read_time"):
|
||||
endpoint.safe_psql("select * from t_0;")
|
||||
|
||||
# Restart
|
||||
endpoint.stop_and_destroy()
|
||||
with zenbenchmark.record_duration("restart_with_data"):
|
||||
endpoint.create_start("test_startup")
|
||||
endpoint.safe_psql("select 1;")
|
||||
|
||||
# Read
|
||||
with zenbenchmark.record_duration("read_after_restart"):
|
||||
endpoint.safe_psql("select * from t_0;")
|
||||
|
||||
@@ -432,3 +432,47 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
|
||||
query(200, "BEGIN")
|
||||
pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
|
||||
assert pid1 != pid2
|
||||
|
||||
|
||||
@pytest.mark.timeout(60)
|
||||
def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
|
||||
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
|
||||
|
||||
static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo')")
|
||||
|
||||
def query(status: int, query: str) -> Any:
|
||||
return static_proxy.http_query(
|
||||
query,
|
||||
[],
|
||||
user="http_auth",
|
||||
password="http",
|
||||
expected_code=status,
|
||||
)
|
||||
|
||||
# query generates a million rows - should hit the 10MB reponse limit quickly
|
||||
response = query(
|
||||
400,
|
||||
"select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
|
||||
)
|
||||
assert "response is too large (max is 10485760 bytes)" in response["message"]
|
||||
|
||||
|
||||
def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy):
|
||||
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
|
||||
|
||||
static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo','bar','baz')")
|
||||
|
||||
def query(status: int, query: str) -> Any:
|
||||
return static_proxy.http_query(
|
||||
query,
|
||||
[],
|
||||
user="http_auth",
|
||||
password="http",
|
||||
expected_code=status,
|
||||
)
|
||||
|
||||
response = query(
|
||||
200,
|
||||
"select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data",
|
||||
)
|
||||
assert response["rows"][0]["data"] == ["foo", "bar", "baz"]
|
||||
|
||||
Reference in New Issue
Block a user