mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 05:00:38 +00:00
Compare commits
6 Commits
problame/b
...
jcsp/relat
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d004675e9b | ||
|
|
5774578fa7 | ||
|
|
d4d4741485 | ||
|
|
171663be86 | ||
|
|
858e93d91a | ||
|
|
275471a2ba |
5
.github/workflows/build_and_test.yml
vendored
5
.github/workflows/build_and_test.yml
vendored
@@ -852,7 +852,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.19.0
|
||||
VM_BUILDER_VERSION: v0.18.5
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -874,7 +874,8 @@ jobs:
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder \
|
||||
-spec=vm-image-spec.yaml \
|
||||
-enable-file-cache \
|
||||
-cgroup-uid=postgres \
|
||||
-src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
|
||||
-dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
|
||||
@@ -9,24 +9,6 @@ refactoring, additional comments, and so forth. Let's try to raise the
|
||||
bar, and clean things up as we go. Try to leave code in a better shape
|
||||
than it was before.
|
||||
|
||||
## Pre-commit hook
|
||||
|
||||
We have a sample pre-commit hook in `pre-commit.py`.
|
||||
To set it up, run:
|
||||
|
||||
```bash
|
||||
ln -s ../../pre-commit.py .git/hooks/pre-commit
|
||||
```
|
||||
|
||||
This will run following checks on staged files before each commit:
|
||||
- `rustfmt`
|
||||
- checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
|
||||
|
||||
There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
|
||||
and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
|
||||
|
||||
If you want to skip the hook, run `git commit` with `--no-verify` option.
|
||||
|
||||
## Submitting changes
|
||||
|
||||
1. Get at least one +1 on your PR before you push.
|
||||
|
||||
76
Cargo.lock
generated
76
Cargo.lock
generated
@@ -1955,20 +1955,6 @@ dependencies = [
|
||||
"hashbrown 0.13.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hdrhistogram"
|
||||
version = "7.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
|
||||
dependencies = [
|
||||
"base64 0.21.1",
|
||||
"byteorder",
|
||||
"crossbeam-channel",
|
||||
"flate2",
|
||||
"nom",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapless"
|
||||
version = "0.8.0"
|
||||
@@ -2648,16 +2634,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||
dependencies = [
|
||||
"overload",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.4.3"
|
||||
@@ -2918,32 +2894,6 @@ version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"futures",
|
||||
"hdrhistogram",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"pageserver",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pagectl"
|
||||
version = "0.1.0"
|
||||
@@ -3029,13 +2979,10 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-chrome",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"utils",
|
||||
"walkdir",
|
||||
@@ -3274,7 +3221,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3287,7 +3234,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -3298,7 +3245,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3316,7 +3263,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4986,7 +4933,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#988d0ddb4184c408fa7fc1bd0ecca7993c02978f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -5271,17 +5218,6 @@ dependencies = [
|
||||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-chrome"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "496b3cd5447f7ff527bbbf19b071ad542a000adf297d4127078b4dfdb931f41a"
|
||||
dependencies = [
|
||||
"serde_json",
|
||||
"tracing-core",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.31"
|
||||
@@ -5354,7 +5290,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -5569,7 +5504,6 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-chrome",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
|
||||
14
Cargo.toml
14
Cargo.toml
@@ -5,7 +5,6 @@ members = [
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"pageserver/ctl",
|
||||
"pageserver/pagebench",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
@@ -80,7 +79,6 @@ futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.13"
|
||||
hashlink = "0.8.1"
|
||||
hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
@@ -167,11 +165,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -208,7 +206,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -479,6 +479,13 @@ fn cli() -> clap::Command {
|
||||
)
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
// DEPRECATED, NO LONGER DOES ANYTHING.
|
||||
// See https://github.com/neondatabase/cloud/issues/7516
|
||||
Arg::new("file-cache-on-disk")
|
||||
.long("file-cache-on-disk")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -86,10 +86,7 @@ where
|
||||
.stdout(process_log_file)
|
||||
.stderr(same_file_for_stderr)
|
||||
.args(args);
|
||||
|
||||
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
|
||||
fill_rust_env_vars(background_command),
|
||||
));
|
||||
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
|
||||
filled_cmd.envs(envs);
|
||||
|
||||
let pid_file_to_check = match initial_pid_file {
|
||||
@@ -256,15 +253,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
cmd
|
||||
}
|
||||
|
||||
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
|
||||
for (var, val) in std::env::vars() {
|
||||
if var.starts_with("NEON_") {
|
||||
cmd = cmd.env(var, val);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
|
||||
/// 1. Claims a pidfile with a fcntl lock on it and
|
||||
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
|
||||
|
||||
@@ -283,10 +283,9 @@ fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body,
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let _guard = logging::init(
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
let args = Cli::parse();
|
||||
|
||||
@@ -140,35 +140,3 @@ impl Key {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
Self::from_hex(s)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use crate::key::Key;
|
||||
|
||||
#[test]
|
||||
fn display_fromstr_bijection() {
|
||||
let mut rng = rand::thread_rng();
|
||||
use rand::Rng;
|
||||
|
||||
let key = Key {
|
||||
field1: rng.gen(),
|
||||
field2: rng.gen(),
|
||||
field3: rng.gen(),
|
||||
field4: rng.gen(),
|
||||
field5: rng.gen(),
|
||||
field6: rng.gen(),
|
||||
};
|
||||
|
||||
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ use utils::{
|
||||
|
||||
use crate::{reltag::RelTag, shard::TenantShardId};
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
|
||||
/// The state of a tenant in this pageserver.
|
||||
///
|
||||
@@ -371,8 +371,6 @@ pub struct TenantInfo {
|
||||
/// If a layer is present in both local FS and S3, it counts only once.
|
||||
pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
|
||||
pub attachment_status: TenantAttachmentStatus,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
@@ -517,8 +515,6 @@ pub enum HistoricLayerInfo {
|
||||
lsn_end: Lsn,
|
||||
remote: bool,
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
remote_path: Option<String>,
|
||||
},
|
||||
Image {
|
||||
layer_file_name: String,
|
||||
@@ -527,8 +523,6 @@ pub enum HistoricLayerInfo {
|
||||
lsn_start: Lsn,
|
||||
remote: bool,
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
remote_path: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -773,36 +767,6 @@ impl PagestreamBeMessage {
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
|
||||
let mut buf = buf.reader();
|
||||
let msg_tag = buf.read_u8()?;
|
||||
match msg_tag {
|
||||
100 => todo!(),
|
||||
101 => todo!(),
|
||||
102 => {
|
||||
let buf = buf.get_ref();
|
||||
/* TODO use constant */
|
||||
if buf.len() == 8192 {
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page: buf.clone(),
|
||||
}))
|
||||
} else {
|
||||
anyhow::bail!("invalid page size: {}", buf.len());
|
||||
}
|
||||
}
|
||||
103 => {
|
||||
let buf = buf.get_ref();
|
||||
let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
|
||||
let rust_str = cstr.to_str()?;
|
||||
Ok(PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: rust_str.to_owned(),
|
||||
}))
|
||||
}
|
||||
104 => todo!(),
|
||||
_ => bail!("unknown tag: {:?}", msg_tag),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -868,7 +832,6 @@ mod tests {
|
||||
state: TenantState::Active,
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: None,
|
||||
};
|
||||
let expected_active = json!({
|
||||
"id": original_active.id.to_string(),
|
||||
@@ -889,7 +852,6 @@ mod tests {
|
||||
},
|
||||
current_physical_size: Some(42),
|
||||
attachment_status: TenantAttachmentStatus::Attached,
|
||||
generation: None,
|
||||
};
|
||||
let expected_broken = json!({
|
||||
"id": original_broken.id.to_string(),
|
||||
|
||||
@@ -81,12 +81,6 @@ impl std::fmt::Display for RemotePath {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RemotePath> for String {
|
||||
fn from(val: RemotePath) -> Self {
|
||||
val.0.into()
|
||||
}
|
||||
}
|
||||
|
||||
impl RemotePath {
|
||||
pub fn new(relative_path: &Utf8Path) -> anyhow::Result<Self> {
|
||||
anyhow::ensure!(
|
||||
@@ -108,7 +102,7 @@ impl RemotePath {
|
||||
self.0.file_name()
|
||||
}
|
||||
|
||||
pub fn join<P: AsRef<Utf8Path>>(&self, segment: P) -> Self {
|
||||
pub fn join(&self, segment: &Utf8Path) -> Self {
|
||||
Self(self.0.join(segment))
|
||||
}
|
||||
|
||||
|
||||
@@ -278,10 +278,9 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
let _ = utils::logging::init(
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
|
||||
@@ -207,10 +207,9 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
|
||||
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
let _ = utils::logging::init(
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
|
||||
@@ -49,7 +49,6 @@ const_format.workspace = true
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
tokio-stream = { version = "0.1.14" }
|
||||
tracing-chrome = "0.7.1"
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder.workspace = true
|
||||
|
||||
@@ -66,24 +66,10 @@ pub enum TracingErrorLayerEnablement {
|
||||
EnableWithRustLogFilter,
|
||||
}
|
||||
|
||||
/// Where the logging should output to.
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum Output {
|
||||
Stdout,
|
||||
Stderr,
|
||||
}
|
||||
|
||||
/// Keep alive and drop it before the program terminates.
|
||||
#[must_use]
|
||||
pub struct FlushGuard {
|
||||
_tracing_chrome_layer: Option<tracing_chrome::FlushGuard>,
|
||||
}
|
||||
|
||||
pub fn init(
|
||||
log_format: LogFormat,
|
||||
tracing_error_layer_enablement: TracingErrorLayerEnablement,
|
||||
output: Output,
|
||||
) -> anyhow::Result<FlushGuard> {
|
||||
) -> anyhow::Result<()> {
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
// the RUST_LOG environment variable is not set.
|
||||
let rust_log_env_filter = || {
|
||||
@@ -91,50 +77,15 @@ pub fn init(
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
|
||||
};
|
||||
|
||||
// WIP: lift it up as an argument
|
||||
let enable_tracing_chrome = match std::env::var("NEON_PAGESERVER_ENABLE_TRACING_CHROME") {
|
||||
Ok(s) if s != "0" => true,
|
||||
Ok(_s) => false,
|
||||
Err(std::env::VarError::NotPresent) => false,
|
||||
Err(std::env::VarError::NotUnicode(_)) => {
|
||||
panic!("env var NEON_PAGESERVER_ENABLE_TRACING_CHROME not unicode")
|
||||
}
|
||||
};
|
||||
|
||||
// NB: the order of the with() calls does not matter.
|
||||
// See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
||||
use tracing_subscriber::prelude::*;
|
||||
|
||||
// https://users.rust-lang.org/t/how-can-i-init-tracing-registry-dynamically-with-multiple-outputs/94307/6
|
||||
#[derive(Default)]
|
||||
struct LayerStack {
|
||||
layers:
|
||||
Option<Box<dyn tracing_subscriber::Layer<tracing_subscriber::Registry> + Sync + Send>>,
|
||||
}
|
||||
impl LayerStack {
|
||||
fn add_layer<L>(&mut self, new_layer: L)
|
||||
where
|
||||
L: tracing_subscriber::Layer<tracing_subscriber::Registry> + Send + Sync,
|
||||
{
|
||||
let new = match self.layers.take() {
|
||||
Some(layers) => Some(layers.and_then(new_layer).boxed()),
|
||||
None => Some(new_layer.boxed()),
|
||||
};
|
||||
self.layers = new;
|
||||
}
|
||||
}
|
||||
let mut layers = LayerStack::default();
|
||||
|
||||
layers.add_layer({
|
||||
let r = tracing_subscriber::registry();
|
||||
let r = r.with({
|
||||
let log_layer = tracing_subscriber::fmt::layer()
|
||||
.with_target(false)
|
||||
.with_ansi(false)
|
||||
.with_writer(move || -> Box<dyn std::io::Write> {
|
||||
match output {
|
||||
Output::Stdout => Box::new(std::io::stdout()),
|
||||
Output::Stderr => Box::new(std::io::stderr()),
|
||||
}
|
||||
});
|
||||
.with_writer(std::io::stdout);
|
||||
let log_layer = match log_format {
|
||||
LogFormat::Json => log_layer.json().boxed(),
|
||||
LogFormat::Plain => log_layer.boxed(),
|
||||
@@ -142,31 +93,15 @@ pub fn init(
|
||||
};
|
||||
log_layer.with_filter(rust_log_env_filter())
|
||||
});
|
||||
|
||||
layers
|
||||
.add_layer(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
|
||||
|
||||
let tracing_chrome_layer_flush_guard = if enable_tracing_chrome {
|
||||
let (layer, guard) = tracing_chrome::ChromeLayerBuilder::new().build();
|
||||
layers.add_layer(layer);
|
||||
Some(guard)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
|
||||
match tracing_error_layer_enablement {
|
||||
TracingErrorLayerEnablement::EnableWithRustLogFilter => layers
|
||||
.add_layer(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter())),
|
||||
TracingErrorLayerEnablement::Disabled => (),
|
||||
TracingErrorLayerEnablement::EnableWithRustLogFilter => r
|
||||
.with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
|
||||
.init(),
|
||||
TracingErrorLayerEnablement::Disabled => r.init(),
|
||||
}
|
||||
|
||||
let r = tracing_subscriber::registry();
|
||||
r.with(layers.layers.expect("we add at least one layer"))
|
||||
.init();
|
||||
|
||||
Ok(FlushGuard {
|
||||
_tracing_chrome_layer: tracing_chrome_layer_flush_guard,
|
||||
})
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Disable the default rust panic hook by using `set_hook`.
|
||||
|
||||
@@ -366,47 +366,6 @@ impl MonotonicCounter<Lsn> for RecordLsn {
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements [`rand::distributions::uniform::UniformSampler`] so we can sample [`Lsn`]s.
|
||||
pub struct LsnSampler(<u64 as rand::distributions::uniform::SampleUniform>::Sampler);
|
||||
|
||||
impl rand::distributions::uniform::SampleUniform for Lsn {
|
||||
type Sampler = LsnSampler;
|
||||
}
|
||||
|
||||
impl rand::distributions::uniform::UniformSampler for LsnSampler {
|
||||
type X = Lsn;
|
||||
|
||||
fn new<B1, B2>(low: B1, high: B2) -> Self
|
||||
where
|
||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
{
|
||||
Self(
|
||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new(
|
||||
low.borrow().0,
|
||||
high.borrow().0,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn new_inclusive<B1, B2>(low: B1, high: B2) -> Self
|
||||
where
|
||||
B1: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
B2: rand::distributions::uniform::SampleBorrow<Self::X> + Sized,
|
||||
{
|
||||
Self(
|
||||
<u64 as rand::distributions::uniform::SampleUniform>::Sampler::new_inclusive(
|
||||
low.borrow().0,
|
||||
high.borrow().0,
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn sample<R: rand::prelude::Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
|
||||
Lsn(self.0.sample(rng))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::bin_ser::BeSer;
|
||||
|
||||
@@ -82,9 +82,6 @@ enum-map.workspace = true
|
||||
enumset.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
|
||||
tracing-chrome = "0.7.1"
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use camino::Utf8Path;
|
||||
use clap::Subcommand;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::block_io::BlockCursor;
|
||||
use pageserver::tenant::disk_btree::DiskBtreeReader;
|
||||
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
||||
use pageserver::tenant::storage_layer::{delta_layer, image_layer};
|
||||
use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver::{
|
||||
@@ -22,7 +20,6 @@ use pageserver::{
|
||||
};
|
||||
use std::fs;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::layer_map_analyzer::parse_filename;
|
||||
|
||||
@@ -48,13 +45,6 @@ pub(crate) enum LayerCmd {
|
||||
/// The id from list-layer command
|
||||
id: usize,
|
||||
},
|
||||
RewriteSummary {
|
||||
layer_file_path: Utf8PathBuf,
|
||||
#[clap(long)]
|
||||
new_tenant_id: Option<TenantId>,
|
||||
#[clap(long)]
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
},
|
||||
}
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
@@ -110,7 +100,6 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
println!("- timeline {}", timeline.file_name().to_string_lossy());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LayerCmd::ListLayer {
|
||||
path,
|
||||
@@ -139,7 +128,6 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
idx += 1;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LayerCmd::DumpLayer {
|
||||
path,
|
||||
@@ -180,63 +168,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
idx += 1;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
LayerCmd::RewriteSummary {
|
||||
layer_file_path,
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
} => {
|
||||
pageserver::virtual_file::init(10);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
macro_rules! rewrite_closure {
|
||||
($($summary_ty:tt)*) => {{
|
||||
|summary| $($summary_ty)* {
|
||||
tenant_id: new_tenant_id.unwrap_or(summary.tenant_id),
|
||||
timeline_id: new_timeline_id.unwrap_or(summary.timeline_id),
|
||||
..summary
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
let res = ImageLayer::rewrite_summary(
|
||||
layer_file_path,
|
||||
rewrite_closure!(image_layer::Summary),
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
Ok(()) => {
|
||||
println!("Successfully rewrote summary of image layer {layer_file_path}");
|
||||
return Ok(());
|
||||
}
|
||||
Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
|
||||
Err(image_layer::RewriteSummaryError::Other(e)) => {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
let res = DeltaLayer::rewrite_summary(
|
||||
layer_file_path,
|
||||
rewrite_closure!(delta_layer::Summary),
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
Ok(()) => {
|
||||
println!("Successfully rewrote summary of delta layer {layer_file_path}");
|
||||
return Ok(());
|
||||
}
|
||||
Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
|
||||
Err(delta_layer::RewriteSummaryError::Other(e)) => {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
anyhow::bail!("not an image or delta layer: {layer_file_path}");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
[package]
|
||||
name = "pagebench"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
hdrhistogram.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tracing.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
pageserver = { path = ".." }
|
||||
utils = { path = "../../libs/utils/" }
|
||||
@@ -1,402 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use pageserver::client::page_service::BasebackupRequest;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, info, instrument};
|
||||
use utils::id::TenantId;
|
||||
use utils::logging;
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::util::tenant_timeline_id::TenantTimelineId;
|
||||
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "localhost:64000")]
|
||||
page_service_host_port: String,
|
||||
#[clap(long)]
|
||||
pageserver_jwt: Option<String>,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long, default_value = "1.0")]
|
||||
gzip_probability: f64,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn inc(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
total: PerTaskOutput,
|
||||
}
|
||||
|
||||
const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
|
||||
|
||||
struct LatencyPercentiles {
|
||||
latency_percentiles: [Duration; 4],
|
||||
}
|
||||
|
||||
impl serde::Serialize for LatencyPercentiles {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
|
||||
for p in LATENCY_PERCENTILES {
|
||||
ser.serialize_entry(
|
||||
&format!("p{p}"),
|
||||
&format!(
|
||||
"{}",
|
||||
&humantime::format_duration(self.latency_percentiles[0])
|
||||
),
|
||||
)?;
|
||||
}
|
||||
ser.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct PerTaskOutput {
|
||||
request_count: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
latency_mean: Duration,
|
||||
latency_percentiles: LatencyPercentiles,
|
||||
}
|
||||
|
||||
struct ThreadLocalStats {
|
||||
latency_histo: hdrhistogram::Histogram<u64>,
|
||||
}
|
||||
|
||||
impl ThreadLocalStats {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
// Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
|
||||
// which would skew the benchmark results.
|
||||
latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
|
||||
}
|
||||
}
|
||||
fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
|
||||
let micros: u64 = latency
|
||||
.as_micros()
|
||||
.try_into()
|
||||
.context("latency greater than u64")?;
|
||||
self.latency_histo
|
||||
.record(micros)
|
||||
.context("add to histogram")?;
|
||||
Ok(())
|
||||
}
|
||||
fn output(&self) -> PerTaskOutput {
|
||||
let latency_percentiles = std::array::from_fn(|idx| {
|
||||
let micros = self
|
||||
.latency_histo
|
||||
.value_at_percentile(LATENCY_PERCENTILES[idx]);
|
||||
Duration::from_micros(micros)
|
||||
});
|
||||
PerTaskOutput {
|
||||
request_count: self.latency_histo.len(),
|
||||
latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
|
||||
latency_percentiles: LatencyPercentiles {
|
||||
latency_percentiles,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn add(&mut self, other: &Self) {
|
||||
let Self {
|
||||
ref mut latency_histo,
|
||||
} = self;
|
||||
latency_histo.add(&other.latency_histo).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
pub static STATS: RefCell<Arc<Mutex<ThreadLocalStats>>> = std::cell::RefCell::new(
|
||||
Arc::new(Mutex::new(ThreadLocalStats::new()))
|
||||
);
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let _guard = logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let thread_local_stats = Arc::new(Mutex::new(Vec::new()));
|
||||
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.on_thread_start({
|
||||
let thread_local_stats = Arc::clone(&thread_local_stats);
|
||||
move || {
|
||||
// pre-initialize the histograms
|
||||
STATS.with(|stats| {
|
||||
let stats: Arc<_> = Arc::clone(&*stats.borrow());
|
||||
thread_local_stats.lock().unwrap().push(stats);
|
||||
});
|
||||
}
|
||||
})
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(args, thread_local_stats));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}
|
||||
|
||||
struct Target {
|
||||
timeline: TenantTimelineId,
|
||||
lsn_range: Option<Range<Lsn>>,
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
thread_local_stats: Arc<Mutex<Vec<Arc<Mutex<ThreadLocalStats>>>>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver::client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let mut timelines: Vec<TenantTimelineId> = Vec::new();
|
||||
if args.targets.is_some() {
|
||||
timelines = args.targets.clone().unwrap();
|
||||
} else {
|
||||
let tenants: Vec<TenantId> = mgmt_api_client
|
||||
.list_tenants()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|ti| ti.id)
|
||||
.collect();
|
||||
let mut js = JoinSet::new();
|
||||
for tenant_id in tenants {
|
||||
js.spawn({
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
async move {
|
||||
(
|
||||
tenant_id,
|
||||
mgmt_api_client.list_timelines(tenant_id).await.unwrap(),
|
||||
)
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(res) = js.join_next().await {
|
||||
let (tenant_id, tl_infos) = res.unwrap();
|
||||
for tl in tl_infos {
|
||||
timelines.push(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id: tl.timeline_id,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("timelines:\n{:?}", timelines);
|
||||
|
||||
let mut js = JoinSet::new();
|
||||
for timeline in &timelines {
|
||||
js.spawn({
|
||||
let timeline = *timeline;
|
||||
let info = mgmt_api_client
|
||||
.timeline_info(timeline.tenant_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
async move {
|
||||
anyhow::Ok(Target {
|
||||
timeline,
|
||||
lsn_range: Some(info.last_record_lsn..(info.last_record_lsn + 1)),
|
||||
})
|
||||
}
|
||||
});
|
||||
}
|
||||
let mut all_targets: Vec<Target> = Vec::new();
|
||||
while let Some(res) = js.join_next().await {
|
||||
all_targets.push(res.unwrap().unwrap());
|
||||
}
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
|
||||
));
|
||||
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
|
||||
|
||||
tokio::spawn({
|
||||
let stats = Arc::clone(&live_stats);
|
||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
||||
async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let mut work_senders = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(1); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender = async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let (timeline, work) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let target = all_targets.choose(&mut rng).unwrap();
|
||||
let lsn = target.lsn_range.clone().map(|r| rng.gen_range(r));
|
||||
(
|
||||
target.timeline,
|
||||
Work {
|
||||
lsn,
|
||||
gzip: rng.gen_bool(args.gzip_probability),
|
||||
},
|
||||
)
|
||||
};
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
sender.send(work).await.ok().unwrap();
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(runtime) = args.runtime {
|
||||
match tokio::time::timeout(runtime.into(), work_sender).await {
|
||||
Ok(()) => unreachable!("work sender never terminates"),
|
||||
Err(_timeout) => {
|
||||
// this implicitly drops the work_senders, making all the clients exit
|
||||
}
|
||||
}
|
||||
} else {
|
||||
work_sender.await;
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = ThreadLocalStats::new();
|
||||
for stats in thread_local_stats.lock().unwrap().iter() {
|
||||
let stats = stats.lock().unwrap();
|
||||
agg_stats.add(&*stats);
|
||||
}
|
||||
agg_stats.output()
|
||||
},
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||
println!("{output}");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct Work {
|
||||
lsn: Option<Lsn>,
|
||||
gzip: bool,
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<Work>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let client =
|
||||
pageserver::client::page_service::Client::new(crate::util::connstring::connstring(
|
||||
&args.page_service_host_port,
|
||||
args.pageserver_jwt.as_deref(),
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some(Work { lsn, gzip }) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
let copy_out_stream = client
|
||||
.basebackup(&BasebackupRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
timeline_id: timeline.timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
})
|
||||
.await
|
||||
.with_context(|| format!("start basebackup for {timeline}"))
|
||||
.unwrap();
|
||||
|
||||
use futures::StreamExt;
|
||||
let size = Arc::new(AtomicUsize::new(0));
|
||||
copy_out_stream
|
||||
.for_each({
|
||||
|r| {
|
||||
let size = Arc::clone(&size);
|
||||
async move {
|
||||
let size = Arc::clone(&size);
|
||||
size.fetch_add(r.unwrap().len(), Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
})
|
||||
.await;
|
||||
debug!("basebackup size is {} bytes", size.load(Ordering::Relaxed));
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
@@ -1,404 +0,0 @@
|
||||
use anyhow::Context;
|
||||
use pageserver::client::page_service::RelTagBlockNo;
|
||||
use pageserver::pgdatadir_mapping::{is_rel_block_key, key_to_rel_block};
|
||||
use pageserver::repository;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use rand::prelude::*;
|
||||
use tokio::sync::Barrier;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{info, instrument};
|
||||
use utils::id::TenantId;
|
||||
use utils::logging;
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::util::tenant_timeline_id::TenantTimelineId;
|
||||
|
||||
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
|
||||
#[derive(clap::Parser)]
|
||||
pub(crate) struct Args {
|
||||
#[clap(long, default_value = "http://localhost:9898")]
|
||||
mgmt_api_endpoint: String,
|
||||
#[clap(long, default_value = "postgres://postgres@localhost:64000")]
|
||||
page_service_connstring: String,
|
||||
#[clap(long, default_value = "1")]
|
||||
num_clients: NonZeroUsize,
|
||||
#[clap(long)]
|
||||
runtime: Option<humantime::Duration>,
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct LiveStats {
|
||||
completed_requests: AtomicU64,
|
||||
}
|
||||
|
||||
impl LiveStats {
|
||||
fn inc(&self) {
|
||||
self.completed_requests.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
total: PerTaskOutput,
|
||||
}
|
||||
|
||||
const LATENCY_PERCENTILES: [f64; 4] = [95.0, 99.00, 99.90, 99.99];
|
||||
|
||||
struct LatencyPercentiles {
|
||||
latency_percentiles: [Duration; 4],
|
||||
}
|
||||
|
||||
impl serde::Serialize for LatencyPercentiles {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let mut ser = serializer.serialize_map(Some(LATENCY_PERCENTILES.len()))?;
|
||||
for p in LATENCY_PERCENTILES {
|
||||
ser.serialize_entry(
|
||||
&format!("p{p}"),
|
||||
&format!(
|
||||
"{}",
|
||||
&humantime::format_duration(self.latency_percentiles[0])
|
||||
),
|
||||
)?;
|
||||
}
|
||||
ser.end()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct PerTaskOutput {
|
||||
request_count: u64,
|
||||
#[serde(with = "humantime_serde")]
|
||||
latency_mean: Duration,
|
||||
latency_percentiles: LatencyPercentiles,
|
||||
}
|
||||
|
||||
struct ThreadLocalStats {
|
||||
latency_histo: hdrhistogram::Histogram<u64>,
|
||||
}
|
||||
|
||||
impl ThreadLocalStats {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
// Initialize with fixed bounds so that we panic at runtime instead of resizing the histogram,
|
||||
// which would skew the benchmark results.
|
||||
latency_histo: hdrhistogram::Histogram::new_with_bounds(1, 1_000_000_000, 3).unwrap(),
|
||||
}
|
||||
}
|
||||
fn observe(&mut self, latency: Duration) -> anyhow::Result<()> {
|
||||
let micros: u64 = latency
|
||||
.as_micros()
|
||||
.try_into()
|
||||
.context("latency greater than u64")?;
|
||||
self.latency_histo
|
||||
.record(micros)
|
||||
.context("add to histogram")?;
|
||||
Ok(())
|
||||
}
|
||||
fn output(&self) -> PerTaskOutput {
|
||||
let latency_percentiles = std::array::from_fn(|idx| {
|
||||
let micros = self
|
||||
.latency_histo
|
||||
.value_at_percentile(LATENCY_PERCENTILES[idx]);
|
||||
Duration::from_micros(micros)
|
||||
});
|
||||
PerTaskOutput {
|
||||
request_count: self.latency_histo.len(),
|
||||
latency_mean: Duration::from_micros(self.latency_histo.mean() as u64),
|
||||
latency_percentiles: LatencyPercentiles {
|
||||
latency_percentiles,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn add(&mut self, other: &Self) {
|
||||
let Self {
|
||||
ref mut latency_histo,
|
||||
} = self;
|
||||
latency_histo.add(&other.latency_histo).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
pub static STATS: RefCell<Arc<Mutex<ThreadLocalStats>>> = std::cell::RefCell::new(
|
||||
Arc::new(Mutex::new(ThreadLocalStats::new()))
|
||||
);
|
||||
}
|
||||
|
||||
pub(crate) fn main(args: Args) -> anyhow::Result<()> {
|
||||
let _guard = logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let thread_local_stats = Arc::new(Mutex::new(Vec::new()));
|
||||
|
||||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||||
.on_thread_start({
|
||||
let thread_local_stats = Arc::clone(&thread_local_stats);
|
||||
move || {
|
||||
// pre-initialize the histograms
|
||||
STATS.with(|stats| {
|
||||
let stats: Arc<_> = Arc::clone(&*stats.borrow());
|
||||
thread_local_stats.lock().unwrap().push(stats);
|
||||
});
|
||||
}
|
||||
})
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let main_task = rt.spawn(main_impl(args, thread_local_stats));
|
||||
rt.block_on(main_task).unwrap()
|
||||
}
|
||||
|
||||
struct KeyRange {
|
||||
timeline: TenantTimelineId,
|
||||
timeline_lsn: Lsn,
|
||||
start: i128,
|
||||
end: i128,
|
||||
}
|
||||
|
||||
impl KeyRange {
|
||||
fn len(&self) -> i128 {
|
||||
self.end - self.start
|
||||
}
|
||||
}
|
||||
|
||||
async fn main_impl(
|
||||
args: Args,
|
||||
thread_local_stats: Arc<Mutex<Vec<Arc<Mutex<ThreadLocalStats>>>>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let args: &'static Args = Box::leak(Box::new(args));
|
||||
|
||||
let mgmt_api_client = Arc::new(pageserver::client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
None, // TODO: support jwt in args
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let mut timelines: Vec<TenantTimelineId> = Vec::new();
|
||||
if args.targets.is_some() {
|
||||
timelines = args.targets.clone().unwrap();
|
||||
} else {
|
||||
let tenants: Vec<TenantId> = mgmt_api_client
|
||||
.list_tenants()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|ti| ti.id)
|
||||
.collect();
|
||||
let mut js = JoinSet::new();
|
||||
for tenant_id in tenants {
|
||||
js.spawn({
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
async move {
|
||||
(
|
||||
tenant_id,
|
||||
mgmt_api_client.list_timelines(tenant_id).await.unwrap(),
|
||||
)
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(res) = js.join_next().await {
|
||||
let (tenant_id, tl_infos) = res.unwrap();
|
||||
for tl in tl_infos {
|
||||
timelines.push(TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id: tl.timeline_id,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
info!("timelines:\n{:?}", timelines);
|
||||
|
||||
let mut js = JoinSet::new();
|
||||
for timeline in &timelines {
|
||||
js.spawn({
|
||||
let mgmt_api_client = Arc::clone(&mgmt_api_client);
|
||||
let timeline = *timeline;
|
||||
async move {
|
||||
let partitioning = mgmt_api_client
|
||||
.keyspace(timeline.tenant_id, timeline.timeline_id)
|
||||
.await?;
|
||||
let lsn = partitioning.at_lsn;
|
||||
|
||||
let ranges = partitioning
|
||||
.keys
|
||||
.ranges
|
||||
.iter()
|
||||
.filter_map(|r| {
|
||||
let start = r.start;
|
||||
let end = r.end;
|
||||
// filter out non-relblock keys
|
||||
match (is_rel_block_key(start), is_rel_block_key(end)) {
|
||||
(true, true) => Some(KeyRange {
|
||||
timeline,
|
||||
timeline_lsn: lsn,
|
||||
start: start.to_i128(),
|
||||
end: end.to_i128(),
|
||||
}),
|
||||
(true, false) | (false, true) => {
|
||||
unimplemented!("split up range")
|
||||
}
|
||||
(false, false) => None,
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
anyhow::Ok(ranges)
|
||||
}
|
||||
});
|
||||
}
|
||||
let mut all_ranges: Vec<KeyRange> = Vec::new();
|
||||
while let Some(res) = js.join_next().await {
|
||||
all_ranges.extend(res.unwrap().unwrap());
|
||||
}
|
||||
let weights =
|
||||
rand::distributions::weighted::WeightedIndex::new(all_ranges.iter().map(|v| v.len()))
|
||||
.unwrap();
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
|
||||
let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
|
||||
num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
|
||||
));
|
||||
let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));
|
||||
|
||||
tokio::spawn({
|
||||
let stats = Arc::clone(&live_stats);
|
||||
let start_work_barrier = Arc::clone(&start_work_barrier);
|
||||
async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let start = std::time::Instant::now();
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
|
||||
let elapsed = start.elapsed();
|
||||
info!(
|
||||
"RPS: {:.0}",
|
||||
completed_requests as f64 / elapsed.as_secs_f64()
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
let mut work_senders = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&all_work_done_barrier),
|
||||
Arc::clone(&live_stats),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender = async move {
|
||||
start_work_barrier.wait().await;
|
||||
loop {
|
||||
let (range, key) = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &all_ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = repository::Key::from_i128(key);
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
(r, RelTagBlockNo { rel_tag, block_no })
|
||||
};
|
||||
let sender = work_senders.get(&range.timeline).unwrap();
|
||||
// TODO: what if this blocks?
|
||||
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(runtime) = args.runtime {
|
||||
match tokio::time::timeout(runtime.into(), work_sender).await {
|
||||
Ok(()) => unreachable!("work sender never terminates"),
|
||||
Err(_timeout) => {
|
||||
// this implicitly drops the work_senders, making all the clients exit
|
||||
}
|
||||
}
|
||||
} else {
|
||||
work_sender.await;
|
||||
unreachable!("work sender never terminates");
|
||||
}
|
||||
|
||||
for t in tasks {
|
||||
t.await.unwrap();
|
||||
}
|
||||
|
||||
let output = Output {
|
||||
total: {
|
||||
let mut agg_stats = ThreadLocalStats::new();
|
||||
for stats in thread_local_stats.lock().unwrap().iter() {
|
||||
let stats = stats.lock().unwrap();
|
||||
agg_stats.add(&*stats);
|
||||
}
|
||||
agg_stats.output()
|
||||
},
|
||||
};
|
||||
|
||||
let output = serde_json::to_string_pretty(&output).unwrap();
|
||||
println!("{output}");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
|
||||
all_work_done_barrier: Arc<Barrier>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
) {
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
let client =
|
||||
pageserver::client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
let mut client = client
|
||||
.pagestream(timeline.tenant_id, timeline.timeline_id)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
while let Some((key, lsn)) = work.recv().await {
|
||||
let start = Instant::now();
|
||||
client
|
||||
.getpage(key, lsn)
|
||||
.await
|
||||
.with_context(|| format!("getpage for {timeline}"))
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed();
|
||||
live_stats.inc();
|
||||
STATS.with(|stats| {
|
||||
stats.borrow().lock().unwrap().observe(elapsed).unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
all_work_done_barrier.wait().await;
|
||||
}
|
||||
@@ -1,22 +0,0 @@
|
||||
use clap::Parser;
|
||||
|
||||
pub(crate) mod util;
|
||||
|
||||
mod basebackup;
|
||||
mod getpage_latest_lsn;
|
||||
|
||||
/// Component-level performance test for pageserver.
|
||||
#[derive(clap::Parser)]
|
||||
enum Args {
|
||||
GetPageLatestLsn(getpage_latest_lsn::Args),
|
||||
Basebackup(basebackup::Args),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args = Args::parse();
|
||||
match args {
|
||||
Args::GetPageLatestLsn(args) => getpage_latest_lsn::main(args),
|
||||
Args::Basebackup(args) => basebackup::main(args),
|
||||
}
|
||||
.unwrap()
|
||||
}
|
||||
@@ -1,2 +0,0 @@
|
||||
pub(crate) mod tenant_timeline_id;
|
||||
pub(crate) mod connstring;
|
||||
@@ -1,8 +0,0 @@
|
||||
pub(crate) fn connstring(host_port: &str, jwt: Option<&str>) -> String {
|
||||
let colon_and_jwt = if let Some(jwt) = jwt {
|
||||
format!(":{jwt}") // TODO: urlescape
|
||||
} else {
|
||||
format!("")
|
||||
};
|
||||
format!("postgres://postgres{colon_and_jwt}@{host_port}")
|
||||
}
|
||||
@@ -1,36 +0,0 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Context;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
|
||||
pub(crate) struct TenantTimelineId {
|
||||
pub(crate) tenant_id: TenantId,
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
impl FromStr for TenantTimelineId {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let (tenant_id, timeline_id) = s
|
||||
.split_once("/")
|
||||
.context("tenant and timeline id must be separated by `/`")?;
|
||||
let tenant_id = TenantId::from_str(&tenant_id)
|
||||
.with_context(|| format!("invalid tenant id: {tenant_id:?}"))?;
|
||||
let timeline_id = TimelineId::from_str(&timeline_id)
|
||||
.with_context(|| format!("invalid timeline id: {timeline_id:?}"))?;
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TenantTimelineId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}/{}", self.tenant_id, self.timeline_id)
|
||||
}
|
||||
}
|
||||
@@ -166,111 +166,71 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
debug!("Gather non-relational files from object storage pages");
|
||||
// Gather non-relational files from object storage pages.
|
||||
for kind in [
|
||||
SlruKind::Clog,
|
||||
SlruKind::MultiXactOffsets,
|
||||
SlruKind::MultiXactMembers,
|
||||
] {
|
||||
async {
|
||||
debug!("list slru segments");
|
||||
for segno in self
|
||||
.timeline
|
||||
.list_slru_segments(kind, self.lsn, self.ctx)
|
||||
.await?
|
||||
{
|
||||
async {
|
||||
debug!("add slru segment");
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
anyhow::Ok(())
|
||||
}
|
||||
.instrument(debug_span!("slru segment", ?segno))
|
||||
.await?;
|
||||
}
|
||||
anyhow::Ok(())
|
||||
for segno in self
|
||||
.timeline
|
||||
.list_slru_segments(kind, self.lsn, self.ctx)
|
||||
.await?
|
||||
{
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
}
|
||||
.instrument(debug_span!("non-rel file", ?kind))
|
||||
.await?;
|
||||
}
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
debug!("Create tablespace directories");
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
|
||||
{
|
||||
async {
|
||||
debug!("iter");
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||
|
||||
// If full backup is requested, include all relation files.
|
||||
// Otherwise only include init forks of unlogged relations.
|
||||
debug!("list rels");
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?;
|
||||
for &rel in rels.iter() {
|
||||
async {
|
||||
debug!("iter");
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
// contents of UNLOGGED relations. Postgres copies it in
|
||||
// `reinit.c` during recovery.
|
||||
if rel.forknum == INIT_FORKNUM {
|
||||
// I doubt we need _init fork itself, but having it at least
|
||||
// serves as a marker relation is unlogged.
|
||||
self.add_rel(rel, rel).await?;
|
||||
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if self.full_backup {
|
||||
if rel.forknum == MAIN_FORKNUM
|
||||
&& rels.contains(&rel.with_forknum(INIT_FORKNUM))
|
||||
{
|
||||
// skip this, will include it when we reach the init fork
|
||||
return Ok(());
|
||||
}
|
||||
self.add_rel(rel, rel).await?;
|
||||
}
|
||||
anyhow::Ok(())
|
||||
}
|
||||
.instrument(debug_span!("process rel", ?rel))
|
||||
.await?;
|
||||
// If full backup is requested, include all relation files.
|
||||
// Otherwise only include init forks of unlogged relations.
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
||||
.await?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
// contents of UNLOGGED relations. Postgres copies it in
|
||||
// `reinit.c` during recovery.
|
||||
if rel.forknum == INIT_FORKNUM {
|
||||
// I doubt we need _init fork itself, but having it at least
|
||||
// serves as a marker relation is unlogged.
|
||||
self.add_rel(rel, rel).await?;
|
||||
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
||||
continue;
|
||||
}
|
||||
|
||||
debug!("list aux files");
|
||||
for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
|
||||
async {
|
||||
debug!("iter");
|
||||
if path.starts_with("pg_replslot") {
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
content[offs..offs + 8].try_into().unwrap(),
|
||||
));
|
||||
info!("Replication slot {} restart LSN={}", path, restart_lsn);
|
||||
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
|
||||
}
|
||||
let header = new_tar_header(&path, content.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &*content)
|
||||
.await
|
||||
.context("could not add aux file to basebackup tarball")?;
|
||||
anyhow::Ok(())
|
||||
if self.full_backup {
|
||||
if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM))
|
||||
{
|
||||
// skip this, will include it when we reach the init fork
|
||||
continue;
|
||||
}
|
||||
.instrument(debug_span!("process aux file", ?path))
|
||||
.await?;
|
||||
self.add_rel(rel, rel).await?;
|
||||
}
|
||||
|
||||
debug!("done");
|
||||
|
||||
anyhow::Ok(())
|
||||
}
|
||||
.instrument(debug_span!(
|
||||
"process tablespace directory",
|
||||
?spcnode,
|
||||
?dbnode
|
||||
))
|
||||
.await?;
|
||||
|
||||
for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
|
||||
if path.starts_with("pg_replslot") {
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
content[offs..offs + 8].try_into().unwrap(),
|
||||
));
|
||||
info!("Replication slot {} restart LSN={}", path, restart_lsn);
|
||||
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
|
||||
}
|
||||
let header = new_tar_header(&path, content.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &*content)
|
||||
.await
|
||||
.context("could not add aux file to basebackup tarball")?;
|
||||
}
|
||||
}
|
||||
if min_restart_lsn != Lsn::MAX {
|
||||
info!(
|
||||
@@ -284,25 +244,19 @@ where
|
||||
.await
|
||||
.context("could not add restart.lsn file to basebackup tarball")?;
|
||||
}
|
||||
debug!("list twophase files");
|
||||
for xid in self
|
||||
.timeline
|
||||
.list_twophase_files(self.lsn, self.ctx)
|
||||
.await?
|
||||
{
|
||||
async {
|
||||
self.add_twophase_file(xid).await?;
|
||||
anyhow::Ok(())
|
||||
}
|
||||
.instrument(debug_span!("process twophase file", ?xid))
|
||||
.await?;
|
||||
self.add_twophase_file(xid).await?;
|
||||
}
|
||||
|
||||
fail_point!("basebackup-before-control-file", |_| {
|
||||
bail!("failpoint basebackup-before-control-file")
|
||||
});
|
||||
|
||||
debug!("Generate pg_control and bootstrap WAL segment.");
|
||||
// Generate pg_control and bootstrap WAL segment.
|
||||
self.add_pgcontrol_file().await?;
|
||||
self.ar.finish().await?;
|
||||
debug!("all tarred up!");
|
||||
|
||||
@@ -103,11 +103,7 @@ fn main() -> anyhow::Result<()> {
|
||||
} else {
|
||||
TracingErrorLayerEnablement::Disabled
|
||||
};
|
||||
let _guard = logging::init(
|
||||
conf.log_format,
|
||||
tracing_error_layer_enablement,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
logging::init(conf.log_format, tracing_error_layer_enablement)?;
|
||||
|
||||
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
||||
// disarming this hook on pageserver, because we never tear down tracing.
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
pub mod mgmt_api;
|
||||
pub mod page_service;
|
||||
@@ -1,89 +0,0 @@
|
||||
use anyhow::Context;
|
||||
|
||||
use hyper::{client::HttpConnector, Uri};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
pub struct Client {
|
||||
mgmt_api_endpoint: String,
|
||||
authorization_header: Option<String>,
|
||||
client: hyper::Client<HttpConnector, hyper::Body>,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||
Self {
|
||||
mgmt_api_endpoint,
|
||||
authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
|
||||
client: hyper::client::Client::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn list_tenants(&self) -> anyhow::Result<Vec<pageserver_api::models::TenantInfo>> {
|
||||
let uri = Uri::try_from(format!("{}/v1/tenant", self.mgmt_api_endpoint))?;
|
||||
let resp = self.get(uri).await?;
|
||||
if !resp.status().is_success() {
|
||||
anyhow::bail!("status error");
|
||||
}
|
||||
let body = hyper::body::to_bytes(resp).await?;
|
||||
Ok(serde_json::from_slice(&body)?)
|
||||
}
|
||||
|
||||
pub async fn list_timelines(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<Vec<pageserver_api::models::TimelineInfo>> {
|
||||
let uri = Uri::try_from(format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline",
|
||||
self.mgmt_api_endpoint
|
||||
))?;
|
||||
let resp = self.get(uri).await?;
|
||||
if !resp.status().is_success() {
|
||||
anyhow::bail!("status error");
|
||||
}
|
||||
let body = hyper::body::to_bytes(resp).await?;
|
||||
Ok(serde_json::from_slice(&body)?)
|
||||
}
|
||||
|
||||
pub async fn timeline_info(
|
||||
&self, tenant_id: TenantId, timeline_id: TimelineId,
|
||||
) -> anyhow::Result<pageserver_api::models::TimelineInfo> {
|
||||
let uri = Uri::try_from(format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
|
||||
self.mgmt_api_endpoint
|
||||
))?;
|
||||
let resp = self.get(uri).await?;
|
||||
if !resp.status().is_success() {
|
||||
anyhow::bail!("status error");
|
||||
}
|
||||
let body = hyper::body::to_bytes(resp).await?;
|
||||
Ok(serde_json::from_slice(&body)?)
|
||||
}
|
||||
|
||||
pub async fn keyspace(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<crate::http::models::partitioning::Partitioning> {
|
||||
let uri = Uri::try_from(format!(
|
||||
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace?check_serialization_roundtrip=true",
|
||||
self.mgmt_api_endpoint
|
||||
))?;
|
||||
let resp = self.get(uri).await?;
|
||||
if !resp.status().is_success() {
|
||||
anyhow::bail!("status error");
|
||||
}
|
||||
let body = hyper::body::to_bytes(resp).await?;
|
||||
Ok(serde_json::from_slice(&body).context("deserialize")?)
|
||||
}
|
||||
|
||||
async fn get(&self, uri: Uri) -> hyper::Result<hyper::Response<hyper::Body>> {
|
||||
let req = hyper::Request::builder().uri(uri).method("GET");
|
||||
let req = if let Some(value) = &self.authorization_header {
|
||||
req.header("Authorization", value)
|
||||
} else {
|
||||
req
|
||||
};
|
||||
let req = req.body(hyper::Body::default());
|
||||
self.client.request(req.unwrap()).await
|
||||
}
|
||||
}
|
||||
@@ -1,145 +0,0 @@
|
||||
use std::pin::Pin;
|
||||
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
|
||||
PagestreamGetPageResponse,
|
||||
},
|
||||
reltag::RelTag,
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_postgres::CopyOutStream;
|
||||
use tokio_stream::StreamExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
pub struct Client {
|
||||
client: tokio_postgres::Client,
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
|
||||
pub struct BasebackupRequest {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub lsn: Option<Lsn>,
|
||||
pub gzip: bool,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub async fn new(connstring: String) -> anyhow::Result<Self> {
|
||||
let (client, connection) = tokio_postgres::connect(&connstring, postgres::NoTls).await?;
|
||||
|
||||
let conn_task_cancel = CancellationToken::new();
|
||||
let conn_task = tokio::spawn({
|
||||
let conn_task_cancel = conn_task_cancel.clone();
|
||||
async move {
|
||||
tokio::select! {
|
||||
_ = conn_task_cancel.cancelled() => { }
|
||||
res = connection => {
|
||||
res.unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
Ok(Self {
|
||||
cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
|
||||
conn_task,
|
||||
client,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn pagestream(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<PagestreamClient> {
|
||||
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
|
||||
.client
|
||||
.copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
|
||||
.await?;
|
||||
let Client {
|
||||
cancel_on_client_drop,
|
||||
conn_task,
|
||||
client: _,
|
||||
} = self;
|
||||
Ok(PagestreamClient {
|
||||
copy_both: Box::pin(copy_both),
|
||||
conn_task,
|
||||
cancel_on_client_drop,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn basebackup(&self, req: &BasebackupRequest) -> anyhow::Result<CopyOutStream> {
|
||||
let BasebackupRequest {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
} = req;
|
||||
let mut args = Vec::with_capacity(5);
|
||||
args.push("basebackup".to_string());
|
||||
args.push(format!("{tenant_id}"));
|
||||
args.push(format!("{timeline_id}"));
|
||||
if let Some(lsn) = lsn {
|
||||
args.push(format!("{lsn}"));
|
||||
}
|
||||
if *gzip {
|
||||
args.push(format!("--gzip"))
|
||||
}
|
||||
Ok(self.client.copy_out(&args.join(" ")).await?)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create using [`Client::pagestream`].
|
||||
pub struct PagestreamClient {
|
||||
copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
|
||||
cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
|
||||
conn_task: JoinHandle<()>,
|
||||
}
|
||||
|
||||
pub struct RelTagBlockNo {
|
||||
pub rel_tag: RelTag,
|
||||
pub block_no: u32,
|
||||
}
|
||||
|
||||
impl PagestreamClient {
|
||||
pub async fn shutdown(mut self) {
|
||||
let _ = self.cancel_on_client_drop.take();
|
||||
self.conn_task.await.unwrap();
|
||||
}
|
||||
|
||||
pub async fn getpage(
|
||||
&mut self,
|
||||
key: RelTagBlockNo,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
let req = PagestreamGetPageRequest {
|
||||
latest: false,
|
||||
rel: key.rel_tag,
|
||||
blkno: key.block_no,
|
||||
lsn,
|
||||
};
|
||||
let req = PagestreamFeMessage::GetPage(req);
|
||||
let req: bytes::Bytes = req.serialize();
|
||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||
let mut req = tokio_stream::once(Ok(req));
|
||||
|
||||
self.copy_both.send_all(&mut req).await?;
|
||||
|
||||
let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
|
||||
let next = next.unwrap().unwrap();
|
||||
|
||||
match PagestreamBeMessage::deserialize(next)? {
|
||||
PagestreamBeMessage::Exists(_) => todo!(),
|
||||
PagestreamBeMessage::Nblocks(_) => todo!(),
|
||||
PagestreamBeMessage::GetPage(p) => Ok(p),
|
||||
PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
|
||||
PagestreamBeMessage::DbSize(_) => todo!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -261,7 +261,7 @@ async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
};
|
||||
|
||||
for (tenant_id, tenant_state, _gen) in tenants {
|
||||
for (tenant_id, tenant_state) in tenants {
|
||||
if tenant_state != TenantState::Active {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -197,7 +197,7 @@ pub(super) async fn collect_all_metrics(
|
||||
}
|
||||
};
|
||||
|
||||
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
|
||||
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
|
||||
if state != TenantState::Active {
|
||||
None
|
||||
} else {
|
||||
|
||||
@@ -345,7 +345,7 @@ impl DeletionList {
|
||||
result.extend(
|
||||
timeline_layers
|
||||
.into_iter()
|
||||
.map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
|
||||
.map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -541,7 +541,7 @@ async fn collect_eviction_candidates(
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
|
||||
for (tenant_id, _state, _gen) in &tenants {
|
||||
for (tenant_id, _state) in &tenants {
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
pub mod routes;
|
||||
pub use routes::make_router;
|
||||
|
||||
pub mod models;
|
||||
pub use pageserver_api::models;
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
//! If possible, use `::pageserver_api::models` instead.
|
||||
|
||||
pub mod partitioning;
|
||||
@@ -1,112 +0,0 @@
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub struct Partitioning {
|
||||
pub keys: crate::keyspace::KeySpace,
|
||||
|
||||
pub at_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl serde::Serialize for Partitioning {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
|
||||
|
||||
impl<'a> serde::Serialize for KeySpace<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeSeq;
|
||||
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
|
||||
for kr in &self.0.ranges {
|
||||
seq.serialize_element(&KeyRange(kr))?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
}
|
||||
|
||||
use serde::ser::SerializeMap;
|
||||
let mut map = serializer.serialize_map(Some(2))?;
|
||||
map.serialize_key("keys")?;
|
||||
map.serialize_value(&KeySpace(&self.keys))?;
|
||||
map.serialize_key("at_lsn")?;
|
||||
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
||||
map.end()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WithDisplay<'a, T>(&'a T);
|
||||
|
||||
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
|
||||
|
||||
impl<'a> serde::Serialize for KeyRange<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeTuple;
|
||||
let mut t = serializer.serialize_tuple(2)?;
|
||||
t.serialize_element(&WithDisplay(&self.0.start))?;
|
||||
t.serialize_element(&WithDisplay(&self.0.end))?;
|
||||
t.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> serde::Deserialize<'a> for Partitioning {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'a>,
|
||||
{
|
||||
pub struct KeySpace(crate::keyspace::KeySpace);
|
||||
|
||||
impl<'de> serde::Deserialize<'de> for KeySpace {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(transparent)]
|
||||
struct Key(#[serde_as(as = "serde_with::DisplayFromStr")] crate::repository::Key);
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Deserialize)]
|
||||
struct Range(Key, Key);
|
||||
|
||||
let ranges: Vec<Range> = serde::Deserialize::deserialize(deserializer)?;
|
||||
Ok(Self(crate::keyspace::KeySpace {
|
||||
ranges: ranges
|
||||
.into_iter()
|
||||
.map(|Range(start, end)| (start.0..end.0))
|
||||
.collect(),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Deserialize)]
|
||||
struct De {
|
||||
keys: KeySpace,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
at_lsn: Lsn,
|
||||
}
|
||||
|
||||
let de: De = serde::Deserialize::deserialize(deserializer)?;
|
||||
Ok(Self {
|
||||
at_lsn: de.at_lsn,
|
||||
keys: de.keys.0,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -26,6 +26,10 @@ use utils::http::endpoint::request_span;
|
||||
use utils::http::json::json_request_or_empty_body;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
|
||||
@@ -42,10 +46,6 @@ use crate::tenant::timeline::Timeline;
|
||||
use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use crate::{disk_usage_eviction_task, tenant};
|
||||
use pageserver_api::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
|
||||
};
|
||||
use utils::{
|
||||
auth::SwappableJwtAuth,
|
||||
generation::Generation,
|
||||
@@ -61,7 +61,7 @@ use utils::{
|
||||
};
|
||||
|
||||
// Imports only used for testing APIs
|
||||
use pageserver_api::models::ConfigureFailpointsRequest;
|
||||
use super::models::ConfigureFailpointsRequest;
|
||||
|
||||
pub struct State {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -764,12 +764,11 @@ async fn tenant_list_handler(
|
||||
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
|
||||
})?
|
||||
.iter()
|
||||
.map(|(id, state, gen)| TenantInfo {
|
||||
.map(|(id, state)| TenantInfo {
|
||||
id: *id,
|
||||
state: state.clone(),
|
||||
current_physical_size: None,
|
||||
attachment_status: state.attachment_status(),
|
||||
generation: (*gen).into(),
|
||||
})
|
||||
.collect::<Vec<TenantInfo>>();
|
||||
|
||||
@@ -798,7 +797,6 @@ async fn tenant_status(
|
||||
state: state.clone(),
|
||||
current_physical_size: Some(current_physical_size),
|
||||
attachment_status: state.attachment_status(),
|
||||
generation: tenant.generation().into(),
|
||||
})
|
||||
}
|
||||
.instrument(info_span!("tenant_status_handler", %tenant_id))
|
||||
@@ -1424,10 +1422,70 @@ async fn timeline_collect_keyspace(
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
|
||||
struct Partitioning {
|
||||
keys: crate::keyspace::KeySpace,
|
||||
|
||||
let check_serialization_roundtrip: bool =
|
||||
parse_query_param(&request, "check_serialization_roundtrip")?.unwrap_or(false);
|
||||
at_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl serde::Serialize for Partitioning {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let mut map = serializer.serialize_map(Some(2))?;
|
||||
map.serialize_key("keys")?;
|
||||
map.serialize_value(&KeySpace(&self.keys))?;
|
||||
map.serialize_key("at_lsn")?;
|
||||
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
||||
map.end()
|
||||
}
|
||||
}
|
||||
|
||||
struct WithDisplay<'a, T>(&'a T);
|
||||
|
||||
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
|
||||
|
||||
impl<'a> serde::Serialize for KeySpace<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeSeq;
|
||||
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
|
||||
for kr in &self.0.ranges {
|
||||
seq.serialize_element(&KeyRange(kr))?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
}
|
||||
|
||||
struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
|
||||
|
||||
impl<'a> serde::Serialize for KeyRange<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeTuple;
|
||||
let mut t = serializer.serialize_tuple(2)?;
|
||||
t.serialize_element(&WithDisplay(&self.0.start))?;
|
||||
t.serialize_element(&WithDisplay(&self.0.end))?;
|
||||
t.end()
|
||||
}
|
||||
}
|
||||
|
||||
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
@@ -1438,20 +1496,7 @@ async fn timeline_collect_keyspace(
|
||||
.await
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let res = crate::http::models::partitioning::Partitioning { keys, at_lsn };
|
||||
if check_serialization_roundtrip {
|
||||
(|| {
|
||||
let ser = serde_json::ser::to_vec(&res).context("serialize")?;
|
||||
let de: crate::http::models::partitioning::Partitioning =
|
||||
serde_json::from_slice(&ser).context("deserialize")?;
|
||||
anyhow::ensure!(de == res, "not equal");
|
||||
info!("passed serialization rountrip check");
|
||||
Ok(())
|
||||
})()
|
||||
.context("serialization rountrip")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
json_response(StatusCode::OK, res)
|
||||
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
|
||||
}
|
||||
.instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
|
||||
.await
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
use crate::repository::{key_range_size, singleton_range, Key};
|
||||
use crate::{
|
||||
pgdatadir_mapping::{BASEBACKUP_CUT, METADATA_CUT},
|
||||
repository::{key_range_size, singleton_range, Key},
|
||||
};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::ops::Range;
|
||||
|
||||
///
|
||||
/// Represents a set of Keys, in a compact form.
|
||||
///
|
||||
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct KeySpace {
|
||||
/// Contiguous ranges of keys that belong to the key space. In key order,
|
||||
/// and with no overlap.
|
||||
@@ -22,13 +25,22 @@ impl KeySpace {
|
||||
let target_nblocks = (target_size / BLCKSZ as u64) as usize;
|
||||
|
||||
let mut parts = Vec::new();
|
||||
let mut current_part = Vec::new();
|
||||
let mut current_part: Vec<Range<Key>> = Vec::new();
|
||||
let mut current_part_size: usize = 0;
|
||||
for range in &self.ranges {
|
||||
let last = current_part
|
||||
.last()
|
||||
.map(|r| r.end)
|
||||
.unwrap_or(Key::from_i128(0));
|
||||
let cut_here = (range.start >= METADATA_CUT && last < METADATA_CUT)
|
||||
|| (range.start >= BASEBACKUP_CUT && last < BASEBACKUP_CUT);
|
||||
|
||||
// If appending the next contiguous range in the keyspace to the current
|
||||
// partition would cause it to be too large, start a new partition.
|
||||
let this_size = key_range_size(range) as usize;
|
||||
if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
|
||||
if cut_here
|
||||
|| current_part_size + this_size > target_nblocks && !current_part.is_empty()
|
||||
{
|
||||
parts.push(KeySpace {
|
||||
ranges: current_part,
|
||||
});
|
||||
|
||||
@@ -25,7 +25,6 @@ pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
pub mod client;
|
||||
pub mod failpoint_support;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
@@ -40,9 +40,6 @@ pub enum StorageTimeOperation {
|
||||
#[strum(serialize = "logical size")]
|
||||
LogicalSize,
|
||||
|
||||
#[strum(serialize = "imitate logical size")]
|
||||
ImitateLogicalSize,
|
||||
|
||||
#[strum(serialize = "load layer map")]
|
||||
LoadLayerMap,
|
||||
|
||||
@@ -638,7 +635,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
///
|
||||
/// Operations:
|
||||
/// - open ([`std::fs::OpenOptions::open`])
|
||||
/// - close (dropping [`crate::virtual_file::VirtualFile`])
|
||||
/// - close (dropping [`std::fs::File`])
|
||||
/// - close-by-replace (close by replacement algorithm)
|
||||
/// - read (`read_at`)
|
||||
/// - write (`write_at`)
|
||||
@@ -1364,7 +1361,6 @@ pub struct TimelineMetrics {
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
pub logical_size_histo: StorageTimeMetrics,
|
||||
pub imitate_logical_size_histo: StorageTimeMetrics,
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
@@ -1393,11 +1389,6 @@ impl TimelineMetrics {
|
||||
StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
|
||||
let logical_size_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
|
||||
let imitate_logical_size_histo = StorageTimeMetrics::new(
|
||||
StorageTimeOperation::ImitateLogicalSize,
|
||||
&tenant_id,
|
||||
&timeline_id,
|
||||
);
|
||||
let load_layer_map_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
|
||||
let garbage_collect_histo =
|
||||
@@ -1430,7 +1421,6 @@ impl TimelineMetrics {
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
logical_size_histo,
|
||||
imitate_logical_size_histo,
|
||||
garbage_collect_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
|
||||
@@ -662,20 +662,21 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<KeySpace, CollectKeySpaceError> {
|
||||
// Iterate through key ranges, greedily packing them into partitions
|
||||
// This function is responsible for appending keys in order, using implicit
|
||||
// knowledge of how keys are defined.
|
||||
let mut result = KeySpaceAccum::new();
|
||||
|
||||
// The dbdir metadata always exists
|
||||
result.add_key(DBDIR_KEY);
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf)?;
|
||||
|
||||
let mut metadata_keys = Vec::new();
|
||||
|
||||
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
|
||||
dbs.sort_unstable();
|
||||
for (spcnode, dbnode) in dbs {
|
||||
result.add_key(relmap_file_key(spcnode, dbnode));
|
||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||
metadata_keys.push(relmap_file_key(spcnode, dbnode));
|
||||
metadata_keys.push(rel_dir_to_key(spcnode, dbnode));
|
||||
|
||||
let mut rels: Vec<RelTag> = self
|
||||
.list_rels(spcnode, dbnode, lsn, ctx)
|
||||
@@ -689,7 +690,7 @@ impl Timeline {
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
|
||||
result.add_key(relsize_key);
|
||||
metadata_keys.push(relsize_key);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -732,6 +733,13 @@ impl Timeline {
|
||||
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
}
|
||||
|
||||
// The dbdir metadata always exists
|
||||
result.add_key(DBDIR_KEY);
|
||||
for key in metadata_keys {
|
||||
result.add_key(key);
|
||||
}
|
||||
|
||||
Ok(result.to_keyspace())
|
||||
}
|
||||
|
||||
@@ -1474,21 +1482,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
//
|
||||
// Below is a full list of the keyspace allocation:
|
||||
//
|
||||
// DbDir:
|
||||
// 00 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Filenodemap:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000000
|
||||
//
|
||||
// RelDir:
|
||||
// 00 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
|
||||
|
||||
//
|
||||
// RelBlock:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK BLKNUM
|
||||
//
|
||||
// RelSize:
|
||||
// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF
|
||||
//
|
||||
// SlruDir:
|
||||
// 01 kind 00000000 00000000 00 00000000
|
||||
//
|
||||
@@ -1513,11 +1511,31 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
// AuxFiles:
|
||||
// 03 00000000 00000000 00000000 00 00000002
|
||||
//
|
||||
// DbDir:
|
||||
// 04 00000000 00000000 00000000 00 00000000
|
||||
//
|
||||
// Filenodemap:
|
||||
// 04 SPCNODE DBNODE 00000000 00 00000000
|
||||
//
|
||||
// RelDir:
|
||||
// 04 SPCNODE DBNODE 00000000 00 00000001 (Postgres never uses relfilenode 0)
|
||||
//
|
||||
// RelSize:
|
||||
// 04 SPCNODE DBNODE RELNODE FORK FFFFFFFF
|
||||
|
||||
//-- Section 01: relation data and metadata
|
||||
|
||||
/// Keys above this Key are required to serve a basebackup request
|
||||
pub(crate) const BASEBACKUP_CUT: Key = slru_dir_to_key(SlruKind::Clog);
|
||||
|
||||
/// Keys aboe this Key are needed to make a logical size calculation
|
||||
///
|
||||
/// Ensuring that such keys are stored above the main range of user relation
|
||||
/// blocks enables much more efficient space management.
|
||||
pub(crate) const METADATA_CUT: Key = CONTROLFILE_KEY;
|
||||
|
||||
const DBDIR_KEY: Key = Key {
|
||||
field1: 0x00,
|
||||
field1: 0x04,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
@@ -1527,14 +1545,14 @@ const DBDIR_KEY: Key = Key {
|
||||
|
||||
fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field1: 0x04,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x00,
|
||||
field1: 0x04,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0xffffffff,
|
||||
@@ -1545,7 +1563,7 @@ fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
|
||||
|
||||
fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field1: 0x04,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
@@ -1556,7 +1574,7 @@ fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
|
||||
fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field1: 0x04,
|
||||
field2: spcnode,
|
||||
field3: dbnode,
|
||||
field4: 0,
|
||||
@@ -1578,7 +1596,7 @@ fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
|
||||
fn rel_size_to_key(rel: RelTag) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field1: 0x04,
|
||||
field2: rel.spcnode,
|
||||
field3: rel.dbnode,
|
||||
field4: rel.relnode,
|
||||
@@ -1607,7 +1625,7 @@ fn rel_key_range(rel: RelTag) -> Range<Key> {
|
||||
|
||||
//-- Section 02: SLRUs
|
||||
|
||||
fn slru_dir_to_key(kind: SlruKind) -> Key {
|
||||
const fn slru_dir_to_key(kind: SlruKind) -> Key {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2: match kind {
|
||||
@@ -1749,7 +1767,6 @@ const AUX_FILES_KEY: Key = Key {
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
|
||||
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x00 => (
|
||||
@@ -1765,8 +1782,7 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
})
|
||||
}
|
||||
|
||||
/// See [[key_to_rel_block]].
|
||||
pub fn is_rel_block_key(key: Key) -> bool {
|
||||
fn is_rel_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0
|
||||
}
|
||||
|
||||
|
||||
@@ -61,7 +61,6 @@ use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use self::timeline::TimelineResources;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -252,8 +251,6 @@ pub struct Tenant {
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
|
||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||
|
||||
// Cancellation token fires when we have entered shutdown(). This is a parent of
|
||||
@@ -291,16 +288,6 @@ impl From<harness::TestRedoManager> for WalRedoManager {
|
||||
}
|
||||
|
||||
impl WalRedoManager {
|
||||
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
||||
match self {
|
||||
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
|
||||
#[cfg(test)]
|
||||
Self::Test(_) => {
|
||||
// Not applicable to test redo manager
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: crate::repository::Key,
|
||||
@@ -1659,16 +1646,22 @@ impl Tenant {
|
||||
/// This function is periodically called by compactor task.
|
||||
/// Also it can be explicitly requested per timeline through page server
|
||||
/// api's 'compact' command.
|
||||
async fn compaction_iteration(
|
||||
pub async fn compaction_iteration(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(), timeline::CompactionError> {
|
||||
// Don't start doing work during shutdown, or when broken, we do not need those in the logs
|
||||
if !self.is_active() {
|
||||
) -> anyhow::Result<()> {
|
||||
// Don't start doing work during shutdown
|
||||
if let TenantState::Stopping { .. } = self.current_state() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// We should only be called once the tenant has activated.
|
||||
anyhow::ensure!(
|
||||
self.is_active(),
|
||||
"Cannot run compaction iteration on inactive tenant"
|
||||
);
|
||||
|
||||
{
|
||||
let conf = self.tenant_conf.read().unwrap();
|
||||
if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
|
||||
@@ -1715,10 +1708,6 @@ impl Tenant {
|
||||
self.current_state() == TenantState::Active
|
||||
}
|
||||
|
||||
pub fn generation(&self) -> Generation {
|
||||
self.generation
|
||||
}
|
||||
|
||||
/// Changes tenant status to active, unless shutdown was already requested.
|
||||
///
|
||||
/// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
|
||||
@@ -2375,7 +2364,6 @@ impl Tenant {
|
||||
state,
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||
cancel: CancellationToken::default(),
|
||||
gate: Gate::new(format!("Tenant<{tenant_id}>")),
|
||||
@@ -3508,7 +3496,6 @@ pub(crate) mod harness {
|
||||
// enable it in case the tests exercise code paths that use
|
||||
// debug_assert_current_span_has_tenant_and_timeline_id
|
||||
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
logging::Output::Stdout,
|
||||
)
|
||||
.expect("Failed to init test logging")
|
||||
});
|
||||
|
||||
@@ -1397,8 +1397,7 @@ pub(crate) enum TenantMapListError {
|
||||
///
|
||||
/// Get list of tenants, for the mgmt API
|
||||
///
|
||||
pub(crate) async fn list_tenants(
|
||||
) -> Result<Vec<(TenantId, TenantState, Generation)>, TenantMapListError> {
|
||||
pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
|
||||
let tenants = TENANTS.read().unwrap();
|
||||
let m = match &*tenants {
|
||||
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
|
||||
@@ -1406,12 +1405,12 @@ pub(crate) async fn list_tenants(
|
||||
};
|
||||
Ok(m.iter()
|
||||
.filter_map(|(id, tenant)| match tenant {
|
||||
TenantSlot::Attached(tenant) => Some((id, tenant.current_state(), tenant.generation())),
|
||||
TenantSlot::Attached(tenant) => Some((id, tenant.current_state())),
|
||||
TenantSlot::Secondary => None,
|
||||
TenantSlot::InProgress(_) => None,
|
||||
})
|
||||
// TODO(sharding): make callers of this function shard-aware
|
||||
.map(|(a, b, c)| (a.tenant_id, b, c))
|
||||
.map(|(k, v)| (k.tenant_id, v))
|
||||
.collect())
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
pub mod delta_layer;
|
||||
mod filename;
|
||||
pub mod image_layer;
|
||||
mod image_layer;
|
||||
mod inmemory_layer;
|
||||
mod layer;
|
||||
mod layer_desc;
|
||||
|
||||
@@ -69,13 +69,13 @@ use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Summary {
|
||||
/// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC.
|
||||
pub magic: u16,
|
||||
pub format_version: u16,
|
||||
magic: u16,
|
||||
format_version: u16,
|
||||
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
/// Block number where the 'index' part of the file begins.
|
||||
pub index_start_blk: u32,
|
||||
@@ -289,9 +289,7 @@ impl DeltaLayer {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = DeltaLayerInner::load(&path, None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
|
||||
|
||||
// not production code
|
||||
let actual_filename = path.file_name().unwrap().to_owned();
|
||||
@@ -611,84 +609,19 @@ impl Drop for DeltaLayerWriter {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum RewriteSummaryError {
|
||||
#[error("magic mismatch")]
|
||||
MagicMismatch,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for RewriteSummaryError {
|
||||
fn from(e: std::io::Error) -> Self {
|
||||
Self::Other(anyhow::anyhow!(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
pub async fn rewrite_summary<F>(
|
||||
path: &Utf8Path,
|
||||
rewrite: F,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), RewriteSummaryError>
|
||||
where
|
||||
F: Fn(Summary) -> Summary,
|
||||
{
|
||||
let file = VirtualFile::open_with_options(
|
||||
path,
|
||||
&*std::fs::OpenOptions::new().read(true).write(true),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
|
||||
let mut file = file.file;
|
||||
if actual_summary.magic != DELTA_FILE_MAGIC {
|
||||
return Err(RewriteSummaryError::MagicMismatch);
|
||||
}
|
||||
|
||||
let new_summary = rewrite(actual_summary);
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
if buf.spilled() {
|
||||
// The code in DeltaLayerWriterInner just warn!()s for this.
|
||||
// It should probably error out as well.
|
||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
)));
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||
/// - inner has the success or transient failure
|
||||
/// - outer has the permanent failure
|
||||
pub(super) async fn load(
|
||||
path: &Utf8Path,
|
||||
summary: Option<Summary>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{path}'"))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
|
||||
let summary_blk = match file.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
|
||||
// TODO: this should be an assertion instead; see ImageLayerInner::load
|
||||
let actual_summary =
|
||||
Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
if let Some(mut expected_summary) = summary {
|
||||
// production code path
|
||||
@@ -703,11 +636,11 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Ok(DeltaLayerInner {
|
||||
Ok(DeltaLayerInner {
|
||||
file,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
}))
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
|
||||
@@ -67,20 +67,20 @@ use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
|
||||
/// the 'index' starts at the block indicated by 'index_start_blk'
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Summary {
|
||||
pub(super) struct Summary {
|
||||
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
|
||||
pub magic: u16,
|
||||
pub format_version: u16,
|
||||
magic: u16,
|
||||
format_version: u16,
|
||||
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn: Lsn,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
/// Block number where the 'index' part of the file begins.
|
||||
pub index_start_blk: u32,
|
||||
index_start_blk: u32,
|
||||
/// Block within the 'index', where the B-tree root page is stored
|
||||
pub index_root_blk: u32,
|
||||
index_root_blk: u32,
|
||||
// the 'values' part starts after the summary header, on block 1.
|
||||
}
|
||||
|
||||
@@ -249,9 +249,7 @@ impl ImageLayer {
|
||||
async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
|
||||
.await
|
||||
.and_then(|res| res)?;
|
||||
let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
|
||||
|
||||
// not production code
|
||||
let actual_filename = path.file_name().unwrap().to_owned();
|
||||
@@ -296,87 +294,19 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum RewriteSummaryError {
|
||||
#[error("magic mismatch")]
|
||||
MagicMismatch,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for RewriteSummaryError {
|
||||
fn from(e: std::io::Error) -> Self {
|
||||
Self::Other(anyhow::anyhow!(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
pub async fn rewrite_summary<F>(
|
||||
path: &Utf8Path,
|
||||
rewrite: F,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), RewriteSummaryError>
|
||||
where
|
||||
F: Fn(Summary) -> Summary,
|
||||
{
|
||||
let file = VirtualFile::open_with_options(
|
||||
path,
|
||||
&*std::fs::OpenOptions::new().read(true).write(true),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
|
||||
let mut file = file.file;
|
||||
if actual_summary.magic != IMAGE_FILE_MAGIC {
|
||||
return Err(RewriteSummaryError::MagicMismatch);
|
||||
}
|
||||
|
||||
let new_summary = rewrite(actual_summary);
|
||||
|
||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||
if buf.spilled() {
|
||||
// The code in ImageLayerWriterInner just warn!()s for this.
|
||||
// It should probably error out as well.
|
||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
||||
"Used more than one page size for summary buffer: {}",
|
||||
buf.len()
|
||||
)));
|
||||
}
|
||||
file.seek(SeekFrom::Start(0)).await?;
|
||||
file.write_all(&buf).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl ImageLayerInner {
|
||||
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
|
||||
/// - inner has the success or transient failure
|
||||
/// - outer has the permanent failure
|
||||
pub(super) async fn load(
|
||||
path: &Utf8Path,
|
||||
lsn: Lsn,
|
||||
summary: Option<Summary>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
|
||||
let file = match VirtualFile::open(path).await {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
|
||||
};
|
||||
) -> anyhow::Result<Self> {
|
||||
let file = VirtualFile::open(path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file '{}'", path))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = match file.read_blk(0, ctx).await {
|
||||
Ok(blk) => blk,
|
||||
Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
|
||||
};
|
||||
|
||||
// length is the only way how this could fail, so it's not actually likely at all unless
|
||||
// read_blk returns wrong sized block.
|
||||
//
|
||||
// TODO: confirm and make this into assertion
|
||||
let actual_summary =
|
||||
Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
|
||||
let summary_blk = file.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
if let Some(mut expected_summary) = summary {
|
||||
// production code path
|
||||
@@ -392,12 +322,12 @@ impl ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Ok(ImageLayerInner {
|
||||
Ok(ImageLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
lsn,
|
||||
file,
|
||||
}))
|
||||
})
|
||||
}
|
||||
|
||||
pub(super) async fn get_value_reconstruct_data(
|
||||
|
||||
@@ -3,7 +3,6 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::models::{
|
||||
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
|
||||
};
|
||||
use remote_storage::RemotePath;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Weak};
|
||||
@@ -306,12 +305,6 @@ impl Layer {
|
||||
&self.0.path
|
||||
}
|
||||
|
||||
/// This can return None even though it should return Some in some edge cases.
|
||||
#[allow(unused)]
|
||||
pub(crate) fn remote_path(&self) -> Option<RemotePath> {
|
||||
self.0.remote_path()
|
||||
}
|
||||
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
self.0.metadata()
|
||||
}
|
||||
@@ -875,9 +868,6 @@ impl LayerInner {
|
||||
}
|
||||
Ok((Err(e), _permit)) => {
|
||||
// FIXME: this should be with the spawned task and be cancellation sensitive
|
||||
//
|
||||
// while we should not need this, this backoff has turned out to be useful with
|
||||
// a bug of unexpectedly deleted remote layer file (#5787).
|
||||
let consecutive_failures =
|
||||
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
||||
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
|
||||
@@ -925,17 +915,6 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// This can return None even though it should return Some in some edge cases.
|
||||
fn remote_path(&self) -> Option<RemotePath> {
|
||||
let tl = self.timeline.upgrade()?; // TODO: should distinguish this case, but, accuracy doesn't matter for this field.
|
||||
Some(crate::tenant::remote_timeline_client::remote_layer_path(
|
||||
&tl.tenant_id,
|
||||
&tl.timeline_id,
|
||||
&self.desc.filename(),
|
||||
self.generation,
|
||||
))
|
||||
}
|
||||
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.desc.filename().file_name();
|
||||
|
||||
@@ -955,7 +934,6 @@ impl LayerInner {
|
||||
lsn_end: lsn_range.end,
|
||||
remote,
|
||||
access_stats,
|
||||
remote_path: self.remote_path().map(|p| p.into()),
|
||||
}
|
||||
} else {
|
||||
let lsn = self.desc.image_layer_lsn();
|
||||
@@ -966,7 +944,6 @@ impl LayerInner {
|
||||
lsn_start: lsn,
|
||||
remote,
|
||||
access_stats,
|
||||
remote_path: self.remote_path().map(|p| p.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1219,7 +1196,7 @@ impl DownloadedLayer {
|
||||
));
|
||||
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Delta))
|
||||
.map(LayerKind::Delta)
|
||||
} else {
|
||||
let lsn = owner.desc.image_layer_lsn();
|
||||
let summary = Some(image_layer::Summary::expected(
|
||||
@@ -1230,32 +1207,23 @@ impl DownloadedLayer {
|
||||
));
|
||||
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
|
||||
.await
|
||||
.map(|res| res.map(LayerKind::Image))
|
||||
};
|
||||
|
||||
match res {
|
||||
Ok(Ok(layer)) => Ok(Ok(layer)),
|
||||
Ok(Err(transient)) => Err(transient),
|
||||
Err(permanent) => {
|
||||
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
|
||||
// TODO(#5815): we are not logging all errors, so temporarily log them **once**
|
||||
// here as well
|
||||
let permanent = permanent.context("load layer");
|
||||
tracing::error!("layer loading failed permanently: {permanent:#}");
|
||||
Ok(Err(permanent))
|
||||
}
|
||||
.map(LayerKind::Image)
|
||||
}
|
||||
// this will be a permanent failure
|
||||
.context("load layer");
|
||||
|
||||
if let Err(e) = res.as_ref() {
|
||||
LAYER_IMPL_METRICS.inc_permanent_loading_failures();
|
||||
// TODO(#5815): we are not logging all errors, so temporarily log them here as well
|
||||
tracing::error!("layer loading failed permanently: {e:#}");
|
||||
}
|
||||
res
|
||||
};
|
||||
self.kind
|
||||
.get_or_try_init(init)
|
||||
// return transient errors using `?`
|
||||
.await?
|
||||
.as_ref()
|
||||
.map_err(|e| {
|
||||
// errors are not clonabled, cannot but stringify
|
||||
// test_broken_timeline matches this string
|
||||
anyhow::anyhow!("layer loading failed: {e:#}")
|
||||
})
|
||||
self.kind.get_or_init(init).await.as_ref().map_err(|e| {
|
||||
// errors are not clonabled, cannot but stringify
|
||||
// test_broken_timeline matches this string
|
||||
anyhow::anyhow!("layer loading failed: {e:#}")
|
||||
})
|
||||
}
|
||||
|
||||
async fn get_value_reconstruct_data(
|
||||
|
||||
@@ -5,7 +5,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::repository::Key;
|
||||
use crate::{pgdatadir_mapping::METADATA_CUT, repository::Key};
|
||||
|
||||
use super::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
|
||||
@@ -49,6 +49,20 @@ impl PersistentLayerDesc {
|
||||
}
|
||||
}
|
||||
|
||||
/// Does this layer consist exclusively of metadata
|
||||
/// content such as dbdir & relation sizes? This is a
|
||||
/// hint that the layer is likely to be small and should
|
||||
/// not be a candidate for eviction under normal circumstances.
|
||||
pub fn is_metadata_pages(&self) -> bool {
|
||||
self.key_range.start >= METADATA_CUT
|
||||
}
|
||||
|
||||
/// Does this layer consist exclusively of content
|
||||
/// required to serve a basebackup request?
|
||||
pub fn is_basebackup_pages(&self) -> bool {
|
||||
self.key_range.start >= METADATA_CUT
|
||||
}
|
||||
|
||||
pub fn short_id(&self) -> impl Display {
|
||||
self.filename()
|
||||
}
|
||||
|
||||
@@ -180,16 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
error_run_count,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
error!(
|
||||
"Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
"Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
|
||||
wait_duration
|
||||
);
|
||||
wait_duration
|
||||
Duration::from_secs_f64(wait_duration)
|
||||
} else {
|
||||
error_run_count = 0;
|
||||
period
|
||||
@@ -198,10 +198,6 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
|
||||
|
||||
// Perhaps we did no work and the walredo process has been idle for some time:
|
||||
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
|
||||
tenant.walredo_mgr.maybe_quiesce(period * 10);
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
@@ -265,16 +261,16 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
.await;
|
||||
if let Err(e) = res {
|
||||
let wait_duration = backoff::exponential_backoff_duration_seconds(
|
||||
error_run_count + 1,
|
||||
error_run_count,
|
||||
1.0,
|
||||
MAX_BACKOFF_SECS,
|
||||
);
|
||||
error_run_count += 1;
|
||||
let wait_duration = Duration::from_secs_f64(wait_duration);
|
||||
error!(
|
||||
"Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
|
||||
"Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
|
||||
wait_duration
|
||||
);
|
||||
wait_duration
|
||||
Duration::from_secs_f64(wait_duration)
|
||||
} else {
|
||||
error_run_count = 0;
|
||||
period
|
||||
|
||||
@@ -81,8 +81,6 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::ZERO_PAGE;
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
use self::eviction_task::EvictionTaskTimelineState;
|
||||
use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
@@ -298,8 +296,6 @@ pub struct Timeline {
|
||||
/// timeline is being deleted. If 'true', the timeline has already been deleted.
|
||||
pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
|
||||
|
||||
/// Barrier to wait before doing initial logical size calculation. Used only during startup.
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
|
||||
@@ -433,7 +429,6 @@ impl std::fmt::Display for PageReconstructError {
|
||||
pub enum LogicalSizeCalculationCause {
|
||||
Initial,
|
||||
ConsumptionMetricsSyntheticSize,
|
||||
EvictionTaskImitation,
|
||||
TenantSizeHandler,
|
||||
}
|
||||
|
||||
@@ -1442,9 +1437,6 @@ impl Timeline {
|
||||
|
||||
state,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex::new(
|
||||
EvictionTaskTimelineState::default(),
|
||||
),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
|
||||
|
||||
initial_logical_size_can_start,
|
||||
@@ -1967,9 +1959,6 @@ impl Timeline {
|
||||
LogicalSizeCalculationCause::Initial
|
||||
| LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize
|
||||
| LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo,
|
||||
LogicalSizeCalculationCause::EvictionTaskImitation => {
|
||||
&self.metrics.imitate_logical_size_histo
|
||||
}
|
||||
};
|
||||
let timer = storage_time_metrics.start_timer();
|
||||
let logical_size = self
|
||||
@@ -2746,18 +2735,18 @@ impl Timeline {
|
||||
partition_size: u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(KeyPartitioning, Lsn)> {
|
||||
{
|
||||
let partitioning_guard = self.partitioning.lock().unwrap();
|
||||
let distance = lsn.0 - partitioning_guard.1 .0;
|
||||
if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
|
||||
debug!(
|
||||
distance,
|
||||
threshold = self.repartition_threshold,
|
||||
"no repartitioning needed"
|
||||
);
|
||||
return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
|
||||
}
|
||||
}
|
||||
// {
|
||||
// let partitioning_guard = self.partitioning.lock().unwrap();
|
||||
// let distance = lsn.0 - partitioning_guard.1 .0;
|
||||
// if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
|
||||
// debug!(
|
||||
// distance,
|
||||
// threshold = self.repartition_threshold,
|
||||
// "no repartitioning needed"
|
||||
// );
|
||||
// return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
|
||||
// }
|
||||
// }
|
||||
let keyspace = self.collect_keyspace(lsn, ctx).await?;
|
||||
let partitioning = keyspace.partition(partition_size);
|
||||
|
||||
@@ -3497,22 +3486,21 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
|
||||
let layer_paths: Vec<Utf8PathBuf> = new_layers
|
||||
let mut layer_paths: Vec<Utf8PathBuf> = new_layers
|
||||
.iter()
|
||||
.map(|l| l.local_path().to_owned())
|
||||
.collect();
|
||||
|
||||
// Fsync all the layer files and directory using multiple threads to
|
||||
// minimize latency.
|
||||
par_fsync::par_fsync_async(&layer_paths)
|
||||
.await
|
||||
.context("fsync all new layers")?;
|
||||
//
|
||||
// FIXME: spawn_blocking above for this
|
||||
par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
|
||||
|
||||
let timeline_dir = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
|
||||
par_fsync::par_fsync_async(&[timeline_dir])
|
||||
.await
|
||||
par_fsync::par_fsync(&[self.conf.timeline_path(&self.tenant_id, &self.timeline_id)])
|
||||
.context("fsync of timeline dir")?;
|
||||
|
||||
layer_paths.pop().unwrap();
|
||||
}
|
||||
|
||||
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
|
||||
@@ -4286,6 +4274,11 @@ impl Timeline {
|
||||
let file_size = l.file_size();
|
||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||
|
||||
// Don't evict small layers required to serve a basebackup
|
||||
if l.is_basebackup_pages() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let l = guard.get_from_desc(&l);
|
||||
|
||||
let l = match l.keep_resident().await {
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
//!
|
||||
//! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
ops::ControlFlow,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -22,17 +21,15 @@ use std::{
|
||||
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
|
||||
use crate::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
pgdatadir_mapping::CollectKeySpaceError,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
tasks::{BackgroundLoopKind, RateLimitError},
|
||||
timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -40,16 +37,6 @@ use utils::completion;
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct EvictionTaskTimelineState {
|
||||
last_layer_access_imitation: Option<tokio::time::Instant>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct EvictionTaskTenantState {
|
||||
last_layer_access_imitation: Option<Instant>,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub(super) fn launch_eviction_task(
|
||||
self: &Arc<Self>,
|
||||
@@ -178,7 +165,6 @@ impl Timeline {
|
||||
// that were accessed to compute the value in the first place.
|
||||
// 3. Invalidate the caches at a period of < p.threshold/2, so that the values
|
||||
// get re-computed from layers, thereby counting towards layer access stats.
|
||||
// 4. Make the eviction task imitate the layer accesses that typically hit caches.
|
||||
//
|
||||
// We follow approach (4) here because in Neon prod deployment:
|
||||
// - page cache is quite small => high churn => low hit rate
|
||||
@@ -190,10 +176,6 @@ impl Timeline {
|
||||
//
|
||||
// We should probably move to persistent caches in the future, or avoid
|
||||
// having inactive tenants attached to pageserver in the first place.
|
||||
match self.imitate_layer_accesses(p, cancel, ctx).await {
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default)]
|
||||
@@ -215,6 +197,11 @@ impl Timeline {
|
||||
let layers = guard.layer_map();
|
||||
let mut candidates = Vec::new();
|
||||
for hist_layer in layers.iter_historic_layers() {
|
||||
// Don't evict the small layers needed to serve a basebackup request.
|
||||
if hist_layer.is_basebackup_pages() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let hist_layer = guard.get_from_desc(&hist_layer);
|
||||
|
||||
// guard against eviction while we inspect it; it might be that eviction_task and
|
||||
@@ -310,170 +297,4 @@ impl Timeline {
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_layer_accesses(
|
||||
&self,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
let mut state = self.eviction_task_timeline_state.lock().await;
|
||||
|
||||
// Only do the imitate_layer accesses approximately as often as the threshold. A little
|
||||
// more frequently, to avoid this period racing with the threshold/period-th eviction iteration.
|
||||
let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold);
|
||||
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_timeline_cached_layer_accesses(ctx).await;
|
||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now())
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
|
||||
// This task is timeline-scoped, but the synthetic size calculation is tenant-scoped.
|
||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||
// The others wait until the calculation is done so that they take into account the
|
||||
// imitated accesses that the winner made.
|
||||
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
|
||||
Ok(t) => t,
|
||||
Err(_) => {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
|
||||
.await;
|
||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
|
||||
/// Recompute the values which would cause on-demand downloads during restart.
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
|
||||
let lsn = self.get_last_record_lsn();
|
||||
|
||||
// imitiate on-restart initial logical size
|
||||
let size = self
|
||||
.calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
|
||||
.instrument(info_span!("calculate_logical_size"))
|
||||
.await;
|
||||
|
||||
match &size {
|
||||
Ok(_size) => {
|
||||
// good, don't log it to avoid confusion
|
||||
}
|
||||
Err(_) => {
|
||||
// we have known issues for which we already log this on consumption metrics,
|
||||
// gc, and compaction. leave logging out for now.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/2539
|
||||
}
|
||||
}
|
||||
|
||||
// imitiate repartiting on first compactation
|
||||
if let Err(e) = self
|
||||
.collect_keyspace(lsn, ctx)
|
||||
.instrument(info_span!("collect_keyspace"))
|
||||
.await
|
||||
{
|
||||
// if this failed, we probably failed logical size because these use the same keys
|
||||
if size.is_err() {
|
||||
// ignore, see above comment
|
||||
} else {
|
||||
match e {
|
||||
CollectKeySpaceError::Cancelled => {
|
||||
// Shutting down, ignore
|
||||
}
|
||||
err => {
|
||||
warn!(
|
||||
"failed to collect keyspace but succeeded in calculating logical size: {err:#}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Imitate the synthetic size calculation done by the consumption_metrics module.
|
||||
#[instrument(skip_all)]
|
||||
async fn imitate_synthetic_size_calculation_worker(
|
||||
&self,
|
||||
tenant: &Arc<Tenant>,
|
||||
ctx: &RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
) {
|
||||
if self.conf.metric_collection_endpoint.is_none() {
|
||||
// We don't start the consumption metrics task if this is not set in the config.
|
||||
// So, no need to imitate the accesses in that case.
|
||||
return;
|
||||
}
|
||||
|
||||
// The consumption metrics are collected on a per-tenant basis, by a single
|
||||
// global background loop.
|
||||
// It limits the number of synthetic size calculations using the global
|
||||
// `concurrent_tenant_size_logical_size_queries` semaphore to not overload
|
||||
// the pageserver. (size calculation is somewhat expensive in terms of CPU and IOs).
|
||||
//
|
||||
// If we used that same semaphore here, then we'd compete for the
|
||||
// same permits, which may impact timeliness of consumption metrics.
|
||||
// That is a no-go, as consumption metrics are much more important
|
||||
// than what we do here.
|
||||
//
|
||||
// So, we have a separate semaphore, initialized to the same
|
||||
// number of permits as the `concurrent_tenant_size_logical_size_queries`.
|
||||
// In the worst, we would have twice the amount of concurrenct size calculations.
|
||||
// But in practice, the `p.threshold` >> `consumption metric interval`, and
|
||||
// we spread out the eviction task using `random_init_delay`.
|
||||
// So, the chance of the worst case is quite low in practice.
|
||||
// It runs as a per-tenant task, but the eviction_task.rs is per-timeline.
|
||||
// So, we must coordinate with other with other eviction tasks of this tenant.
|
||||
let limit = self
|
||||
.conf
|
||||
.eviction_task_immitated_concurrent_logical_size_queries
|
||||
.inner();
|
||||
|
||||
let mut throwaway_cache = HashMap::new();
|
||||
let gather = crate::tenant::size::gather_inputs(
|
||||
tenant,
|
||||
limit,
|
||||
None,
|
||||
&mut throwaway_cache,
|
||||
LogicalSizeCalculationCause::EvictionTaskImitation,
|
||||
ctx,
|
||||
)
|
||||
.instrument(info_span!("gather_inputs"));
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {}
|
||||
gather_result = gather => {
|
||||
match gather_result {
|
||||
Ok(_) => {},
|
||||
Err(e) => {
|
||||
// We don't care about the result, but, if it failed, we should log it,
|
||||
// since consumption metric might be hitting the cached value and
|
||||
// thus not encountering this error.
|
||||
warn!("failed to imitate synthetic size calculation accesses: {e:#}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,7 +91,6 @@ struct ProcessOutput {
|
||||
pub struct PostgresRedoManager {
|
||||
tenant_id: TenantId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
|
||||
}
|
||||
|
||||
@@ -188,26 +187,10 @@ impl PostgresRedoManager {
|
||||
PostgresRedoManager {
|
||||
tenant_id,
|
||||
conf,
|
||||
last_redo_at: std::sync::Mutex::default(),
|
||||
redo_process: RwLock::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// This type doesn't have its own background task to check for idleness: we
|
||||
/// rely on our owner calling this function periodically in its own housekeeping
|
||||
/// loops.
|
||||
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
||||
if let Ok(g) = self.last_redo_at.try_lock() {
|
||||
if let Some(last_redo_at) = *g {
|
||||
if last_redo_at.elapsed() >= idle_timeout {
|
||||
drop(g);
|
||||
let mut guard = self.redo_process.write().unwrap();
|
||||
*guard = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
@@ -222,8 +205,6 @@ impl PostgresRedoManager {
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
|
||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
@@ -367,13 +348,12 @@ impl PostgresRedoManager {
|
||||
self.apply_record_neon(key, &mut page, *record_lsn, record)?;
|
||||
}
|
||||
// Success!
|
||||
let duration = start_time.elapsed();
|
||||
// FIXME: using the same metric here creates a bimodal distribution by default, and because
|
||||
// there could be multiple batch sizes this would be N+1 modal.
|
||||
let end_time = Instant::now();
|
||||
let duration = end_time.duration_since(start_time);
|
||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||
|
||||
debug!(
|
||||
"neon applied {} WAL records in {} us to reconstruct page image at LSN {}",
|
||||
"neon applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
duration.as_micros(),
|
||||
lsn
|
||||
|
||||
@@ -475,12 +475,6 @@ NeonXactCallback(XactEvent event, void *arg)
|
||||
Assert(CurrentDdlTable == &RootTable);
|
||||
}
|
||||
|
||||
static bool
|
||||
RoleIsNeonSuperuser(const char *role_name)
|
||||
{
|
||||
return strcmp(role_name, "neon_superuser") == 0;
|
||||
}
|
||||
|
||||
static void
|
||||
HandleCreateDb(CreatedbStmt *stmt)
|
||||
{
|
||||
@@ -507,16 +501,9 @@ HandleCreateDb(CreatedbStmt *stmt)
|
||||
|
||||
entry->type = Op_Set;
|
||||
if (downer && downer->arg)
|
||||
{
|
||||
const char *owner_name = defGetString(downer);
|
||||
if (RoleIsNeonSuperuser(owner_name))
|
||||
elog(ERROR, "can't create a database with owner neon_superuser");
|
||||
entry->owner = get_role_oid(owner_name, false);
|
||||
}
|
||||
entry->owner = get_role_oid(defGetString(downer), false);
|
||||
else
|
||||
{
|
||||
entry->owner = GetUserId();
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -535,10 +522,8 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
|
||||
|
||||
if (!found)
|
||||
memset(entry->old_name, 0, sizeof(entry->old_name));
|
||||
const char *new_owner = get_rolespec_name(stmt->newowner);
|
||||
if (RoleIsNeonSuperuser(new_owner))
|
||||
elog(ERROR, "can't alter owner to neon_superuser");
|
||||
entry->owner = get_role_oid(new_owner, false);
|
||||
|
||||
entry->owner = get_role_oid(get_rolespec_name(stmt->newowner), false);
|
||||
entry->type = Op_Set;
|
||||
}
|
||||
|
||||
@@ -632,9 +617,6 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
InitRoleTableIfNeeded();
|
||||
DefElem *dpass = NULL;
|
||||
ListCell *option;
|
||||
const char *role_name = stmt->role->rolename;
|
||||
if (RoleIsNeonSuperuser(role_name))
|
||||
elog(ERROR, "can't ALTER neon_superuser");
|
||||
|
||||
foreach(option, stmt->options)
|
||||
{
|
||||
@@ -649,7 +631,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
bool found = false;
|
||||
RoleEntry *entry = hash_search(
|
||||
CurrentDdlTable->role_table,
|
||||
role_name,
|
||||
stmt->role->rolename,
|
||||
HASH_ENTER,
|
||||
&found);
|
||||
|
||||
|
||||
@@ -1687,9 +1687,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (current_size >= ((uint64) max_cluster_size) * 1024 * 1024)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DISK_FULL),
|
||||
errmsg("could not extend file because project size limit (%d MB) has been exceeded",
|
||||
errmsg("could not extend file because cluster size limit (%d MB) has been exceeded",
|
||||
max_cluster_size),
|
||||
errhint("This limit is defined externally by the project size limit, and internally by neon.max_cluster_size GUC")));
|
||||
errhint("This limit is defined by neon.max_cluster_size GUC")));
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
//! User credentials used in authentication.
|
||||
|
||||
use crate::{
|
||||
auth::password_hack::parse_endpoint_param,
|
||||
error::UserFacingError,
|
||||
proxy::{neon_options, NUM_CONNECTION_ACCEPTED_BY_SNI},
|
||||
auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options,
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
@@ -126,22 +124,6 @@ impl<'a> ClientCredentials<'a> {
|
||||
.transpose()?;
|
||||
|
||||
info!(user, project = project.as_deref(), "credentials");
|
||||
if sni.is_some() {
|
||||
info!("Connection with sni");
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
.with_label_values(&["sni"])
|
||||
.inc();
|
||||
} else if project.is_some() {
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
.with_label_values(&["no_sni"])
|
||||
.inc();
|
||||
info!("Connection without sni");
|
||||
} else {
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
.with_label_values(&["password_hack"])
|
||||
.inc();
|
||||
info!("Connection with password hack");
|
||||
}
|
||||
|
||||
let cache_key = format!(
|
||||
"{}{}",
|
||||
|
||||
@@ -248,7 +248,6 @@ impl ConnCfg {
|
||||
|
||||
// connect_raw() will not use TLS if sslmode is "disable"
|
||||
let (client, connection) = self.0.connect_raw(stream, tls).await?;
|
||||
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
|
||||
let stream = connection.stream.into_inner();
|
||||
|
||||
info!(
|
||||
|
||||
@@ -129,15 +129,6 @@ pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_accepted_connections_by_sni",
|
||||
"Number of connections (per sni).",
|
||||
&["kind"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub struct LatencyTimer {
|
||||
// time since the stopwatch was started
|
||||
start: Option<Instant>,
|
||||
@@ -514,7 +505,7 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
|
||||
}
|
||||
|
||||
/// Try to connect to the compute node once.
|
||||
#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
|
||||
#[tracing::instrument(name = "connect_once", skip_all)]
|
||||
async fn connect_to_compute_once(
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
|
||||
@@ -208,13 +208,14 @@ impl GlobalConnPool {
|
||||
} else {
|
||||
info!("pool: reusing connection '{conn_info}'");
|
||||
client.session.send(session_id)?;
|
||||
tracing::Span::current().record(
|
||||
"pid",
|
||||
&tracing::field::display(client.inner.get_process_id()),
|
||||
);
|
||||
latency_timer.pool_hit();
|
||||
latency_timer.success();
|
||||
return Ok(Client::new(client, pool).await);
|
||||
return Ok(Client {
|
||||
conn_id: client.conn_id,
|
||||
inner: Some(client),
|
||||
span: Span::current(),
|
||||
pool,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
@@ -228,12 +229,6 @@ impl GlobalConnPool {
|
||||
)
|
||||
.await
|
||||
};
|
||||
if let Ok(client) = &new_client {
|
||||
tracing::Span::current().record(
|
||||
"pid",
|
||||
&tracing::field::display(client.inner.get_process_id()),
|
||||
);
|
||||
}
|
||||
|
||||
match &new_client {
|
||||
// clear the hash. it's no longer valid
|
||||
@@ -267,8 +262,13 @@ impl GlobalConnPool {
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
let new_client = new_client?;
|
||||
Ok(Client::new(new_client, pool).await)
|
||||
|
||||
new_client.map(|inner| Client {
|
||||
conn_id: inner.conn_id,
|
||||
inner: Some(inner),
|
||||
span: Span::current(),
|
||||
pool,
|
||||
})
|
||||
}
|
||||
|
||||
fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
|
||||
@@ -394,7 +394,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
|
||||
// Wake up the destination if needed. Code here is a bit involved because
|
||||
// we reuse the code from the usual proxy and we need to prepare few structures
|
||||
// that this code expects.
|
||||
#[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn connect_to_compute(
|
||||
config: &config::ProxyConfig,
|
||||
conn_info: &ConnInfo,
|
||||
@@ -461,7 +461,6 @@ async fn connect_to_compute_once(
|
||||
.connect_timeout(timeout)
|
||||
.connect(tokio_postgres::NoTls)
|
||||
.await?;
|
||||
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
|
||||
|
||||
let (tx, mut rx) = tokio::sync::watch::channel(session);
|
||||
|
||||
@@ -548,17 +547,6 @@ pub struct Discard<'a> {
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub(self) async fn new(
|
||||
inner: ClientInner,
|
||||
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
|
||||
) -> Self {
|
||||
Self {
|
||||
conn_id: inner.conn_id,
|
||||
inner: Some(inner),
|
||||
span: Span::current(),
|
||||
pool,
|
||||
}
|
||||
}
|
||||
pub fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
|
||||
let Self {
|
||||
inner,
|
||||
|
||||
@@ -250,7 +250,7 @@ pub async fn handle(
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
|
||||
#[instrument(name = "sql-over-http", skip_all)]
|
||||
async fn handle_inner(
|
||||
request: Request<Body>,
|
||||
sni_hostname: Option<String>,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.74.0"
|
||||
channel = "1.73.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -199,10 +199,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
// 1. init logging
|
||||
// 2. tracing panic hook
|
||||
// 3. sentry
|
||||
let _guard = logging::init(
|
||||
logging::init(
|
||||
LogFormat::from_config(&args.log_format)?,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
info!("version: {GIT_VERSION}");
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
if [ "$(cat /sys/class/block/nvme1n1/device/model)" != "Amazon EC2 NVMe Instance Storage " ]; then
|
||||
echo "nvme1n1 is not Amazon EC2 NVMe Instance Storage: '$(cat /sys/class/block/nvme1n1/device/model)'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rmdir bench_repo_dir || true
|
||||
|
||||
sudo mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/nvme1n1
|
||||
|
||||
sudo mount /dev/nvme1n1 /mnt
|
||||
sudo chown -R "$(id -u)":"$(id -g)" /mnt
|
||||
|
||||
mkdir /mnt/bench_repo_dir
|
||||
mkdir bench_repo_dir
|
||||
sudo mount --bind /mnt/bench_repo_dir bench_repo_dir
|
||||
|
||||
mkdir /mnt/test_output
|
||||
|
||||
mkdir /mnt/many_tenants
|
||||
|
||||
echo run the following commands
|
||||
|
||||
cat <<EOF
|
||||
# test suite run
|
||||
export TEST_OUTPUT="/mnt/test_output"
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest test_runner/performance/test_pageserver.py
|
||||
|
||||
# for interactive use
|
||||
export NEON_REPO_DIR="$(readlink -f ./bench_repo_dir)/repo"
|
||||
cargo build_testing --release
|
||||
./target/release/neon_local init
|
||||
# ... create tenant, seed it using pgbench
|
||||
# then duplicate the tenant using
|
||||
# poetry run python3 ./test_runner/duplicate_tenant.py TENANT_ID 200 8
|
||||
EOF
|
||||
|
||||
|
||||
@@ -431,10 +431,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// 1. init logging
|
||||
// 2. tracing panic hook
|
||||
// 3. sentry
|
||||
let _guard = logging::init(
|
||||
logging::init(
|
||||
LogFormat::from_config(&args.log_format)?,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
# Usage from top of repo:
|
||||
# poetry run python3 ./test_runner/duplicate_tenant.py c66e2e233057f7f05563caff664ecb14 .neon/remote_storage_local_fs
|
||||
import argparse
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.types import TenantId
|
||||
|
||||
parser = argparse.ArgumentParser(description="Duplicate tenant script.")
|
||||
parser.add_argument("initial_tenant", type=str, help="Initial tenant")
|
||||
parser.add_argument("remote_storage_local_fs_root", type=Path, help="Remote storage local fs root")
|
||||
parser.add_argument("--ncopies", type=int, help="Number of copies")
|
||||
parser.add_argument("--numthreads", type=int, default=1, help="Number of threads")
|
||||
parser.add_argument("--port", type=int, default=9898, help="Pageserver management api port")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
initial_tenant = args.initial_tenant
|
||||
remote_storage_local_fs_root: Path = args.remote_storage_local_fs_root
|
||||
ncopies = args.ncopies
|
||||
numthreads = args.numthreads
|
||||
|
||||
new_tenant = TenantId.generate()
|
||||
print(f"New tenant: {new_tenant}")
|
||||
|
||||
client = PageserverHttpClient(args.port, lambda: None)
|
||||
|
||||
src_tenant_gen = int(client.tenant_status(initial_tenant)["generation"])
|
||||
|
||||
assert remote_storage_local_fs_root.is_dir(), f"{remote_storage_local_fs_root} is not a directory"
|
||||
|
||||
src_timelines_dir: Path = remote_storage_local_fs_root / "tenants" / initial_tenant / "timelines"
|
||||
assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory"
|
||||
|
||||
dst_timelines_dir: Path = remote_storage_local_fs_root / "tenants" / str(new_tenant) / "timelines"
|
||||
dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False)
|
||||
dst_timelines_dir.mkdir(parents=False, exist_ok=False)
|
||||
|
||||
for tl in src_timelines_dir.iterdir():
|
||||
src_tl_dir = src_timelines_dir / tl.name
|
||||
assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory"
|
||||
dst_tl_dir = dst_timelines_dir / tl.name
|
||||
dst_tl_dir.mkdir(parents=False, exist_ok=False)
|
||||
for file in tl.iterdir():
|
||||
shutil.copy2(file, dst_tl_dir)
|
||||
if "__" in file.name:
|
||||
cmd = [
|
||||
"./target/debug/pagectl", # TODO: abstract this like the other binaries
|
||||
"layer",
|
||||
"rewrite-summary",
|
||||
str(dst_tl_dir / file.name),
|
||||
"--new-tenant-id",
|
||||
str(new_tenant),
|
||||
]
|
||||
subprocess.run(cmd, check=True)
|
||||
|
||||
client.tenant_attach(new_tenant, generation=src_tenant_gen)
|
||||
|
||||
while True:
|
||||
status = client.tenant_status(new_tenant)
|
||||
if status["state"]["slug"] == "Active":
|
||||
break
|
||||
print("Waiting for tenant to be active..., is: " + status["state"]["slug"])
|
||||
time.sleep(1)
|
||||
|
||||
print("Tenant is active: " + str(new_tenant))
|
||||
@@ -724,10 +724,13 @@ class NeonEnv:
|
||||
self.initial_tenant = config.initial_tenant
|
||||
self.initial_timeline = config.initial_timeline
|
||||
|
||||
self.control_plane_api: Optional[str] = None
|
||||
self.attachment_service: Optional[NeonAttachmentService] = None
|
||||
if config.enable_generations:
|
||||
self.enable_generations()
|
||||
attachment_service_port = self.port_distributor.get_port()
|
||||
self.control_plane_api: Optional[str] = f"http://127.0.0.1:{attachment_service_port}"
|
||||
self.attachment_service: Optional[NeonAttachmentService] = NeonAttachmentService(self)
|
||||
else:
|
||||
self.control_plane_api = None
|
||||
self.attachment_service = None
|
||||
|
||||
# Create a config file corresponding to the options
|
||||
toml = textwrap.dedent(
|
||||
@@ -816,18 +819,6 @@ class NeonEnv:
|
||||
log.info(f"Config: {toml}")
|
||||
self.neon_cli.init(toml)
|
||||
|
||||
def enable_generations(self, start=False):
|
||||
if not start:
|
||||
# TODO: assert that we haven't `self.start()`ed yet
|
||||
pass
|
||||
assert self.control_plane_api is None
|
||||
assert self.attachment_service is None
|
||||
attachment_service_port = self.port_distributor.get_port()
|
||||
self.control_plane_api = f"http://127.0.0.1:{attachment_service_port}"
|
||||
self.attachment_service = NeonAttachmentService(self)
|
||||
if start:
|
||||
self.attachment_service.start()
|
||||
|
||||
def start(self):
|
||||
# Start up broker, pageserver and all safekeepers
|
||||
self.broker.try_start()
|
||||
@@ -1567,17 +1558,6 @@ class ComputeCtl(AbstractNeonCli):
|
||||
COMMAND = "compute_ctl"
|
||||
|
||||
|
||||
# class GetpageBenchLibpq(AbstractNeonCli):
|
||||
# """
|
||||
# A typed wrapper around the `getpage_bench_libpq` CLI.
|
||||
# """
|
||||
#
|
||||
# COMMAND = "getpage_bench_libpq"
|
||||
#
|
||||
# def run(self):
|
||||
# pass
|
||||
|
||||
|
||||
class NeonAttachmentService:
|
||||
def __init__(self, env: NeonEnv):
|
||||
self.env = env
|
||||
@@ -1683,7 +1663,7 @@ class NeonPageserver(PgProtocol):
|
||||
# these can happen anytime we do compactions from background task and shutdown pageserver
|
||||
r".*ERROR.*ancestor timeline \S+ is being stopped",
|
||||
# this is expected given our collaborative shutdown approach for the UploadQueue
|
||||
".*Compaction failed.*, retrying in .*: Other\\(queue is in state Stopped.*",
|
||||
".*Compaction failed.*, retrying in .*: queue is in state Stopped.*",
|
||||
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
|
||||
".*Error processing HTTP request: NotFound: Timeline .* was not found",
|
||||
".*took more than expected to complete.*",
|
||||
|
||||
@@ -58,7 +58,6 @@ class HistoricLayerInfo:
|
||||
lsn_start: str
|
||||
lsn_end: Optional[str]
|
||||
remote: bool
|
||||
remote_path: Optional[str] = None
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo:
|
||||
@@ -69,7 +68,6 @@ class HistoricLayerInfo:
|
||||
lsn_start=d["lsn_start"],
|
||||
lsn_end=d.get("lsn_end"),
|
||||
remote=d["remote"],
|
||||
remote_path=d.get("remote_path"),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,122 +0,0 @@
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from fixtures.benchmark_fixture import NeonBenchmarker
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, last_flush_lsn_upload
|
||||
from fixtures.pageserver.utils import wait_until_tenant_active
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
from fixtures.types import TenantId
|
||||
|
||||
|
||||
def test_getpage_throughput(
|
||||
neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin
|
||||
):
|
||||
neon_env_builder.enable_generations = True
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
remote_storage = env.pageserver_remote_storage
|
||||
assert isinstance(remote_storage, LocalFsStorage)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# clean up the useless default tenant
|
||||
ps_http.tenant_delete(env.initial_tenant)
|
||||
|
||||
# create our template tenant
|
||||
tenant_config_mgmt_api = {
|
||||
"gc_period": "0s",
|
||||
"checkpoint_timeout": "3650 day",
|
||||
"compaction_period": "20 s",
|
||||
"compaction_threshold": 10,
|
||||
"compaction_target_size": 134217728,
|
||||
"checkpoint_distance": 268435456,
|
||||
"image_creation_threshold": 3,
|
||||
}
|
||||
tenant_config_cli = {k: str(v) for k, v in tenant_config_mgmt_api.items()}
|
||||
|
||||
template_tenant, template_timeline = env.neon_cli.create_tenant(conf=tenant_config_cli)
|
||||
template_tenant_gen = int(ps_http.tenant_status(template_tenant)["generation"])
|
||||
with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s50", ep.connstr()])
|
||||
last_flush_lsn_upload(env, ep, template_tenant, template_timeline)
|
||||
ps_http.tenant_detach(template_tenant)
|
||||
|
||||
# stop PS just for good measure
|
||||
env.pageserver.stop()
|
||||
|
||||
# duplicate the tenant in remote storage
|
||||
src_timelines_dir: Path = remote_storage.tenant_path(template_tenant) / "timelines"
|
||||
assert src_timelines_dir.is_dir(), f"{src_timelines_dir} is not a directory"
|
||||
tenants = [template_tenant]
|
||||
for i in range(0, 200):
|
||||
new_tenant = TenantId.generate()
|
||||
tenants.append(new_tenant)
|
||||
log.info("Duplicating tenant #%s: %s", i, new_tenant)
|
||||
|
||||
dst_timelines_dir: Path = remote_storage.tenant_path(new_tenant) / "timelines"
|
||||
dst_timelines_dir.parent.mkdir(parents=False, exist_ok=False)
|
||||
dst_timelines_dir.mkdir(parents=False, exist_ok=False)
|
||||
|
||||
for tl in src_timelines_dir.iterdir():
|
||||
src_tl_dir = src_timelines_dir / tl.name
|
||||
assert src_tl_dir.is_dir(), f"{src_tl_dir} is not a directory"
|
||||
dst_tl_dir = dst_timelines_dir / tl.name
|
||||
dst_tl_dir.mkdir(parents=False, exist_ok=False)
|
||||
for file in tl.iterdir():
|
||||
shutil.copy2(file, dst_tl_dir)
|
||||
if "__" in file.name:
|
||||
cmd: List[str] = [
|
||||
str(
|
||||
env.neon_binpath / "pagectl"
|
||||
), # TODO: abstract this like the other binaries
|
||||
"layer",
|
||||
"rewrite-summary",
|
||||
str(dst_tl_dir / file.name),
|
||||
"--new-tenant-id",
|
||||
str(new_tenant),
|
||||
]
|
||||
subprocess.run(cmd, check=True)
|
||||
else:
|
||||
# index_part etc need no patching
|
||||
pass
|
||||
|
||||
env.pageserver.start()
|
||||
assert ps_http.tenant_list() == []
|
||||
for tenant in tenants:
|
||||
ps_http.tenant_attach(
|
||||
tenant, config=tenant_config_mgmt_api, generation=template_tenant_gen + 1
|
||||
)
|
||||
for tenant in tenants:
|
||||
wait_until_tenant_active(ps_http, tenant)
|
||||
|
||||
# ensure all layers are resident for predictiable performance
|
||||
# TODO: ensure all kinds of eviction are disabled (per-tenant, disk-usage-based)
|
||||
for tenant in tenants:
|
||||
ps_http.download_all_layers(tenant, template_timeline)
|
||||
|
||||
# run the benchmark with one client per timeline, each doing 10k requests to random keys.
|
||||
cmd = [
|
||||
str(env.neon_binpath / "pagebench"),
|
||||
"get-page-latest-lsn",
|
||||
"--mgmt-api-endpoint",
|
||||
ps_http.base_url,
|
||||
"--page-service-connstring",
|
||||
env.pageserver.connstr(password=None),
|
||||
"--runtime",
|
||||
"10s",
|
||||
*[f"{tenant}/{template_timeline}" for tenant in tenants],
|
||||
]
|
||||
log.info(f"command: {' '.join(cmd)}")
|
||||
basepath = pg_bin.run_capture(cmd)
|
||||
results_path = Path(basepath + ".stdout")
|
||||
log.info(f"Benchmark results at: {results_path}")
|
||||
|
||||
with open(results_path, "r") as f:
|
||||
results = json.load(f)
|
||||
|
||||
log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
|
||||
@@ -46,10 +46,7 @@ from fixtures.utils import query_scalar
|
||||
# Because the delta layer D covering lsn1 is corrupted, creating a branch
|
||||
# starting from lsn1 should return an error as follows:
|
||||
# could not find data for key ... at LSN ..., for request at LSN ...
|
||||
def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
|
||||
if build_type == "debug":
|
||||
pytest.skip("times out in debug builds")
|
||||
|
||||
def test_branch_and_gc(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
pageserver_http_client = env.pageserver.http_client()
|
||||
|
||||
|
||||
@@ -245,19 +245,6 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
|
||||
raise AssertionError("Could not count databases")
|
||||
assert result[0] == 0, "Database 'failure' still exists after drop"
|
||||
|
||||
# We don't have compute_ctl, so here, so create neon_superuser here manually
|
||||
cur.execute("CREATE ROLE neon_superuser NOLOGIN CREATEDB CREATEROLE")
|
||||
|
||||
with pytest.raises(psycopg2.InternalError):
|
||||
cur.execute("ALTER ROLE neon_superuser LOGIN")
|
||||
|
||||
with pytest.raises(psycopg2.InternalError):
|
||||
cur.execute("CREATE DATABASE trololobus WITH OWNER neon_superuser")
|
||||
|
||||
cur.execute("CREATE DATABASE trololobus")
|
||||
with pytest.raises(psycopg2.InternalError):
|
||||
cur.execute("ALTER DATABASE trololobus OWNER TO neon_superuser")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
@@ -16,11 +15,7 @@ from fixtures.utils import query_scalar
|
||||
# and then download them back.
|
||||
def test_basic_eviction(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
build_type: str,
|
||||
):
|
||||
if build_type == "debug":
|
||||
pytest.skip("times out in debug builds")
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
|
||||
@@ -144,10 +144,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
|
||||
# Test that repeatedly kills and restarts the page server, while the
|
||||
# safekeeper and compute node keep running.
|
||||
@pytest.mark.timeout(540)
|
||||
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, build_type: str):
|
||||
if build_type == "debug":
|
||||
pytest.skip("times out in debug builds")
|
||||
|
||||
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
|
||||
|
||||
@@ -307,7 +307,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id))
|
||||
gc_thread.start()
|
||||
time.sleep(5)
|
||||
time.sleep(1)
|
||||
# By now the gc task is spawned but in sleep for another second due to the failpoint.
|
||||
|
||||
log.info("detaching tenant")
|
||||
|
||||
@@ -602,10 +602,7 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat
|
||||
# The test takes more than default 5 minutes on Postgres 16,
|
||||
# see https://github.com/neondatabase/neon/issues/5305
|
||||
@pytest.mark.timeout(600)
|
||||
def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, build_type: str):
|
||||
if build_type == "debug":
|
||||
pytest.skip("times out in debug builds")
|
||||
|
||||
def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
|
||||
---
|
||||
commands:
|
||||
- name: cgconfigparser
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
|
||||
- name: pgbouncer
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
- filename: pgbouncer.ini
|
||||
content: |
|
||||
[databases]
|
||||
*=host=localhost port=5432 auth_user=cloud_admin
|
||||
[pgbouncer]
|
||||
listen_port=6432
|
||||
listen_addr=0.0.0.0
|
||||
auth_type=scram-sha-256
|
||||
auth_user=cloud_admin
|
||||
auth_dbname=postgres
|
||||
client_tls_sslmode=disable
|
||||
server_tls_sslmode=disable
|
||||
pool_mode=transaction
|
||||
max_client_conn=10000
|
||||
default_pool_size=16
|
||||
max_prepared_statements=0
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
group neon-postgres {
|
||||
perm {
|
||||
admin {
|
||||
uid = postgres;
|
||||
}
|
||||
task {
|
||||
gid = users;
|
||||
}
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
|
||||
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
|
||||
# requires cgroup v2, so we'll build cgroup-tools ourselves.
|
||||
FROM debian:bullseye-slim as libcgroup-builder
|
||||
ENV LIBCGROUP_VERSION v2.0.3
|
||||
|
||||
RUN set -exu \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
git \
|
||||
ca-certificates \
|
||||
automake \
|
||||
cmake \
|
||||
make \
|
||||
gcc \
|
||||
byacc \
|
||||
flex \
|
||||
libtool \
|
||||
libpam0g-dev \
|
||||
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
|
||||
&& INSTALL_DIR="/libcgroup-install" \
|
||||
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
|
||||
&& cd libcgroup \
|
||||
# extracted from bootstrap.sh, with modified flags:
|
||||
&& (test -d m4 || mkdir m4) \
|
||||
&& autoreconf -fi \
|
||||
&& rm -rf autom4te.cache \
|
||||
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
|
||||
# actually build the thing...
|
||||
&& make install
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter
|
||||
|
||||
# Build pgbouncer
|
||||
#
|
||||
FROM debian:bullseye-slim AS pgbouncer
|
||||
RUN set -e \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y \
|
||||
curl \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
libevent-dev \
|
||||
libssl-dev
|
||||
|
||||
ENV PGBOUNCER_VERSION 1.21.0
|
||||
ENV PGBOUNCER_GITPATH 1_21_0
|
||||
RUN set -e \
|
||||
&& curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
|
||||
&& tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
|
||||
&& cd pgbouncer-${PGBOUNCER_VERSION} \
|
||||
&& LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
|
||||
&& make -j $(nproc) \
|
||||
&& make install
|
||||
merge: |
|
||||
# tweak nofile limits
|
||||
RUN set -e \
|
||||
&& echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
|
||||
&& test ! -e /etc/security || ( \
|
||||
echo '* - nofile 1048576' >>/etc/security/limits.conf \
|
||||
&& echo 'root - nofile 1048576' >>/etc/security/limits.conf \
|
||||
)
|
||||
|
||||
COPY cgconfig.conf /etc/cgconfig.conf
|
||||
COPY pgbouncer.ini /etc/pgbouncer.ini
|
||||
RUN set -e \
|
||||
&& chown postgres:postgres /etc/pgbouncer.ini \
|
||||
&& chmod 0644 /etc/pgbouncer.ini \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
|
||||
Reference in New Issue
Block a user