mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-13 23:50:36 +00:00
Compare commits
243 Commits
RemoteExte
...
release-45
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
aa72a22661 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
8
.github/workflows/build_and_test.yml
vendored
8
.github/workflows/build_and_test.yml
vendored
@@ -105,11 +105,11 @@ jobs:
|
|||||||
- name: Install Python deps
|
- name: Install Python deps
|
||||||
run: ./scripts/pysync
|
run: ./scripts/pysync
|
||||||
|
|
||||||
- name: Run ruff to ensure code format
|
- name: Run `ruff check` to ensure code format
|
||||||
run: poetry run ruff .
|
run: poetry run ruff check .
|
||||||
|
|
||||||
- name: Run black to ensure code format
|
- name: Run `ruff format` to ensure code format
|
||||||
run: poetry run black --diff --check .
|
run: poetry run ruff format --check .
|
||||||
|
|
||||||
- name: Run mypy to check types
|
- name: Run mypy to check types
|
||||||
run: poetry run mypy .
|
run: poetry run mypy .
|
||||||
|
|||||||
7
Cargo.lock
generated
7
Cargo.lock
generated
@@ -1161,6 +1161,7 @@ dependencies = [
|
|||||||
"flate2",
|
"flate2",
|
||||||
"futures",
|
"futures",
|
||||||
"hyper",
|
"hyper",
|
||||||
|
"nix 0.26.2",
|
||||||
"notify",
|
"notify",
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
@@ -1171,6 +1172,7 @@ dependencies = [
|
|||||||
"rust-ini",
|
"rust-ini",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"signal-hook",
|
||||||
"tar",
|
"tar",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tokio-postgres",
|
"tokio-postgres",
|
||||||
@@ -4403,12 +4405,14 @@ dependencies = [
|
|||||||
"async-stream",
|
"async-stream",
|
||||||
"aws-config",
|
"aws-config",
|
||||||
"aws-sdk-s3",
|
"aws-sdk-s3",
|
||||||
|
"aws-smithy-async",
|
||||||
"bincode",
|
"bincode",
|
||||||
"bytes",
|
"bytes",
|
||||||
"chrono",
|
"chrono",
|
||||||
"clap",
|
"clap",
|
||||||
"crc32c",
|
"crc32c",
|
||||||
"either",
|
"either",
|
||||||
|
"futures",
|
||||||
"futures-util",
|
"futures-util",
|
||||||
"hex",
|
"hex",
|
||||||
"histogram",
|
"histogram",
|
||||||
@@ -4447,6 +4451,7 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"const_format",
|
"const_format",
|
||||||
"crc32c",
|
"crc32c",
|
||||||
|
"fail",
|
||||||
"fs2",
|
"fs2",
|
||||||
"futures",
|
"futures",
|
||||||
"git-version",
|
"git-version",
|
||||||
@@ -4470,6 +4475,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"serde_with",
|
"serde_with",
|
||||||
|
"sha2",
|
||||||
"signal-hook",
|
"signal-hook",
|
||||||
"storage_broker",
|
"storage_broker",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
@@ -5878,6 +5884,7 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
"const_format",
|
"const_format",
|
||||||
"criterion",
|
"criterion",
|
||||||
|
"fail",
|
||||||
"futures",
|
"futures",
|
||||||
"heapless",
|
"heapless",
|
||||||
"hex",
|
"hex",
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ clap.workspace = true
|
|||||||
flate2.workspace = true
|
flate2.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
hyper = { workspace = true, features = ["full"] }
|
hyper = { workspace = true, features = ["full"] }
|
||||||
|
nix.workspace = true
|
||||||
notify.workspace = true
|
notify.workspace = true
|
||||||
num_cpus.workspace = true
|
num_cpus.workspace = true
|
||||||
opentelemetry.workspace = true
|
opentelemetry.workspace = true
|
||||||
@@ -20,6 +21,7 @@ postgres.workspace = true
|
|||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
|
signal-hook.workspace = true
|
||||||
tar.workspace = true
|
tar.workspace = true
|
||||||
reqwest = { workspace = true, features = ["json"] }
|
reqwest = { workspace = true, features = ["json"] }
|
||||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||||
|
|||||||
@@ -40,18 +40,22 @@ use std::collections::HashMap;
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::process::exit;
|
use std::process::exit;
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||||
use std::{thread, time::Duration};
|
use std::{thread, time::Duration};
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use clap::Arg;
|
use clap::Arg;
|
||||||
|
use nix::sys::signal::{kill, Signal};
|
||||||
|
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||||
|
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
use compute_api::responses::ComputeStatus;
|
use compute_api::responses::ComputeStatus;
|
||||||
|
|
||||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
|
||||||
use compute_tools::configurator::launch_configurator;
|
use compute_tools::configurator::launch_configurator;
|
||||||
use compute_tools::extension_server::get_pg_version;
|
use compute_tools::extension_server::get_pg_version;
|
||||||
use compute_tools::http::api::launch_http_server;
|
use compute_tools::http::api::launch_http_server;
|
||||||
@@ -67,6 +71,13 @@ const BUILD_TAG_DEFAULT: &str = "latest";
|
|||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||||
|
|
||||||
|
let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
|
||||||
|
thread::spawn(move || {
|
||||||
|
for sig in signals.forever() {
|
||||||
|
handle_exit_signal(sig);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
let build_tag = option_env!("BUILD_TAG")
|
let build_tag = option_env!("BUILD_TAG")
|
||||||
.unwrap_or(BUILD_TAG_DEFAULT)
|
.unwrap_or(BUILD_TAG_DEFAULT)
|
||||||
.to_string();
|
.to_string();
|
||||||
@@ -346,6 +357,7 @@ fn main() -> Result<()> {
|
|||||||
let ecode = pg
|
let ecode = pg
|
||||||
.wait()
|
.wait()
|
||||||
.expect("failed to start waiting on Postgres process");
|
.expect("failed to start waiting on Postgres process");
|
||||||
|
PG_PID.store(0, Ordering::SeqCst);
|
||||||
info!("Postgres exited with code {}, shutting down", ecode);
|
info!("Postgres exited with code {}, shutting down", ecode);
|
||||||
exit_code = ecode.code()
|
exit_code = ecode.code()
|
||||||
}
|
}
|
||||||
@@ -519,6 +531,24 @@ fn cli() -> clap::Command {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||||
|
/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
|
||||||
|
/// wait for termination which would be easy then.
|
||||||
|
fn handle_exit_signal(sig: i32) {
|
||||||
|
info!("received {sig} termination signal");
|
||||||
|
let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
|
||||||
|
if ss_pid != 0 {
|
||||||
|
let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
|
||||||
|
kill(ss_pid, Signal::SIGTERM).ok();
|
||||||
|
}
|
||||||
|
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||||
|
if pg_pid != 0 {
|
||||||
|
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||||
|
kill(pg_pid, Signal::SIGTERM).ok();
|
||||||
|
}
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn verify_cli() {
|
fn verify_cli() {
|
||||||
cli().debug_assert()
|
cli().debug_assert()
|
||||||
|
|||||||
@@ -6,6 +6,8 @@ use std::os::unix::fs::PermissionsExt;
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::process::{Command, Stdio};
|
use std::process::{Command, Stdio};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
use std::sync::atomic::AtomicU32;
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
use std::sync::{Condvar, Mutex, RwLock};
|
use std::sync::{Condvar, Mutex, RwLock};
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
@@ -34,6 +36,9 @@ use crate::spec::*;
|
|||||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||||
use crate::{config, extension_server};
|
use crate::{config, extension_server};
|
||||||
|
|
||||||
|
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
|
||||||
|
pub static PG_PID: AtomicU32 = AtomicU32::new(0);
|
||||||
|
|
||||||
/// Compute node info shared across several `compute_ctl` threads.
|
/// Compute node info shared across several `compute_ctl` threads.
|
||||||
pub struct ComputeNode {
|
pub struct ComputeNode {
|
||||||
// Url type maintains proper escaping
|
// Url type maintains proper escaping
|
||||||
@@ -501,6 +506,7 @@ impl ComputeNode {
|
|||||||
.stdout(Stdio::piped())
|
.stdout(Stdio::piped())
|
||||||
.spawn()
|
.spawn()
|
||||||
.expect("postgres --sync-safekeepers failed to start");
|
.expect("postgres --sync-safekeepers failed to start");
|
||||||
|
SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
|
||||||
|
|
||||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||||
@@ -508,6 +514,7 @@ impl ComputeNode {
|
|||||||
let sync_output = sync_handle
|
let sync_output = sync_handle
|
||||||
.wait_with_output()
|
.wait_with_output()
|
||||||
.expect("postgres --sync-safekeepers failed");
|
.expect("postgres --sync-safekeepers failed");
|
||||||
|
SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
|
||||||
|
|
||||||
if !sync_output.status.success() {
|
if !sync_output.status.success() {
|
||||||
anyhow::bail!(
|
anyhow::bail!(
|
||||||
@@ -662,6 +669,7 @@ impl ComputeNode {
|
|||||||
})
|
})
|
||||||
.spawn()
|
.spawn()
|
||||||
.expect("cannot start postgres process");
|
.expect("cannot start postgres process");
|
||||||
|
PG_PID.store(pg.id(), Ordering::SeqCst);
|
||||||
|
|
||||||
wait_for_postgres(&mut pg, pgdata_path)?;
|
wait_for_postgres(&mut pg, pgdata_path)?;
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use std::{thread, time::Duration};
|
|||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use postgres::{Client, NoTls};
|
use postgres::{Client, NoTls};
|
||||||
use tracing::{debug, info};
|
use tracing::{debug, info, warn};
|
||||||
|
|
||||||
use crate::compute::ComputeNode;
|
use crate::compute::ComputeNode;
|
||||||
|
|
||||||
@@ -84,6 +84,29 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If there are existing (logical) walsenders, do not suspend.
|
||||||
|
//
|
||||||
|
// walproposer doesn't currently show up in pg_stat_replication,
|
||||||
|
// but protect if it will be
|
||||||
|
let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
|
||||||
|
match cli.query_one(ws_count_query, &[]) {
|
||||||
|
Ok(r) => match r.try_get::<&str, i64>("count") {
|
||||||
|
Ok(num_ws) => {
|
||||||
|
if num_ws > 0 {
|
||||||
|
last_active = Some(Utc::now());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("failed to parse ws count: {:?}", e);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(e) => {
|
||||||
|
warn!("failed to get list of walsenders: {:?}", e);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Update the last activity in the shared state if we got a more recent one.
|
// Update the last activity in the shared state if we got a more recent one.
|
||||||
let mut state = compute.state.lock().unwrap();
|
let mut state = compute.state.lock().unwrap();
|
||||||
// NB: `Some(<DateTime>)` is always greater than `None`.
|
// NB: `Some(<DateTime>)` is always greater than `None`.
|
||||||
|
|||||||
@@ -46,6 +46,8 @@ use std::time::Duration;
|
|||||||
|
|
||||||
use anyhow::{anyhow, bail, Context, Result};
|
use anyhow::{anyhow, bail, Context, Result};
|
||||||
use compute_api::spec::RemoteExtSpec;
|
use compute_api::spec::RemoteExtSpec;
|
||||||
|
use nix::sys::signal::kill;
|
||||||
|
use nix::sys::signal::Signal;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utils::id::{NodeId, TenantId, TimelineId};
|
use utils::id::{NodeId, TenantId, TimelineId};
|
||||||
|
|
||||||
@@ -439,11 +441,14 @@ impl Endpoint {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
|
fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
|
||||||
// TODO use background_process::stop_process instead
|
// TODO use background_process::stop_process instead
|
||||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||||
|
if send_sigterm {
|
||||||
|
kill(pid, Signal::SIGTERM).ok();
|
||||||
|
}
|
||||||
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -733,10 +738,15 @@ impl Endpoint {
|
|||||||
&None,
|
&None,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
// Also wait for the compute_ctl process to die. It might have some
|
||||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
// cleanup work to do after postgres stops, like syncing safekeepers,
|
||||||
|
// etc.
|
||||||
//
|
//
|
||||||
self.wait_for_compute_ctl_to_exit()?;
|
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
|
||||||
|
// want this cleanup: tests intentionally do stop when majority of
|
||||||
|
// safekeepers is down, so sync-safekeepers would hang otherwise. This
|
||||||
|
// could be a separate flag though.
|
||||||
|
self.wait_for_compute_ctl_to_exit(destroy)?;
|
||||||
if destroy {
|
if destroy {
|
||||||
println!(
|
println!(
|
||||||
"Destroying postgres data directory '{}'",
|
"Destroying postgres data directory '{}'",
|
||||||
|
|||||||
@@ -485,6 +485,13 @@ impl PageServerNode {
|
|||||||
Ok(self.http_client.list_timelines(*tenant_id).await?)
|
Ok(self.http_client.list_timelines(*tenant_id).await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
|
||||||
|
Ok(self
|
||||||
|
.http_client
|
||||||
|
.tenant_secondary_download(*tenant_id)
|
||||||
|
.await?)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn timeline_create(
|
pub async fn timeline_create(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ use crate::{
|
|||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
|
||||||
};
|
};
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -40,9 +41,9 @@ async fn await_lsn(
|
|||||||
loop {
|
loop {
|
||||||
let latest = match get_lsns(tenant_id, pageserver).await {
|
let latest = match get_lsns(tenant_id, pageserver).await {
|
||||||
Ok(l) => l,
|
Ok(l) => l,
|
||||||
Err(e) => {
|
Err(_e) => {
|
||||||
println!(
|
println!(
|
||||||
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
|
"🕑 Waiting for pageserver {} to activate...",
|
||||||
pageserver.conf.id
|
pageserver.conf.id
|
||||||
);
|
);
|
||||||
std::thread::sleep(Duration::from_millis(500));
|
std::thread::sleep(Duration::from_millis(500));
|
||||||
@@ -89,7 +90,7 @@ pub async fn migrate_tenant(
|
|||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
dest_ps: PageServerNode,
|
dest_ps: PageServerNode,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
// Get a new generation
|
println!("🤔 Checking existing status...");
|
||||||
let attachment_service = AttachmentService::from_env(env);
|
let attachment_service = AttachmentService::from_env(env);
|
||||||
|
|
||||||
fn build_location_config(
|
fn build_location_config(
|
||||||
@@ -135,6 +136,20 @@ pub async fn migrate_tenant(
|
|||||||
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
|
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"🔁 Downloading latest layers to destination pageserver {}",
|
||||||
|
dest_ps.conf.id
|
||||||
|
);
|
||||||
|
match dest_ps
|
||||||
|
.tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(()) => {}
|
||||||
|
Err(_) => {
|
||||||
|
println!(" (skipping, destination wasn't in secondary mode)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let gen = attachment_service
|
let gen = attachment_service
|
||||||
.attach_hook(tenant_id, dest_ps.conf.id)
|
.attach_hook(tenant_id, dest_ps.conf.id)
|
||||||
.await?;
|
.await?;
|
||||||
|
|||||||
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
|
|||||||
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
||||||
|
|
||||||
### Obligatory checks
|
### Obligatory checks
|
||||||
We force code formatting via `black`, `ruff`, and type hints via `mypy`.
|
We force code formatting via `ruff`, and type hints via `mypy`.
|
||||||
Run the following commands in the repository's root (next to `pyproject.toml`):
|
Run the following commands in the repository's root (next to `pyproject.toml`):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
poetry run black . # All code is reformatted
|
poetry run ruff format . # All code is reformatted
|
||||||
poetry run ruff . # Python linter
|
poetry run ruff check . # Python linter
|
||||||
poetry run mypy . # Ensure there are no typing errors
|
poetry run mypy . # Ensure there are no typing errors
|
||||||
```
|
```
|
||||||
|
|
||||||
**WARNING**: do not run `mypy` from a directory other than the root of the repository.
|
**WARNING**: do not run `mypy` from a directory other than the root of the repository.
|
||||||
|
|||||||
@@ -142,7 +142,7 @@ impl Key {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||||
key.field1 == 0x00 && key.field4 != 0
|
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::str::FromStr for Key {
|
impl std::str::FromStr for Key {
|
||||||
|
|||||||
@@ -124,6 +124,9 @@ impl KeySpaceAccum {
|
|||||||
if range.start == accum.end {
|
if range.start == accum.end {
|
||||||
accum.end = range.end;
|
accum.end = range.end;
|
||||||
} else {
|
} else {
|
||||||
|
// TODO: to efficiently support small sharding stripe sizes, we should avoid starting
|
||||||
|
// a new range here if the skipped region was all keys that don't belong on this shard.
|
||||||
|
// (https://github.com/neondatabase/neon/issues/6247)
|
||||||
assert!(range.start > accum.end);
|
assert!(range.start > accum.end);
|
||||||
self.ranges.push(accum.clone());
|
self.ranges.push(accum.clone());
|
||||||
*accum = range;
|
*accum = range;
|
||||||
|
|||||||
@@ -557,19 +557,6 @@ pub enum DownloadRemoteLayersTaskState {
|
|||||||
ShutDown,
|
ShutDown,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
|
||||||
|
|
||||||
/// Information for configuring a single fail point
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
|
||||||
pub struct FailpointConfig {
|
|
||||||
/// Name of the fail point
|
|
||||||
pub name: String,
|
|
||||||
/// List of actions to take, using the format described in `fail::cfg`
|
|
||||||
///
|
|
||||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
|
||||||
pub actions: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
pub struct TimelineGcRequest {
|
pub struct TimelineGcRequest {
|
||||||
pub gc_horizon: Option<u64>,
|
pub gc_horizon: Option<u64>,
|
||||||
|
|||||||
@@ -422,6 +422,21 @@ impl ShardIdentity {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return true if the key should be discarded if found in this shard's
|
||||||
|
/// data store, e.g. during compaction after a split
|
||||||
|
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||||
|
if key_is_shard0(key) {
|
||||||
|
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||||
|
// A: because the WAL ingestion logic currently ingests some shard 0
|
||||||
|
// content on all shards, even though it's only read on shard 0. If we
|
||||||
|
// dropped it, then subsequent WAL ingest to these keys would encounter
|
||||||
|
// an error.
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
!self.is_key_local(key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn shard_slug(&self) -> String {
|
pub fn shard_slug(&self) -> String {
|
||||||
if self.count > ShardCount(0) {
|
if self.count > ShardCount(0) {
|
||||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||||
@@ -515,12 +530,7 @@ fn key_is_shard0(key: &Key) -> bool {
|
|||||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||||
// requests, and any request other than those for particular blocks in relations.
|
// requests, and any request other than those for particular blocks in relations.
|
||||||
//
|
!is_rel_block_key(key)
|
||||||
// In this condition:
|
|
||||||
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
|
|
||||||
// all metadata.
|
|
||||||
// - field6 is set to -1 for relation size pages.
|
|
||||||
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||||
|
|||||||
@@ -35,6 +35,12 @@ pub enum QueryError {
|
|||||||
/// We were instructed to shutdown while processing the query
|
/// We were instructed to shutdown while processing the query
|
||||||
#[error("Shutting down")]
|
#[error("Shutting down")]
|
||||||
Shutdown,
|
Shutdown,
|
||||||
|
/// Query handler indicated that client should reconnect
|
||||||
|
#[error("Server requested reconnect")]
|
||||||
|
Reconnect,
|
||||||
|
/// Query named an entity that was not found
|
||||||
|
#[error("Not found: {0}")]
|
||||||
|
NotFound(std::borrow::Cow<'static, str>),
|
||||||
/// Authentication failure
|
/// Authentication failure
|
||||||
#[error("Unauthorized: {0}")]
|
#[error("Unauthorized: {0}")]
|
||||||
Unauthorized(std::borrow::Cow<'static, str>),
|
Unauthorized(std::borrow::Cow<'static, str>),
|
||||||
@@ -54,9 +60,9 @@ impl From<io::Error> for QueryError {
|
|||||||
impl QueryError {
|
impl QueryError {
|
||||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||||
match self {
|
match self {
|
||||||
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
|
Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
|
||||||
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
||||||
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
|
Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
|
||||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -425,6 +431,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
|||||||
info!("Stopped due to shutdown");
|
info!("Stopped due to shutdown");
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Err(QueryError::Reconnect) => {
|
||||||
|
// Dropping out of this loop implicitly disconnects
|
||||||
|
info!("Stopped due to handler reconnect request");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
Err(QueryError::Disconnected(e)) => {
|
Err(QueryError::Disconnected(e)) => {
|
||||||
info!("Disconnected ({e:#})");
|
info!("Disconnected ({e:#})");
|
||||||
// Disconnection is not an error: we just use it that way internally to drop
|
// Disconnection is not an error: we just use it that way internally to drop
|
||||||
@@ -974,7 +985,9 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
|
|||||||
pub fn short_error(e: &QueryError) -> String {
|
pub fn short_error(e: &QueryError) -> String {
|
||||||
match e {
|
match e {
|
||||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||||
|
QueryError::Reconnect => "reconnect".to_string(),
|
||||||
QueryError::Shutdown => "shutdown".to_string(),
|
QueryError::Shutdown => "shutdown".to_string(),
|
||||||
|
QueryError::NotFound(_) => "not found".to_string(),
|
||||||
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
|
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
|
||||||
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
|
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
|
||||||
QueryError::Other(e) => format!("{e:#}"),
|
QueryError::Other(e) => format!("{e:#}"),
|
||||||
@@ -996,9 +1009,15 @@ fn log_query_error(query: &str, e: &QueryError) {
|
|||||||
QueryError::SimulatedConnectionError => {
|
QueryError::SimulatedConnectionError => {
|
||||||
error!("query handler for query '{query}' failed due to a simulated connection error")
|
error!("query handler for query '{query}' failed due to a simulated connection error")
|
||||||
}
|
}
|
||||||
|
QueryError::Reconnect => {
|
||||||
|
info!("query handler for '{query}' requested client to reconnect")
|
||||||
|
}
|
||||||
QueryError::Shutdown => {
|
QueryError::Shutdown => {
|
||||||
info!("query handler for '{query}' cancelled during tenant shutdown")
|
info!("query handler for '{query}' cancelled during tenant shutdown")
|
||||||
}
|
}
|
||||||
|
QueryError::NotFound(reason) => {
|
||||||
|
info!("query handler for '{query}' entity not found: {reason}")
|
||||||
|
}
|
||||||
QueryError::Unauthorized(e) => {
|
QueryError::Unauthorized(e) => {
|
||||||
warn!("query handler for '{query}' failed with authentication error: {e}");
|
warn!("query handler for '{query}' failed with authentication error: {e}");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -322,6 +322,12 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
|
||||||
|
Err(anyhow::anyhow!(
|
||||||
|
"copy for azure blob storage is not implemented"
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pin_project_lite::pin_project! {
|
pin_project_lite::pin_project! {
|
||||||
|
|||||||
@@ -207,6 +207,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||||
|
|
||||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||||
|
|
||||||
|
/// Copy a remote object inside a bucket from one path to another.
|
||||||
|
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
|
||||||
@@ -374,6 +377,15 @@ impl GenericRemoteStorage {
|
|||||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||||
|
match self {
|
||||||
|
Self::LocalFs(s) => s.copy(from, to).await,
|
||||||
|
Self::AwsS3(s) => s.copy(from, to).await,
|
||||||
|
Self::AzureBlob(s) => s.copy(from, to).await,
|
||||||
|
Self::Unreliable(s) => s.copy(from, to).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GenericRemoteStorage {
|
impl GenericRemoteStorage {
|
||||||
@@ -660,6 +672,7 @@ impl ConcurrencyLimiter {
|
|||||||
RequestKind::Put => &self.write,
|
RequestKind::Put => &self.write,
|
||||||
RequestKind::List => &self.read,
|
RequestKind::List => &self.read,
|
||||||
RequestKind::Delete => &self.write,
|
RequestKind::Delete => &self.write,
|
||||||
|
RequestKind::Copy => &self.write,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -409,6 +409,20 @@ impl RemoteStorage for LocalFs {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||||
|
let from_path = from.with_base(&self.storage_root);
|
||||||
|
let to_path = to.with_base(&self.storage_root);
|
||||||
|
create_target_directory(&to_path).await?;
|
||||||
|
fs::copy(&from_path, &to_path).await.with_context(|| {
|
||||||
|
format!(
|
||||||
|
"Failed to copy file from '{from_path}' to '{to_path}'",
|
||||||
|
from_path = from_path,
|
||||||
|
to_path = to_path
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||||
|
|||||||
@@ -493,6 +493,38 @@ impl RemoteStorage for S3Bucket {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||||
|
let kind = RequestKind::Copy;
|
||||||
|
let _guard = self.permit(kind).await;
|
||||||
|
|
||||||
|
let started_at = start_measuring_requests(kind);
|
||||||
|
|
||||||
|
// we need to specify bucket_name as a prefix
|
||||||
|
let copy_source = format!(
|
||||||
|
"{}/{}",
|
||||||
|
self.bucket_name,
|
||||||
|
self.relative_path_to_s3_object(from)
|
||||||
|
);
|
||||||
|
|
||||||
|
let res = self
|
||||||
|
.client
|
||||||
|
.copy_object()
|
||||||
|
.bucket(self.bucket_name.clone())
|
||||||
|
.key(self.relative_path_to_s3_object(to))
|
||||||
|
.copy_source(copy_source)
|
||||||
|
.send()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let started_at = ScopeGuard::into_inner(started_at);
|
||||||
|
metrics::BUCKET_METRICS
|
||||||
|
.req_seconds
|
||||||
|
.observe_elapsed(kind, &res, started_at);
|
||||||
|
|
||||||
|
res?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||||
// if prefix is not none then download file `prefix/from`
|
// if prefix is not none then download file `prefix/from`
|
||||||
// if prefix is none then download file `from`
|
// if prefix is none then download file `from`
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ pub(crate) enum RequestKind {
|
|||||||
Put = 1,
|
Put = 1,
|
||||||
Delete = 2,
|
Delete = 2,
|
||||||
List = 3,
|
List = 3,
|
||||||
|
Copy = 4,
|
||||||
}
|
}
|
||||||
|
|
||||||
use RequestKind::*;
|
use RequestKind::*;
|
||||||
@@ -22,6 +23,7 @@ impl RequestKind {
|
|||||||
Put => "put_object",
|
Put => "put_object",
|
||||||
Delete => "delete_object",
|
Delete => "delete_object",
|
||||||
List => "list_objects",
|
List => "list_objects",
|
||||||
|
Copy => "copy_object",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const fn as_index(&self) -> usize {
|
const fn as_index(&self) -> usize {
|
||||||
@@ -29,7 +31,7 @@ impl RequestKind {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(super) struct RequestTyped<C>([C; 4]);
|
pub(super) struct RequestTyped<C>([C; 5]);
|
||||||
|
|
||||||
impl<C> RequestTyped<C> {
|
impl<C> RequestTyped<C> {
|
||||||
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
pub(super) fn get(&self, kind: RequestKind) -> &C {
|
||||||
@@ -38,8 +40,8 @@ impl<C> RequestTyped<C> {
|
|||||||
|
|
||||||
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
|
||||||
use RequestKind::*;
|
use RequestKind::*;
|
||||||
let mut it = [Get, Put, Delete, List].into_iter();
|
let mut it = [Get, Put, Delete, List, Copy].into_iter();
|
||||||
let arr = std::array::from_fn::<C, 4, _>(|index| {
|
let arr = std::array::from_fn::<C, 5, _>(|index| {
|
||||||
let next = it.next().unwrap();
|
let next = it.next().unwrap();
|
||||||
assert_eq!(index, next.as_index());
|
assert_eq!(index, next.as_index());
|
||||||
f(next)
|
f(next)
|
||||||
|
|||||||
@@ -162,4 +162,11 @@ impl RemoteStorage for UnreliableWrapper {
|
|||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
|
||||||
|
// copy is equivalent to download + upload
|
||||||
|
self.attempt(RemoteOp::Download(from.clone()))?;
|
||||||
|
self.attempt(RemoteOp::Upload(to.clone()))?;
|
||||||
|
self.inner.copy_object(from, to).await
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,3 +51,9 @@ pub struct SkTimelineInfo {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub http_connstr: Option<String>,
|
pub http_connstr: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
|
pub struct TimelineCopyRequest {
|
||||||
|
pub target_timeline_id: TimelineId,
|
||||||
|
pub until_lsn: Lsn,
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,6 +4,12 @@ version = "0.1.0"
|
|||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = []
|
||||||
|
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||||
|
# which adds some runtime cost to run tests on outage conditions
|
||||||
|
testing = ["fail/failpoints"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
arc-swap.workspace = true
|
arc-swap.workspace = true
|
||||||
sentry.workspace = true
|
sentry.workspace = true
|
||||||
@@ -16,6 +22,7 @@ chrono.workspace = true
|
|||||||
heapless.workspace = true
|
heapless.workspace = true
|
||||||
hex = { workspace = true, features = ["serde"] }
|
hex = { workspace = true, features = ["serde"] }
|
||||||
hyper = { workspace = true, features = ["full"] }
|
hyper = { workspace = true, features = ["full"] }
|
||||||
|
fail.workspace = true
|
||||||
futures = { workspace = true}
|
futures = { workspace = true}
|
||||||
jsonwebtoken.workspace = true
|
jsonwebtoken.workspace = true
|
||||||
nix.workspace = true
|
nix.workspace = true
|
||||||
|
|||||||
@@ -1,3 +1,14 @@
|
|||||||
|
//! Failpoint support code shared between pageserver and safekeepers.
|
||||||
|
|
||||||
|
use crate::http::{
|
||||||
|
error::ApiError,
|
||||||
|
json::{json_request, json_response},
|
||||||
|
};
|
||||||
|
use hyper::{Body, Request, Response, StatusCode};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
use tracing::*;
|
||||||
|
|
||||||
/// use with fail::cfg("$name", "return(2000)")
|
/// use with fail::cfg("$name", "return(2000)")
|
||||||
///
|
///
|
||||||
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
/// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
|
||||||
@@ -25,7 +36,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async;
|
|||||||
// Helper function used by the macro. (A function has nicer scoping so we
|
// Helper function used by the macro. (A function has nicer scoping so we
|
||||||
// don't need to decorate everything with "::")
|
// don't need to decorate everything with "::")
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
|
||||||
let millis = duration_str.parse::<u64>().unwrap();
|
let millis = duration_str.parse::<u64>().unwrap();
|
||||||
let d = std::time::Duration::from_millis(millis);
|
let d = std::time::Duration::from_millis(millis);
|
||||||
|
|
||||||
@@ -71,7 +82,7 @@ pub fn init() -> fail::FailScenario<'static> {
|
|||||||
scenario
|
scenario
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
|
||||||
if actions == "exit" {
|
if actions == "exit" {
|
||||||
fail::cfg_callback(name, exit_failpoint)
|
fail::cfg_callback(name, exit_failpoint)
|
||||||
} else {
|
} else {
|
||||||
@@ -84,3 +95,45 @@ fn exit_failpoint() {
|
|||||||
tracing::info!("Exit requested by failpoint");
|
tracing::info!("Exit requested by failpoint");
|
||||||
std::process::exit(1);
|
std::process::exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||||
|
|
||||||
|
/// Information for configuring a single fail point
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct FailpointConfig {
|
||||||
|
/// Name of the fail point
|
||||||
|
pub name: String,
|
||||||
|
/// List of actions to take, using the format described in `fail::cfg`
|
||||||
|
///
|
||||||
|
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||||
|
pub actions: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configure failpoints through http.
|
||||||
|
pub async fn failpoints_handler(
|
||||||
|
mut request: Request<Body>,
|
||||||
|
_cancel: CancellationToken,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
if !fail::has_failpoints() {
|
||||||
|
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||||
|
"Cannot manage failpoints because storage was compiled without failpoints support"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||||
|
for fp in failpoints {
|
||||||
|
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||||
|
|
||||||
|
// We recognize one extra "action" that's not natively recognized
|
||||||
|
// by the failpoints crate: exit, to immediately kill the process
|
||||||
|
let cfg_result = apply_failpoint(&fp.name, &fp.actions);
|
||||||
|
|
||||||
|
if let Err(err_msg) = cfg_result {
|
||||||
|
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||||
|
"Failed to configure failpoints: {err_msg}"
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}
|
||||||
@@ -31,6 +31,9 @@ pub enum ApiError {
|
|||||||
#[error("Shutting down")]
|
#[error("Shutting down")]
|
||||||
ShuttingDown,
|
ShuttingDown,
|
||||||
|
|
||||||
|
#[error("Timeout")]
|
||||||
|
Timeout(Cow<'static, str>),
|
||||||
|
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
InternalServerError(anyhow::Error),
|
InternalServerError(anyhow::Error),
|
||||||
}
|
}
|
||||||
@@ -67,6 +70,10 @@ impl ApiError {
|
|||||||
err.to_string(),
|
err.to_string(),
|
||||||
StatusCode::SERVICE_UNAVAILABLE,
|
StatusCode::SERVICE_UNAVAILABLE,
|
||||||
),
|
),
|
||||||
|
ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
|
||||||
|
err.to_string(),
|
||||||
|
StatusCode::REQUEST_TIMEOUT,
|
||||||
|
),
|
||||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||||
err.to_string(),
|
err.to_string(),
|
||||||
StatusCode::INTERNAL_SERVER_ERROR,
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
|||||||
@@ -83,6 +83,10 @@ pub mod timeout;
|
|||||||
|
|
||||||
pub mod sync;
|
pub mod sync;
|
||||||
|
|
||||||
|
pub mod failpoint_support;
|
||||||
|
|
||||||
|
pub mod yielding_loop;
|
||||||
|
|
||||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||||
///
|
///
|
||||||
/// we have several cases:
|
/// we have several cases:
|
||||||
|
|||||||
@@ -15,6 +15,12 @@ pub struct Gate {
|
|||||||
name: String,
|
name: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for Gate {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
write!(f, "Gate<{}>", self.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
||||||
/// not complete.
|
/// not complete.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
|||||||
35
libs/utils/src/yielding_loop.rs
Normal file
35
libs/utils/src/yielding_loop.rs
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
pub enum YieldingLoopError {
|
||||||
|
#[error("Cancelled")]
|
||||||
|
Cancelled,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically
|
||||||
|
/// yields to avoid blocking the executor, and after resuming checks the provided
|
||||||
|
/// cancellation token to drop out promptly on shutdown.
|
||||||
|
#[inline(always)]
|
||||||
|
pub async fn yielding_loop<I, T, F>(
|
||||||
|
interval: usize,
|
||||||
|
cancel: &CancellationToken,
|
||||||
|
iter: I,
|
||||||
|
mut visitor: F,
|
||||||
|
) -> Result<(), YieldingLoopError>
|
||||||
|
where
|
||||||
|
I: Iterator<Item = T>,
|
||||||
|
F: FnMut(T),
|
||||||
|
{
|
||||||
|
for (i, item) in iter.enumerate() {
|
||||||
|
visitor(item);
|
||||||
|
|
||||||
|
if i + 1 % interval == 0 {
|
||||||
|
tokio::task::yield_now().await;
|
||||||
|
if cancel.is_cancelled() {
|
||||||
|
return Err(YieldingLoopError::Cancelled);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -425,7 +425,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
||||||
println!("walprop_log[{}] {}", level, msg);
|
println!("wp_log[{}] {}", level, msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ use bytes::{Buf, Bytes};
|
|||||||
use pageserver::{
|
use pageserver::{
|
||||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||||
};
|
};
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use utils::{id::TenantId, lsn::Lsn};
|
use utils::{id::TenantId, lsn::Lsn};
|
||||||
|
|
||||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||||
@@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) {
|
|||||||
|
|
||||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||||
let conf = Box::leak(Box::new(conf));
|
let conf = Box::leak(Box::new(conf));
|
||||||
let tenant_id = TenantId::generate();
|
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||||
|
|
||||||
let manager = PostgresRedoManager::new(conf, tenant_id);
|
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||||
|
|
||||||
let manager = Arc::new(manager);
|
let manager = Arc::new(manager);
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use pageserver_api::models::*;
|
use pageserver_api::{models::*, shard::TenantShardId};
|
||||||
use reqwest::{IntoUrl, Method};
|
use reqwest::{IntoUrl, Method};
|
||||||
use utils::{
|
use utils::{
|
||||||
http::error::HttpErrorBody,
|
http::error::HttpErrorBody,
|
||||||
@@ -164,6 +164,18 @@ impl Client {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{}/secondary/download",
|
||||||
|
self.mgmt_api_endpoint, tenant_id
|
||||||
|
);
|
||||||
|
self.request(Method::POST, &uri, ())
|
||||||
|
.await?
|
||||||
|
.error_for_status()
|
||||||
|
.map(|_| ())
|
||||||
|
.map_err(|e| Error::ApiError(format!("{}", e)))
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn location_config(
|
pub async fn location_config(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
|
|||||||
@@ -115,15 +115,8 @@ impl PagestreamClient {
|
|||||||
|
|
||||||
pub async fn getpage(
|
pub async fn getpage(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: RelTagBlockNo,
|
req: PagestreamGetPageRequest,
|
||||||
lsn: Lsn,
|
|
||||||
) -> anyhow::Result<PagestreamGetPageResponse> {
|
) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||||
let req = PagestreamGetPageRequest {
|
|
||||||
latest: false,
|
|
||||||
rel: key.rel_tag,
|
|
||||||
blkno: key.block_no,
|
|
||||||
lsn,
|
|
||||||
};
|
|
||||||
let req = PagestreamFeMessage::GetPage(req);
|
let req = PagestreamFeMessage::GetPage(req);
|
||||||
let req: bytes::Bytes = req.serialize();
|
let req: bytes::Bytes = req.serialize();
|
||||||
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
// let mut req = tokio_util::io::ReaderStream::new(&req);
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use futures::future::join_all;
|
|||||||
use pageserver::pgdatadir_mapping::key_to_rel_block;
|
use pageserver::pgdatadir_mapping::key_to_rel_block;
|
||||||
use pageserver::repository;
|
use pageserver::repository;
|
||||||
use pageserver_api::key::is_rel_block_key;
|
use pageserver_api::key::is_rel_block_key;
|
||||||
use pageserver_client::page_service::RelTagBlockNo;
|
use pageserver_api::models::PagestreamGetPageRequest;
|
||||||
|
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
@@ -39,6 +39,9 @@ pub(crate) struct Args {
|
|||||||
runtime: Option<humantime::Duration>,
|
runtime: Option<humantime::Duration>,
|
||||||
#[clap(long)]
|
#[clap(long)]
|
||||||
per_target_rate_limit: Option<usize>,
|
per_target_rate_limit: Option<usize>,
|
||||||
|
/// Probability for sending `latest=true` in the request (uniform distribution).
|
||||||
|
#[clap(long, default_value = "1")]
|
||||||
|
req_latest_probability: f64,
|
||||||
#[clap(long)]
|
#[clap(long)]
|
||||||
limit_to_first_n_targets: Option<usize>,
|
limit_to_first_n_targets: Option<usize>,
|
||||||
targets: Option<Vec<TenantTimelineId>>,
|
targets: Option<Vec<TenantTimelineId>>,
|
||||||
@@ -200,18 +203,26 @@ async fn main_impl(
|
|||||||
start_work_barrier.wait().await;
|
start_work_barrier.wait().await;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let (range, key) = {
|
let (timeline, req) = {
|
||||||
let mut rng = rand::thread_rng();
|
let mut rng = rand::thread_rng();
|
||||||
let r = &all_ranges[weights.sample(&mut rng)];
|
let r = &all_ranges[weights.sample(&mut rng)];
|
||||||
let key: i128 = rng.gen_range(r.start..r.end);
|
let key: i128 = rng.gen_range(r.start..r.end);
|
||||||
let key = repository::Key::from_i128(key);
|
let key = repository::Key::from_i128(key);
|
||||||
let (rel_tag, block_no) =
|
let (rel_tag, block_no) =
|
||||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||||
(r, RelTagBlockNo { rel_tag, block_no })
|
(
|
||||||
|
r.timeline,
|
||||||
|
PagestreamGetPageRequest {
|
||||||
|
latest: rng.gen_bool(args.req_latest_probability),
|
||||||
|
lsn: r.timeline_lsn,
|
||||||
|
rel: rel_tag,
|
||||||
|
blkno: block_no,
|
||||||
|
},
|
||||||
|
)
|
||||||
};
|
};
|
||||||
let sender = work_senders.get(&range.timeline).unwrap();
|
let sender = work_senders.get(&timeline).unwrap();
|
||||||
// TODO: what if this blocks?
|
// TODO: what if this blocks?
|
||||||
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
|
sender.send(req).await.ok().unwrap();
|
||||||
}
|
}
|
||||||
}),
|
}),
|
||||||
Some(rps_limit) => Box::pin(async move {
|
Some(rps_limit) => Box::pin(async move {
|
||||||
@@ -240,16 +251,21 @@ async fn main_impl(
|
|||||||
);
|
);
|
||||||
loop {
|
loop {
|
||||||
ticker.tick().await;
|
ticker.tick().await;
|
||||||
let (range, key) = {
|
let req = {
|
||||||
let mut rng = rand::thread_rng();
|
let mut rng = rand::thread_rng();
|
||||||
let r = &ranges[weights.sample(&mut rng)];
|
let r = &ranges[weights.sample(&mut rng)];
|
||||||
let key: i128 = rng.gen_range(r.start..r.end);
|
let key: i128 = rng.gen_range(r.start..r.end);
|
||||||
let key = repository::Key::from_i128(key);
|
let key = repository::Key::from_i128(key);
|
||||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
let (rel_tag, block_no) = key_to_rel_block(key)
|
||||||
.expect("we filter non-rel-block keys out above");
|
.expect("we filter non-rel-block keys out above");
|
||||||
(r, RelTagBlockNo { rel_tag, block_no })
|
PagestreamGetPageRequest {
|
||||||
|
latest: rng.gen_bool(args.req_latest_probability),
|
||||||
|
lsn: r.timeline_lsn,
|
||||||
|
rel: rel_tag,
|
||||||
|
blkno: block_no,
|
||||||
|
}
|
||||||
};
|
};
|
||||||
sender.send((key, range.timeline_lsn)).await.ok().unwrap();
|
sender.send(req).await.ok().unwrap();
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
};
|
};
|
||||||
@@ -303,7 +319,7 @@ async fn client(
|
|||||||
args: &'static Args,
|
args: &'static Args,
|
||||||
timeline: TenantTimelineId,
|
timeline: TenantTimelineId,
|
||||||
start_work_barrier: Arc<Barrier>,
|
start_work_barrier: Arc<Barrier>,
|
||||||
mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
|
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
|
||||||
all_work_done_barrier: Arc<Barrier>,
|
all_work_done_barrier: Arc<Barrier>,
|
||||||
live_stats: Arc<LiveStats>,
|
live_stats: Arc<LiveStats>,
|
||||||
) {
|
) {
|
||||||
@@ -317,10 +333,10 @@ async fn client(
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
while let Some((key, lsn)) = work.recv().await {
|
while let Some(req) = work.recv().await {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
client
|
client
|
||||||
.getpage(key, lsn)
|
.getpage(req)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("getpage for {timeline}"))
|
.with_context(|| format!("getpage for {timeline}"))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ use tracing::*;
|
|||||||
use tokio_tar::{Builder, EntryType, Header};
|
use tokio_tar::{Builder, EntryType, Header};
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
|
use crate::pgdatadir_mapping::Version;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||||
|
|
||||||
@@ -174,7 +175,7 @@ where
|
|||||||
] {
|
] {
|
||||||
for segno in self
|
for segno in self
|
||||||
.timeline
|
.timeline
|
||||||
.list_slru_segments(kind, self.lsn, self.ctx)
|
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?
|
.await?
|
||||||
{
|
{
|
||||||
self.add_slru_segment(kind, segno).await?;
|
self.add_slru_segment(kind, segno).await?;
|
||||||
@@ -192,7 +193,7 @@ where
|
|||||||
// Otherwise only include init forks of unlogged relations.
|
// Otherwise only include init forks of unlogged relations.
|
||||||
let rels = self
|
let rels = self
|
||||||
.timeline
|
.timeline
|
||||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
for &rel in rels.iter() {
|
for &rel in rels.iter() {
|
||||||
// Send init fork as main fork to provide well formed empty
|
// Send init fork as main fork to provide well formed empty
|
||||||
@@ -267,7 +268,7 @@ where
|
|||||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
|
||||||
let nblocks = self
|
let nblocks = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_size(src, self.lsn, false, self.ctx)
|
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// If the relation is empty, create an empty file
|
// If the relation is empty, create an empty file
|
||||||
@@ -288,7 +289,7 @@ where
|
|||||||
for blknum in startblk..endblk {
|
for blknum in startblk..endblk {
|
||||||
let img = self
|
let img = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
|
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
segment_data.extend_from_slice(&img[..]);
|
segment_data.extend_from_slice(&img[..]);
|
||||||
}
|
}
|
||||||
@@ -310,7 +311,7 @@ where
|
|||||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||||
let nblocks = self
|
let nblocks = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_slru_segment_size(slru, segno, self.lsn, self.ctx)
|
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||||
@@ -352,7 +353,7 @@ where
|
|||||||
let relmap_img = if has_relmap_file {
|
let relmap_img = if has_relmap_file {
|
||||||
let img = self
|
let img = self
|
||||||
.timeline
|
.timeline
|
||||||
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
|
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
ensure!(
|
ensure!(
|
||||||
@@ -399,7 +400,7 @@ where
|
|||||||
if !has_relmap_file
|
if !has_relmap_file
|
||||||
&& self
|
&& self
|
||||||
.timeline
|
.timeline
|
||||||
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
|
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||||
.await?
|
.await?
|
||||||
.is_empty()
|
.is_empty()
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ use pageserver::{
|
|||||||
virtual_file,
|
virtual_file,
|
||||||
};
|
};
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
|
use utils::failpoint_support;
|
||||||
use utils::logging::TracingErrorLayerEnablement;
|
use utils::logging::TracingErrorLayerEnablement;
|
||||||
use utils::signals::ShutdownSignals;
|
use utils::signals::ShutdownSignals;
|
||||||
use utils::{
|
use utils::{
|
||||||
@@ -126,7 +127,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Initialize up failpoints support
|
// Initialize up failpoints support
|
||||||
let scenario = pageserver::failpoint_support::init();
|
let scenario = failpoint_support::init();
|
||||||
|
|
||||||
// Basic initialization of things that don't change after startup
|
// Basic initialization of things that don't change after startup
|
||||||
virtual_file::init(conf.max_file_descriptors);
|
virtual_file::init(conf.max_file_descriptors);
|
||||||
|
|||||||
@@ -37,8 +37,8 @@ use crate::tenant::{
|
|||||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
|
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||||
TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||||
};
|
};
|
||||||
|
|
||||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||||
@@ -75,6 +75,9 @@ pub mod defaults {
|
|||||||
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
|
||||||
|
|
||||||
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
|
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
|
||||||
|
pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
|
||||||
|
|
||||||
|
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Default built-in configuration file.
|
/// Default built-in configuration file.
|
||||||
@@ -88,6 +91,7 @@ pub mod defaults {
|
|||||||
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
||||||
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
||||||
|
|
||||||
|
#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
|
||||||
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
|
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
|
||||||
|
|
||||||
# initial superuser role name to use when creating a new tenant
|
# initial superuser role name to use when creating a new tenant
|
||||||
@@ -108,6 +112,8 @@ pub mod defaults {
|
|||||||
|
|
||||||
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
||||||
|
|
||||||
|
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
|
||||||
|
|
||||||
[tenant_config]
|
[tenant_config]
|
||||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||||
@@ -125,6 +131,7 @@ pub mod defaults {
|
|||||||
#gc_feedback = false
|
#gc_feedback = false
|
||||||
|
|
||||||
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
|
||||||
|
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
|
||||||
|
|
||||||
[remote_storage]
|
[remote_storage]
|
||||||
|
|
||||||
@@ -233,6 +240,13 @@ pub struct PageServerConf {
|
|||||||
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
|
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
|
||||||
/// heatmap uploads vs. other remote storage operations.
|
/// heatmap uploads vs. other remote storage operations.
|
||||||
pub heatmap_upload_concurrency: usize,
|
pub heatmap_upload_concurrency: usize,
|
||||||
|
|
||||||
|
/// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly
|
||||||
|
/// deprioritises secondary downloads vs. remote storage operations for attached tenants.
|
||||||
|
pub secondary_download_concurrency: usize,
|
||||||
|
|
||||||
|
/// Maximum number of WAL records to be ingested and committed at the same time
|
||||||
|
pub ingest_batch_size: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||||
@@ -314,6 +328,9 @@ struct PageServerConfigBuilder {
|
|||||||
control_plane_emergency_mode: BuilderValue<bool>,
|
control_plane_emergency_mode: BuilderValue<bool>,
|
||||||
|
|
||||||
heatmap_upload_concurrency: BuilderValue<usize>,
|
heatmap_upload_concurrency: BuilderValue<usize>,
|
||||||
|
secondary_download_concurrency: BuilderValue<usize>,
|
||||||
|
|
||||||
|
ingest_batch_size: BuilderValue<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for PageServerConfigBuilder {
|
impl Default for PageServerConfigBuilder {
|
||||||
@@ -386,6 +403,9 @@ impl Default for PageServerConfigBuilder {
|
|||||||
control_plane_emergency_mode: Set(false),
|
control_plane_emergency_mode: Set(false),
|
||||||
|
|
||||||
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
|
||||||
|
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
|
||||||
|
|
||||||
|
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -534,6 +554,14 @@ impl PageServerConfigBuilder {
|
|||||||
self.heatmap_upload_concurrency = BuilderValue::Set(value)
|
self.heatmap_upload_concurrency = BuilderValue::Set(value)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn secondary_download_concurrency(&mut self, value: usize) {
|
||||||
|
self.secondary_download_concurrency = BuilderValue::Set(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
|
||||||
|
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||||
let concurrent_tenant_warmup = self
|
let concurrent_tenant_warmup = self
|
||||||
.concurrent_tenant_warmup
|
.concurrent_tenant_warmup
|
||||||
@@ -632,10 +660,15 @@ impl PageServerConfigBuilder {
|
|||||||
control_plane_emergency_mode: self
|
control_plane_emergency_mode: self
|
||||||
.control_plane_emergency_mode
|
.control_plane_emergency_mode
|
||||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
||||||
|
|
||||||
heatmap_upload_concurrency: self
|
heatmap_upload_concurrency: self
|
||||||
.heatmap_upload_concurrency
|
.heatmap_upload_concurrency
|
||||||
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
|
||||||
|
secondary_download_concurrency: self
|
||||||
|
.secondary_download_concurrency
|
||||||
|
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
|
||||||
|
ingest_batch_size: self
|
||||||
|
.ingest_batch_size
|
||||||
|
.ok_or(anyhow!("missing ingest_batch_size"))?,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -693,6 +726,11 @@ impl PageServerConf {
|
|||||||
.join(TENANT_LOCATION_CONFIG_NAME)
|
.join(TENANT_LOCATION_CONFIG_NAME)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||||
|
self.tenant_path(tenant_shard_id)
|
||||||
|
.join(TENANT_HEATMAP_BASENAME)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||||
self.tenant_path(tenant_shard_id)
|
self.tenant_path(tenant_shard_id)
|
||||||
.join(TIMELINES_SEGMENT_NAME)
|
.join(TIMELINES_SEGMENT_NAME)
|
||||||
@@ -878,6 +916,10 @@ impl PageServerConf {
|
|||||||
"heatmap_upload_concurrency" => {
|
"heatmap_upload_concurrency" => {
|
||||||
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
|
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
|
||||||
},
|
},
|
||||||
|
"secondary_download_concurrency" => {
|
||||||
|
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
|
||||||
|
},
|
||||||
|
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
|
||||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -949,6 +991,8 @@ impl PageServerConf {
|
|||||||
control_plane_api_token: None,
|
control_plane_api_token: None,
|
||||||
control_plane_emergency_mode: false,
|
control_plane_emergency_mode: false,
|
||||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||||
|
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||||
|
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1177,7 +1221,9 @@ background_task_maximum_delay = '334 s'
|
|||||||
control_plane_api: None,
|
control_plane_api: None,
|
||||||
control_plane_api_token: None,
|
control_plane_api_token: None,
|
||||||
control_plane_emergency_mode: false,
|
control_plane_emergency_mode: false,
|
||||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||||
|
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||||
|
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
|
||||||
},
|
},
|
||||||
"Correct defaults should be used when no config values are provided"
|
"Correct defaults should be used when no config values are provided"
|
||||||
);
|
);
|
||||||
@@ -1238,7 +1284,9 @@ background_task_maximum_delay = '334 s'
|
|||||||
control_plane_api: None,
|
control_plane_api: None,
|
||||||
control_plane_api_token: None,
|
control_plane_api_token: None,
|
||||||
control_plane_emergency_mode: false,
|
control_plane_emergency_mode: false,
|
||||||
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
|
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
|
||||||
|
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
|
||||||
|
ingest_batch_size: 100,
|
||||||
},
|
},
|
||||||
"Should be able to parse all basic config values correctly"
|
"Should be able to parse all basic config values correctly"
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ use tenant_size_model::{SizeResult, StorageModel};
|
|||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::auth::JwtAuth;
|
use utils::auth::JwtAuth;
|
||||||
|
use utils::failpoint_support::failpoints_handler;
|
||||||
use utils::http::endpoint::request_span;
|
use utils::http::endpoint::request_span;
|
||||||
use utils::http::json::json_request_or_empty_body;
|
use utils::http::json::json_request_or_empty_body;
|
||||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||||
@@ -66,9 +67,6 @@ use utils::{
|
|||||||
lsn::Lsn,
|
lsn::Lsn,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Imports only used for testing APIs
|
|
||||||
use pageserver_api::models::ConfigureFailpointsRequest;
|
|
||||||
|
|
||||||
// For APIs that require an Active tenant, how long should we block waiting for that state?
|
// For APIs that require an Active tenant, how long should we block waiting for that state?
|
||||||
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
// This is not functionally necessary (clients will retry), but avoids generating a lot of
|
||||||
// failed API calls while tenants are activating.
|
// failed API calls while tenants are activating.
|
||||||
@@ -154,6 +152,7 @@ impl From<PageReconstructError> for ApiError {
|
|||||||
PageReconstructError::AncestorStopping(_) => {
|
PageReconstructError::AncestorStopping(_) => {
|
||||||
ApiError::ResourceUnavailable(format!("{pre}").into())
|
ApiError::ResourceUnavailable(format!("{pre}").into())
|
||||||
}
|
}
|
||||||
|
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
|
||||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1275,6 +1274,23 @@ async fn put_tenant_location_config_handler(
|
|||||||
// which is not a 400 but a 409.
|
// which is not a 400 but a 409.
|
||||||
.map_err(ApiError::BadRequest)?;
|
.map_err(ApiError::BadRequest)?;
|
||||||
|
|
||||||
|
if let Some(_flush_ms) = flush {
|
||||||
|
match state
|
||||||
|
.secondary_controller
|
||||||
|
.upload_tenant(tenant_shard_id)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(()) => {
|
||||||
|
tracing::info!("Uploaded heatmap during flush");
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!("Failed to flush heatmap: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
tracing::info!("No flush requested when configuring");
|
||||||
|
}
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1293,34 +1309,6 @@ async fn handle_tenant_break(
|
|||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn failpoints_handler(
|
|
||||||
mut request: Request<Body>,
|
|
||||||
_cancel: CancellationToken,
|
|
||||||
) -> Result<Response<Body>, ApiError> {
|
|
||||||
if !fail::has_failpoints() {
|
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
|
||||||
"Cannot manage failpoints because pageserver was compiled without failpoints support"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
|
|
||||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
|
||||||
for fp in failpoints {
|
|
||||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
|
||||||
|
|
||||||
// We recognize one extra "action" that's not natively recognized
|
|
||||||
// by the failpoints crate: exit, to immediately kill the process
|
|
||||||
let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
|
|
||||||
|
|
||||||
if let Err(err_msg) = cfg_result {
|
|
||||||
return Err(ApiError::BadRequest(anyhow!(
|
|
||||||
"Failed to configure failpoints: {err_msg}"
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
json_response(StatusCode::OK, ())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Run GC immediately on given timeline.
|
// Run GC immediately on given timeline.
|
||||||
async fn timeline_gc_handler(
|
async fn timeline_gc_handler(
|
||||||
mut request: Request<Body>,
|
mut request: Request<Body>,
|
||||||
@@ -1640,6 +1628,21 @@ async fn secondary_upload_handler(
|
|||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn secondary_download_handler(
|
||||||
|
request: Request<Body>,
|
||||||
|
_cancel: CancellationToken,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
let state = get_state(&request);
|
||||||
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
|
state
|
||||||
|
.secondary_controller
|
||||||
|
.download_tenant(tenant_shard_id)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}
|
||||||
|
|
||||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
json_response(
|
json_response(
|
||||||
StatusCode::NOT_FOUND,
|
StatusCode::NOT_FOUND,
|
||||||
@@ -1908,6 +1911,9 @@ pub fn make_router(
|
|||||||
.put("/v1/deletion_queue/flush", |r| {
|
.put("/v1/deletion_queue/flush", |r| {
|
||||||
api_handler(r, deletion_queue_flush)
|
api_handler(r, deletion_queue_flush)
|
||||||
})
|
})
|
||||||
|
.post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
|
||||||
|
api_handler(r, secondary_download_handler)
|
||||||
|
})
|
||||||
.put("/v1/tenant/:tenant_shard_id/break", |r| {
|
.put("/v1/tenant/:tenant_shard_id/break", |r| {
|
||||||
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ use tracing::*;
|
|||||||
use walkdir::WalkDir;
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
|
use crate::metrics::WAL_INGEST;
|
||||||
use crate::pgdatadir_mapping::*;
|
use crate::pgdatadir_mapping::*;
|
||||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
@@ -312,13 +313,16 @@ async fn import_wal(
|
|||||||
waldecoder.feed_bytes(&buf);
|
waldecoder.feed_bytes(&buf);
|
||||||
|
|
||||||
let mut nrecords = 0;
|
let mut nrecords = 0;
|
||||||
let mut modification = tline.begin_modification(endpoint);
|
let mut modification = tline.begin_modification(last_lsn);
|
||||||
let mut decoded = DecodedWALRecord::default();
|
let mut decoded = DecodedWALRecord::default();
|
||||||
while last_lsn <= endpoint {
|
while last_lsn <= endpoint {
|
||||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||||
walingest
|
walingest
|
||||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
WAL_INGEST.records_committed.inc();
|
||||||
|
|
||||||
|
modification.commit(ctx).await?;
|
||||||
last_lsn = lsn;
|
last_lsn = lsn;
|
||||||
|
|
||||||
nrecords += 1;
|
nrecords += 1;
|
||||||
@@ -448,13 +452,14 @@ pub async fn import_wal_from_tar(
|
|||||||
|
|
||||||
waldecoder.feed_bytes(&bytes[offset..]);
|
waldecoder.feed_bytes(&bytes[offset..]);
|
||||||
|
|
||||||
let mut modification = tline.begin_modification(end_lsn);
|
let mut modification = tline.begin_modification(last_lsn);
|
||||||
let mut decoded = DecodedWALRecord::default();
|
let mut decoded = DecodedWALRecord::default();
|
||||||
while last_lsn <= end_lsn {
|
while last_lsn <= end_lsn {
|
||||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||||
walingest
|
walingest
|
||||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
modification.commit(ctx).await?;
|
||||||
last_lsn = lsn;
|
last_lsn = lsn;
|
||||||
|
|
||||||
debug!("imported record at {} (end {})", lsn, end_lsn);
|
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||||
|
|||||||
@@ -25,8 +25,6 @@ pub mod walingest;
|
|||||||
pub mod walrecord;
|
pub mod walrecord;
|
||||||
pub mod walredo;
|
pub mod walredo;
|
||||||
|
|
||||||
pub mod failpoint_support;
|
|
||||||
|
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use deletion_queue::DeletionQueue;
|
use deletion_queue::DeletionQueue;
|
||||||
@@ -119,6 +117,10 @@ pub const TENANT_CONFIG_NAME: &str = "config";
|
|||||||
/// Full path: `tenants/<tenant_id>/config`.
|
/// Full path: `tenants/<tenant_id>/config`.
|
||||||
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
|
||||||
|
|
||||||
|
/// Per-tenant copy of their remote heatmap, downloaded into the local
|
||||||
|
/// tenant path while in secondary mode.
|
||||||
|
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
||||||
|
|
||||||
/// A suffix used for various temporary files. Any temporary files found in the
|
/// A suffix used for various temporary files. Any temporary files found in the
|
||||||
/// data directory at pageserver startup can be automatically removed.
|
/// data directory at pageserver startup can be automatically removed.
|
||||||
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
|
|||||||
// Metrics collected on operations on the storage repository.
|
// Metrics collected on operations on the storage repository.
|
||||||
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
|
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
|
||||||
#[strum(serialize_all = "kebab_case")]
|
#[strum(serialize_all = "kebab_case")]
|
||||||
pub enum StorageTimeOperation {
|
pub(crate) enum StorageTimeOperation {
|
||||||
#[strum(serialize = "layer flush")]
|
#[strum(serialize = "layer flush")]
|
||||||
LayerFlush,
|
LayerFlush,
|
||||||
|
|
||||||
@@ -55,7 +55,7 @@ pub enum StorageTimeOperation {
|
|||||||
CreateTenant,
|
CreateTenant,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
||||||
register_counter_vec!(
|
register_counter_vec!(
|
||||||
"pageserver_storage_operations_seconds_sum",
|
"pageserver_storage_operations_seconds_sum",
|
||||||
"Total time spent on storage operations with operation, tenant and timeline dimensions",
|
"Total time spent on storage operations with operation, tenant and timeline dimensions",
|
||||||
@@ -64,7 +64,7 @@ pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||||
register_int_counter_vec!(
|
register_int_counter_vec!(
|
||||||
"pageserver_storage_operations_seconds_count",
|
"pageserver_storage_operations_seconds_count",
|
||||||
"Count of storage operations with operation, tenant and timeline dimensions",
|
"Count of storage operations with operation, tenant and timeline dimensions",
|
||||||
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub struct PageCacheMetricsForTaskKind {
|
pub(crate) struct PageCacheMetricsForTaskKind {
|
||||||
pub read_accesses_materialized_page: IntCounter,
|
pub read_accesses_materialized_page: IntCounter,
|
||||||
pub read_accesses_immutable: IntCounter,
|
pub read_accesses_immutable: IntCounter,
|
||||||
|
|
||||||
@@ -159,7 +159,7 @@ pub struct PageCacheMetricsForTaskKind {
|
|||||||
pub read_hits_materialized_page_older_lsn: IntCounter,
|
pub read_hits_materialized_page_older_lsn: IntCounter,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct PageCacheMetrics {
|
pub(crate) struct PageCacheMetrics {
|
||||||
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
|
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
|
||||||
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
|
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
|
||||||
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
|
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
|
||||||
let task_kind: &'static str = task_kind.into();
|
let task_kind: &'static str = task_kind.into();
|
||||||
@@ -243,10 +243,9 @@ impl PageCacheMetrics {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct PageCacheSizeMetrics {
|
pub(crate) struct PageCacheSizeMetrics {
|
||||||
pub max_bytes: UIntGauge,
|
pub max_bytes: UIntGauge,
|
||||||
|
|
||||||
pub current_bytes_ephemeral: UIntGauge,
|
|
||||||
pub current_bytes_immutable: UIntGauge,
|
pub current_bytes_immutable: UIntGauge,
|
||||||
pub current_bytes_materialized_page: UIntGauge,
|
pub current_bytes_materialized_page: UIntGauge,
|
||||||
}
|
}
|
||||||
@@ -260,31 +259,26 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
|||||||
.expect("failed to define a metric")
|
.expect("failed to define a metric")
|
||||||
});
|
});
|
||||||
|
|
||||||
pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
|
pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
|
||||||
max_bytes: {
|
Lazy::new(|| PageCacheSizeMetrics {
|
||||||
register_uint_gauge!(
|
max_bytes: {
|
||||||
"pageserver_page_cache_size_max_bytes",
|
register_uint_gauge!(
|
||||||
"Maximum size of the page cache in bytes"
|
"pageserver_page_cache_size_max_bytes",
|
||||||
)
|
"Maximum size of the page cache in bytes"
|
||||||
.expect("failed to define a metric")
|
)
|
||||||
},
|
.expect("failed to define a metric")
|
||||||
|
},
|
||||||
current_bytes_ephemeral: {
|
current_bytes_immutable: {
|
||||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||||
.get_metric_with_label_values(&["ephemeral"])
|
.get_metric_with_label_values(&["immutable"])
|
||||||
.unwrap()
|
.unwrap()
|
||||||
},
|
},
|
||||||
current_bytes_immutable: {
|
current_bytes_materialized_page: {
|
||||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||||
.get_metric_with_label_values(&["immutable"])
|
.get_metric_with_label_values(&["materialized_page"])
|
||||||
.unwrap()
|
.unwrap()
|
||||||
},
|
},
|
||||||
current_bytes_materialized_page: {
|
});
|
||||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
|
||||||
.get_metric_with_label_values(&["materialized_page"])
|
|
||||||
.unwrap()
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
pub(crate) mod page_cache_eviction_metrics {
|
pub(crate) mod page_cache_eviction_metrics {
|
||||||
use std::num::NonZeroUsize;
|
use std::num::NonZeroUsize;
|
||||||
@@ -740,13 +734,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
|
|||||||
|
|
||||||
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct EvictionsWithLowResidenceDuration {
|
pub(crate) struct EvictionsWithLowResidenceDuration {
|
||||||
data_source: &'static str,
|
data_source: &'static str,
|
||||||
threshold: Duration,
|
threshold: Duration,
|
||||||
counter: Option<IntCounter>,
|
counter: Option<IntCounter>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct EvictionsWithLowResidenceDurationBuilder {
|
pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
|
||||||
data_source: &'static str,
|
data_source: &'static str,
|
||||||
threshold: Duration,
|
threshold: Duration,
|
||||||
}
|
}
|
||||||
@@ -1009,7 +1003,7 @@ pub enum SmgrQueryType {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct SmgrQueryTimePerTimeline {
|
pub(crate) struct SmgrQueryTimePerTimeline {
|
||||||
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1181,8 +1175,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
|||||||
.map(|ms| (ms as f64) / 1000.0)
|
.map(|ms| (ms as f64) / 1000.0)
|
||||||
});
|
});
|
||||||
|
|
||||||
pub struct BasebackupQueryTime(HistogramVec);
|
pub(crate) struct BasebackupQueryTime(HistogramVec);
|
||||||
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||||
BasebackupQueryTime({
|
BasebackupQueryTime({
|
||||||
register_histogram_vec!(
|
register_histogram_vec!(
|
||||||
"pageserver_basebackup_query_seconds",
|
"pageserver_basebackup_query_seconds",
|
||||||
@@ -1202,7 +1196,7 @@ impl DurationResultObserver for BasebackupQueryTime {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||||
register_int_gauge_vec!(
|
register_int_gauge_vec!(
|
||||||
"pageserver_live_connections",
|
"pageserver_live_connections",
|
||||||
"Number of live network connections",
|
"Number of live network connections",
|
||||||
@@ -1369,6 +1363,8 @@ pub(crate) struct SecondaryModeMetrics {
|
|||||||
pub(crate) upload_heatmap: IntCounter,
|
pub(crate) upload_heatmap: IntCounter,
|
||||||
pub(crate) upload_heatmap_errors: IntCounter,
|
pub(crate) upload_heatmap_errors: IntCounter,
|
||||||
pub(crate) upload_heatmap_duration: Histogram,
|
pub(crate) upload_heatmap_duration: Histogram,
|
||||||
|
pub(crate) download_heatmap: IntCounter,
|
||||||
|
pub(crate) download_layer: IntCounter,
|
||||||
}
|
}
|
||||||
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
|
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
|
||||||
upload_heatmap: register_int_counter!(
|
upload_heatmap: register_int_counter!(
|
||||||
@@ -1386,6 +1382,16 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
|
|||||||
"Time to build and upload a heatmap, including any waiting inside the S3 client"
|
"Time to build and upload a heatmap, including any waiting inside the S3 client"
|
||||||
)
|
)
|
||||||
.expect("failed to define a metric"),
|
.expect("failed to define a metric"),
|
||||||
|
download_heatmap: register_int_counter!(
|
||||||
|
"pageserver_secondary_download_heatmap",
|
||||||
|
"Number of downloads of heatmaps by secondary mode locations"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
|
download_layer: register_int_counter!(
|
||||||
|
"pageserver_secondary_download_layer",
|
||||||
|
"Number of downloads of layers by secondary mode locations"
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric"),
|
||||||
});
|
});
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||||
@@ -1655,7 +1661,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
|||||||
Lazy::new(WalRedoProcessCounters::default);
|
Lazy::new(WalRedoProcessCounters::default);
|
||||||
|
|
||||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||||
pub struct StorageTimeMetricsTimer {
|
pub(crate) struct StorageTimeMetricsTimer {
|
||||||
metrics: StorageTimeMetrics,
|
metrics: StorageTimeMetrics,
|
||||||
start: Instant,
|
start: Instant,
|
||||||
}
|
}
|
||||||
@@ -1680,7 +1686,7 @@ impl StorageTimeMetricsTimer {
|
|||||||
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
|
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
|
||||||
/// timeline total sum and count.
|
/// timeline total sum and count.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct StorageTimeMetrics {
|
pub(crate) struct StorageTimeMetrics {
|
||||||
/// Sum of f64 seconds, per operation, tenant_id and timeline_id
|
/// Sum of f64 seconds, per operation, tenant_id and timeline_id
|
||||||
timeline_sum: Counter,
|
timeline_sum: Counter,
|
||||||
/// Number of oeprations, per operation, tenant_id and timeline_id
|
/// Number of oeprations, per operation, tenant_id and timeline_id
|
||||||
@@ -1719,7 +1725,7 @@ impl StorageTimeMetrics {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct TimelineMetrics {
|
pub(crate) struct TimelineMetrics {
|
||||||
tenant_id: String,
|
tenant_id: String,
|
||||||
shard_id: String,
|
shard_id: String,
|
||||||
timeline_id: String,
|
timeline_id: String,
|
||||||
@@ -1927,7 +1933,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct RemoteTimelineClientMetrics {
|
pub(crate) struct RemoteTimelineClientMetrics {
|
||||||
tenant_id: String,
|
tenant_id: String,
|
||||||
timeline_id: String,
|
timeline_id: String,
|
||||||
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
|
||||||
@@ -2225,7 +2231,7 @@ impl Drop for RemoteTimelineClientMetrics {
|
|||||||
|
|
||||||
/// Wrapper future that measures the time spent by a remote storage operation,
|
/// Wrapper future that measures the time spent by a remote storage operation,
|
||||||
/// and records the time and success/failure as a prometheus metric.
|
/// and records the time and success/failure as a prometheus metric.
|
||||||
pub trait MeasureRemoteOp: Sized {
|
pub(crate) trait MeasureRemoteOp: Sized {
|
||||||
fn measure_remote_op(
|
fn measure_remote_op(
|
||||||
self,
|
self,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
@@ -2250,7 +2256,7 @@ pub trait MeasureRemoteOp: Sized {
|
|||||||
impl<T: Sized> MeasureRemoteOp for T {}
|
impl<T: Sized> MeasureRemoteOp for T {}
|
||||||
|
|
||||||
pin_project! {
|
pin_project! {
|
||||||
pub struct MeasuredRemoteOp<F>
|
pub(crate) struct MeasuredRemoteOp<F>
|
||||||
{
|
{
|
||||||
#[pin]
|
#[pin]
|
||||||
inner: F,
|
inner: F,
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
|
|||||||
use pq_proto::framed::ConnectionError;
|
use pq_proto::framed::ConnectionError;
|
||||||
use pq_proto::FeStartupPacket;
|
use pq_proto::FeStartupPacket;
|
||||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||||
|
use std::borrow::Cow;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::net::TcpListener;
|
use std::net::TcpListener;
|
||||||
use std::pin::pin;
|
use std::pin::pin;
|
||||||
@@ -53,7 +54,7 @@ use crate::context::{DownloadBehavior, RequestContext};
|
|||||||
use crate::import_datadir::import_wal_from_tar;
|
use crate::import_datadir::import_wal_from_tar;
|
||||||
use crate::metrics;
|
use crate::metrics;
|
||||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||||
use crate::pgdatadir_mapping::rel_block_to_key;
|
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
|
||||||
use crate::task_mgr;
|
use crate::task_mgr;
|
||||||
use crate::task_mgr::TaskKind;
|
use crate::task_mgr::TaskKind;
|
||||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
@@ -61,6 +62,9 @@ use crate::tenant::mgr;
|
|||||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||||
use crate::tenant::mgr::GetActiveTenantError;
|
use crate::tenant::mgr::GetActiveTenantError;
|
||||||
use crate::tenant::mgr::ShardSelector;
|
use crate::tenant::mgr::ShardSelector;
|
||||||
|
use crate::tenant::timeline::WaitLsnError;
|
||||||
|
use crate::tenant::GetTimelineError;
|
||||||
|
use crate::tenant::PageReconstructError;
|
||||||
use crate::tenant::Timeline;
|
use crate::tenant::Timeline;
|
||||||
use crate::trace::Tracer;
|
use crate::trace::Tracer;
|
||||||
|
|
||||||
@@ -283,6 +287,64 @@ struct PageServerHandler {
|
|||||||
connection_ctx: RequestContext,
|
connection_ctx: RequestContext,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
enum PageStreamError {
|
||||||
|
/// We encountered an error that should prompt the client to reconnect:
|
||||||
|
/// in practice this means we drop the connection without sending a response.
|
||||||
|
#[error("Reconnect required: {0}")]
|
||||||
|
Reconnect(Cow<'static, str>),
|
||||||
|
|
||||||
|
/// We were instructed to shutdown while processing the query
|
||||||
|
#[error("Shutting down")]
|
||||||
|
Shutdown,
|
||||||
|
|
||||||
|
/// Something went wrong reading a page: this likely indicates a pageserver bug
|
||||||
|
#[error("Read error: {0}")]
|
||||||
|
Read(PageReconstructError),
|
||||||
|
|
||||||
|
/// Ran out of time waiting for an LSN
|
||||||
|
#[error("LSN timeout: {0}")]
|
||||||
|
LsnTimeout(WaitLsnError),
|
||||||
|
|
||||||
|
/// The entity required to serve the request (tenant or timeline) is not found,
|
||||||
|
/// or is not found in a suitable state to serve a request.
|
||||||
|
#[error("Not found: {0}")]
|
||||||
|
NotFound(std::borrow::Cow<'static, str>),
|
||||||
|
|
||||||
|
/// Request asked for something that doesn't make sense, like an invalid LSN
|
||||||
|
#[error("Bad request: {0}")]
|
||||||
|
BadRequest(std::borrow::Cow<'static, str>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<PageReconstructError> for PageStreamError {
|
||||||
|
fn from(value: PageReconstructError) -> Self {
|
||||||
|
match value {
|
||||||
|
PageReconstructError::Cancelled => Self::Shutdown,
|
||||||
|
e => Self::Read(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<GetActiveTimelineError> for PageStreamError {
|
||||||
|
fn from(value: GetActiveTimelineError) -> Self {
|
||||||
|
match value {
|
||||||
|
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
|
||||||
|
GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
|
||||||
|
GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<WaitLsnError> for PageStreamError {
|
||||||
|
fn from(value: WaitLsnError) -> Self {
|
||||||
|
match value {
|
||||||
|
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
|
||||||
|
WaitLsnError::Shutdown => Self::Shutdown,
|
||||||
|
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl PageServerHandler {
|
impl PageServerHandler {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
@@ -428,7 +490,7 @@ impl PageServerHandler {
|
|||||||
// Check that the timeline exists
|
// Check that the timeline exists
|
||||||
let timeline = tenant
|
let timeline = tenant
|
||||||
.get_timeline(timeline_id, true)
|
.get_timeline(timeline_id, true)
|
||||||
.map_err(|e| anyhow::anyhow!(e))?;
|
.map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
|
||||||
|
|
||||||
// Avoid starting new requests if the timeline has already started shutting down,
|
// Avoid starting new requests if the timeline has already started shutting down,
|
||||||
// and block timeline shutdown until this request is complete, or drops out due
|
// and block timeline shutdown until this request is complete, or drops out due
|
||||||
@@ -520,32 +582,44 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Err(e) = &response {
|
match response {
|
||||||
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
|
Err(PageStreamError::Shutdown) => {
|
||||||
// because wait_lsn etc will drop out
|
|
||||||
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
|
|
||||||
// is_canceled(): [`Timeline::shutdown`]` has entered
|
|
||||||
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
|
|
||||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||||
// shutdown, then do not send the error to the client. Instead just drop the
|
// shutdown, then do not send the error to the client. Instead just drop the
|
||||||
// connection.
|
// connection.
|
||||||
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
|
span.in_scope(|| info!("dropping connection due to shutdown"));
|
||||||
return Err(QueryError::Shutdown);
|
return Err(QueryError::Shutdown);
|
||||||
}
|
}
|
||||||
|
Err(PageStreamError::Reconnect(reason)) => {
|
||||||
|
span.in_scope(|| info!("handler requested reconnect: {reason}"));
|
||||||
|
return Err(QueryError::Reconnect);
|
||||||
|
}
|
||||||
|
Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
|
||||||
|
// This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
|
||||||
|
// shutdown error, this may be buried inside a PageReconstructError::Other for example.
|
||||||
|
//
|
||||||
|
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
|
||||||
|
// because wait_lsn etc will drop out
|
||||||
|
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
|
||||||
|
// is_canceled(): [`Timeline::shutdown`]` has entered
|
||||||
|
span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
|
||||||
|
return Err(QueryError::Shutdown);
|
||||||
|
}
|
||||||
|
r => {
|
||||||
|
let response_msg = r.unwrap_or_else(|e| {
|
||||||
|
// print the all details to the log with {:#}, but for the client the
|
||||||
|
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||||
|
// here includes cancellation which is not an error.
|
||||||
|
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||||
|
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||||
|
message: e.to_string(),
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
|
||||||
|
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let response = response.unwrap_or_else(|e| {
|
|
||||||
// print the all details to the log with {:#}, but for the client the
|
|
||||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
|
||||||
// here includes cancellation which is not an error.
|
|
||||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
|
||||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
|
||||||
message: e.to_string(),
|
|
||||||
})
|
|
||||||
});
|
|
||||||
|
|
||||||
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
|
||||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -692,7 +766,7 @@ impl PageServerHandler {
|
|||||||
latest: bool,
|
latest: bool,
|
||||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Lsn> {
|
) -> Result<Lsn, PageStreamError> {
|
||||||
if latest {
|
if latest {
|
||||||
// Latest page version was requested. If LSN is given, it is a hint
|
// Latest page version was requested. If LSN is given, it is a hint
|
||||||
// to the page server that there have been no modifications to the
|
// to the page server that there have been no modifications to the
|
||||||
@@ -723,15 +797,19 @@ impl PageServerHandler {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if lsn == Lsn(0) {
|
if lsn == Lsn(0) {
|
||||||
anyhow::bail!("invalid LSN(0) in request");
|
return Err(PageStreamError::BadRequest(
|
||||||
|
"invalid LSN(0) in request".into(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
timeline.wait_lsn(lsn, ctx).await?;
|
timeline.wait_lsn(lsn, ctx).await?;
|
||||||
}
|
}
|
||||||
anyhow::ensure!(
|
|
||||||
lsn >= **latest_gc_cutoff_lsn,
|
if lsn < **latest_gc_cutoff_lsn {
|
||||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
return Err(PageStreamError::BadRequest(format!(
|
||||||
lsn, **latest_gc_cutoff_lsn
|
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||||
);
|
lsn, **latest_gc_cutoff_lsn
|
||||||
|
).into()));
|
||||||
|
}
|
||||||
Ok(lsn)
|
Ok(lsn)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -740,14 +818,14 @@ impl PageServerHandler {
|
|||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
req: &PagestreamExistsRequest,
|
req: &PagestreamExistsRequest,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<PagestreamBeMessage> {
|
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
let lsn =
|
let lsn =
|
||||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let exists = timeline
|
let exists = timeline
|
||||||
.get_rel_exists(req.rel, lsn, req.latest, ctx)
|
.get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||||
@@ -760,13 +838,15 @@ impl PageServerHandler {
|
|||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
req: &PagestreamNblocksRequest,
|
req: &PagestreamNblocksRequest,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<PagestreamBeMessage> {
|
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
let lsn =
|
let lsn =
|
||||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
|
let n_blocks = timeline
|
||||||
|
.get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||||
n_blocks,
|
n_blocks,
|
||||||
@@ -778,14 +858,20 @@ impl PageServerHandler {
|
|||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
req: &PagestreamDbSizeRequest,
|
req: &PagestreamDbSizeRequest,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<PagestreamBeMessage> {
|
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
let lsn =
|
let lsn =
|
||||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let total_blocks = timeline
|
let total_blocks = timeline
|
||||||
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
|
.get_db_size(
|
||||||
|
DEFAULTTABLESPACE_OID,
|
||||||
|
req.dbnode,
|
||||||
|
Version::Lsn(lsn),
|
||||||
|
req.latest,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
.await?;
|
.await?;
|
||||||
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
||||||
|
|
||||||
@@ -794,30 +880,35 @@ impl PageServerHandler {
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn do_handle_get_page_at_lsn_request(
|
||||||
|
&self,
|
||||||
|
timeline: &Timeline,
|
||||||
|
req: &PagestreamGetPageRequest,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||||
|
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||||
|
let lsn =
|
||||||
|
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
||||||
|
.await?;
|
||||||
|
let page = timeline
|
||||||
|
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||||
|
page,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
async fn handle_get_page_at_lsn_request(
|
async fn handle_get_page_at_lsn_request(
|
||||||
&self,
|
&self,
|
||||||
timeline: &Timeline,
|
timeline: &Timeline,
|
||||||
req: &PagestreamGetPageRequest,
|
req: &PagestreamGetPageRequest,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<PagestreamBeMessage> {
|
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
|
||||||
let lsn =
|
|
||||||
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
|
|
||||||
.await?;
|
|
||||||
/*
|
|
||||||
// Add a 1s delay to some requests. The delay helps the requests to
|
|
||||||
// hit the race condition from github issue #1047 more easily.
|
|
||||||
use rand::Rng;
|
|
||||||
if rand::thread_rng().gen::<u8>() < 5 {
|
|
||||||
std::thread::sleep(std::time::Duration::from_millis(1000));
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
let key = rel_block_to_key(req.rel, req.blkno);
|
let key = rel_block_to_key(req.rel, req.blkno);
|
||||||
let page = if timeline.get_shard_identity().is_key_local(&key) {
|
if timeline.get_shard_identity().is_key_local(&key) {
|
||||||
timeline
|
self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
|
||||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
.await
|
||||||
.await?
|
|
||||||
} else {
|
} else {
|
||||||
// The Tenant shard we looked up at connection start does not hold this particular
|
// The Tenant shard we looked up at connection start does not hold this particular
|
||||||
// key: look for other shards in this tenant. This scenario occurs if a pageserver
|
// key: look for other shards in this tenant. This scenario occurs if a pageserver
|
||||||
@@ -836,30 +927,30 @@ impl PageServerHandler {
|
|||||||
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||||
// We already know this tenant exists in general, because we resolved it at
|
// We already know this tenant exists in general, because we resolved it at
|
||||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||||
// the requested page is not present on this node.
|
// the requested page is not present on this node: the client's knowledge of shard->pageserver
|
||||||
|
// mapping is out of date.
|
||||||
// TODO: this should be some kind of structured error that the client will understand,
|
tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
|
||||||
// so that it can block until its config is updated: this error is expected in the case
|
timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
|
||||||
// that the Tenant's shards' placements are being updated and the client hasn't been
|
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
|
||||||
// informed yet.
|
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
|
||||||
//
|
// and talk to a different pageserver.
|
||||||
// https://github.com/neondatabase/neon/issues/6038
|
return Err(PageStreamError::Reconnect(
|
||||||
return Err(anyhow::anyhow!("Request routed to wrong shard"));
|
"getpage@lsn request routed to wrong shard".into(),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
Err(e) => return Err(e.into()),
|
Err(e) => return Err(e.into()),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
|
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
|
||||||
// the GateGuard was already held over the whole connection.
|
// the GateGuard was already held over the whole connection.
|
||||||
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
let _timeline_guard = timeline
|
||||||
timeline
|
.gate
|
||||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
.enter()
|
||||||
.await?
|
.map_err(|_| PageStreamError::Shutdown)?;
|
||||||
};
|
|
||||||
|
|
||||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
|
||||||
page,
|
.await
|
||||||
}))
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
@@ -1000,9 +1091,7 @@ impl PageServerHandler {
|
|||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.map_err(GetActiveTimelineError::Tenant)?;
|
.map_err(GetActiveTimelineError::Tenant)?;
|
||||||
let timeline = tenant
|
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||||
.get_timeline(timeline_id, true)
|
|
||||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
|
||||||
Ok(timeline)
|
Ok(timeline)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1424,14 +1513,15 @@ enum GetActiveTimelineError {
|
|||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Tenant(GetActiveTenantError),
|
Tenant(GetActiveTenantError),
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Timeline(anyhow::Error),
|
Timeline(#[from] GetTimelineError),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<GetActiveTimelineError> for QueryError {
|
impl From<GetActiveTimelineError> for QueryError {
|
||||||
fn from(e: GetActiveTimelineError) -> Self {
|
fn from(e: GetActiveTimelineError) -> Self {
|
||||||
match e {
|
match e {
|
||||||
|
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
|
||||||
GetActiveTimelineError::Tenant(e) => e.into(),
|
GetActiveTimelineError::Tenant(e) => e.into(),
|
||||||
GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
|
GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
|
|||||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||||
use crate::repository::*;
|
use crate::repository::*;
|
||||||
use crate::walrecord::NeonWalRecord;
|
use crate::walrecord::NeonWalRecord;
|
||||||
use anyhow::Context;
|
use anyhow::{ensure, Context};
|
||||||
use bytes::{Buf, Bytes};
|
use bytes::{Buf, Bytes};
|
||||||
use pageserver_api::key::is_rel_block_key;
|
use pageserver_api::key::is_rel_block_key;
|
||||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||||
@@ -147,6 +147,7 @@ impl Timeline {
|
|||||||
{
|
{
|
||||||
DatadirModification {
|
DatadirModification {
|
||||||
tline: self,
|
tline: self,
|
||||||
|
pending_lsns: Vec::new(),
|
||||||
pending_updates: HashMap::new(),
|
pending_updates: HashMap::new(),
|
||||||
pending_deletions: Vec::new(),
|
pending_deletions: Vec::new(),
|
||||||
pending_nblocks: 0,
|
pending_nblocks: 0,
|
||||||
@@ -159,11 +160,11 @@ impl Timeline {
|
|||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
/// Look up given page version.
|
/// Look up given page version.
|
||||||
pub async fn get_rel_page_at_lsn(
|
pub(crate) async fn get_rel_page_at_lsn(
|
||||||
&self,
|
&self,
|
||||||
tag: RelTag,
|
tag: RelTag,
|
||||||
blknum: BlockNumber,
|
blknum: BlockNumber,
|
||||||
lsn: Lsn,
|
version: Version<'_>,
|
||||||
latest: bool,
|
latest: bool,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Bytes, PageReconstructError> {
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
@@ -173,44 +174,47 @@ impl Timeline {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
|
let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
|
||||||
if blknum >= nblocks {
|
if blknum >= nblocks {
|
||||||
debug!(
|
debug!(
|
||||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||||
tag, blknum, lsn, nblocks
|
tag,
|
||||||
|
blknum,
|
||||||
|
version.get_lsn(),
|
||||||
|
nblocks
|
||||||
);
|
);
|
||||||
return Ok(ZERO_PAGE.clone());
|
return Ok(ZERO_PAGE.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
let key = rel_block_to_key(tag, blknum);
|
let key = rel_block_to_key(tag, blknum);
|
||||||
self.get(key, lsn, ctx).await
|
version.get(self, key, ctx).await
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get size of a database in blocks
|
// Get size of a database in blocks
|
||||||
pub async fn get_db_size(
|
pub(crate) async fn get_db_size(
|
||||||
&self,
|
&self,
|
||||||
spcnode: Oid,
|
spcnode: Oid,
|
||||||
dbnode: Oid,
|
dbnode: Oid,
|
||||||
lsn: Lsn,
|
version: Version<'_>,
|
||||||
latest: bool,
|
latest: bool,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<usize, PageReconstructError> {
|
) -> Result<usize, PageReconstructError> {
|
||||||
let mut total_blocks = 0;
|
let mut total_blocks = 0;
|
||||||
|
|
||||||
let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
|
let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
|
||||||
|
|
||||||
for rel in rels {
|
for rel in rels {
|
||||||
let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
|
let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
|
||||||
total_blocks += n_blocks as usize;
|
total_blocks += n_blocks as usize;
|
||||||
}
|
}
|
||||||
Ok(total_blocks)
|
Ok(total_blocks)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get size of a relation file
|
/// Get size of a relation file
|
||||||
pub async fn get_rel_size(
|
pub(crate) async fn get_rel_size(
|
||||||
&self,
|
&self,
|
||||||
tag: RelTag,
|
tag: RelTag,
|
||||||
lsn: Lsn,
|
version: Version<'_>,
|
||||||
latest: bool,
|
latest: bool,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<BlockNumber, PageReconstructError> {
|
) -> Result<BlockNumber, PageReconstructError> {
|
||||||
@@ -220,12 +224,12 @@ impl Timeline {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||||
return Ok(nblocks);
|
return Ok(nblocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
||||||
&& !self.get_rel_exists(tag, lsn, latest, ctx).await?
|
&& !self.get_rel_exists(tag, version, latest, ctx).await?
|
||||||
{
|
{
|
||||||
// FIXME: Postgres sometimes calls smgrcreate() to create
|
// FIXME: Postgres sometimes calls smgrcreate() to create
|
||||||
// FSM, and smgrnblocks() on it immediately afterwards,
|
// FSM, and smgrnblocks() on it immediately afterwards,
|
||||||
@@ -235,7 +239,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let key = rel_size_to_key(tag);
|
let key = rel_size_to_key(tag);
|
||||||
let mut buf = self.get(key, lsn, ctx).await?;
|
let mut buf = version.get(self, key, ctx).await?;
|
||||||
let nblocks = buf.get_u32_le();
|
let nblocks = buf.get_u32_le();
|
||||||
|
|
||||||
if latest {
|
if latest {
|
||||||
@@ -246,16 +250,16 @@ impl Timeline {
|
|||||||
// latest=true, then it can not cause cache corruption, because with latest=true
|
// latest=true, then it can not cause cache corruption, because with latest=true
|
||||||
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
|
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
|
||||||
// associated with most recent value of LSN.
|
// associated with most recent value of LSN.
|
||||||
self.update_cached_rel_size(tag, lsn, nblocks);
|
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
|
||||||
}
|
}
|
||||||
Ok(nblocks)
|
Ok(nblocks)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Does relation exist?
|
/// Does relation exist?
|
||||||
pub async fn get_rel_exists(
|
pub(crate) async fn get_rel_exists(
|
||||||
&self,
|
&self,
|
||||||
tag: RelTag,
|
tag: RelTag,
|
||||||
lsn: Lsn,
|
version: Version<'_>,
|
||||||
_latest: bool,
|
_latest: bool,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<bool, PageReconstructError> {
|
) -> Result<bool, PageReconstructError> {
|
||||||
@@ -266,12 +270,12 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// first try to lookup relation in cache
|
// first try to lookup relation in cache
|
||||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||||
return Ok(true);
|
return Ok(true);
|
||||||
}
|
}
|
||||||
// fetch directory listing
|
// fetch directory listing
|
||||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||||
let buf = self.get(key, lsn, ctx).await?;
|
let buf = version.get(self, key, ctx).await?;
|
||||||
|
|
||||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||||
Ok(dir) => {
|
Ok(dir) => {
|
||||||
@@ -287,16 +291,16 @@ impl Timeline {
|
|||||||
/// # Cancel-Safety
|
/// # Cancel-Safety
|
||||||
///
|
///
|
||||||
/// This method is cancellation-safe.
|
/// This method is cancellation-safe.
|
||||||
pub async fn list_rels(
|
pub(crate) async fn list_rels(
|
||||||
&self,
|
&self,
|
||||||
spcnode: Oid,
|
spcnode: Oid,
|
||||||
dbnode: Oid,
|
dbnode: Oid,
|
||||||
lsn: Lsn,
|
version: Version<'_>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||||
// fetch directory listing
|
// fetch directory listing
|
||||||
let key = rel_dir_to_key(spcnode, dbnode);
|
let key = rel_dir_to_key(spcnode, dbnode);
|
||||||
let buf = self.get(key, lsn, ctx).await?;
|
let buf = version.get(self, key, ctx).await?;
|
||||||
|
|
||||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||||
Ok(dir) => {
|
Ok(dir) => {
|
||||||
@@ -315,7 +319,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Look up given SLRU page version.
|
/// Look up given SLRU page version.
|
||||||
pub async fn get_slru_page_at_lsn(
|
pub(crate) async fn get_slru_page_at_lsn(
|
||||||
&self,
|
&self,
|
||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
segno: u32,
|
segno: u32,
|
||||||
@@ -328,29 +332,29 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get size of an SLRU segment
|
/// Get size of an SLRU segment
|
||||||
pub async fn get_slru_segment_size(
|
pub(crate) async fn get_slru_segment_size(
|
||||||
&self,
|
&self,
|
||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
segno: u32,
|
segno: u32,
|
||||||
lsn: Lsn,
|
version: Version<'_>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<BlockNumber, PageReconstructError> {
|
) -> Result<BlockNumber, PageReconstructError> {
|
||||||
let key = slru_segment_size_to_key(kind, segno);
|
let key = slru_segment_size_to_key(kind, segno);
|
||||||
let mut buf = self.get(key, lsn, ctx).await?;
|
let mut buf = version.get(self, key, ctx).await?;
|
||||||
Ok(buf.get_u32_le())
|
Ok(buf.get_u32_le())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get size of an SLRU segment
|
/// Get size of an SLRU segment
|
||||||
pub async fn get_slru_segment_exists(
|
pub(crate) async fn get_slru_segment_exists(
|
||||||
&self,
|
&self,
|
||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
segno: u32,
|
segno: u32,
|
||||||
lsn: Lsn,
|
version: Version<'_>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<bool, PageReconstructError> {
|
) -> Result<bool, PageReconstructError> {
|
||||||
// fetch directory listing
|
// fetch directory listing
|
||||||
let key = slru_dir_to_key(kind);
|
let key = slru_dir_to_key(kind);
|
||||||
let buf = self.get(key, lsn, ctx).await?;
|
let buf = version.get(self, key, ctx).await?;
|
||||||
|
|
||||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||||
Ok(dir) => {
|
Ok(dir) => {
|
||||||
@@ -368,7 +372,7 @@ impl Timeline {
|
|||||||
/// so it's not well defined which LSN you get if there were multiple commits
|
/// so it's not well defined which LSN you get if there were multiple commits
|
||||||
/// "in flight" at that point in time.
|
/// "in flight" at that point in time.
|
||||||
///
|
///
|
||||||
pub async fn find_lsn_for_timestamp(
|
pub(crate) async fn find_lsn_for_timestamp(
|
||||||
&self,
|
&self,
|
||||||
search_timestamp: TimestampTz,
|
search_timestamp: TimestampTz,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
@@ -448,7 +452,7 @@ impl Timeline {
|
|||||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||||
/// with a smaller/larger timestamp.
|
/// with a smaller/larger timestamp.
|
||||||
///
|
///
|
||||||
pub async fn is_latest_commit_timestamp_ge_than(
|
pub(crate) async fn is_latest_commit_timestamp_ge_than(
|
||||||
&self,
|
&self,
|
||||||
search_timestamp: TimestampTz,
|
search_timestamp: TimestampTz,
|
||||||
probe_lsn: Lsn,
|
probe_lsn: Lsn,
|
||||||
@@ -471,7 +475,7 @@ impl Timeline {
|
|||||||
/// Obtain the possible timestamp range for the given lsn.
|
/// Obtain the possible timestamp range for the given lsn.
|
||||||
///
|
///
|
||||||
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
|
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
|
||||||
pub async fn get_timestamp_for_lsn(
|
pub(crate) async fn get_timestamp_for_lsn(
|
||||||
&self,
|
&self,
|
||||||
probe_lsn: Lsn,
|
probe_lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
@@ -501,11 +505,11 @@ impl Timeline {
|
|||||||
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
|
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
|
||||||
) -> Result<T, PageReconstructError> {
|
) -> Result<T, PageReconstructError> {
|
||||||
for segno in self
|
for segno in self
|
||||||
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
|
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
|
||||||
.await?
|
.await?
|
||||||
{
|
{
|
||||||
let nblocks = self
|
let nblocks = self
|
||||||
.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
|
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
|
||||||
.await?;
|
.await?;
|
||||||
for blknum in (0..nblocks).rev() {
|
for blknum in (0..nblocks).rev() {
|
||||||
let clog_page = self
|
let clog_page = self
|
||||||
@@ -528,36 +532,36 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get a list of SLRU segments
|
/// Get a list of SLRU segments
|
||||||
pub async fn list_slru_segments(
|
pub(crate) async fn list_slru_segments(
|
||||||
&self,
|
&self,
|
||||||
kind: SlruKind,
|
kind: SlruKind,
|
||||||
lsn: Lsn,
|
version: Version<'_>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<HashSet<u32>, PageReconstructError> {
|
) -> Result<HashSet<u32>, PageReconstructError> {
|
||||||
// fetch directory entry
|
// fetch directory entry
|
||||||
let key = slru_dir_to_key(kind);
|
let key = slru_dir_to_key(kind);
|
||||||
|
|
||||||
let buf = self.get(key, lsn, ctx).await?;
|
let buf = version.get(self, key, ctx).await?;
|
||||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||||
Ok(dir) => Ok(dir.segments),
|
Ok(dir) => Ok(dir.segments),
|
||||||
Err(e) => Err(PageReconstructError::from(e)),
|
Err(e) => Err(PageReconstructError::from(e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_relmap_file(
|
pub(crate) async fn get_relmap_file(
|
||||||
&self,
|
&self,
|
||||||
spcnode: Oid,
|
spcnode: Oid,
|
||||||
dbnode: Oid,
|
dbnode: Oid,
|
||||||
lsn: Lsn,
|
version: Version<'_>,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<Bytes, PageReconstructError> {
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
let key = relmap_file_key(spcnode, dbnode);
|
let key = relmap_file_key(spcnode, dbnode);
|
||||||
|
|
||||||
let buf = self.get(key, lsn, ctx).await?;
|
let buf = version.get(self, key, ctx).await?;
|
||||||
Ok(buf)
|
Ok(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn list_dbdirs(
|
pub(crate) async fn list_dbdirs(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
@@ -571,7 +575,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_twophase_file(
|
pub(crate) async fn get_twophase_file(
|
||||||
&self,
|
&self,
|
||||||
xid: TransactionId,
|
xid: TransactionId,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
@@ -582,7 +586,7 @@ impl Timeline {
|
|||||||
Ok(buf)
|
Ok(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn list_twophase_files(
|
pub(crate) async fn list_twophase_files(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
@@ -596,7 +600,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_control_file(
|
pub(crate) async fn get_control_file(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
@@ -604,7 +608,7 @@ impl Timeline {
|
|||||||
self.get(CONTROLFILE_KEY, lsn, ctx).await
|
self.get(CONTROLFILE_KEY, lsn, ctx).await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn get_checkpoint(
|
pub(crate) async fn get_checkpoint(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
@@ -612,7 +616,7 @@ impl Timeline {
|
|||||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn list_aux_files(
|
pub(crate) async fn list_aux_files(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
@@ -652,7 +656,10 @@ impl Timeline {
|
|||||||
|
|
||||||
let mut total_size: u64 = 0;
|
let mut total_size: u64 = 0;
|
||||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||||
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
|
for rel in self
|
||||||
|
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
|
||||||
|
.await?
|
||||||
|
{
|
||||||
if self.cancel.is_cancelled() {
|
if self.cancel.is_cancelled() {
|
||||||
return Err(CalculateLogicalSizeError::Cancelled);
|
return Err(CalculateLogicalSizeError::Cancelled);
|
||||||
}
|
}
|
||||||
@@ -692,7 +699,7 @@ impl Timeline {
|
|||||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||||
|
|
||||||
let mut rels: Vec<RelTag> = self
|
let mut rels: Vec<RelTag> = self
|
||||||
.list_rels(spcnode, dbnode, lsn, ctx)
|
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
|
||||||
.await?
|
.await?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect();
|
.collect();
|
||||||
@@ -799,18 +806,39 @@ pub struct DatadirModification<'a> {
|
|||||||
/// in the state in 'tline' yet.
|
/// in the state in 'tline' yet.
|
||||||
pub tline: &'a Timeline,
|
pub tline: &'a Timeline,
|
||||||
|
|
||||||
/// Lsn assigned by begin_modification
|
/// Current LSN of the modification
|
||||||
pub lsn: Lsn,
|
lsn: Lsn,
|
||||||
|
|
||||||
// The modifications are not applied directly to the underlying key-value store.
|
// The modifications are not applied directly to the underlying key-value store.
|
||||||
// The put-functions add the modifications here, and they are flushed to the
|
// The put-functions add the modifications here, and they are flushed to the
|
||||||
// underlying key-value store by the 'finish' function.
|
// underlying key-value store by the 'finish' function.
|
||||||
pending_updates: HashMap<Key, Value>,
|
pending_lsns: Vec<Lsn>,
|
||||||
pending_deletions: Vec<Range<Key>>,
|
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||||
|
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||||
pending_nblocks: i64,
|
pending_nblocks: i64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DatadirModification<'a> {
|
impl<'a> DatadirModification<'a> {
|
||||||
|
/// Get the current lsn
|
||||||
|
pub(crate) fn get_lsn(&self) -> Lsn {
|
||||||
|
self.lsn
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the current lsn
|
||||||
|
pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
|
||||||
|
ensure!(
|
||||||
|
lsn >= self.lsn,
|
||||||
|
"setting an older lsn {} than {} is not allowed",
|
||||||
|
lsn,
|
||||||
|
self.lsn
|
||||||
|
);
|
||||||
|
if lsn > self.lsn {
|
||||||
|
self.pending_lsns.push(self.lsn);
|
||||||
|
self.lsn = lsn;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Initialize a completely new repository.
|
/// Initialize a completely new repository.
|
||||||
///
|
///
|
||||||
/// This inserts the directory metadata entries that are assumed to
|
/// This inserts the directory metadata entries that are assumed to
|
||||||
@@ -984,11 +1012,9 @@ impl<'a> DatadirModification<'a> {
|
|||||||
dbnode: Oid,
|
dbnode: Oid,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let req_lsn = self.tline.get_last_record_lsn();
|
|
||||||
|
|
||||||
let total_blocks = self
|
let total_blocks = self
|
||||||
.tline
|
.tline
|
||||||
.get_db_size(spcnode, dbnode, req_lsn, true, ctx)
|
.get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
// Remove entry from dbdir
|
// Remove entry from dbdir
|
||||||
@@ -1077,8 +1103,11 @@ impl<'a> DatadirModification<'a> {
|
|||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||||
let last_lsn = self.tline.get_last_record_lsn();
|
if self
|
||||||
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
|
.tline
|
||||||
|
.get_rel_exists(rel, Version::Modified(self), true, ctx)
|
||||||
|
.await?
|
||||||
|
{
|
||||||
let size_key = rel_size_to_key(rel);
|
let size_key = rel_size_to_key(rel);
|
||||||
// Fetch the old size first
|
// Fetch the old size first
|
||||||
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
let old_size = self.get(size_key, ctx).await?.get_u32_le();
|
||||||
@@ -1323,17 +1352,23 @@ impl<'a> DatadirModification<'a> {
|
|||||||
let writer = self.tline.writer().await;
|
let writer = self.tline.writer().await;
|
||||||
|
|
||||||
// Flush relation and SLRU data blocks, keep metadata.
|
// Flush relation and SLRU data blocks, keep metadata.
|
||||||
let mut retained_pending_updates = HashMap::new();
|
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
|
||||||
for (key, value) in self.pending_updates.drain() {
|
for (key, values) in self.pending_updates.drain() {
|
||||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
for (lsn, value) in values {
|
||||||
// This bails out on first error without modifying pending_updates.
|
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||||
// That's Ok, cf this function's doc comment.
|
// This bails out on first error without modifying pending_updates.
|
||||||
writer.put(key, self.lsn, &value, ctx).await?;
|
// That's Ok, cf this function's doc comment.
|
||||||
} else {
|
writer.put(key, lsn, &value, ctx).await?;
|
||||||
retained_pending_updates.insert(key, value);
|
} else {
|
||||||
|
retained_pending_updates
|
||||||
|
.entry(key)
|
||||||
|
.or_default()
|
||||||
|
.push((lsn, value));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.pending_updates.extend(retained_pending_updates);
|
|
||||||
|
self.pending_updates = retained_pending_updates;
|
||||||
|
|
||||||
if pending_nblocks != 0 {
|
if pending_nblocks != 0 {
|
||||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||||
@@ -1350,18 +1385,28 @@ impl<'a> DatadirModification<'a> {
|
|||||||
///
|
///
|
||||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||||
let writer = self.tline.writer().await;
|
let writer = self.tline.writer().await;
|
||||||
let lsn = self.lsn;
|
|
||||||
let pending_nblocks = self.pending_nblocks;
|
let pending_nblocks = self.pending_nblocks;
|
||||||
self.pending_nblocks = 0;
|
self.pending_nblocks = 0;
|
||||||
|
|
||||||
for (key, value) in self.pending_updates.drain() {
|
if !self.pending_updates.is_empty() {
|
||||||
writer.put(key, lsn, &value, ctx).await?;
|
writer.put_batch(&self.pending_updates, ctx).await?;
|
||||||
}
|
self.pending_updates.clear();
|
||||||
for key_range in self.pending_deletions.drain(..) {
|
|
||||||
writer.delete(key_range, lsn).await?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.finish_write(lsn);
|
if !self.pending_deletions.is_empty() {
|
||||||
|
writer.delete_batch(&self.pending_deletions).await?;
|
||||||
|
self.pending_deletions.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
self.pending_lsns.push(self.lsn);
|
||||||
|
for pending_lsn in self.pending_lsns.drain(..) {
|
||||||
|
// Ideally, we should be able to call writer.finish_write() only once
|
||||||
|
// with the highest LSN. However, the last_record_lsn variable in the
|
||||||
|
// timeline keeps track of the latest LSN and the immediate previous LSN
|
||||||
|
// so we need to record every LSN to not leave a gap between them.
|
||||||
|
writer.finish_write(pending_lsn);
|
||||||
|
}
|
||||||
|
|
||||||
if pending_nblocks != 0 {
|
if pending_nblocks != 0 {
|
||||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||||
@@ -1370,44 +1415,86 @@ impl<'a> DatadirModification<'a> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn is_empty(&self) -> bool {
|
pub(crate) fn len(&self) -> usize {
|
||||||
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
|
self.pending_updates.len() + self.pending_deletions.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Internal helper functions to batch the modifications
|
// Internal helper functions to batch the modifications
|
||||||
|
|
||||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||||
// Have we already updated the same key? Read the pending updated
|
// Have we already updated the same key? Read the latest pending updated
|
||||||
// version in that case.
|
// version in that case.
|
||||||
//
|
//
|
||||||
// Note: we don't check pending_deletions. It is an error to request a
|
// Note: we don't check pending_deletions. It is an error to request a
|
||||||
// value that has been removed, deletion only avoids leaking storage.
|
// value that has been removed, deletion only avoids leaking storage.
|
||||||
if let Some(value) = self.pending_updates.get(&key) {
|
if let Some(values) = self.pending_updates.get(&key) {
|
||||||
if let Value::Image(img) = value {
|
if let Some((_, value)) = values.last() {
|
||||||
Ok(img.clone())
|
return if let Value::Image(img) = value {
|
||||||
} else {
|
Ok(img.clone())
|
||||||
// Currently, we never need to read back a WAL record that we
|
} else {
|
||||||
// inserted in the same "transaction". All the metadata updates
|
// Currently, we never need to read back a WAL record that we
|
||||||
// work directly with Images, and we never need to read actual
|
// inserted in the same "transaction". All the metadata updates
|
||||||
// data pages. We could handle this if we had to, by calling
|
// work directly with Images, and we never need to read actual
|
||||||
// the walredo manager, but let's keep it simple for now.
|
// data pages. We could handle this if we had to, by calling
|
||||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
// the walredo manager, but let's keep it simple for now.
|
||||||
"unexpected pending WAL record"
|
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||||
)))
|
"unexpected pending WAL record"
|
||||||
|
)))
|
||||||
|
};
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
|
||||||
self.tline.get(key, lsn, ctx).await
|
|
||||||
}
|
}
|
||||||
|
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||||
|
self.tline.get(key, lsn, ctx).await
|
||||||
}
|
}
|
||||||
|
|
||||||
fn put(&mut self, key: Key, val: Value) {
|
fn put(&mut self, key: Key, val: Value) {
|
||||||
self.pending_updates.insert(key, val);
|
let values = self.pending_updates.entry(key).or_default();
|
||||||
|
// Replace the previous value if it exists at the same lsn
|
||||||
|
if let Some((last_lsn, last_value)) = values.last_mut() {
|
||||||
|
if *last_lsn == self.lsn {
|
||||||
|
*last_value = val;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
values.push((self.lsn, val));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn delete(&mut self, key_range: Range<Key>) {
|
fn delete(&mut self, key_range: Range<Key>) {
|
||||||
trace!("DELETE {}-{}", key_range.start, key_range.end);
|
trace!("DELETE {}-{}", key_range.start, key_range.end);
|
||||||
self.pending_deletions.push(key_range);
|
self.pending_deletions.push((key_range, self.lsn));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This struct facilitates accessing either a committed key from the timeline at a
|
||||||
|
/// specific LSN, or the latest uncommitted key from a pending modification.
|
||||||
|
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
|
||||||
|
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
|
||||||
|
/// need to look up the keys in the modification first before looking them up in the
|
||||||
|
/// timeline to not miss the latest updates.
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub enum Version<'a> {
|
||||||
|
Lsn(Lsn),
|
||||||
|
Modified(&'a DatadirModification<'a>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Version<'a> {
|
||||||
|
async fn get(
|
||||||
|
&self,
|
||||||
|
timeline: &Timeline,
|
||||||
|
key: Key,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> Result<Bytes, PageReconstructError> {
|
||||||
|
match self {
|
||||||
|
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
|
||||||
|
Version::Modified(modification) => modification.get(key, ctx).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_lsn(&self) -> Lsn {
|
||||||
|
match self {
|
||||||
|
Version::Lsn(lsn) => *lsn,
|
||||||
|
Version::Modified(modification) => modification.lsn,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
|
|||||||
// else, but that has not been needed in a long time.
|
// else, but that has not been needed in a long time.
|
||||||
std::env::var("TOKIO_WORKER_THREADS")
|
std::env::var("TOKIO_WORKER_THREADS")
|
||||||
.map(|s| s.parse::<usize>().unwrap())
|
.map(|s| s.parse::<usize>().unwrap())
|
||||||
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
|
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
|
||||||
});
|
});
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
@@ -258,6 +258,9 @@ pub enum TaskKind {
|
|||||||
/// See [`crate::disk_usage_eviction_task`].
|
/// See [`crate::disk_usage_eviction_task`].
|
||||||
DiskUsageEviction,
|
DiskUsageEviction,
|
||||||
|
|
||||||
|
/// See [`crate::tenant::secondary`].
|
||||||
|
SecondaryDownloads,
|
||||||
|
|
||||||
/// See [`crate::tenant::secondary`].
|
/// See [`crate::tenant::secondary`].
|
||||||
SecondaryUploads,
|
SecondaryUploads,
|
||||||
|
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ use tracing::*;
|
|||||||
use utils::backoff;
|
use utils::backoff;
|
||||||
use utils::completion;
|
use utils::completion;
|
||||||
use utils::crashsafe::path_with_suffix_extension;
|
use utils::crashsafe::path_with_suffix_extension;
|
||||||
|
use utils::failpoint_support;
|
||||||
use utils::fs_ext;
|
use utils::fs_ext;
|
||||||
use utils::sync::gate::Gate;
|
use utils::sync::gate::Gate;
|
||||||
use utils::sync::gate::GateGuard;
|
use utils::sync::gate::GateGuard;
|
||||||
@@ -55,6 +56,7 @@ use self::timeline::uninit::TimelineUninitMark;
|
|||||||
use self::timeline::uninit::UninitializedTimeline;
|
use self::timeline::uninit::UninitializedTimeline;
|
||||||
use self::timeline::EvictionTaskTenantState;
|
use self::timeline::EvictionTaskTenantState;
|
||||||
use self::timeline::TimelineResources;
|
use self::timeline::TimelineResources;
|
||||||
|
use self::timeline::WaitLsnError;
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
use crate::deletion_queue::DeletionQueueClient;
|
use crate::deletion_queue::DeletionQueueClient;
|
||||||
@@ -594,10 +596,9 @@ impl Tenant {
|
|||||||
mode: SpawnMode,
|
mode: SpawnMode,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<Arc<Tenant>> {
|
) -> anyhow::Result<Arc<Tenant>> {
|
||||||
// TODO(sharding): make WalRedoManager shard-aware
|
|
||||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||||
conf,
|
conf,
|
||||||
tenant_shard_id.tenant_id,
|
tenant_shard_id,
|
||||||
)));
|
)));
|
||||||
|
|
||||||
let TenantSharedResources {
|
let TenantSharedResources {
|
||||||
@@ -890,7 +891,7 @@ impl Tenant {
|
|||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
span::debug_assert_current_span_has_tenant_id();
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
failpoint_support::sleep_millis_async!("before-attaching-tenant");
|
||||||
|
|
||||||
let preload = match preload {
|
let preload = match preload {
|
||||||
Some(p) => p,
|
Some(p) => p,
|
||||||
@@ -1002,7 +1003,7 @@ impl Tenant {
|
|||||||
// IndexPart is the source of truth.
|
// IndexPart is the source of truth.
|
||||||
self.clean_up_timelines(&existent_timelines)?;
|
self.clean_up_timelines(&existent_timelines)?;
|
||||||
|
|
||||||
crate::failpoint_support::sleep_millis_async!("attach-before-activate");
|
failpoint_support::sleep_millis_async!("attach-before-activate");
|
||||||
|
|
||||||
info!("Done");
|
info!("Done");
|
||||||
|
|
||||||
@@ -1144,10 +1145,9 @@ impl Tenant {
|
|||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
reason: String,
|
reason: String,
|
||||||
) -> Arc<Tenant> {
|
) -> Arc<Tenant> {
|
||||||
// TODO(sharding): make WalRedoManager shard-aware
|
|
||||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||||
conf,
|
conf,
|
||||||
tenant_shard_id.tenant_id,
|
tenant_shard_id,
|
||||||
)));
|
)));
|
||||||
Arc::new(Tenant::new(
|
Arc::new(Tenant::new(
|
||||||
TenantState::Broken {
|
TenantState::Broken {
|
||||||
@@ -1759,7 +1759,15 @@ impl Tenant {
|
|||||||
// decoding the new WAL might need to look up previous pages, relation
|
// decoding the new WAL might need to look up previous pages, relation
|
||||||
// sizes etc. and that would get confused if the previous page versions
|
// sizes etc. and that would get confused if the previous page versions
|
||||||
// are not in the repository yet.
|
// are not in the repository yet.
|
||||||
ancestor_timeline.wait_lsn(*lsn, ctx).await?;
|
ancestor_timeline
|
||||||
|
.wait_lsn(*lsn, ctx)
|
||||||
|
.await
|
||||||
|
.map_err(|e| match e {
|
||||||
|
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
|
||||||
|
CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
|
||||||
|
}
|
||||||
|
WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
|
||||||
|
})?;
|
||||||
}
|
}
|
||||||
|
|
||||||
self.branch_timeline(
|
self.branch_timeline(
|
||||||
@@ -2839,9 +2847,7 @@ impl Tenant {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
crate::failpoint_support::sleep_millis_async!(
|
failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
|
||||||
"gc_iteration_internal_after_getting_gc_timelines"
|
|
||||||
);
|
|
||||||
|
|
||||||
// If there is nothing to GC, we don't want any messages in the INFO log.
|
// If there is nothing to GC, we don't want any messages in the INFO log.
|
||||||
if !gc_timelines.is_empty() {
|
if !gc_timelines.is_empty() {
|
||||||
|
|||||||
@@ -46,6 +46,8 @@ pub mod defaults {
|
|||||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||||
|
|
||||||
|
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||||
|
|||||||
@@ -588,7 +588,7 @@ impl DeleteTenantFlow {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
|
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
|
||||||
// This is unexpected: this secondary tenants should not have been created, and we
|
// This is unexpected: this secondary tenants should not have been created, and we
|
||||||
// are not in a position to shut it down from here.
|
// are not in a position to shut it down from here.
|
||||||
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
|
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ use utils::generation::Generation;
|
|||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
|
|
||||||
use super::delete::DeleteTenantError;
|
use super::delete::DeleteTenantError;
|
||||||
|
use super::secondary::SecondaryTenant;
|
||||||
use super::TenantSharedResources;
|
use super::TenantSharedResources;
|
||||||
|
|
||||||
/// For a tenant that appears in TenantsMap, it may either be
|
/// For a tenant that appears in TenantsMap, it may either be
|
||||||
@@ -57,7 +58,7 @@ use super::TenantSharedResources;
|
|||||||
/// having a properly acquired generation (Secondary doesn't need a generation)
|
/// having a properly acquired generation (Secondary doesn't need a generation)
|
||||||
pub(crate) enum TenantSlot {
|
pub(crate) enum TenantSlot {
|
||||||
Attached(Arc<Tenant>),
|
Attached(Arc<Tenant>),
|
||||||
Secondary,
|
Secondary(Arc<SecondaryTenant>),
|
||||||
/// In this state, other administrative operations acting on the TenantId should
|
/// In this state, other administrative operations acting on the TenantId should
|
||||||
/// block, or return a retry indicator equivalent to HTTP 503.
|
/// block, or return a retry indicator equivalent to HTTP 503.
|
||||||
InProgress(utils::completion::Barrier),
|
InProgress(utils::completion::Barrier),
|
||||||
@@ -67,7 +68,7 @@ impl std::fmt::Debug for TenantSlot {
|
|||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
match self {
|
match self {
|
||||||
Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
|
Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
|
||||||
Self::Secondary => write!(f, "Secondary"),
|
Self::Secondary(_) => write!(f, "Secondary"),
|
||||||
Self::InProgress(_) => write!(f, "InProgress"),
|
Self::InProgress(_) => write!(f, "InProgress"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -78,7 +79,7 @@ impl TenantSlot {
|
|||||||
fn get_attached(&self) -> Option<&Arc<Tenant>> {
|
fn get_attached(&self) -> Option<&Arc<Tenant>> {
|
||||||
match self {
|
match self {
|
||||||
Self::Attached(t) => Some(t),
|
Self::Attached(t) => Some(t),
|
||||||
Self::Secondary => None,
|
Self::Secondary(_) => None,
|
||||||
Self::InProgress(_) => None,
|
Self::InProgress(_) => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -130,7 +131,7 @@ impl TenantsMap {
|
|||||||
|
|
||||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||||
/// resolve this to a fully qualified TenantShardId.
|
/// resolve this to a fully qualified TenantShardId.
|
||||||
fn resolve_shard(
|
fn resolve_attached_shard(
|
||||||
&self,
|
&self,
|
||||||
tenant_id: &TenantId,
|
tenant_id: &TenantId,
|
||||||
selector: ShardSelector,
|
selector: ShardSelector,
|
||||||
@@ -140,25 +141,27 @@ impl TenantsMap {
|
|||||||
TenantsMap::Initializing => None,
|
TenantsMap::Initializing => None,
|
||||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||||
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
||||||
|
// Ignore all slots that don't contain an attached tenant
|
||||||
|
let tenant = match &slot.1 {
|
||||||
|
TenantSlot::Attached(t) => t,
|
||||||
|
_ => continue,
|
||||||
|
};
|
||||||
|
|
||||||
match selector {
|
match selector {
|
||||||
ShardSelector::First => return Some(*slot.0),
|
ShardSelector::First => return Some(*slot.0),
|
||||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||||
return Some(*slot.0)
|
return Some(*slot.0)
|
||||||
}
|
}
|
||||||
ShardSelector::Page(key) => {
|
ShardSelector::Page(key) => {
|
||||||
if let Some(tenant) = slot.1.get_attached() {
|
// First slot we see for this tenant, calculate the expected shard number
|
||||||
// First slot we see for this tenant, calculate the expected shard number
|
// for the key: we will use this for checking if this and subsequent
|
||||||
// for the key: we will use this for checking if this and subsequent
|
// slots contain the key, rather than recalculating the hash each time.
|
||||||
// slots contain the key, rather than recalculating the hash each time.
|
if want_shard.is_none() {
|
||||||
if want_shard.is_none() {
|
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if Some(tenant.shard_identity.number) == want_shard {
|
if Some(tenant.shard_identity.number) == want_shard {
|
||||||
return Some(*slot.0);
|
return Some(*slot.0);
|
||||||
}
|
|
||||||
} else {
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ => continue,
|
_ => continue,
|
||||||
@@ -464,12 +467,18 @@ pub async fn init_tenant_mgr(
|
|||||||
*gen
|
*gen
|
||||||
} else {
|
} else {
|
||||||
match &location_conf.mode {
|
match &location_conf.mode {
|
||||||
LocationMode::Secondary(_) => {
|
LocationMode::Secondary(secondary_config) => {
|
||||||
// We do not require the control plane's permission for secondary mode
|
// We do not require the control plane's permission for secondary mode
|
||||||
// tenants, because they do no remote writes and hence require no
|
// tenants, because they do no remote writes and hence require no
|
||||||
// generation number
|
// generation number
|
||||||
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
|
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
|
||||||
tenants.insert(tenant_shard_id, TenantSlot::Secondary);
|
tenants.insert(
|
||||||
|
tenant_shard_id,
|
||||||
|
TenantSlot::Secondary(SecondaryTenant::new(
|
||||||
|
tenant_shard_id,
|
||||||
|
secondary_config,
|
||||||
|
)),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
LocationMode::Attached(_) => {
|
LocationMode::Attached(_) => {
|
||||||
// TODO: augment re-attach API to enable the control plane to
|
// TODO: augment re-attach API to enable the control plane to
|
||||||
@@ -661,8 +670,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
|||||||
|
|
||||||
total_attached += 1;
|
total_attached += 1;
|
||||||
}
|
}
|
||||||
TenantSlot::Secondary => {
|
TenantSlot::Secondary(state) => {
|
||||||
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
|
// We don't need to wait for this individually per-tenant: the
|
||||||
|
// downloader task will be waited on eventually, this cancel
|
||||||
|
// is just to encourage it to drop out if it is doing work
|
||||||
|
// for this tenant right now.
|
||||||
|
state.cancel.cancel();
|
||||||
|
|
||||||
|
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
|
||||||
}
|
}
|
||||||
TenantSlot::InProgress(notify) => {
|
TenantSlot::InProgress(notify) => {
|
||||||
// InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
|
// InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
|
||||||
@@ -845,12 +860,28 @@ impl TenantManager {
|
|||||||
Some(TenantSlot::InProgress(_)) => {
|
Some(TenantSlot::InProgress(_)) => {
|
||||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||||
}
|
}
|
||||||
None | Some(TenantSlot::Secondary) => {
|
None | Some(TenantSlot::Secondary(_)) => {
|
||||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn get_secondary_tenant_shard(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
) -> Option<Arc<SecondaryTenant>> {
|
||||||
|
let locked = self.tenants.read().unwrap();
|
||||||
|
|
||||||
|
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||||
|
.ok()
|
||||||
|
.flatten();
|
||||||
|
|
||||||
|
match peek_slot {
|
||||||
|
Some(TenantSlot::Secondary(s)) => Some(s.clone()),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||||
pub(crate) async fn upsert_location(
|
pub(crate) async fn upsert_location(
|
||||||
&self,
|
&self,
|
||||||
@@ -862,10 +893,15 @@ impl TenantManager {
|
|||||||
debug_assert_current_span_has_tenant_id();
|
debug_assert_current_span_has_tenant_id();
|
||||||
info!("configuring tenant location to state {new_location_config:?}");
|
info!("configuring tenant location to state {new_location_config:?}");
|
||||||
|
|
||||||
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
|
enum FastPathModified {
|
||||||
|
Attached(Arc<Tenant>),
|
||||||
|
Secondary(Arc<SecondaryTenant>),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
|
||||||
// then we do not need to set the slot to InProgress, we can just call into the
|
// then we do not need to set the slot to InProgress, we can just call into the
|
||||||
// existng tenant.
|
// existng tenant.
|
||||||
let modify_tenant = {
|
let fast_path_taken = {
|
||||||
let locked = self.tenants.read().unwrap();
|
let locked = self.tenants.read().unwrap();
|
||||||
let peek_slot =
|
let peek_slot =
|
||||||
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
|
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
|
||||||
@@ -879,12 +915,19 @@ impl TenantManager {
|
|||||||
new_location_config.clone(),
|
new_location_config.clone(),
|
||||||
)?);
|
)?);
|
||||||
|
|
||||||
Some(tenant.clone())
|
Some(FastPathModified::Attached(tenant.clone()))
|
||||||
} else {
|
} else {
|
||||||
// Different generations, fall through to general case
|
// Different generations, fall through to general case
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
(
|
||||||
|
LocationMode::Secondary(secondary_conf),
|
||||||
|
Some(TenantSlot::Secondary(secondary_tenant)),
|
||||||
|
) => {
|
||||||
|
secondary_tenant.set_config(secondary_conf);
|
||||||
|
Some(FastPathModified::Secondary(secondary_tenant.clone()))
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// Not an Attached->Attached transition, fall through to general case
|
// Not an Attached->Attached transition, fall through to general case
|
||||||
None
|
None
|
||||||
@@ -893,34 +936,51 @@ impl TenantManager {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Fast-path continued: having dropped out of the self.tenants lock, do the async
|
// Fast-path continued: having dropped out of the self.tenants lock, do the async
|
||||||
// phase of waiting for flush, before returning.
|
// phase of writing config and/or waiting for flush, before returning.
|
||||||
if let Some(tenant) = modify_tenant {
|
match fast_path_taken {
|
||||||
// Transition to AttachedStale means we may well hold a valid generation
|
Some(FastPathModified::Attached(tenant)) => {
|
||||||
// still, and have been requested to go stale as part of a migration. If
|
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||||
// the caller set `flush`, then flush to remote storage.
|
.await
|
||||||
if let LocationMode::Attached(AttachedLocationConfig {
|
.map_err(SetNewTenantConfigError::Persist)?;
|
||||||
generation: _,
|
|
||||||
attach_mode: AttachmentMode::Stale,
|
// Transition to AttachedStale means we may well hold a valid generation
|
||||||
}) = &new_location_config.mode
|
// still, and have been requested to go stale as part of a migration. If
|
||||||
{
|
// the caller set `flush`, then flush to remote storage.
|
||||||
if let Some(flush_timeout) = flush {
|
if let LocationMode::Attached(AttachedLocationConfig {
|
||||||
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
|
generation: _,
|
||||||
Ok(Err(e)) => {
|
attach_mode: AttachmentMode::Stale,
|
||||||
return Err(e);
|
}) = &new_location_config.mode
|
||||||
}
|
{
|
||||||
Ok(Ok(_)) => return Ok(()),
|
if let Some(flush_timeout) = flush {
|
||||||
Err(_) => {
|
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
|
||||||
tracing::warn!(
|
Ok(Err(e)) => {
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
Ok(Ok(_)) => return Ok(()),
|
||||||
|
Err(_) => {
|
||||||
|
tracing::warn!(
|
||||||
timeout_ms = flush_timeout.as_millis(),
|
timeout_ms = flush_timeout.as_millis(),
|
||||||
"Timed out waiting for flush to remote storage, proceeding anyway."
|
"Timed out waiting for flush to remote storage, proceeding anyway."
|
||||||
)
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
Some(FastPathModified::Secondary(_secondary_tenant)) => {
|
||||||
|
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
|
||||||
|
.await
|
||||||
|
.map_err(SetNewTenantConfigError::Persist)?;
|
||||||
|
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// Proceed with the general case procedure, where we will shutdown & remove any existing
|
||||||
|
// slot contents and replace with a fresh one
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
|
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
|
||||||
// InProgress value to the slot while we make whatever changes are required. The state for
|
// InProgress value to the slot while we make whatever changes are required. The state for
|
||||||
@@ -929,33 +989,47 @@ impl TenantManager {
|
|||||||
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
|
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
|
||||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||||
|
|
||||||
if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
|
match slot_guard.get_old_value() {
|
||||||
// The case where we keep a Tenant alive was covered above in the special case
|
Some(TenantSlot::Attached(tenant)) => {
|
||||||
// for Attached->Attached transitions in the same generation. By this point,
|
// The case where we keep a Tenant alive was covered above in the special case
|
||||||
// if we see an attached tenant we know it will be discarded and should be
|
// for Attached->Attached transitions in the same generation. By this point,
|
||||||
// shut down.
|
// if we see an attached tenant we know it will be discarded and should be
|
||||||
let (_guard, progress) = utils::completion::channel();
|
// shut down.
|
||||||
|
let (_guard, progress) = utils::completion::channel();
|
||||||
|
|
||||||
match tenant.get_attach_mode() {
|
match tenant.get_attach_mode() {
|
||||||
AttachmentMode::Single | AttachmentMode::Multi => {
|
AttachmentMode::Single | AttachmentMode::Multi => {
|
||||||
// Before we leave our state as the presumed holder of the latest generation,
|
// Before we leave our state as the presumed holder of the latest generation,
|
||||||
// flush any outstanding deletions to reduce the risk of leaking objects.
|
// flush any outstanding deletions to reduce the risk of leaking objects.
|
||||||
self.resources.deletion_queue_client.flush_advisory()
|
self.resources.deletion_queue_client.flush_advisory()
|
||||||
}
|
}
|
||||||
AttachmentMode::Stale => {
|
AttachmentMode::Stale => {
|
||||||
// If we're stale there's not point trying to flush deletions
|
// If we're stale there's not point trying to flush deletions
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
info!("Shutting down attached tenant");
|
info!("Shutting down attached tenant");
|
||||||
match tenant.shutdown(progress, false).await {
|
match tenant.shutdown(progress, false).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(barrier) => {
|
Err(barrier) => {
|
||||||
info!("Shutdown already in progress, waiting for it to complete");
|
info!("Shutdown already in progress, waiting for it to complete");
|
||||||
barrier.wait().await;
|
barrier.wait().await;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
slot_guard.drop_old_value().expect("We just shut it down");
|
||||||
|
}
|
||||||
|
Some(TenantSlot::Secondary(state)) => {
|
||||||
|
info!("Shutting down secondary tenant");
|
||||||
|
state.shutdown().await;
|
||||||
|
}
|
||||||
|
Some(TenantSlot::InProgress(_)) => {
|
||||||
|
// This should never happen: acquire_slot should error out
|
||||||
|
// if the contents of a slot were InProgress.
|
||||||
|
anyhow::bail!("Acquired an InProgress slot, this is a bug.")
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// Slot was vacant, nothing needs shutting down.
|
||||||
}
|
}
|
||||||
slot_guard.drop_old_value().expect("We just shut it down");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||||
@@ -978,7 +1052,9 @@ impl TenantManager {
|
|||||||
.map_err(SetNewTenantConfigError::Persist)?;
|
.map_err(SetNewTenantConfigError::Persist)?;
|
||||||
|
|
||||||
let new_slot = match &new_location_config.mode {
|
let new_slot = match &new_location_config.mode {
|
||||||
LocationMode::Secondary(_) => TenantSlot::Secondary,
|
LocationMode::Secondary(secondary_config) => {
|
||||||
|
TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
|
||||||
|
}
|
||||||
LocationMode::Attached(_attach_config) => {
|
LocationMode::Attached(_attach_config) => {
|
||||||
let shard_identity = new_location_config.shard;
|
let shard_identity = new_location_config.shard;
|
||||||
let tenant = tenant_spawn(
|
let tenant = tenant_spawn(
|
||||||
@@ -1091,6 +1167,30 @@ impl TenantManager {
|
|||||||
.collect(),
|
.collect(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Do some synchronous work for all tenant slots in Secondary state. The provided
|
||||||
|
// callback should be small and fast, as it will be called inside the global
|
||||||
|
// TenantsMap lock.
|
||||||
|
pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
|
||||||
|
where
|
||||||
|
// TODO: let the callback return a hint to drop out of the loop early
|
||||||
|
F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
|
||||||
|
{
|
||||||
|
let locked = self.tenants.read().unwrap();
|
||||||
|
|
||||||
|
let map = match &*locked {
|
||||||
|
TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
|
||||||
|
TenantsMap::Open(m) => m,
|
||||||
|
};
|
||||||
|
|
||||||
|
for (tenant_id, slot) in map {
|
||||||
|
if let TenantSlot::Secondary(state) = slot {
|
||||||
|
// Only expose secondary tenants that are not currently shutting down
|
||||||
|
if !state.cancel.is_cancelled() {
|
||||||
|
func(tenant_id, state)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) async fn delete_tenant(
|
pub(crate) async fn delete_tenant(
|
||||||
&self,
|
&self,
|
||||||
@@ -1205,7 +1305,7 @@ pub(crate) fn get_tenant(
|
|||||||
Some(TenantSlot::InProgress(_)) => {
|
Some(TenantSlot::InProgress(_)) => {
|
||||||
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
|
||||||
}
|
}
|
||||||
None | Some(TenantSlot::Secondary) => {
|
None | Some(TenantSlot::Secondary(_)) => {
|
||||||
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1257,9 +1357,11 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
|||||||
let locked = TENANTS.read().unwrap();
|
let locked = TENANTS.read().unwrap();
|
||||||
|
|
||||||
// Resolve TenantId to TenantShardId
|
// Resolve TenantId to TenantShardId
|
||||||
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
|
let tenant_shard_id = locked
|
||||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
|
.resolve_attached_shard(&tenant_id, shard_selector)
|
||||||
)?;
|
.ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||||
|
tenant_id,
|
||||||
|
)))?;
|
||||||
|
|
||||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||||
.map_err(GetTenantError::MapState)?;
|
.map_err(GetTenantError::MapState)?;
|
||||||
@@ -1276,7 +1378,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some(TenantSlot::Secondary) => {
|
Some(TenantSlot::Secondary(_)) => {
|
||||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
|
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
|
||||||
tenant_id,
|
tenant_id,
|
||||||
)))
|
)))
|
||||||
@@ -1540,7 +1642,7 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
|
|||||||
Ok(m.iter()
|
Ok(m.iter()
|
||||||
.filter_map(|(id, tenant)| match tenant {
|
.filter_map(|(id, tenant)| match tenant {
|
||||||
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
|
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
|
||||||
TenantSlot::Secondary => None,
|
TenantSlot::Secondary(_) => None,
|
||||||
TenantSlot::InProgress(_) => None,
|
TenantSlot::InProgress(_) => None,
|
||||||
})
|
})
|
||||||
.collect())
|
.collect())
|
||||||
@@ -1797,11 +1899,7 @@ impl SlotGuard {
|
|||||||
fn old_value_is_shutdown(&self) -> bool {
|
fn old_value_is_shutdown(&self) -> bool {
|
||||||
match self.old_value.as_ref() {
|
match self.old_value.as_ref() {
|
||||||
Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
|
Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
|
||||||
Some(TenantSlot::Secondary) => {
|
Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
|
||||||
// TODO: when adding secondary mode tenants, this will check for shutdown
|
|
||||||
// in the same way that we do for `Tenant` above
|
|
||||||
true
|
|
||||||
}
|
|
||||||
Some(TenantSlot::InProgress(_)) => {
|
Some(TenantSlot::InProgress(_)) => {
|
||||||
// A SlotGuard cannot be constructed for a slot that was already InProgress
|
// A SlotGuard cannot be constructed for a slot that was already InProgress
|
||||||
unreachable!()
|
unreachable!()
|
||||||
@@ -2011,26 +2109,19 @@ where
|
|||||||
let mut slot_guard =
|
let mut slot_guard =
|
||||||
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
|
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
|
||||||
|
|
||||||
// The SlotGuard allows us to manipulate the Tenant object without fear of some
|
|
||||||
// concurrent API request doing something else for the same tenant ID.
|
|
||||||
let attached_tenant = match slot_guard.get_old_value() {
|
|
||||||
Some(TenantSlot::Attached(t)) => Some(t),
|
|
||||||
_ => None,
|
|
||||||
};
|
|
||||||
|
|
||||||
// allow pageserver shutdown to await for our completion
|
// allow pageserver shutdown to await for our completion
|
||||||
let (_guard, progress) = completion::channel();
|
let (_guard, progress) = completion::channel();
|
||||||
|
|
||||||
// If the tenant was attached, shut it down gracefully. For secondary
|
// The SlotGuard allows us to manipulate the Tenant object without fear of some
|
||||||
// locations this part is not necessary
|
// concurrent API request doing something else for the same tenant ID.
|
||||||
match &attached_tenant {
|
let attached_tenant = match slot_guard.get_old_value() {
|
||||||
Some(attached_tenant) => {
|
Some(TenantSlot::Attached(tenant)) => {
|
||||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||||
let freeze_and_flush = false;
|
let freeze_and_flush = false;
|
||||||
|
|
||||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||||
// that we can continue safely to cleanup.
|
// that we can continue safely to cleanup.
|
||||||
match attached_tenant.shutdown(progress, freeze_and_flush).await {
|
match tenant.shutdown(progress, freeze_and_flush).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(_other) => {
|
Err(_other) => {
|
||||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||||
@@ -2039,11 +2130,19 @@ where
|
|||||||
return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
|
return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Some(tenant)
|
||||||
}
|
}
|
||||||
None => {
|
Some(TenantSlot::Secondary(secondary_state)) => {
|
||||||
// Nothing to wait on when not attached, proceed.
|
tracing::info!("Shutting down in secondary mode");
|
||||||
|
secondary_state.shutdown().await;
|
||||||
|
None
|
||||||
}
|
}
|
||||||
}
|
Some(TenantSlot::InProgress(_)) => {
|
||||||
|
// Acquiring a slot guarantees its old value was not InProgress
|
||||||
|
unreachable!();
|
||||||
|
}
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
match tenant_cleanup
|
match tenant_cleanup
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -229,6 +229,7 @@ use crate::{
|
|||||||
tenant::upload_queue::{
|
tenant::upload_queue::{
|
||||||
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
||||||
},
|
},
|
||||||
|
TENANT_HEATMAP_BASENAME,
|
||||||
};
|
};
|
||||||
|
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
@@ -818,8 +819,25 @@ impl RemoteTimelineClient {
|
|||||||
fn schedule_deletion_of_unlinked0(
|
fn schedule_deletion_of_unlinked0(
|
||||||
self: &Arc<Self>,
|
self: &Arc<Self>,
|
||||||
upload_queue: &mut UploadQueueInitialized,
|
upload_queue: &mut UploadQueueInitialized,
|
||||||
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
|
mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
|
||||||
) {
|
) {
|
||||||
|
// Filter out any layers which were not created by this tenant shard. These are
|
||||||
|
// layers that originate from some ancestor shard after a split, and may still
|
||||||
|
// be referenced by other shards. We are free to delete them locally and remove
|
||||||
|
// them from our index (and would have already done so when we reach this point
|
||||||
|
// in the code), but we may not delete them remotely.
|
||||||
|
with_metadata.retain(|(name, meta)| {
|
||||||
|
let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
|
||||||
|
&& meta.shard.shard_count == self.tenant_shard_id.shard_count;
|
||||||
|
if !retain {
|
||||||
|
tracing::debug!(
|
||||||
|
"Skipping deletion of ancestor-shard layer {name}, from shard {}",
|
||||||
|
meta.shard
|
||||||
|
);
|
||||||
|
}
|
||||||
|
retain
|
||||||
|
});
|
||||||
|
|
||||||
for (name, meta) in &with_metadata {
|
for (name, meta) in &with_metadata {
|
||||||
info!(
|
info!(
|
||||||
"scheduling deletion of layer {}{} (shard {})",
|
"scheduling deletion of layer {}{} (shard {})",
|
||||||
@@ -1724,11 +1742,11 @@ pub fn remote_index_path(
|
|||||||
.expect("Failed to construct path")
|
.expect("Failed to construct path")
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
|
|
||||||
|
|
||||||
pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||||
RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
|
RemotePath::from_string(&format!(
|
||||||
.expect("Failed to construct path")
|
"tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
|
||||||
|
))
|
||||||
|
.expect("Failed to construct path")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Given the key of an index, parse out the generation part of the name
|
/// Given the key of an index, parse out the generation part of the name
|
||||||
|
|||||||
@@ -1,24 +1,48 @@
|
|||||||
|
mod downloader;
|
||||||
pub mod heatmap;
|
pub mod heatmap;
|
||||||
mod heatmap_uploader;
|
mod heatmap_uploader;
|
||||||
|
mod scheduler;
|
||||||
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||||
|
|
||||||
use self::heatmap_uploader::heatmap_uploader_task;
|
use self::{
|
||||||
|
downloader::{downloader_task, SecondaryDetail},
|
||||||
|
heatmap_uploader::heatmap_uploader_task,
|
||||||
|
};
|
||||||
|
|
||||||
use super::mgr::TenantManager;
|
use super::{config::SecondaryLocationConfig, mgr::TenantManager};
|
||||||
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
|
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::completion::Barrier;
|
use utils::{completion::Barrier, sync::gate::Gate};
|
||||||
|
|
||||||
|
enum DownloadCommand {
|
||||||
|
Download(TenantShardId),
|
||||||
|
}
|
||||||
enum UploadCommand {
|
enum UploadCommand {
|
||||||
Upload(TenantShardId),
|
Upload(TenantShardId),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl UploadCommand {
|
||||||
|
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||||
|
match self {
|
||||||
|
Self::Upload(id) => id,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DownloadCommand {
|
||||||
|
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||||
|
match self {
|
||||||
|
Self::Download(id) => id,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct CommandRequest<T> {
|
struct CommandRequest<T> {
|
||||||
payload: T,
|
payload: T,
|
||||||
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
|
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
|
||||||
@@ -28,12 +52,73 @@ struct CommandResponse {
|
|||||||
result: anyhow::Result<()>,
|
result: anyhow::Result<()>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Whereas [`Tenant`] represents an attached tenant, this type represents the work
|
||||||
|
// we do for secondary tenant locations: where we are not serving clients or
|
||||||
|
// ingesting WAL, but we are maintaining a warm cache of layer files.
|
||||||
|
//
|
||||||
|
// This type is all about the _download_ path for secondary mode. The upload path
|
||||||
|
// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
|
||||||
|
//
|
||||||
|
// This structure coordinates TenantManager and SecondaryDownloader,
|
||||||
|
// so that the downloader can indicate which tenants it is currently
|
||||||
|
// operating on, and the manager can indicate when a particular
|
||||||
|
// secondary tenant should cancel any work in flight.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub(crate) struct SecondaryTenant {
|
||||||
|
/// Carrying a tenant shard ID simplifies callers such as the downloader
|
||||||
|
/// which need to organize many of these objects by ID.
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
|
||||||
|
/// Cancellation token indicates to SecondaryDownloader that it should stop doing
|
||||||
|
/// any work for this tenant at the next opportunity.
|
||||||
|
pub(crate) cancel: CancellationToken,
|
||||||
|
|
||||||
|
pub(crate) gate: Gate,
|
||||||
|
|
||||||
|
detail: std::sync::Mutex<SecondaryDetail>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SecondaryTenant {
|
||||||
|
pub(crate) fn new(
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
config: &SecondaryLocationConfig,
|
||||||
|
) -> Arc<Self> {
|
||||||
|
Arc::new(Self {
|
||||||
|
tenant_shard_id,
|
||||||
|
// todo: shall we make this a descendent of the
|
||||||
|
// main cancellation token, or is it sufficient that
|
||||||
|
// on shutdown we walk the tenants and fire their
|
||||||
|
// individual cancellations?
|
||||||
|
cancel: CancellationToken::new(),
|
||||||
|
gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
|
||||||
|
|
||||||
|
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn shutdown(&self) {
|
||||||
|
self.cancel.cancel();
|
||||||
|
|
||||||
|
// Wait for any secondary downloader work to complete
|
||||||
|
self.gate.close().await;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
|
||||||
|
self.detail.lock().unwrap().config = config.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||||
|
&self.tenant_shard_id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
|
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
|
||||||
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
|
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
|
||||||
/// where we want to immediately upload/download for a particular tenant. In normal operation
|
/// where we want to immediately upload/download for a particular tenant. In normal operation
|
||||||
/// uploads & downloads are autonomous and not driven by this interface.
|
/// uploads & downloads are autonomous and not driven by this interface.
|
||||||
pub struct SecondaryController {
|
pub struct SecondaryController {
|
||||||
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
|
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
|
||||||
|
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SecondaryController {
|
impl SecondaryController {
|
||||||
@@ -63,6 +148,13 @@ impl SecondaryController {
|
|||||||
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
|
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||||
|
self.dispatch(
|
||||||
|
&self.download_req_tx,
|
||||||
|
DownloadCommand::Download(tenant_shard_id),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn spawn_tasks(
|
pub fn spawn_tasks(
|
||||||
@@ -71,9 +163,37 @@ pub fn spawn_tasks(
|
|||||||
background_jobs_can_start: Barrier,
|
background_jobs_can_start: Barrier,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> SecondaryController {
|
) -> SecondaryController {
|
||||||
|
let mgr_clone = tenant_manager.clone();
|
||||||
|
let storage_clone = remote_storage.clone();
|
||||||
|
let cancel_clone = cancel.clone();
|
||||||
|
let bg_jobs_clone = background_jobs_can_start.clone();
|
||||||
|
|
||||||
|
let (download_req_tx, download_req_rx) =
|
||||||
|
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
|
||||||
let (upload_req_tx, upload_req_rx) =
|
let (upload_req_tx, upload_req_rx) =
|
||||||
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
||||||
|
|
||||||
|
task_mgr::spawn(
|
||||||
|
BACKGROUND_RUNTIME.handle(),
|
||||||
|
TaskKind::SecondaryDownloads,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
"secondary tenant downloads",
|
||||||
|
false,
|
||||||
|
async move {
|
||||||
|
downloader_task(
|
||||||
|
mgr_clone,
|
||||||
|
storage_clone,
|
||||||
|
download_req_rx,
|
||||||
|
bg_jobs_clone,
|
||||||
|
cancel_clone,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
task_mgr::spawn(
|
task_mgr::spawn(
|
||||||
BACKGROUND_RUNTIME.handle(),
|
BACKGROUND_RUNTIME.handle(),
|
||||||
TaskKind::SecondaryUploads,
|
TaskKind::SecondaryUploads,
|
||||||
@@ -89,16 +209,26 @@ pub fn spawn_tasks(
|
|||||||
background_jobs_can_start,
|
background_jobs_can_start,
|
||||||
cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.await
|
.await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
SecondaryController { upload_req_tx }
|
SecondaryController {
|
||||||
|
download_req_tx,
|
||||||
|
upload_req_tx,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
|
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
|
||||||
pub fn null_controller() -> SecondaryController {
|
pub fn null_controller() -> SecondaryController {
|
||||||
|
let (download_req_tx, _download_req_rx) =
|
||||||
|
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
|
||||||
let (upload_req_tx, _upload_req_rx) =
|
let (upload_req_tx, _upload_req_rx) =
|
||||||
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
|
||||||
SecondaryController { upload_req_tx }
|
SecondaryController {
|
||||||
|
upload_req_tx,
|
||||||
|
download_req_tx,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
801
pageserver/src/tenant/secondary/downloader.rs
Normal file
801
pageserver/src/tenant/secondary/downloader.rs
Normal file
@@ -0,0 +1,801 @@
|
|||||||
|
use std::{
|
||||||
|
collections::{HashMap, HashSet},
|
||||||
|
pin::Pin,
|
||||||
|
str::FromStr,
|
||||||
|
sync::Arc,
|
||||||
|
time::{Duration, Instant, SystemTime},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
config::PageServerConf,
|
||||||
|
metrics::SECONDARY_MODE,
|
||||||
|
tenant::{
|
||||||
|
config::SecondaryLocationConfig,
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||||
|
remote_timeline_client::{
|
||||||
|
index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
|
||||||
|
},
|
||||||
|
span::debug_assert_current_span_has_tenant_id,
|
||||||
|
storage_layer::LayerFileName,
|
||||||
|
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||||
|
},
|
||||||
|
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
|
||||||
|
METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::{
|
||||||
|
heatmap::HeatMapLayer,
|
||||||
|
scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
|
||||||
|
SecondaryTenant,
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::tenant::{
|
||||||
|
mgr::TenantManager,
|
||||||
|
remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
|
||||||
|
};
|
||||||
|
|
||||||
|
use chrono::format::{DelayedFormat, StrftimeItems};
|
||||||
|
use futures::Future;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
|
use rand::Rng;
|
||||||
|
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||||
|
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
use tracing::{info_span, instrument, Instrument};
|
||||||
|
use utils::{
|
||||||
|
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::{
|
||||||
|
heatmap::{HeatMapTenant, HeatMapTimeline},
|
||||||
|
CommandRequest, DownloadCommand,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// For each tenant, how long must have passed since the last download_tenant call before
|
||||||
|
/// calling it again. This is approximately the time by which local data is allowed
|
||||||
|
/// to fall behind remote data.
|
||||||
|
///
|
||||||
|
/// TODO: this should just be a default, and the actual period should be controlled
|
||||||
|
/// via the heatmap itself
|
||||||
|
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
|
||||||
|
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
|
||||||
|
|
||||||
|
pub(super) async fn downloader_task(
|
||||||
|
tenant_manager: Arc<TenantManager>,
|
||||||
|
remote_storage: GenericRemoteStorage,
|
||||||
|
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
|
||||||
|
background_jobs_can_start: Barrier,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
) {
|
||||||
|
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
|
||||||
|
|
||||||
|
let generator = SecondaryDownloader {
|
||||||
|
tenant_manager,
|
||||||
|
remote_storage,
|
||||||
|
};
|
||||||
|
let mut scheduler = Scheduler::new(generator, concurrency);
|
||||||
|
|
||||||
|
scheduler
|
||||||
|
.run(command_queue, background_jobs_can_start, cancel)
|
||||||
|
.instrument(info_span!("secondary_downloads"))
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
struct SecondaryDownloader {
|
||||||
|
tenant_manager: Arc<TenantManager>,
|
||||||
|
remote_storage: GenericRemoteStorage,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub(super) struct OnDiskState {
|
||||||
|
metadata: LayerFileMetadata,
|
||||||
|
access_time: SystemTime,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl OnDiskState {
|
||||||
|
fn new(
|
||||||
|
_conf: &'static PageServerConf,
|
||||||
|
_tenant_shard_id: &TenantShardId,
|
||||||
|
_imeline_id: &TimelineId,
|
||||||
|
_ame: LayerFileName,
|
||||||
|
metadata: LayerFileMetadata,
|
||||||
|
access_time: SystemTime,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
metadata,
|
||||||
|
access_time,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Default)]
|
||||||
|
pub(super) struct SecondaryDetailTimeline {
|
||||||
|
pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
|
||||||
|
|
||||||
|
/// We remember when layers were evicted, to prevent re-downloading them.
|
||||||
|
pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This state is written by the secondary downloader, it is opaque
|
||||||
|
/// to TenantManager
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub(super) struct SecondaryDetail {
|
||||||
|
pub(super) config: SecondaryLocationConfig,
|
||||||
|
|
||||||
|
last_download: Option<Instant>,
|
||||||
|
next_download: Option<Instant>,
|
||||||
|
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Helper for logging SystemTime
|
||||||
|
fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
|
||||||
|
let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
|
||||||
|
datetime.format("%d/%m/%Y %T")
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SecondaryDetail {
|
||||||
|
pub(super) fn new(config: SecondaryLocationConfig) -> Self {
|
||||||
|
Self {
|
||||||
|
config,
|
||||||
|
last_download: None,
|
||||||
|
next_download: None,
|
||||||
|
timelines: HashMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct PendingDownload {
|
||||||
|
secondary_state: Arc<SecondaryTenant>,
|
||||||
|
last_download: Option<Instant>,
|
||||||
|
target_time: Option<Instant>,
|
||||||
|
period: Option<Duration>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl scheduler::PendingJob for PendingDownload {
|
||||||
|
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||||
|
self.secondary_state.get_tenant_shard_id()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct RunningDownload {
|
||||||
|
barrier: Barrier,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl scheduler::RunningJob for RunningDownload {
|
||||||
|
fn get_barrier(&self) -> Barrier {
|
||||||
|
self.barrier.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct CompleteDownload {
|
||||||
|
secondary_state: Arc<SecondaryTenant>,
|
||||||
|
completed_at: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl scheduler::Completion for CompleteDownload {
|
||||||
|
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||||
|
self.secondary_state.get_tenant_shard_id()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type Scheduler = TenantBackgroundJobs<
|
||||||
|
SecondaryDownloader,
|
||||||
|
PendingDownload,
|
||||||
|
RunningDownload,
|
||||||
|
CompleteDownload,
|
||||||
|
DownloadCommand,
|
||||||
|
>;
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
|
||||||
|
for SecondaryDownloader
|
||||||
|
{
|
||||||
|
#[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
|
||||||
|
fn on_completion(&mut self, completion: CompleteDownload) {
|
||||||
|
let CompleteDownload {
|
||||||
|
secondary_state,
|
||||||
|
completed_at: _completed_at,
|
||||||
|
} = completion;
|
||||||
|
|
||||||
|
tracing::debug!("Secondary tenant download completed");
|
||||||
|
|
||||||
|
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
|
||||||
|
// take priority to run again.
|
||||||
|
let mut detail = secondary_state.detail.lock().unwrap();
|
||||||
|
detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
|
||||||
|
let mut result = SchedulingResult {
|
||||||
|
jobs: Vec::new(),
|
||||||
|
want_interval: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step 1: identify some tenants that we may work on
|
||||||
|
let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
|
||||||
|
self.tenant_manager
|
||||||
|
.foreach_secondary_tenants(|_id, secondary_state| {
|
||||||
|
tenants.push(secondary_state.clone());
|
||||||
|
});
|
||||||
|
|
||||||
|
// Step 2: filter out tenants which are not yet elegible to run
|
||||||
|
let now = Instant::now();
|
||||||
|
result.jobs = tenants
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|secondary_tenant| {
|
||||||
|
let (last_download, next_download) = {
|
||||||
|
let mut detail = secondary_tenant.detail.lock().unwrap();
|
||||||
|
|
||||||
|
if !detail.config.warm {
|
||||||
|
// Downloads are disabled for this tenant
|
||||||
|
detail.next_download = None;
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
if detail.next_download.is_none() {
|
||||||
|
// Initialize with a jitter: this spreads initial downloads on startup
|
||||||
|
// or mass-attach across our freshen interval.
|
||||||
|
let jittered_period =
|
||||||
|
rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
|
||||||
|
detail.next_download = Some(now.checked_add(jittered_period).expect(
|
||||||
|
"Using our constant, which is known to be small compared with clock range",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
(detail.last_download, detail.next_download.unwrap())
|
||||||
|
};
|
||||||
|
|
||||||
|
if now < next_download {
|
||||||
|
Some(PendingDownload {
|
||||||
|
secondary_state: secondary_tenant,
|
||||||
|
last_download,
|
||||||
|
target_time: Some(next_download),
|
||||||
|
period: Some(DOWNLOAD_FRESHEN_INTERVAL),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Step 3: sort by target execution time to run most urgent first.
|
||||||
|
result.jobs.sort_by_key(|j| j.target_time);
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
|
||||||
|
let tenant_shard_id = command.get_tenant_shard_id();
|
||||||
|
|
||||||
|
let tenant = self
|
||||||
|
.tenant_manager
|
||||||
|
.get_secondary_tenant_shard(*tenant_shard_id);
|
||||||
|
let Some(tenant) = tenant else {
|
||||||
|
{
|
||||||
|
return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(PendingDownload {
|
||||||
|
target_time: None,
|
||||||
|
period: None,
|
||||||
|
last_download: None,
|
||||||
|
secondary_state: tenant,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn spawn(
|
||||||
|
&mut self,
|
||||||
|
job: PendingDownload,
|
||||||
|
) -> (
|
||||||
|
RunningDownload,
|
||||||
|
Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
|
||||||
|
) {
|
||||||
|
let PendingDownload {
|
||||||
|
secondary_state,
|
||||||
|
last_download,
|
||||||
|
target_time,
|
||||||
|
period,
|
||||||
|
} = job;
|
||||||
|
|
||||||
|
let (completion, barrier) = utils::completion::channel();
|
||||||
|
let remote_storage = self.remote_storage.clone();
|
||||||
|
let conf = self.tenant_manager.get_conf();
|
||||||
|
let tenant_shard_id = *secondary_state.get_tenant_shard_id();
|
||||||
|
(RunningDownload { barrier }, Box::pin(async move {
|
||||||
|
let _completion = completion;
|
||||||
|
|
||||||
|
match TenantDownloader::new(conf, &remote_storage, &secondary_state)
|
||||||
|
.download()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Err(UpdateError::NoData) => {
|
||||||
|
tracing::info!("No heatmap found for tenant. This is fine if it is new.");
|
||||||
|
},
|
||||||
|
Err(UpdateError::NoSpace) => {
|
||||||
|
tracing::warn!("Insufficient space while downloading. Will retry later.");
|
||||||
|
}
|
||||||
|
Err(UpdateError::Cancelled) => {
|
||||||
|
tracing::debug!("Shut down while downloading");
|
||||||
|
},
|
||||||
|
Err(UpdateError::Deserialize(e)) => {
|
||||||
|
tracing::error!("Corrupt content while downloading tenant: {e}");
|
||||||
|
},
|
||||||
|
Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
|
||||||
|
tracing::error!("Error while downloading tenant: {e}");
|
||||||
|
},
|
||||||
|
Ok(()) => {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Irrespective of the result, we will reschedule ourselves to run after our usual period.
|
||||||
|
|
||||||
|
// If the job had a target execution time, we may check our final execution
|
||||||
|
// time against that for observability purposes.
|
||||||
|
if let (Some(target_time), Some(period)) = (target_time, period) {
|
||||||
|
// Only track execution lag if this isn't our first download: otherwise, it is expected
|
||||||
|
// that execution will have taken longer than our configured interval, for example
|
||||||
|
// when starting up a pageserver and
|
||||||
|
if last_download.is_some() {
|
||||||
|
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||||
|
let elapsed = Instant::now().duration_since(target_time);
|
||||||
|
|
||||||
|
warn_when_period_overrun(
|
||||||
|
elapsed,
|
||||||
|
period,
|
||||||
|
BackgroundLoopKind::SecondaryDownload,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CompleteDownload {
|
||||||
|
secondary_state,
|
||||||
|
completed_at: Instant::now(),
|
||||||
|
}
|
||||||
|
}.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This type is a convenience to group together the various functions involved in
|
||||||
|
/// freshening a secondary tenant.
|
||||||
|
struct TenantDownloader<'a> {
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
remote_storage: &'a GenericRemoteStorage,
|
||||||
|
secondary_state: &'a SecondaryTenant,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Errors that may be encountered while updating a tenant
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
enum UpdateError {
|
||||||
|
#[error("No remote data found")]
|
||||||
|
NoData,
|
||||||
|
#[error("Insufficient local storage space")]
|
||||||
|
NoSpace,
|
||||||
|
#[error("Failed to download")]
|
||||||
|
DownloadError(DownloadError),
|
||||||
|
#[error(transparent)]
|
||||||
|
Deserialize(#[from] serde_json::Error),
|
||||||
|
#[error("Cancelled")]
|
||||||
|
Cancelled,
|
||||||
|
#[error(transparent)]
|
||||||
|
Other(#[from] anyhow::Error),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<DownloadError> for UpdateError {
|
||||||
|
fn from(value: DownloadError) -> Self {
|
||||||
|
match &value {
|
||||||
|
DownloadError::Cancelled => Self::Cancelled,
|
||||||
|
DownloadError::NotFound => Self::NoData,
|
||||||
|
_ => Self::DownloadError(value),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<std::io::Error> for UpdateError {
|
||||||
|
fn from(value: std::io::Error) -> Self {
|
||||||
|
if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
|
||||||
|
UpdateError::NoSpace
|
||||||
|
} else {
|
||||||
|
// An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
|
||||||
|
UpdateError::Other(anyhow::anyhow!(value))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> TenantDownloader<'a> {
|
||||||
|
fn new(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
remote_storage: &'a GenericRemoteStorage,
|
||||||
|
secondary_state: &'a SecondaryTenant,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
conf,
|
||||||
|
remote_storage,
|
||||||
|
secondary_state,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn download(&self) -> Result<(), UpdateError> {
|
||||||
|
debug_assert_current_span_has_tenant_id();
|
||||||
|
|
||||||
|
// For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
|
||||||
|
// cover our access to local storage.
|
||||||
|
let Ok(_guard) = self.secondary_state.gate.enter() else {
|
||||||
|
// Shutting down
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
|
||||||
|
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||||
|
// Download the tenant's heatmap
|
||||||
|
let heatmap_bytes = tokio::select!(
|
||||||
|
bytes = self.download_heatmap() => {bytes?},
|
||||||
|
_ = self.secondary_state.cancel.cancelled() => return Ok(())
|
||||||
|
);
|
||||||
|
|
||||||
|
let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
|
||||||
|
|
||||||
|
// Save the heatmap: this will be useful on restart, allowing us to reconstruct
|
||||||
|
// layer metadata without having to re-download it.
|
||||||
|
let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
|
||||||
|
|
||||||
|
let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
|
||||||
|
let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
|
||||||
|
let heatmap_path_bg = heatmap_path.clone();
|
||||||
|
tokio::task::spawn_blocking(move || {
|
||||||
|
tokio::runtime::Handle::current().block_on(async move {
|
||||||
|
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.expect("Blocking task is never aborted")
|
||||||
|
.maybe_fatal_err(&context_msg)?;
|
||||||
|
|
||||||
|
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
|
||||||
|
|
||||||
|
// Download the layers in the heatmap
|
||||||
|
for timeline in heatmap.timelines {
|
||||||
|
if self.secondary_state.cancel.is_cancelled() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let timeline_id = timeline.timeline_id;
|
||||||
|
self.download_timeline(timeline)
|
||||||
|
.instrument(tracing::info_span!(
|
||||||
|
"secondary_download_timeline",
|
||||||
|
tenant_id=%tenant_shard_id.tenant_id,
|
||||||
|
shard_id=%tenant_shard_id.shard_slug(),
|
||||||
|
%timeline_id
|
||||||
|
))
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
|
||||||
|
debug_assert_current_span_has_tenant_id();
|
||||||
|
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||||
|
// TODO: make download conditional on ETag having changed since last download
|
||||||
|
// (https://github.com/neondatabase/neon/issues/6199)
|
||||||
|
tracing::debug!("Downloading heatmap for secondary tenant",);
|
||||||
|
|
||||||
|
let heatmap_path = remote_heatmap_path(tenant_shard_id);
|
||||||
|
|
||||||
|
let heatmap_bytes = backoff::retry(
|
||||||
|
|| async {
|
||||||
|
let download = self
|
||||||
|
.remote_storage
|
||||||
|
.download(&heatmap_path)
|
||||||
|
.await
|
||||||
|
.map_err(UpdateError::from)?;
|
||||||
|
let mut heatmap_bytes = Vec::new();
|
||||||
|
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||||
|
let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
|
||||||
|
Ok(heatmap_bytes)
|
||||||
|
},
|
||||||
|
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
||||||
|
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||||
|
FAILED_REMOTE_OP_RETRIES,
|
||||||
|
"download heatmap",
|
||||||
|
backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
|
||||||
|
UpdateError::Cancelled
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
SECONDARY_MODE.download_heatmap.inc();
|
||||||
|
|
||||||
|
Ok(heatmap_bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
|
||||||
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||||
|
let timeline_path = self
|
||||||
|
.conf
|
||||||
|
.timeline_path(tenant_shard_id, &timeline.timeline_id);
|
||||||
|
|
||||||
|
// Accumulate updates to the state
|
||||||
|
let mut touched = Vec::new();
|
||||||
|
|
||||||
|
// Clone a view of what layers already exist on disk
|
||||||
|
let timeline_state = self
|
||||||
|
.secondary_state
|
||||||
|
.detail
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.timelines
|
||||||
|
.get(&timeline.timeline_id)
|
||||||
|
.cloned();
|
||||||
|
|
||||||
|
let timeline_state = match timeline_state {
|
||||||
|
Some(t) => t,
|
||||||
|
None => {
|
||||||
|
// We have no existing state: need to scan local disk for layers first.
|
||||||
|
let timeline_state =
|
||||||
|
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
|
||||||
|
|
||||||
|
// Re-acquire detail lock now that we're done with async load from local FS
|
||||||
|
self.secondary_state
|
||||||
|
.detail
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.timelines
|
||||||
|
.insert(timeline.timeline_id, timeline_state.clone());
|
||||||
|
timeline_state
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let layers_in_heatmap = timeline
|
||||||
|
.layers
|
||||||
|
.iter()
|
||||||
|
.map(|l| &l.name)
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
let layers_on_disk = timeline_state
|
||||||
|
.on_disk_layers
|
||||||
|
.iter()
|
||||||
|
.map(|l| l.0)
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
|
||||||
|
// Remove on-disk layers that are no longer present in heatmap
|
||||||
|
for layer in layers_on_disk.difference(&layers_in_heatmap) {
|
||||||
|
let local_path = timeline_path.join(layer.to_string());
|
||||||
|
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
|
||||||
|
tokio::fs::remove_file(&local_path)
|
||||||
|
.await
|
||||||
|
.or_else(fs_ext::ignore_not_found)
|
||||||
|
.maybe_fatal_err("Removing secondary layer")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Download heatmap layers that are not present on local disk, or update their
|
||||||
|
// access time if they are already present.
|
||||||
|
for layer in timeline.layers {
|
||||||
|
if self.secondary_state.cancel.is_cancelled() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Existing on-disk layers: just update their access time.
|
||||||
|
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
|
||||||
|
tracing::debug!("Layer {} is already on disk", layer.name);
|
||||||
|
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|
||||||
|
|| on_disk.access_time != layer.access_time
|
||||||
|
{
|
||||||
|
// We already have this layer on disk. Update its access time.
|
||||||
|
tracing::debug!(
|
||||||
|
"Access time updated for layer {}: {} -> {}",
|
||||||
|
layer.name,
|
||||||
|
strftime(&on_disk.access_time),
|
||||||
|
strftime(&layer.access_time)
|
||||||
|
);
|
||||||
|
touched.push(layer);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
tracing::debug!("Layer {} not present on disk yet", layer.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
|
||||||
|
// recently than it was evicted.
|
||||||
|
if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
|
||||||
|
if &layer.access_time > evicted_at {
|
||||||
|
tracing::info!(
|
||||||
|
"Re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||||
|
layer.name,
|
||||||
|
strftime(&layer.access_time),
|
||||||
|
strftime(evicted_at)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
tracing::trace!(
|
||||||
|
"Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
|
||||||
|
layer.name,
|
||||||
|
strftime(&layer.access_time),
|
||||||
|
strftime(evicted_at)
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
|
||||||
|
let downloaded_bytes = match download_layer_file(
|
||||||
|
self.conf,
|
||||||
|
self.remote_storage,
|
||||||
|
*tenant_shard_id,
|
||||||
|
timeline.timeline_id,
|
||||||
|
&layer.name,
|
||||||
|
&LayerFileMetadata::from(&layer.metadata),
|
||||||
|
&self.secondary_state.cancel,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(bytes) => bytes,
|
||||||
|
Err(e) => {
|
||||||
|
if let DownloadError::NotFound = e {
|
||||||
|
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
|
||||||
|
// This is harmless: continue to download the next layer. It is expected during compaction
|
||||||
|
// GC.
|
||||||
|
tracing::debug!(
|
||||||
|
"Skipped downloading missing layer {}, raced with compaction/gc?",
|
||||||
|
layer.name
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
return Err(e.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if downloaded_bytes != layer.metadata.file_size {
|
||||||
|
let local_path = timeline_path.join(layer.name.to_string());
|
||||||
|
|
||||||
|
tracing::warn!(
|
||||||
|
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
|
||||||
|
layer.name,
|
||||||
|
downloaded_bytes,
|
||||||
|
layer.metadata.file_size
|
||||||
|
);
|
||||||
|
|
||||||
|
tokio::fs::remove_file(&local_path)
|
||||||
|
.await
|
||||||
|
.or_else(fs_ext::ignore_not_found)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
SECONDARY_MODE.download_layer.inc();
|
||||||
|
touched.push(layer)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write updates to state to record layers we just downloaded or touched.
|
||||||
|
{
|
||||||
|
let mut detail = self.secondary_state.detail.lock().unwrap();
|
||||||
|
let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
|
||||||
|
|
||||||
|
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
|
||||||
|
|
||||||
|
for t in touched {
|
||||||
|
use std::collections::hash_map::Entry;
|
||||||
|
match timeline_detail.on_disk_layers.entry(t.name.clone()) {
|
||||||
|
Entry::Occupied(mut v) => {
|
||||||
|
v.get_mut().access_time = t.access_time;
|
||||||
|
}
|
||||||
|
Entry::Vacant(e) => {
|
||||||
|
e.insert(OnDiskState::new(
|
||||||
|
self.conf,
|
||||||
|
tenant_shard_id,
|
||||||
|
&timeline.timeline_id,
|
||||||
|
t.name,
|
||||||
|
LayerFileMetadata::from(&t.metadata),
|
||||||
|
t.access_time,
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
|
||||||
|
async fn init_timeline_state(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
tenant_shard_id: &TenantShardId,
|
||||||
|
heatmap: &HeatMapTimeline,
|
||||||
|
) -> SecondaryDetailTimeline {
|
||||||
|
let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
|
||||||
|
let mut detail = SecondaryDetailTimeline::default();
|
||||||
|
|
||||||
|
let mut dir = match tokio::fs::read_dir(&timeline_path).await {
|
||||||
|
Ok(d) => d,
|
||||||
|
Err(e) => {
|
||||||
|
if e.kind() == std::io::ErrorKind::NotFound {
|
||||||
|
let context = format!("Creating timeline directory {timeline_path}");
|
||||||
|
tracing::info!("{}", context);
|
||||||
|
tokio::fs::create_dir_all(&timeline_path)
|
||||||
|
.await
|
||||||
|
.fatal_err(&context);
|
||||||
|
|
||||||
|
// No entries to report: drop out.
|
||||||
|
return detail;
|
||||||
|
} else {
|
||||||
|
on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// As we iterate through layers found on disk, we will look up their metadata from this map.
|
||||||
|
// Layers not present in metadata will be discarded.
|
||||||
|
let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
|
||||||
|
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
|
||||||
|
|
||||||
|
while let Some(dentry) = dir
|
||||||
|
.next_entry()
|
||||||
|
.await
|
||||||
|
.fatal_err(&format!("Listing {timeline_path}"))
|
||||||
|
{
|
||||||
|
let dentry_file_name = dentry.file_name();
|
||||||
|
let file_name = dentry_file_name.to_string_lossy();
|
||||||
|
let local_meta = dentry.metadata().await.fatal_err(&format!(
|
||||||
|
"Read metadata on {}",
|
||||||
|
dentry.path().to_string_lossy()
|
||||||
|
));
|
||||||
|
|
||||||
|
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||||
|
if file_name == METADATA_FILE_NAME {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
match LayerFileName::from_str(&file_name) {
|
||||||
|
Ok(name) => {
|
||||||
|
let remote_meta = heatmap_metadata.get(&name);
|
||||||
|
match remote_meta {
|
||||||
|
Some(remote_meta) => {
|
||||||
|
// TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
|
||||||
|
if local_meta.len() != remote_meta.metadata.file_size {
|
||||||
|
// This should not happen, because we do crashsafe write-then-rename when downloading
|
||||||
|
// layers, and layers in remote storage are immutable. Remove the local file because
|
||||||
|
// we cannot trust it.
|
||||||
|
tracing::warn!(
|
||||||
|
"Removing local layer {name} with unexpected local size {} != {}",
|
||||||
|
local_meta.len(),
|
||||||
|
remote_meta.metadata.file_size
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// We expect the access time to be initialized immediately afterwards, when
|
||||||
|
// the latest heatmap is applied to the state.
|
||||||
|
detail.on_disk_layers.insert(
|
||||||
|
name.clone(),
|
||||||
|
OnDiskState::new(
|
||||||
|
conf,
|
||||||
|
tenant_shard_id,
|
||||||
|
&heatmap.timeline_id,
|
||||||
|
name,
|
||||||
|
LayerFileMetadata::from(&remote_meta.metadata),
|
||||||
|
remote_meta.access_time,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// FIXME: consider some optimization when transitioning from attached to secondary: maybe
|
||||||
|
// wait until we have seen a heatmap that is more recent than the most recent on-disk state? Otherwise
|
||||||
|
// we will end up deleting any layers which were created+uploaded more recently than the heatmap.
|
||||||
|
tracing::info!(
|
||||||
|
"Removing secondary local layer {} because it's absent in heatmap",
|
||||||
|
name
|
||||||
|
);
|
||||||
|
tokio::fs::remove_file(&dentry.path())
|
||||||
|
.await
|
||||||
|
.or_else(fs_ext::ignore_not_found)
|
||||||
|
.fatal_err(&format!(
|
||||||
|
"Removing layer {}",
|
||||||
|
dentry.path().to_string_lossy()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// Ignore it.
|
||||||
|
tracing::warn!("Unexpected file in timeline directory: {file_name}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
detail
|
||||||
|
}
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
use std::{
|
use std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
|
pin::Pin,
|
||||||
sync::{Arc, Weak},
|
sync::{Arc, Weak},
|
||||||
time::{Duration, Instant},
|
time::{Duration, Instant},
|
||||||
};
|
};
|
||||||
@@ -7,35 +8,86 @@ use std::{
|
|||||||
use crate::{
|
use crate::{
|
||||||
metrics::SECONDARY_MODE,
|
metrics::SECONDARY_MODE,
|
||||||
tenant::{
|
tenant::{
|
||||||
config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
|
config::AttachmentMode,
|
||||||
secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
|
mgr::TenantManager,
|
||||||
|
remote_timeline_client::remote_heatmap_path,
|
||||||
|
span::debug_assert_current_span_has_tenant_id,
|
||||||
|
tasks::{warn_when_period_overrun, BackgroundLoopKind},
|
||||||
|
Tenant,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use futures::Future;
|
||||||
use md5;
|
use md5;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
|
use rand::Rng;
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
|
|
||||||
use tokio::task::JoinSet;
|
use super::{
|
||||||
|
scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
|
||||||
|
CommandRequest,
|
||||||
|
};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::instrument;
|
use tracing::{info_span, instrument, Instrument};
|
||||||
use utils::{backoff, completion::Barrier};
|
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
|
||||||
|
|
||||||
use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
|
use super::{heatmap::HeatMapTenant, UploadCommand};
|
||||||
|
|
||||||
/// Period between heatmap uploader walking Tenants to look for work to do.
|
pub(super) async fn heatmap_uploader_task(
|
||||||
/// If any tenants have a heatmap upload period lower than this, it will be adjusted
|
tenant_manager: Arc<TenantManager>,
|
||||||
/// downward to match.
|
remote_storage: GenericRemoteStorage,
|
||||||
const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
|
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
|
||||||
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
|
background_jobs_can_start: Barrier,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
) {
|
||||||
|
let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
|
||||||
|
|
||||||
|
let generator = HeatmapUploader {
|
||||||
|
tenant_manager,
|
||||||
|
remote_storage,
|
||||||
|
cancel: cancel.clone(),
|
||||||
|
tenants: HashMap::new(),
|
||||||
|
};
|
||||||
|
let mut scheduler = Scheduler::new(generator, concurrency);
|
||||||
|
|
||||||
|
scheduler
|
||||||
|
.run(command_queue, background_jobs_can_start, cancel)
|
||||||
|
.instrument(info_span!("heatmap_uploader"))
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
|
||||||
|
/// handling loop and mutates it as needed: there are no locks here, because that event loop
|
||||||
|
/// can hold &mut references to this type throughout.
|
||||||
|
struct HeatmapUploader {
|
||||||
|
tenant_manager: Arc<TenantManager>,
|
||||||
|
remote_storage: GenericRemoteStorage,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
|
||||||
|
tenants: HashMap<TenantShardId, UploaderTenantState>,
|
||||||
|
}
|
||||||
|
|
||||||
struct WriteInProgress {
|
struct WriteInProgress {
|
||||||
barrier: Barrier,
|
barrier: Barrier,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl RunningJob for WriteInProgress {
|
||||||
|
fn get_barrier(&self) -> Barrier {
|
||||||
|
self.barrier.clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct UploadPending {
|
struct UploadPending {
|
||||||
tenant: Arc<Tenant>,
|
tenant: Arc<Tenant>,
|
||||||
last_digest: Option<md5::Digest>,
|
last_digest: Option<md5::Digest>,
|
||||||
|
target_time: Option<Instant>,
|
||||||
|
period: Option<Duration>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl scheduler::PendingJob for UploadPending {
|
||||||
|
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||||
|
self.tenant.get_tenant_shard_id()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct WriteComplete {
|
struct WriteComplete {
|
||||||
@@ -45,6 +97,12 @@ struct WriteComplete {
|
|||||||
next_upload: Option<Instant>,
|
next_upload: Option<Instant>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl scheduler::Completion for WriteComplete {
|
||||||
|
fn get_tenant_shard_id(&self) -> &TenantShardId {
|
||||||
|
&self.tenant_shard_id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
|
/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
|
||||||
/// when we last did a write. We only populate this after doing at least one
|
/// when we last did a write. We only populate this after doing at least one
|
||||||
/// write for a tenant -- this avoids holding state for tenants that have
|
/// write for a tenant -- this avoids holding state for tenants that have
|
||||||
@@ -68,267 +126,111 @@ struct UploaderTenantState {
|
|||||||
next_upload: Option<Instant>,
|
next_upload: Option<Instant>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
|
type Scheduler = TenantBackgroundJobs<
|
||||||
/// handling loop and mutates it as needed: there are no locks here, because that event loop
|
HeatmapUploader,
|
||||||
/// can hold &mut references to this type throughout.
|
UploadPending,
|
||||||
struct HeatmapUploader {
|
WriteInProgress,
|
||||||
tenant_manager: Arc<TenantManager>,
|
WriteComplete,
|
||||||
remote_storage: GenericRemoteStorage,
|
UploadCommand,
|
||||||
cancel: CancellationToken,
|
>;
|
||||||
|
|
||||||
tenants: HashMap<TenantShardId, UploaderTenantState>,
|
#[async_trait::async_trait]
|
||||||
|
impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
|
||||||
/// Tenants with work to do, for which tasks should be spawned as soon as concurrency
|
for HeatmapUploader
|
||||||
/// limits permit it.
|
{
|
||||||
tenants_pending: std::collections::VecDeque<UploadPending>,
|
async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
|
||||||
|
|
||||||
/// Tenants for which a task in `tasks` has been spawned.
|
|
||||||
tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
|
|
||||||
|
|
||||||
tasks: JoinSet<()>,
|
|
||||||
|
|
||||||
/// Channel for our child tasks to send results to: we use a channel for results rather than
|
|
||||||
/// just getting task results via JoinSet because we need the channel's recv() "sleep until something
|
|
||||||
/// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
|
|
||||||
/// behavior.
|
|
||||||
task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
|
|
||||||
task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
|
|
||||||
|
|
||||||
concurrent_uploads: usize,
|
|
||||||
|
|
||||||
scheduling_interval: Duration,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The uploader task runs a loop that periodically wakes up and schedules tasks for
|
|
||||||
/// tenants that require an upload, or handles any commands that have been sent into
|
|
||||||
/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we
|
|
||||||
/// spawn.
|
|
||||||
///
|
|
||||||
/// Scheduling iterations are somewhat infrequent. However, each one will enqueue
|
|
||||||
/// all tenants that require an upload, and in between scheduling iterations we will
|
|
||||||
/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
|
|
||||||
///
|
|
||||||
/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
|
|
||||||
/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
|
|
||||||
/// we might block waiting on a Tenant.
|
|
||||||
pub(super) async fn heatmap_uploader_task(
|
|
||||||
tenant_manager: Arc<TenantManager>,
|
|
||||||
remote_storage: GenericRemoteStorage,
|
|
||||||
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
|
|
||||||
background_jobs_can_start: Barrier,
|
|
||||||
cancel: CancellationToken,
|
|
||||||
) -> anyhow::Result<()> {
|
|
||||||
let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
|
|
||||||
|
|
||||||
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
|
|
||||||
|
|
||||||
let mut uploader = HeatmapUploader {
|
|
||||||
tenant_manager,
|
|
||||||
remote_storage,
|
|
||||||
cancel: cancel.clone(),
|
|
||||||
tasks: JoinSet::new(),
|
|
||||||
tenants: HashMap::new(),
|
|
||||||
tenants_pending: std::collections::VecDeque::new(),
|
|
||||||
tenants_uploading: HashMap::new(),
|
|
||||||
task_result_tx: result_tx,
|
|
||||||
task_result_rx: result_rx,
|
|
||||||
concurrent_uploads,
|
|
||||||
scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
|
|
||||||
};
|
|
||||||
|
|
||||||
tracing::info!("Waiting for background_jobs_can start...");
|
|
||||||
background_jobs_can_start.wait().await;
|
|
||||||
tracing::info!("background_jobs_can is ready, proceeding.");
|
|
||||||
|
|
||||||
while !cancel.is_cancelled() {
|
|
||||||
// Look for new work: this is relatively expensive because we have to go acquire the lock on
|
|
||||||
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
|
|
||||||
// require an upload.
|
|
||||||
uploader.schedule_iteration().await?;
|
|
||||||
|
|
||||||
// Between scheduling iterations, we will:
|
|
||||||
// - Drain any complete tasks and spawn pending tasks
|
|
||||||
// - Handle incoming administrative commands
|
|
||||||
// - Check our cancellation token
|
|
||||||
let next_scheduling_iteration = Instant::now()
|
|
||||||
.checked_add(uploader.scheduling_interval)
|
|
||||||
.unwrap_or_else(|| {
|
|
||||||
tracing::warn!(
|
|
||||||
"Scheduling interval invalid ({}s), running immediately!",
|
|
||||||
uploader.scheduling_interval.as_secs_f64()
|
|
||||||
);
|
|
||||||
Instant::now()
|
|
||||||
});
|
|
||||||
loop {
|
|
||||||
tokio::select! {
|
|
||||||
_ = cancel.cancelled() => {
|
|
||||||
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
|
|
||||||
tracing::info!("Heatmap uploader joining tasks");
|
|
||||||
while let Some(_r) = uploader.tasks.join_next().await {};
|
|
||||||
tracing::info!("Heatmap uploader terminating");
|
|
||||||
|
|
||||||
break;
|
|
||||||
},
|
|
||||||
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
|
|
||||||
tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
|
|
||||||
break;},
|
|
||||||
cmd = command_queue.recv() => {
|
|
||||||
tracing::debug!("heatmap_uploader_task: woke for command queue");
|
|
||||||
let cmd = match cmd {
|
|
||||||
Some(c) =>c,
|
|
||||||
None => {
|
|
||||||
// SecondaryController was destroyed, and this has raced with
|
|
||||||
// our CancellationToken
|
|
||||||
tracing::info!("Heatmap uploader terminating");
|
|
||||||
cancel.cancel();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let CommandRequest{
|
|
||||||
response_tx,
|
|
||||||
payload
|
|
||||||
} = cmd;
|
|
||||||
uploader.handle_command(payload, response_tx);
|
|
||||||
},
|
|
||||||
_ = uploader.process_next_completion() => {
|
|
||||||
if !cancel.is_cancelled() {
|
|
||||||
uploader.spawn_pending();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
impl HeatmapUploader {
|
|
||||||
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
|
|
||||||
async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
|
|
||||||
// Cull any entries in self.tenants whose Arc<Tenant> is gone
|
// Cull any entries in self.tenants whose Arc<Tenant> is gone
|
||||||
self.tenants
|
self.tenants
|
||||||
.retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
|
.retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
|
||||||
|
|
||||||
// The priority order of previously scheduled work may be invalidated by current state: drop
|
|
||||||
// all pending work (it will be re-scheduled if still needed)
|
|
||||||
self.tenants_pending.clear();
|
|
||||||
|
|
||||||
// Used a fixed 'now' through the following loop, for efficiency and fairness.
|
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
|
||||||
// While iterating over the potentially-long list of tenants, we will periodically yield
|
let mut result = SchedulingResult {
|
||||||
// to avoid blocking executor.
|
jobs: Vec::new(),
|
||||||
const YIELD_ITERATIONS: usize = 1000;
|
want_interval: None,
|
||||||
|
};
|
||||||
|
|
||||||
// Iterate over tenants looking for work to do.
|
|
||||||
let tenants = self.tenant_manager.get_attached_active_tenant_shards();
|
let tenants = self.tenant_manager.get_attached_active_tenant_shards();
|
||||||
for (i, tenant) in tenants.into_iter().enumerate() {
|
|
||||||
// Process is shutting down, drop out
|
|
||||||
if self.cancel.is_cancelled() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Skip tenants that already have a write in flight
|
yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
|
||||||
if self
|
let period = match tenant.get_heatmap_period() {
|
||||||
.tenants_uploading
|
None => {
|
||||||
.contains_key(tenant.get_tenant_shard_id())
|
// Heatmaps are disabled for this tenant
|
||||||
{
|
return;
|
||||||
continue;
|
}
|
||||||
}
|
Some(period) => {
|
||||||
|
// If any tenant has asked for uploads more frequent than our scheduling interval,
|
||||||
|
// reduce it to match so that we can keep up. This is mainly useful in testing, where
|
||||||
|
// we may set rather short intervals.
|
||||||
|
result.want_interval = match result.want_interval {
|
||||||
|
None => Some(period),
|
||||||
|
Some(existing) => Some(std::cmp::min(period, existing)),
|
||||||
|
};
|
||||||
|
|
||||||
self.maybe_schedule_upload(&now, tenant);
|
period
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if i + 1 % YIELD_ITERATIONS == 0 {
|
// Stale attachments do not upload anything: if we are in this state, there is probably some
|
||||||
tokio::task::yield_now().await;
|
// other attachment in mode Single or Multi running on another pageserver, and we don't
|
||||||
}
|
// want to thrash and overwrite their heatmap uploads.
|
||||||
}
|
if tenant.get_attach_mode() == AttachmentMode::Stale {
|
||||||
|
|
||||||
// Spawn tasks for as many of our pending tenants as we can.
|
|
||||||
self.spawn_pending();
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
/// Cancellation: this method is cancel-safe.
|
|
||||||
async fn process_next_completion(&mut self) {
|
|
||||||
match self.task_result_rx.recv().await {
|
|
||||||
Some(r) => {
|
|
||||||
self.on_completion(r);
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
unreachable!("Result sender is stored on Self");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The 'maybe' refers to the tenant's state: whether it is configured
|
|
||||||
/// for heatmap uploads at all, and whether sufficient time has passed
|
|
||||||
/// since the last upload.
|
|
||||||
fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
|
|
||||||
match tenant.get_heatmap_period() {
|
|
||||||
None => {
|
|
||||||
// Heatmaps are disabled for this tenant
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
Some(period) => {
|
|
||||||
// If any tenant has asked for uploads more frequent than our scheduling interval,
|
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
|
||||||
// reduce it to match so that we can keep up. This is mainly useful in testing, where
|
// with the completion time in on_completion.
|
||||||
// we may set rather short intervals.
|
let state = self
|
||||||
if period < self.scheduling_interval {
|
.tenants
|
||||||
self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
|
.entry(*tenant.get_tenant_shard_id())
|
||||||
}
|
.or_insert_with(|| {
|
||||||
|
let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
|
||||||
|
|
||||||
|
UploaderTenantState {
|
||||||
|
tenant: Arc::downgrade(&tenant),
|
||||||
|
last_upload: None,
|
||||||
|
next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
|
||||||
|
last_digest: None,
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Decline to do the upload if insufficient time has passed
|
||||||
|
if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Stale attachments do not upload anything: if we are in this state, there is probably some
|
let last_digest = state.last_digest;
|
||||||
// other attachment in mode Single or Multi running on another pageserver, and we don't
|
result.jobs.push(UploadPending {
|
||||||
// want to thrash and overwrite their heatmap uploads.
|
tenant,
|
||||||
if tenant.get_attach_mode() == AttachmentMode::Stale {
|
last_digest,
|
||||||
return;
|
target_time: state.next_upload,
|
||||||
}
|
period: Some(period),
|
||||||
|
|
||||||
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
|
|
||||||
// with the completion time in on_completion.
|
|
||||||
let state = self
|
|
||||||
.tenants
|
|
||||||
.entry(*tenant.get_tenant_shard_id())
|
|
||||||
.or_insert_with(|| UploaderTenantState {
|
|
||||||
tenant: Arc::downgrade(&tenant),
|
|
||||||
last_upload: None,
|
|
||||||
next_upload: Some(Instant::now()),
|
|
||||||
last_digest: None,
|
|
||||||
});
|
});
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
|
||||||
// Decline to do the upload if insufficient time has passed
|
result
|
||||||
if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
|
}
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
let last_digest = state.last_digest;
|
fn spawn(
|
||||||
self.tenants_pending.push_back(UploadPending {
|
&mut self,
|
||||||
|
job: UploadPending,
|
||||||
|
) -> (
|
||||||
|
WriteInProgress,
|
||||||
|
Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
|
||||||
|
) {
|
||||||
|
let UploadPending {
|
||||||
tenant,
|
tenant,
|
||||||
last_digest,
|
last_digest,
|
||||||
})
|
target_time,
|
||||||
}
|
period,
|
||||||
|
} = job;
|
||||||
|
|
||||||
fn spawn_pending(&mut self) {
|
|
||||||
while !self.tenants_pending.is_empty()
|
|
||||||
&& self.tenants_uploading.len() < self.concurrent_uploads
|
|
||||||
{
|
|
||||||
// unwrap: loop condition includes !is_empty()
|
|
||||||
let pending = self.tenants_pending.pop_front().unwrap();
|
|
||||||
self.spawn_upload(pending.tenant, pending.last_digest);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
|
|
||||||
let remote_storage = self.remote_storage.clone();
|
let remote_storage = self.remote_storage.clone();
|
||||||
let tenant_shard_id = *tenant.get_tenant_shard_id();
|
|
||||||
let (completion, barrier) = utils::completion::channel();
|
let (completion, barrier) = utils::completion::channel();
|
||||||
let result_tx = self.task_result_tx.clone();
|
let tenant_shard_id = *tenant.get_tenant_shard_id();
|
||||||
self.tasks.spawn(async move {
|
(WriteInProgress { barrier }, Box::pin(async move {
|
||||||
// Guard for the barrier in [`WriteInProgress`]
|
// Guard for the barrier in [`WriteInProgress`]
|
||||||
let _completion = completion;
|
let _completion = completion;
|
||||||
|
|
||||||
@@ -362,22 +264,47 @@ impl HeatmapUploader {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
|
||||||
|
// If the job had a target execution time, we may check our final execution
|
||||||
|
// time against that for observability purposes.
|
||||||
|
if let (Some(target_time), Some(period)) = (target_time, period) {
|
||||||
|
// Elapsed time includes any scheduling lag as well as the execution of the job
|
||||||
|
let elapsed = now.duration_since(target_time);
|
||||||
|
|
||||||
|
warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
|
||||||
|
}
|
||||||
|
|
||||||
let next_upload = tenant
|
let next_upload = tenant
|
||||||
.get_heatmap_period()
|
.get_heatmap_period()
|
||||||
.and_then(|period| now.checked_add(period));
|
.and_then(|period| now.checked_add(period));
|
||||||
|
|
||||||
result_tx
|
WriteComplete {
|
||||||
.send(WriteComplete {
|
|
||||||
tenant_shard_id: *tenant.get_tenant_shard_id(),
|
tenant_shard_id: *tenant.get_tenant_shard_id(),
|
||||||
completed_at: now,
|
completed_at: now,
|
||||||
digest,
|
digest,
|
||||||
next_upload,
|
next_upload,
|
||||||
})
|
}
|
||||||
.ok();
|
}.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||||
});
|
}
|
||||||
|
|
||||||
self.tenants_uploading
|
fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
|
||||||
.insert(tenant_shard_id, WriteInProgress { barrier });
|
let tenant_shard_id = command.get_tenant_shard_id();
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
||||||
|
"Starting heatmap write on command");
|
||||||
|
let tenant = self
|
||||||
|
.tenant_manager
|
||||||
|
.get_attached_tenant_shard(*tenant_shard_id, true)
|
||||||
|
.map_err(|e| anyhow::anyhow!(e))?;
|
||||||
|
|
||||||
|
Ok(UploadPending {
|
||||||
|
// Ignore our state for last digest: this forces an upload even if nothing has changed
|
||||||
|
last_digest: None,
|
||||||
|
tenant,
|
||||||
|
target_time: None,
|
||||||
|
period: None,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
|
#[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
|
||||||
@@ -389,7 +316,6 @@ impl HeatmapUploader {
|
|||||||
digest,
|
digest,
|
||||||
next_upload,
|
next_upload,
|
||||||
} = completion;
|
} = completion;
|
||||||
self.tenants_uploading.remove(&tenant_shard_id);
|
|
||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
match self.tenants.entry(tenant_shard_id) {
|
match self.tenants.entry(tenant_shard_id) {
|
||||||
Entry::Vacant(_) => {
|
Entry::Vacant(_) => {
|
||||||
@@ -402,69 +328,6 @@ impl HeatmapUploader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn handle_command(
|
|
||||||
&mut self,
|
|
||||||
command: UploadCommand,
|
|
||||||
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
|
|
||||||
) {
|
|
||||||
match command {
|
|
||||||
UploadCommand::Upload(tenant_shard_id) => {
|
|
||||||
// If an upload was ongoing for this tenant, let it finish first.
|
|
||||||
let barrier = if let Some(writing_state) =
|
|
||||||
self.tenants_uploading.get(&tenant_shard_id)
|
|
||||||
{
|
|
||||||
tracing::info!(
|
|
||||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
|
||||||
"Waiting for heatmap write to complete");
|
|
||||||
writing_state.barrier.clone()
|
|
||||||
} else {
|
|
||||||
// Spawn the upload then immediately wait for it. This will block processing of other commands and
|
|
||||||
// starting of other background work.
|
|
||||||
tracing::info!(
|
|
||||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
|
||||||
"Starting heatmap write on command");
|
|
||||||
let tenant = match self
|
|
||||||
.tenant_manager
|
|
||||||
.get_attached_tenant_shard(tenant_shard_id, true)
|
|
||||||
{
|
|
||||||
Ok(t) => t,
|
|
||||||
Err(e) => {
|
|
||||||
// Drop result of send: we don't care if caller dropped their receiver
|
|
||||||
drop(response_tx.send(CommandResponse {
|
|
||||||
result: Err(e.into()),
|
|
||||||
}));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
self.spawn_upload(tenant, None);
|
|
||||||
let writing_state = self
|
|
||||||
.tenants_uploading
|
|
||||||
.get(&tenant_shard_id)
|
|
||||||
.expect("We just inserted this");
|
|
||||||
tracing::info!(
|
|
||||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
|
||||||
"Waiting for heatmap upload to complete");
|
|
||||||
|
|
||||||
writing_state.barrier.clone()
|
|
||||||
};
|
|
||||||
|
|
||||||
// This task does no I/O: it only listens for a barrier's completion and then
|
|
||||||
// sends to the command response channel. It is therefore safe to spawn this without
|
|
||||||
// any gates/task_mgr hooks.
|
|
||||||
tokio::task::spawn(async move {
|
|
||||||
barrier.wait().await;
|
|
||||||
|
|
||||||
tracing::info!(
|
|
||||||
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
|
|
||||||
"Heatmap upload complete");
|
|
||||||
|
|
||||||
// Drop result of send: we don't care if caller dropped their receiver
|
|
||||||
drop(response_tx.send(CommandResponse { result: Ok(()) }))
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
enum UploadHeatmapOutcome {
|
enum UploadHeatmapOutcome {
|
||||||
@@ -487,7 +350,6 @@ enum UploadHeatmapError {
|
|||||||
|
|
||||||
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
|
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
|
||||||
/// of the object we would have uploaded.
|
/// of the object we would have uploaded.
|
||||||
#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
|
|
||||||
async fn upload_tenant_heatmap(
|
async fn upload_tenant_heatmap(
|
||||||
remote_storage: GenericRemoteStorage,
|
remote_storage: GenericRemoteStorage,
|
||||||
tenant: &Arc<Tenant>,
|
tenant: &Arc<Tenant>,
|
||||||
|
|||||||
361
pageserver/src/tenant/secondary/scheduler.rs
Normal file
361
pageserver/src/tenant/secondary/scheduler.rs
Normal file
@@ -0,0 +1,361 @@
|
|||||||
|
use async_trait;
|
||||||
|
use futures::Future;
|
||||||
|
use std::{
|
||||||
|
collections::HashMap,
|
||||||
|
marker::PhantomData,
|
||||||
|
pin::Pin,
|
||||||
|
time::{Duration, Instant},
|
||||||
|
};
|
||||||
|
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
|
use tokio::task::JoinSet;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
use utils::{completion::Barrier, yielding_loop::yielding_loop};
|
||||||
|
|
||||||
|
use super::{CommandRequest, CommandResponse};
|
||||||
|
|
||||||
|
/// Scheduling interval is the time between calls to JobGenerator::schedule.
|
||||||
|
/// When we schedule jobs, the job generator may provide a hint of its preferred
|
||||||
|
/// interval, which we will respect within these intervals.
|
||||||
|
const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
|
||||||
|
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
|
||||||
|
|
||||||
|
/// Scheduling helper for background work across many tenants.
|
||||||
|
///
|
||||||
|
/// Systems that need to run background work across many tenants may use this type
|
||||||
|
/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
|
||||||
|
/// implementation to provide the work to execute. This is a simple scheduler that just
|
||||||
|
/// polls the generator for outstanding work, replacing its queue of pending work with
|
||||||
|
/// what the generator yields on each call: the job generator can change its mind about
|
||||||
|
/// the order of jobs between calls. The job generator is notified when jobs complete,
|
||||||
|
/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
|
||||||
|
/// admin APIs).
|
||||||
|
///
|
||||||
|
/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
|
||||||
|
///
|
||||||
|
/// G: A JobGenerator that this scheduler will poll to find pending jobs
|
||||||
|
/// PJ: 'Pending Job': type for job descriptors that are ready to run
|
||||||
|
/// RJ: 'Running Job' type' for jobs that have been spawned
|
||||||
|
/// C : 'Completion' type that spawned jobs will send when they finish
|
||||||
|
/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
|
||||||
|
pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
|
||||||
|
where
|
||||||
|
G: JobGenerator<PJ, RJ, C, CMD>,
|
||||||
|
C: Completion,
|
||||||
|
PJ: PendingJob,
|
||||||
|
RJ: RunningJob,
|
||||||
|
{
|
||||||
|
generator: G,
|
||||||
|
|
||||||
|
/// Ready to run. Will progress to `running` once concurrent limit is satisfied, or
|
||||||
|
/// be removed on next scheduling pass.
|
||||||
|
pending: std::collections::VecDeque<PJ>,
|
||||||
|
|
||||||
|
/// Tasks currently running in Self::tasks for these tenants. Check this map
|
||||||
|
/// before pushing more work into pending for the same tenant.
|
||||||
|
running: HashMap<TenantShardId, RJ>,
|
||||||
|
|
||||||
|
tasks: JoinSet<C>,
|
||||||
|
|
||||||
|
concurrency: usize,
|
||||||
|
|
||||||
|
/// How often we would like schedule_interval to be called.
|
||||||
|
pub(super) scheduling_interval: Duration,
|
||||||
|
|
||||||
|
_phantom: PhantomData<(PJ, RJ, C, CMD)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
|
||||||
|
where
|
||||||
|
C: Completion,
|
||||||
|
PJ: PendingJob,
|
||||||
|
RJ: RunningJob,
|
||||||
|
{
|
||||||
|
/// Called at each scheduling interval. Return a list of jobs to run, most urgent first.
|
||||||
|
///
|
||||||
|
/// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
|
||||||
|
/// Implementations should take care to yield the executor periodically if running
|
||||||
|
/// very long loops.
|
||||||
|
///
|
||||||
|
/// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
|
||||||
|
/// jobs is not drained by the next scheduling interval, pending jobs will be cleared
|
||||||
|
/// and re-generated.
|
||||||
|
async fn schedule(&mut self) -> SchedulingResult<PJ>;
|
||||||
|
|
||||||
|
/// Called when a pending job is ready to be run.
|
||||||
|
///
|
||||||
|
/// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
|
||||||
|
fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
|
||||||
|
|
||||||
|
/// Called when a job previously spawned with spawn() transmits its completion
|
||||||
|
fn on_completion(&mut self, completion: C);
|
||||||
|
|
||||||
|
/// Called when a command is received. A job will be spawned immediately if the return
|
||||||
|
/// value is Some, ignoring concurrency limits and the pending queue.
|
||||||
|
fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
|
||||||
|
pub(super) struct SchedulingResult<PJ> {
|
||||||
|
pub(super) jobs: Vec<PJ>,
|
||||||
|
/// The job generator would like to be called again this soon
|
||||||
|
pub(super) want_interval: Option<Duration>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// See [`TenantBackgroundJobs`].
|
||||||
|
pub(super) trait PendingJob {
|
||||||
|
fn get_tenant_shard_id(&self) -> &TenantShardId;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// See [`TenantBackgroundJobs`].
|
||||||
|
pub(super) trait Completion: Send + 'static {
|
||||||
|
fn get_tenant_shard_id(&self) -> &TenantShardId;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// See [`TenantBackgroundJobs`].
|
||||||
|
pub(super) trait RunningJob {
|
||||||
|
fn get_barrier(&self) -> Barrier;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
|
||||||
|
where
|
||||||
|
C: Completion,
|
||||||
|
PJ: PendingJob,
|
||||||
|
RJ: RunningJob,
|
||||||
|
G: JobGenerator<PJ, RJ, C, CMD>,
|
||||||
|
{
|
||||||
|
pub(super) fn new(generator: G, concurrency: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
generator,
|
||||||
|
pending: std::collections::VecDeque::new(),
|
||||||
|
running: HashMap::new(),
|
||||||
|
tasks: JoinSet::new(),
|
||||||
|
concurrency,
|
||||||
|
scheduling_interval: MAX_SCHEDULING_INTERVAL,
|
||||||
|
_phantom: PhantomData,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(super) async fn run(
|
||||||
|
&mut self,
|
||||||
|
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
|
||||||
|
background_jobs_can_start: Barrier,
|
||||||
|
cancel: CancellationToken,
|
||||||
|
) {
|
||||||
|
tracing::info!("Waiting for background_jobs_can start...");
|
||||||
|
background_jobs_can_start.wait().await;
|
||||||
|
tracing::info!("background_jobs_can is ready, proceeding.");
|
||||||
|
|
||||||
|
while !cancel.is_cancelled() {
|
||||||
|
// Look for new work: this is relatively expensive because we have to go acquire the lock on
|
||||||
|
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
|
||||||
|
// require an upload.
|
||||||
|
self.schedule_iteration(&cancel).await;
|
||||||
|
|
||||||
|
if cancel.is_cancelled() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Schedule some work, if concurrency limit permits it
|
||||||
|
self.spawn_pending();
|
||||||
|
|
||||||
|
// Between scheduling iterations, we will:
|
||||||
|
// - Drain any complete tasks and spawn pending tasks
|
||||||
|
// - Handle incoming administrative commands
|
||||||
|
// - Check our cancellation token
|
||||||
|
let next_scheduling_iteration = Instant::now()
|
||||||
|
.checked_add(self.scheduling_interval)
|
||||||
|
.unwrap_or_else(|| {
|
||||||
|
tracing::warn!(
|
||||||
|
"Scheduling interval invalid ({}s)",
|
||||||
|
self.scheduling_interval.as_secs_f64()
|
||||||
|
);
|
||||||
|
// unwrap(): this constant is small, cannot fail to add to time unless
|
||||||
|
// we are close to the end of the universe.
|
||||||
|
Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
|
||||||
|
});
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = cancel.cancelled() => {
|
||||||
|
tracing::info!("joining tasks");
|
||||||
|
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
|
||||||
|
// It is the callers responsibility to make sure that the tasks they scheduled
|
||||||
|
// respect an appropriate cancellation token, to shut down promptly. It is only
|
||||||
|
// safe to wait on joining these tasks because we can see the cancellation token
|
||||||
|
// has been set.
|
||||||
|
while let Some(_r) = self.tasks.join_next().await {}
|
||||||
|
tracing::info!("terminating on cancellation token.");
|
||||||
|
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
|
||||||
|
tracing::debug!("woke for scheduling interval");
|
||||||
|
break;},
|
||||||
|
cmd = command_queue.recv() => {
|
||||||
|
tracing::debug!("woke for command queue");
|
||||||
|
let cmd = match cmd {
|
||||||
|
Some(c) =>c,
|
||||||
|
None => {
|
||||||
|
// SecondaryController was destroyed, and this has raced with
|
||||||
|
// our CancellationToken
|
||||||
|
tracing::info!("terminating on command queue destruction");
|
||||||
|
cancel.cancel();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let CommandRequest{
|
||||||
|
response_tx,
|
||||||
|
payload
|
||||||
|
} = cmd;
|
||||||
|
self.handle_command(payload, response_tx);
|
||||||
|
},
|
||||||
|
_ = async {
|
||||||
|
let completion = self.process_next_completion().await;
|
||||||
|
match completion {
|
||||||
|
Some(c) => {
|
||||||
|
self.generator.on_completion(c);
|
||||||
|
if !cancel.is_cancelled() {
|
||||||
|
self.spawn_pending();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
// Nothing is running, so just wait: expect that this future
|
||||||
|
// will be dropped when something in the outer select! fires.
|
||||||
|
cancel.cancelled().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn do_spawn(&mut self, job: PJ) {
|
||||||
|
let tenant_shard_id = *job.get_tenant_shard_id();
|
||||||
|
let (in_progress, fut) = self.generator.spawn(job);
|
||||||
|
|
||||||
|
self.tasks.spawn(fut);
|
||||||
|
|
||||||
|
self.running.insert(tenant_shard_id, in_progress);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// For all pending tenants that are elegible for execution, spawn their task.
|
||||||
|
///
|
||||||
|
/// Caller provides the spawn operation, we track the resulting execution.
|
||||||
|
fn spawn_pending(&mut self) {
|
||||||
|
while !self.pending.is_empty() && self.running.len() < self.concurrency {
|
||||||
|
// unwrap: loop condition includes !is_empty()
|
||||||
|
let pending = self.pending.pop_front().unwrap();
|
||||||
|
self.do_spawn(pending);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// For administrative commands: skip the pending queue, ignore concurrency limits
|
||||||
|
fn spawn_now(&mut self, job: PJ) -> &RJ {
|
||||||
|
let tenant_shard_id = *job.get_tenant_shard_id();
|
||||||
|
self.do_spawn(job);
|
||||||
|
self.running
|
||||||
|
.get(&tenant_shard_id)
|
||||||
|
.expect("We just inserted this")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wait until the next task completes, and handle its completion
|
||||||
|
///
|
||||||
|
/// Cancellation: this method is cancel-safe.
|
||||||
|
async fn process_next_completion(&mut self) -> Option<C> {
|
||||||
|
match self.tasks.join_next().await {
|
||||||
|
Some(r) => {
|
||||||
|
// We use a channel to drive completions, but also
|
||||||
|
// need to drain the JoinSet to avoid completed tasks
|
||||||
|
// accumulating. These calls are 1:1 because every task
|
||||||
|
// we spawn into this joinset submits is result to the channel.
|
||||||
|
let completion = r.expect("Panic in background task");
|
||||||
|
|
||||||
|
self.running.remove(completion.get_tenant_shard_id());
|
||||||
|
Some(completion)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// Nothing is running, so we have nothing to wait for. We may drop out: the
|
||||||
|
// main even loop will call us again after the next time it has run something.
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert the command into a pending job, spawn it, and when the spawned
|
||||||
|
/// job completes, send the result down `response_tx`.
|
||||||
|
fn handle_command(
|
||||||
|
&mut self,
|
||||||
|
cmd: CMD,
|
||||||
|
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
|
||||||
|
) {
|
||||||
|
let job = match self.generator.on_command(cmd) {
|
||||||
|
Ok(j) => j,
|
||||||
|
Err(e) => {
|
||||||
|
response_tx.send(CommandResponse { result: Err(e) }).ok();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let tenant_shard_id = job.get_tenant_shard_id();
|
||||||
|
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
|
||||||
|
barrier
|
||||||
|
} else {
|
||||||
|
let running = self.spawn_now(job);
|
||||||
|
running.get_barrier().clone()
|
||||||
|
};
|
||||||
|
|
||||||
|
// This task does no I/O: it only listens for a barrier's completion and then
|
||||||
|
// sends to the command response channel. It is therefore safe to spawn this without
|
||||||
|
// any gates/task_mgr hooks.
|
||||||
|
tokio::task::spawn(async move {
|
||||||
|
barrier.wait().await;
|
||||||
|
|
||||||
|
response_tx.send(CommandResponse { result: Ok(()) }).ok();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
|
||||||
|
self.running.get(tenant_shard_id).map(|r| r.get_barrier())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
|
||||||
|
///
|
||||||
|
/// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
|
||||||
|
///
|
||||||
|
/// This function resets the pending list: it is assumed that the caller may change their mind about
|
||||||
|
/// which tenants need work between calls to schedule_iteration.
|
||||||
|
async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
|
||||||
|
let SchedulingResult {
|
||||||
|
jobs,
|
||||||
|
want_interval,
|
||||||
|
} = self.generator.schedule().await;
|
||||||
|
|
||||||
|
// Adjust interval based on feedback from the job generator
|
||||||
|
if let Some(want_interval) = want_interval {
|
||||||
|
// Calculation uses second granularity: this scheduler is not intended for high frequency tasks
|
||||||
|
self.scheduling_interval = Duration::from_secs(std::cmp::min(
|
||||||
|
std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
|
||||||
|
MAX_SCHEDULING_INTERVAL.as_secs(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// The priority order of previously scheduled work may be invalidated by current state: drop
|
||||||
|
// all pending work (it will be re-scheduled if still needed)
|
||||||
|
self.pending.clear();
|
||||||
|
|
||||||
|
// While iterating over the potentially-long list of tenants, we will periodically yield
|
||||||
|
// to avoid blocking executor.
|
||||||
|
yielding_loop(1000, cancel, jobs.into_iter(), |job| {
|
||||||
|
// Skip tenants that already have a write in flight
|
||||||
|
if !self.running.contains_key(job.get_tenant_shard_id()) {
|
||||||
|
self.pending.push_back(job);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
|||||||
// while being able to use std::fmt::Write's methods
|
// while being able to use std::fmt::Write's methods
|
||||||
use std::fmt::Write as _;
|
use std::fmt::Write as _;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use tokio::sync::RwLock;
|
use tokio::sync::{RwLock, RwLockWriteGuard};
|
||||||
|
|
||||||
use super::{DeltaLayerWriter, ResidentLayer};
|
use super::{DeltaLayerWriter, ResidentLayer};
|
||||||
|
|
||||||
@@ -246,16 +246,43 @@ impl InMemoryLayer {
|
|||||||
|
|
||||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||||
/// Adds the page version to the in-memory tree
|
/// Adds the page version to the in-memory tree
|
||||||
pub async fn put_value(
|
pub(crate) async fn put_value(
|
||||||
&self,
|
&self,
|
||||||
key: Key,
|
key: Key,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
val: &Value,
|
val: &Value,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
let mut inner = self.inner.write().await;
|
||||||
let inner: &mut _ = &mut *self.inner.write().await;
|
|
||||||
self.assert_writable();
|
self.assert_writable();
|
||||||
|
self.put_value_locked(&mut inner, key, lsn, val, ctx).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn put_values(
|
||||||
|
&self,
|
||||||
|
values: &HashMap<Key, Vec<(Lsn, Value)>>,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut inner = self.inner.write().await;
|
||||||
|
self.assert_writable();
|
||||||
|
for (key, vals) in values {
|
||||||
|
for (lsn, val) in vals {
|
||||||
|
self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn put_value_locked(
|
||||||
|
&self,
|
||||||
|
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
|
||||||
|
key: Key,
|
||||||
|
lsn: Lsn,
|
||||||
|
val: &Value,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> Result<()> {
|
||||||
|
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||||
|
|
||||||
let off = {
|
let off = {
|
||||||
// Avoid doing allocations for "small" values.
|
// Avoid doing allocations for "small" values.
|
||||||
@@ -264,7 +291,7 @@ impl InMemoryLayer {
|
|||||||
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
|
||||||
buf.clear();
|
buf.clear();
|
||||||
val.ser_into(&mut buf)?;
|
val.ser_into(&mut buf)?;
|
||||||
inner
|
locked_inner
|
||||||
.file
|
.file
|
||||||
.write_blob(
|
.write_blob(
|
||||||
&buf,
|
&buf,
|
||||||
@@ -275,7 +302,7 @@ impl InMemoryLayer {
|
|||||||
.await?
|
.await?
|
||||||
};
|
};
|
||||||
|
|
||||||
let vec_map = inner.index.entry(key).or_default();
|
let vec_map = locked_inner.index.entry(key).or_default();
|
||||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||||
if old.is_some() {
|
if old.is_some() {
|
||||||
// We already had an entry for this LSN. That's odd..
|
// We already had an entry for this LSN. That's odd..
|
||||||
@@ -285,13 +312,11 @@ impl InMemoryLayer {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
|
pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
|
||||||
// TODO: Currently, we just leak the storage for any deleted keys
|
// TODO: Currently, we just leak the storage for any deleted keys
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Make the layer non-writeable. Only call once.
|
|
||||||
/// Records the end_lsn for non-dropped layers.
|
/// Records the end_lsn for non-dropped layers.
|
||||||
/// `end_lsn` is exclusive
|
/// `end_lsn` is exclusive
|
||||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||||
|
|||||||
@@ -1118,6 +1118,7 @@ impl LayerInner {
|
|||||||
tracing::info!("evicted layer after unknown residence period");
|
tracing::info!("evicted layer after unknown residence period");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
timeline.metrics.evictions.inc();
|
||||||
timeline
|
timeline
|
||||||
.metrics
|
.metrics
|
||||||
.resident_physical_size_sub(self.desc.file_size);
|
.resident_physical_size_sub(self.desc.file_size);
|
||||||
|
|||||||
@@ -45,6 +45,8 @@ pub(crate) enum BackgroundLoopKind {
|
|||||||
ConsumptionMetricsCollectMetrics,
|
ConsumptionMetricsCollectMetrics,
|
||||||
ConsumptionMetricsSyntheticSizeWorker,
|
ConsumptionMetricsSyntheticSizeWorker,
|
||||||
InitialLogicalSizeCalculation,
|
InitialLogicalSizeCalculation,
|
||||||
|
HeatmapUpload,
|
||||||
|
SecondaryDownload,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BackgroundLoopKind {
|
impl BackgroundLoopKind {
|
||||||
|
|||||||
@@ -373,15 +373,20 @@ pub struct GcInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// An error happened in a get() operation.
|
/// An error happened in a get() operation.
|
||||||
#[derive(thiserror::Error)]
|
#[derive(thiserror::Error, Debug)]
|
||||||
pub enum PageReconstructError {
|
pub(crate) enum PageReconstructError {
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Other(#[from] anyhow::Error),
|
Other(#[from] anyhow::Error),
|
||||||
|
|
||||||
|
#[error("Ancestor LSN wait error: {0}")]
|
||||||
|
AncestorLsnTimeout(#[from] WaitLsnError),
|
||||||
|
|
||||||
/// The operation was cancelled
|
/// The operation was cancelled
|
||||||
|
#[error("Cancelled")]
|
||||||
Cancelled,
|
Cancelled,
|
||||||
|
|
||||||
/// The ancestor of this is being stopped
|
/// The ancestor of this is being stopped
|
||||||
|
#[error("ancestor timeline {0} is being stopped")]
|
||||||
AncestorStopping(TimelineId),
|
AncestorStopping(TimelineId),
|
||||||
|
|
||||||
/// An error happened replaying WAL records
|
/// An error happened replaying WAL records
|
||||||
@@ -402,32 +407,6 @@ enum FlushLayerError {
|
|||||||
Other(#[from] anyhow::Error),
|
Other(#[from] anyhow::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Debug for PageReconstructError {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
|
||||||
match self {
|
|
||||||
Self::Other(err) => err.fmt(f),
|
|
||||||
Self::Cancelled => write!(f, "cancelled"),
|
|
||||||
Self::AncestorStopping(timeline_id) => {
|
|
||||||
write!(f, "ancestor timeline {timeline_id} is being stopped")
|
|
||||||
}
|
|
||||||
Self::WalRedo(err) => err.fmt(f),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl std::fmt::Display for PageReconstructError {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
|
||||||
match self {
|
|
||||||
Self::Other(err) => err.fmt(f),
|
|
||||||
Self::Cancelled => write!(f, "cancelled"),
|
|
||||||
Self::AncestorStopping(timeline_id) => {
|
|
||||||
write!(f, "ancestor timeline {timeline_id} is being stopped")
|
|
||||||
}
|
|
||||||
Self::WalRedo(err) => err.fmt(f),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Copy)]
|
#[derive(Clone, Copy)]
|
||||||
pub enum LogicalSizeCalculationCause {
|
pub enum LogicalSizeCalculationCause {
|
||||||
Initial,
|
Initial,
|
||||||
@@ -452,6 +431,21 @@ impl std::fmt::Debug for Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(thiserror::Error, Debug)]
|
||||||
|
pub(crate) enum WaitLsnError {
|
||||||
|
// Called on a timeline which is shutting down
|
||||||
|
#[error("Shutdown")]
|
||||||
|
Shutdown,
|
||||||
|
|
||||||
|
// Called on an timeline not in active state or shutting down
|
||||||
|
#[error("Bad state (not active)")]
|
||||||
|
BadState,
|
||||||
|
|
||||||
|
// Timeout expired while waiting for LSN to catch up with goal.
|
||||||
|
#[error("{0}")]
|
||||||
|
Timeout(String),
|
||||||
|
}
|
||||||
|
|
||||||
/// Public interface functions
|
/// Public interface functions
|
||||||
impl Timeline {
|
impl Timeline {
|
||||||
/// Get the LSN where this branch was created
|
/// Get the LSN where this branch was created
|
||||||
@@ -486,7 +480,7 @@ impl Timeline {
|
|||||||
/// # Cancel-Safety
|
/// # Cancel-Safety
|
||||||
///
|
///
|
||||||
/// This method is cancellation-safe.
|
/// This method is cancellation-safe.
|
||||||
pub async fn get(
|
pub(crate) async fn get(
|
||||||
&self,
|
&self,
|
||||||
key: Key,
|
key: Key,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
@@ -496,6 +490,11 @@ impl Timeline {
|
|||||||
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
|
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This check is debug-only because of the cost of hashing, and because it's a double-check: we
|
||||||
|
// already checked the key against the shard_identity when looking up the Timeline from
|
||||||
|
// page_service.
|
||||||
|
debug_assert!(!self.shard_identity.is_key_disposable(&key));
|
||||||
|
|
||||||
// XXX: structured stats collection for layer eviction here.
|
// XXX: structured stats collection for layer eviction here.
|
||||||
trace!(
|
trace!(
|
||||||
"get page request for {}@{} from task kind {:?}",
|
"get page request for {}@{} from task kind {:?}",
|
||||||
@@ -629,24 +628,28 @@ impl Timeline {
|
|||||||
/// You should call this before any of the other get_* or list_* functions. Calling
|
/// You should call this before any of the other get_* or list_* functions. Calling
|
||||||
/// those functions with an LSN that has been processed yet is an error.
|
/// those functions with an LSN that has been processed yet is an error.
|
||||||
///
|
///
|
||||||
pub async fn wait_lsn(
|
pub(crate) async fn wait_lsn(
|
||||||
&self,
|
&self,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
_ctx: &RequestContext, /* Prepare for use by cancellation */
|
_ctx: &RequestContext, /* Prepare for use by cancellation */
|
||||||
) -> anyhow::Result<()> {
|
) -> Result<(), WaitLsnError> {
|
||||||
anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
|
if self.cancel.is_cancelled() {
|
||||||
|
return Err(WaitLsnError::Shutdown);
|
||||||
|
} else if !self.is_active() {
|
||||||
|
return Err(WaitLsnError::BadState);
|
||||||
|
}
|
||||||
|
|
||||||
// This should never be called from the WAL receiver, because that could lead
|
// This should never be called from the WAL receiver, because that could lead
|
||||||
// to a deadlock.
|
// to a deadlock.
|
||||||
anyhow::ensure!(
|
debug_assert!(
|
||||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
|
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
|
||||||
"wait_lsn cannot be called in WAL receiver"
|
"wait_lsn cannot be called in WAL receiver"
|
||||||
);
|
);
|
||||||
anyhow::ensure!(
|
debug_assert!(
|
||||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
|
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
|
||||||
"wait_lsn cannot be called in WAL receiver"
|
"wait_lsn cannot be called in WAL receiver"
|
||||||
);
|
);
|
||||||
anyhow::ensure!(
|
debug_assert!(
|
||||||
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
|
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
|
||||||
"wait_lsn cannot be called in WAL receiver"
|
"wait_lsn cannot be called in WAL receiver"
|
||||||
);
|
);
|
||||||
@@ -660,18 +663,22 @@ impl Timeline {
|
|||||||
{
|
{
|
||||||
Ok(()) => Ok(()),
|
Ok(()) => Ok(()),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
|
use utils::seqwait::SeqWaitError::*;
|
||||||
drop(_timer);
|
match e {
|
||||||
let walreceiver_status = self.walreceiver_status();
|
Shutdown => Err(WaitLsnError::Shutdown),
|
||||||
Err(anyhow::Error::new(e).context({
|
Timeout => {
|
||||||
format!(
|
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
|
||||||
|
drop(_timer);
|
||||||
|
let walreceiver_status = self.walreceiver_status();
|
||||||
|
Err(WaitLsnError::Timeout(format!(
|
||||||
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
|
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
|
||||||
lsn,
|
lsn,
|
||||||
self.get_last_record_lsn(),
|
self.get_last_record_lsn(),
|
||||||
self.get_disk_consistent_lsn(),
|
self.get_disk_consistent_lsn(),
|
||||||
walreceiver_status,
|
walreceiver_status,
|
||||||
)
|
)))
|
||||||
}))
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1459,6 +1466,7 @@ impl Timeline {
|
|||||||
max_lsn_wal_lag,
|
max_lsn_wal_lag,
|
||||||
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
|
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
|
||||||
availability_zone: self.conf.availability_zone.clone(),
|
availability_zone: self.conf.availability_zone.clone(),
|
||||||
|
ingest_batch_size: self.conf.ingest_batch_size,
|
||||||
},
|
},
|
||||||
broker_client,
|
broker_client,
|
||||||
ctx,
|
ctx,
|
||||||
@@ -2223,13 +2231,13 @@ impl Timeline {
|
|||||||
return Err(layer_traversal_error(
|
return Err(layer_traversal_error(
|
||||||
if cfg!(test) {
|
if cfg!(test) {
|
||||||
format!(
|
format!(
|
||||||
"could not find data for key {} at LSN {}, for request at LSN {}\n{}",
|
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
|
||||||
key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
|
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
format!(
|
format!(
|
||||||
"could not find data for key {} at LSN {}, for request at LSN {}",
|
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
|
||||||
key, cont_lsn, request_lsn
|
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
traversal_path,
|
traversal_path,
|
||||||
@@ -2289,11 +2297,12 @@ impl Timeline {
|
|||||||
ancestor
|
ancestor
|
||||||
.wait_lsn(timeline.ancestor_lsn, ctx)
|
.wait_lsn(timeline.ancestor_lsn, ctx)
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.map_err(|e| match e {
|
||||||
format!(
|
e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
|
||||||
"wait for lsn {} on ancestor timeline_id={}",
|
WaitLsnError::Shutdown => PageReconstructError::Cancelled,
|
||||||
timeline.ancestor_lsn, ancestor.timeline_id
|
e @ WaitLsnError::BadState => {
|
||||||
)
|
PageReconstructError::Other(anyhow::anyhow!(e))
|
||||||
|
}
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
timeline_owned = ancestor;
|
timeline_owned = ancestor;
|
||||||
@@ -2471,9 +2480,27 @@ impl Timeline {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
|
async fn put_values(
|
||||||
let layer = self.get_layer_for_write(lsn).await?;
|
&self,
|
||||||
layer.put_tombstone(key_range, lsn).await?;
|
values: &HashMap<Key, Vec<(Lsn, Value)>>,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
// Pick the first LSN in the batch to get the layer to write to.
|
||||||
|
for lsns in values.values() {
|
||||||
|
if let Some((lsn, _)) = lsns.first() {
|
||||||
|
let layer = self.get_layer_for_write(*lsn).await?;
|
||||||
|
layer.put_values(values, ctx).await?;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
|
||||||
|
if let Some((_, lsn)) = tombstones.first() {
|
||||||
|
let layer = self.get_layer_for_write(*lsn).await?;
|
||||||
|
layer.put_tombstones(tombstones).await?;
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3035,6 +3062,15 @@ impl Timeline {
|
|||||||
for range in &partition.ranges {
|
for range in &partition.ranges {
|
||||||
let mut key = range.start;
|
let mut key = range.start;
|
||||||
while key < range.end {
|
while key < range.end {
|
||||||
|
if self.shard_identity.is_key_disposable(&key) {
|
||||||
|
debug!(
|
||||||
|
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||||
|
key,
|
||||||
|
self.shard_identity.get_shard_number(&key)
|
||||||
|
);
|
||||||
|
key = key.next();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
let img = match self.get(key, lsn, ctx).await {
|
let img = match self.get(key, lsn, ctx).await {
|
||||||
Ok(img) => img,
|
Ok(img) => img,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
@@ -3061,6 +3097,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
image_layer_writer.put_image(key, &img).await?;
|
image_layer_writer.put_image(key, &img).await?;
|
||||||
key = key.next();
|
key = key.next();
|
||||||
}
|
}
|
||||||
@@ -3631,7 +3668,15 @@ impl Timeline {
|
|||||||
)))
|
)))
|
||||||
});
|
});
|
||||||
|
|
||||||
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
|
if !self.shard_identity.is_key_disposable(&key) {
|
||||||
|
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
|
||||||
|
} else {
|
||||||
|
debug!(
|
||||||
|
"Dropping key {} during compaction (it belongs on shard {:?})",
|
||||||
|
key,
|
||||||
|
self.shard_identity.get_shard_number(&key)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if !new_layers.is_empty() {
|
if !new_layers.is_empty() {
|
||||||
fail_point!("after-timeline-compacted-first-L1");
|
fail_point!("after-timeline-compacted-first-L1");
|
||||||
@@ -4186,7 +4231,7 @@ impl Timeline {
|
|||||||
.context("Failed to reconstruct a page image:")
|
.context("Failed to reconstruct a page image:")
|
||||||
{
|
{
|
||||||
Ok(img) => img,
|
Ok(img) => img,
|
||||||
Err(e) => return Err(PageReconstructError::from(e)),
|
Err(e) => return Err(PageReconstructError::WalRedo(e)),
|
||||||
};
|
};
|
||||||
|
|
||||||
if img.len() == page_cache::PAGE_SZ {
|
if img.len() == page_cache::PAGE_SZ {
|
||||||
@@ -4529,8 +4574,16 @@ impl<'a> TimelineWriter<'a> {
|
|||||||
self.tl.put_value(key, lsn, value, ctx).await
|
self.tl.put_value(key, lsn, value, ctx).await
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
|
pub(crate) async fn put_batch(
|
||||||
self.tl.put_tombstone(key_range, lsn).await
|
&self,
|
||||||
|
batch: &HashMap<Key, Vec<(Lsn, Value)>>,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
self.tl.put_values(batch, ctx).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
|
||||||
|
self.tl.put_tombstones(batch).await
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Track the end of the latest digested WAL record.
|
/// Track the end of the latest digested WAL record.
|
||||||
@@ -4541,11 +4594,11 @@ impl<'a> TimelineWriter<'a> {
|
|||||||
/// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
|
/// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
|
||||||
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
|
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
|
||||||
/// the latest and can be read.
|
/// the latest and can be read.
|
||||||
pub fn finish_write(&self, new_lsn: Lsn) {
|
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
|
||||||
self.tl.finish_write(new_lsn);
|
self.tl.finish_write(new_lsn);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn update_current_logical_size(&self, delta: i64) {
|
pub(crate) fn update_current_logical_size(&self, delta: i64) {
|
||||||
self.tl.update_current_logical_size(delta)
|
self.tl.update_current_logical_size(delta)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,6 +58,7 @@ pub struct WalReceiverConf {
|
|||||||
pub max_lsn_wal_lag: NonZeroU64,
|
pub max_lsn_wal_lag: NonZeroU64,
|
||||||
pub auth_token: Option<Arc<String>>,
|
pub auth_token: Option<Arc<String>>,
|
||||||
pub availability_zone: Option<String>,
|
pub availability_zone: Option<String>,
|
||||||
|
pub ingest_batch_size: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct WalReceiver {
|
pub struct WalReceiver {
|
||||||
|
|||||||
@@ -411,6 +411,7 @@ impl ConnectionManagerState {
|
|||||||
|
|
||||||
let node_id = new_sk.safekeeper_id;
|
let node_id = new_sk.safekeeper_id;
|
||||||
let connect_timeout = self.conf.wal_connect_timeout;
|
let connect_timeout = self.conf.wal_connect_timeout;
|
||||||
|
let ingest_batch_size = self.conf.ingest_batch_size;
|
||||||
let timeline = Arc::clone(&self.timeline);
|
let timeline = Arc::clone(&self.timeline);
|
||||||
let ctx = ctx.detached_child(
|
let ctx = ctx.detached_child(
|
||||||
TaskKind::WalReceiverConnectionHandler,
|
TaskKind::WalReceiverConnectionHandler,
|
||||||
@@ -430,6 +431,7 @@ impl ConnectionManagerState {
|
|||||||
connect_timeout,
|
connect_timeout,
|
||||||
ctx,
|
ctx,
|
||||||
node_id,
|
node_id,
|
||||||
|
ingest_batch_size,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -1345,6 +1347,7 @@ mod tests {
|
|||||||
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
|
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
|
||||||
auth_token: None,
|
auth_token: None,
|
||||||
availability_zone: None,
|
availability_zone: None,
|
||||||
|
ingest_batch_size: 1,
|
||||||
},
|
},
|
||||||
wal_connection: None,
|
wal_connection: None,
|
||||||
wal_stream_candidates: HashMap::new(),
|
wal_stream_candidates: HashMap::new(),
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
|
|||||||
use super::TaskStateUpdate;
|
use super::TaskStateUpdate;
|
||||||
use crate::{
|
use crate::{
|
||||||
context::RequestContext,
|
context::RequestContext,
|
||||||
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
|
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
|
||||||
task_mgr,
|
task_mgr,
|
||||||
task_mgr::TaskKind,
|
task_mgr::TaskKind,
|
||||||
task_mgr::WALRECEIVER_RUNTIME,
|
task_mgr::WALRECEIVER_RUNTIME,
|
||||||
@@ -106,6 +106,7 @@ impl From<WalDecodeError> for WalReceiverError {
|
|||||||
|
|
||||||
/// Open a connection to the given safekeeper and receive WAL, sending back progress
|
/// Open a connection to the given safekeeper and receive WAL, sending back progress
|
||||||
/// messages as we go.
|
/// messages as we go.
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub(super) async fn handle_walreceiver_connection(
|
pub(super) async fn handle_walreceiver_connection(
|
||||||
timeline: Arc<Timeline>,
|
timeline: Arc<Timeline>,
|
||||||
wal_source_connconf: PgConnectionConfig,
|
wal_source_connconf: PgConnectionConfig,
|
||||||
@@ -114,6 +115,7 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
connect_timeout: Duration,
|
connect_timeout: Duration,
|
||||||
ctx: RequestContext,
|
ctx: RequestContext,
|
||||||
node: NodeId,
|
node: NodeId,
|
||||||
|
ingest_batch_size: u64,
|
||||||
) -> Result<(), WalReceiverError> {
|
) -> Result<(), WalReceiverError> {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
@@ -305,7 +307,9 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
|
|
||||||
{
|
{
|
||||||
let mut decoded = DecodedWALRecord::default();
|
let mut decoded = DecodedWALRecord::default();
|
||||||
let mut modification = timeline.begin_modification(endlsn);
|
let mut modification = timeline.begin_modification(startlsn);
|
||||||
|
let mut uncommitted_records = 0;
|
||||||
|
let mut filtered_records = 0;
|
||||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||||
// aligned and can be several bytes bigger. Without this alignment we are
|
// aligned and can be several bytes bigger. Without this alignment we are
|
||||||
@@ -314,14 +318,40 @@ pub(super) async fn handle_walreceiver_connection(
|
|||||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||||
}
|
}
|
||||||
|
|
||||||
walingest
|
// Ingest the records without immediately committing them.
|
||||||
|
let ingested = walingest
|
||||||
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
|
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
|
||||||
.await
|
.await
|
||||||
.with_context(|| format!("could not ingest record at {lsn}"))?;
|
.with_context(|| format!("could not ingest record at {lsn}"))?;
|
||||||
|
if !ingested {
|
||||||
|
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
|
||||||
|
WAL_INGEST.records_filtered.inc();
|
||||||
|
filtered_records += 1;
|
||||||
|
}
|
||||||
|
|
||||||
fail_point!("walreceiver-after-ingest");
|
fail_point!("walreceiver-after-ingest");
|
||||||
|
|
||||||
last_rec_lsn = lsn;
|
last_rec_lsn = lsn;
|
||||||
|
|
||||||
|
// Commit every ingest_batch_size records. Even if we filtered out
|
||||||
|
// all records, we still need to call commit to advance the LSN.
|
||||||
|
uncommitted_records += 1;
|
||||||
|
if uncommitted_records >= ingest_batch_size {
|
||||||
|
WAL_INGEST
|
||||||
|
.records_committed
|
||||||
|
.inc_by(uncommitted_records - filtered_records);
|
||||||
|
modification.commit(&ctx).await?;
|
||||||
|
uncommitted_records = 0;
|
||||||
|
filtered_records = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Commit the remaining records.
|
||||||
|
if uncommitted_records > 0 {
|
||||||
|
WAL_INGEST
|
||||||
|
.records_committed
|
||||||
|
.inc_by(uncommitted_records - filtered_records);
|
||||||
|
modification.commit(&ctx).await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
|||||||
use anyhow::{bail, Context, Result};
|
use anyhow::{bail, Context, Result};
|
||||||
use bytes::{Buf, Bytes, BytesMut};
|
use bytes::{Buf, Bytes, BytesMut};
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
use utils::failpoint_support;
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::metrics::WAL_INGEST;
|
use crate::metrics::WAL_INGEST;
|
||||||
@@ -47,20 +48,18 @@ use postgres_ffi::TransactionId;
|
|||||||
use postgres_ffi::BLCKSZ;
|
use postgres_ffi::BLCKSZ;
|
||||||
use utils::lsn::Lsn;
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
pub struct WalIngest<'a> {
|
pub struct WalIngest {
|
||||||
shard: ShardIdentity,
|
shard: ShardIdentity,
|
||||||
timeline: &'a Timeline,
|
|
||||||
|
|
||||||
checkpoint: CheckPoint,
|
checkpoint: CheckPoint,
|
||||||
checkpoint_modified: bool,
|
checkpoint_modified: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> WalIngest<'a> {
|
impl WalIngest {
|
||||||
pub async fn new(
|
pub async fn new(
|
||||||
timeline: &'a Timeline,
|
timeline: &Timeline,
|
||||||
startpoint: Lsn,
|
startpoint: Lsn,
|
||||||
ctx: &'_ RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<WalIngest<'a>> {
|
) -> anyhow::Result<WalIngest> {
|
||||||
// Fetch the latest checkpoint into memory, so that we can compare with it
|
// Fetch the latest checkpoint into memory, so that we can compare with it
|
||||||
// quickly in `ingest_record` and update it when it changes.
|
// quickly in `ingest_record` and update it when it changes.
|
||||||
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
|
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
|
||||||
@@ -69,7 +68,6 @@ impl<'a> WalIngest<'a> {
|
|||||||
|
|
||||||
Ok(WalIngest {
|
Ok(WalIngest {
|
||||||
shard: *timeline.get_shard_identity(),
|
shard: *timeline.get_shard_identity(),
|
||||||
timeline,
|
|
||||||
checkpoint,
|
checkpoint,
|
||||||
checkpoint_modified: false,
|
checkpoint_modified: false,
|
||||||
})
|
})
|
||||||
@@ -83,6 +81,8 @@ impl<'a> WalIngest<'a> {
|
|||||||
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
|
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
|
||||||
/// relations/pages that the record affects.
|
/// relations/pages that the record affects.
|
||||||
///
|
///
|
||||||
|
/// This function returns `true` if the record was ingested, and `false` if it was filtered out
|
||||||
|
///
|
||||||
pub async fn ingest_record(
|
pub async fn ingest_record(
|
||||||
&mut self,
|
&mut self,
|
||||||
recdata: Bytes,
|
recdata: Bytes,
|
||||||
@@ -90,11 +90,13 @@ impl<'a> WalIngest<'a> {
|
|||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
decoded: &mut DecodedWALRecord,
|
decoded: &mut DecodedWALRecord,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<bool> {
|
||||||
WAL_INGEST.records_received.inc();
|
WAL_INGEST.records_received.inc();
|
||||||
|
let pg_version = modification.tline.pg_version;
|
||||||
|
let prev_len = modification.len();
|
||||||
|
|
||||||
modification.lsn = lsn;
|
modification.set_lsn(lsn)?;
|
||||||
decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
|
decode_wal_record(recdata, decoded, pg_version)?;
|
||||||
|
|
||||||
let mut buf = decoded.record.clone();
|
let mut buf = decoded.record.clone();
|
||||||
buf.advance(decoded.main_data_offset);
|
buf.advance(decoded.main_data_offset);
|
||||||
@@ -131,9 +133,9 @@ impl<'a> WalIngest<'a> {
|
|||||||
}
|
}
|
||||||
pg_constants::RM_DBASE_ID => {
|
pg_constants::RM_DBASE_ID => {
|
||||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||||
debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");
|
debug!(%info, %pg_version, "handle RM_DBASE_ID");
|
||||||
|
|
||||||
if self.timeline.pg_version == 14 {
|
if pg_version == 14 {
|
||||||
if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
|
if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
|
||||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||||
debug!("XLOG_DBASE_CREATE v14");
|
debug!("XLOG_DBASE_CREATE v14");
|
||||||
@@ -149,7 +151,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if self.timeline.pg_version == 15 {
|
} else if pg_version == 15 {
|
||||||
if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
||||||
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||||
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
||||||
@@ -169,7 +171,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
.await?;
|
.await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if self.timeline.pg_version == 16 {
|
} else if pg_version == 16 {
|
||||||
if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
||||||
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||||
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
||||||
@@ -344,9 +346,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
// particular point in the WAL. For more fine-grained control,
|
// particular point in the WAL. For more fine-grained control,
|
||||||
// we could peek into the message and only pause if it contains
|
// we could peek into the message and only pause if it contains
|
||||||
// a particular string, for example, but this is enough for now.
|
// a particular string, for example, but this is enough for now.
|
||||||
crate::failpoint_support::sleep_millis_async!(
|
failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
|
||||||
"wal-ingest-logical-message-sleep"
|
|
||||||
);
|
|
||||||
} else if let Some(path) = prefix.strip_prefix("neon-file:") {
|
} else if let Some(path) = prefix.strip_prefix("neon-file:") {
|
||||||
modification.put_file(path, message, ctx).await?;
|
modification.put_file(path, message, ctx).await?;
|
||||||
}
|
}
|
||||||
@@ -400,19 +400,11 @@ impl<'a> WalIngest<'a> {
|
|||||||
self.checkpoint_modified = false;
|
self.checkpoint_modified = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if modification.is_empty() {
|
// Note that at this point this record is only cached in the modification
|
||||||
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
|
// until commit() is called to flush the data into the repository and update
|
||||||
WAL_INGEST.records_filtered.inc();
|
// the latest LSN.
|
||||||
modification.tline.finish_write(lsn);
|
|
||||||
} else {
|
|
||||||
WAL_INGEST.records_committed.inc();
|
|
||||||
modification.commit(ctx).await?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now that this record has been fully handled, including updating the
|
Ok(modification.len() > prev_len)
|
||||||
// checkpoint data, let the repository know that it is up-to-date to this LSN.
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do not store this block, but observe it for the purposes of updating our relation size state.
|
/// Do not store this block, but observe it for the purposes of updating our relation size state.
|
||||||
@@ -459,7 +451,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
||||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||||
// compression of WAL is not yet supported: fall back to storing the original WAL record
|
// compression of WAL is not yet supported: fall back to storing the original WAL record
|
||||||
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
|
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
|
||||||
// do not materialize null pages because them most likely be soon replaced with real data
|
// do not materialize null pages because them most likely be soon replaced with real data
|
||||||
&& blk.bimg_len != 0
|
&& blk.bimg_len != 0
|
||||||
{
|
{
|
||||||
@@ -512,7 +504,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
let mut old_heap_blkno: Option<u32> = None;
|
let mut old_heap_blkno: Option<u32> = None;
|
||||||
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
|
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
|
||||||
|
|
||||||
match self.timeline.pg_version {
|
match modification.tline.pg_version {
|
||||||
14 => {
|
14 => {
|
||||||
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||||
@@ -736,7 +728,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
// replaying it would fail to find the previous image of the page, because
|
// replaying it would fail to find the previous image of the page, because
|
||||||
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
|
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
|
||||||
// record if it doesn't.
|
// record if it doesn't.
|
||||||
let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
|
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
|
||||||
if let Some(blknum) = new_vm_blk {
|
if let Some(blknum) = new_vm_blk {
|
||||||
if blknum >= vm_size {
|
if blknum >= vm_size {
|
||||||
new_vm_blk = None;
|
new_vm_blk = None;
|
||||||
@@ -817,10 +809,11 @@ impl<'a> WalIngest<'a> {
|
|||||||
let mut new_heap_blkno: Option<u32> = None;
|
let mut new_heap_blkno: Option<u32> = None;
|
||||||
let mut old_heap_blkno: Option<u32> = None;
|
let mut old_heap_blkno: Option<u32> = None;
|
||||||
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
|
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
|
||||||
|
let pg_version = modification.tline.pg_version;
|
||||||
|
|
||||||
assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
|
assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
|
||||||
|
|
||||||
match self.timeline.pg_version {
|
match pg_version {
|
||||||
16 => {
|
16 => {
|
||||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||||
|
|
||||||
@@ -883,7 +876,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
}
|
}
|
||||||
_ => bail!(
|
_ => bail!(
|
||||||
"Neon RMGR has no known compatibility with PostgreSQL version {}",
|
"Neon RMGR has no known compatibility with PostgreSQL version {}",
|
||||||
self.timeline.pg_version
|
pg_version
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -906,7 +899,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
// replaying it would fail to find the previous image of the page, because
|
// replaying it would fail to find the previous image of the page, because
|
||||||
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
|
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
|
||||||
// record if it doesn't.
|
// record if it doesn't.
|
||||||
let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
|
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
|
||||||
if let Some(blknum) = new_vm_blk {
|
if let Some(blknum) = new_vm_blk {
|
||||||
if blknum >= vm_size {
|
if blknum >= vm_size {
|
||||||
new_vm_blk = None;
|
new_vm_blk = None;
|
||||||
@@ -984,16 +977,14 @@ impl<'a> WalIngest<'a> {
|
|||||||
let src_db_id = rec.src_db_id;
|
let src_db_id = rec.src_db_id;
|
||||||
let src_tablespace_id = rec.src_tablespace_id;
|
let src_tablespace_id = rec.src_tablespace_id;
|
||||||
|
|
||||||
// Creating a database is implemented by copying the template (aka. source) database.
|
|
||||||
// To copy all the relations, we need to ask for the state as of the same LSN, but we
|
|
||||||
// cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
|
|
||||||
// the last valid LSN to advance up to it. So we use the previous record's LSN in the
|
|
||||||
// get calls instead.
|
|
||||||
let req_lsn = modification.tline.get_last_record_lsn();
|
|
||||||
|
|
||||||
let rels = modification
|
let rels = modification
|
||||||
.tline
|
.tline
|
||||||
.list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
|
.list_rels(
|
||||||
|
src_tablespace_id,
|
||||||
|
src_db_id,
|
||||||
|
Version::Modified(modification),
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
debug!("ingest_xlog_dbase_create: {} rels", rels.len());
|
debug!("ingest_xlog_dbase_create: {} rels", rels.len());
|
||||||
@@ -1001,7 +992,12 @@ impl<'a> WalIngest<'a> {
|
|||||||
// Copy relfilemap
|
// Copy relfilemap
|
||||||
let filemap = modification
|
let filemap = modification
|
||||||
.tline
|
.tline
|
||||||
.get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
|
.get_relmap_file(
|
||||||
|
src_tablespace_id,
|
||||||
|
src_db_id,
|
||||||
|
Version::Modified(modification),
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
.await?;
|
.await?;
|
||||||
modification
|
modification
|
||||||
.put_relmap_file(tablespace_id, db_id, filemap, ctx)
|
.put_relmap_file(tablespace_id, db_id, filemap, ctx)
|
||||||
@@ -1015,7 +1011,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
|
|
||||||
let nblocks = modification
|
let nblocks = modification
|
||||||
.tline
|
.tline
|
||||||
.get_rel_size(src_rel, req_lsn, true, ctx)
|
.get_rel_size(src_rel, Version::Modified(modification), true, ctx)
|
||||||
.await?;
|
.await?;
|
||||||
let dst_rel = RelTag {
|
let dst_rel = RelTag {
|
||||||
spcnode: tablespace_id,
|
spcnode: tablespace_id,
|
||||||
@@ -1033,7 +1029,13 @@ impl<'a> WalIngest<'a> {
|
|||||||
|
|
||||||
let content = modification
|
let content = modification
|
||||||
.tline
|
.tline
|
||||||
.get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
|
.get_rel_page_at_lsn(
|
||||||
|
src_rel,
|
||||||
|
blknum,
|
||||||
|
Version::Modified(modification),
|
||||||
|
true,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
.await?;
|
.await?;
|
||||||
modification.put_rel_page_image(dst_rel, blknum, content)?;
|
modification.put_rel_page_image(dst_rel, blknum, content)?;
|
||||||
num_blocks_copied += 1;
|
num_blocks_copied += 1;
|
||||||
@@ -1104,7 +1106,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
|
modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
|
||||||
fsm_physical_page_no += 1;
|
fsm_physical_page_no += 1;
|
||||||
}
|
}
|
||||||
let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
|
let nblocks = get_relsize(modification, rel, ctx).await?;
|
||||||
if nblocks > fsm_physical_page_no {
|
if nblocks > fsm_physical_page_no {
|
||||||
// check if something to do: FSM is larger than truncate position
|
// check if something to do: FSM is larger than truncate position
|
||||||
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
|
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
|
||||||
@@ -1126,7 +1128,7 @@ impl<'a> WalIngest<'a> {
|
|||||||
modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
|
modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
|
||||||
vm_page_no += 1;
|
vm_page_no += 1;
|
||||||
}
|
}
|
||||||
let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
|
let nblocks = get_relsize(modification, rel, ctx).await?;
|
||||||
if nblocks > vm_page_no {
|
if nblocks > vm_page_no {
|
||||||
// check if something to do: VM is larger than truncate position
|
// check if something to do: VM is larger than truncate position
|
||||||
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
|
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
|
||||||
@@ -1199,10 +1201,9 @@ impl<'a> WalIngest<'a> {
|
|||||||
dbnode: xnode.dbnode,
|
dbnode: xnode.dbnode,
|
||||||
relnode: xnode.relnode,
|
relnode: xnode.relnode,
|
||||||
};
|
};
|
||||||
let last_lsn = self.timeline.get_last_record_lsn();
|
|
||||||
if modification
|
if modification
|
||||||
.tline
|
.tline
|
||||||
.get_rel_exists(rel, last_lsn, true, ctx)
|
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
|
||||||
.await?
|
.await?
|
||||||
{
|
{
|
||||||
self.put_rel_drop(modification, rel, ctx).await?;
|
self.put_rel_drop(modification, rel, ctx).await?;
|
||||||
@@ -1256,10 +1257,9 @@ impl<'a> WalIngest<'a> {
|
|||||||
// will block waiting for the last valid LSN to advance up to
|
// will block waiting for the last valid LSN to advance up to
|
||||||
// it. So we use the previous record's LSN in the get calls
|
// it. So we use the previous record's LSN in the get calls
|
||||||
// instead.
|
// instead.
|
||||||
let req_lsn = modification.tline.get_last_record_lsn();
|
|
||||||
for segno in modification
|
for segno in modification
|
||||||
.tline
|
.tline
|
||||||
.list_slru_segments(SlruKind::Clog, req_lsn, ctx)
|
.list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
|
||||||
.await?
|
.await?
|
||||||
{
|
{
|
||||||
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||||
@@ -1471,20 +1471,6 @@ impl<'a> WalIngest<'a> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_relsize(
|
|
||||||
&mut self,
|
|
||||||
rel: RelTag,
|
|
||||||
lsn: Lsn,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> anyhow::Result<BlockNumber> {
|
|
||||||
let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
|
|
||||||
0
|
|
||||||
} else {
|
|
||||||
self.timeline.get_rel_size(rel, lsn, true, ctx).await?
|
|
||||||
};
|
|
||||||
Ok(nblocks)
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn handle_rel_extend(
|
async fn handle_rel_extend(
|
||||||
&mut self,
|
&mut self,
|
||||||
modification: &mut DatadirModification<'_>,
|
modification: &mut DatadirModification<'_>,
|
||||||
@@ -1496,7 +1482,6 @@ impl<'a> WalIngest<'a> {
|
|||||||
// Check if the relation exists. We implicitly create relations on first
|
// Check if the relation exists. We implicitly create relations on first
|
||||||
// record.
|
// record.
|
||||||
// TODO: would be nice if to be more explicit about it
|
// TODO: would be nice if to be more explicit about it
|
||||||
let last_lsn = modification.lsn;
|
|
||||||
|
|
||||||
// Get current size and put rel creation if rel doesn't exist
|
// Get current size and put rel creation if rel doesn't exist
|
||||||
//
|
//
|
||||||
@@ -1504,11 +1489,14 @@ impl<'a> WalIngest<'a> {
|
|||||||
// check the cache too. This is because eagerly checking the cache results in
|
// check the cache too. This is because eagerly checking the cache results in
|
||||||
// less work overall and 10% better performance. It's more work on cache miss
|
// less work overall and 10% better performance. It's more work on cache miss
|
||||||
// but cache miss is rare.
|
// but cache miss is rare.
|
||||||
let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
|
let old_nblocks = if let Some(nblocks) = modification
|
||||||
|
.tline
|
||||||
|
.get_cached_rel_size(&rel, modification.get_lsn())
|
||||||
|
{
|
||||||
nblocks
|
nblocks
|
||||||
} else if !self
|
} else if !modification
|
||||||
.timeline
|
.tline
|
||||||
.get_rel_exists(rel, last_lsn, true, ctx)
|
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
|
||||||
.await?
|
.await?
|
||||||
{
|
{
|
||||||
// create it with 0 size initially, the logic below will extend it
|
// create it with 0 size initially, the logic below will extend it
|
||||||
@@ -1518,7 +1506,10 @@ impl<'a> WalIngest<'a> {
|
|||||||
.context("Relation Error")?;
|
.context("Relation Error")?;
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
|
modification
|
||||||
|
.tline
|
||||||
|
.get_rel_size(rel, Version::Modified(modification), true, ctx)
|
||||||
|
.await?
|
||||||
};
|
};
|
||||||
|
|
||||||
if new_nblocks > old_nblocks {
|
if new_nblocks > old_nblocks {
|
||||||
@@ -1571,10 +1562,9 @@ impl<'a> WalIngest<'a> {
|
|||||||
// Check if the relation exists. We implicitly create relations on first
|
// Check if the relation exists. We implicitly create relations on first
|
||||||
// record.
|
// record.
|
||||||
// TODO: would be nice if to be more explicit about it
|
// TODO: would be nice if to be more explicit about it
|
||||||
let last_lsn = self.timeline.get_last_record_lsn();
|
let old_nblocks = if !modification
|
||||||
let old_nblocks = if !self
|
.tline
|
||||||
.timeline
|
.get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx)
|
||||||
.get_slru_segment_exists(kind, segno, last_lsn, ctx)
|
|
||||||
.await?
|
.await?
|
||||||
{
|
{
|
||||||
// create it with 0 size initially, the logic below will extend it
|
// create it with 0 size initially, the logic below will extend it
|
||||||
@@ -1583,8 +1573,9 @@ impl<'a> WalIngest<'a> {
|
|||||||
.await?;
|
.await?;
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
self.timeline
|
modification
|
||||||
.get_slru_segment_size(kind, segno, last_lsn, ctx)
|
.tline
|
||||||
|
.get_slru_segment_size(kind, segno, Version::Modified(modification), ctx)
|
||||||
.await?
|
.await?
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1607,6 +1598,26 @@ impl<'a> WalIngest<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn get_relsize(
|
||||||
|
modification: &DatadirModification<'_>,
|
||||||
|
rel: RelTag,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<BlockNumber> {
|
||||||
|
let nblocks = if !modification
|
||||||
|
.tline
|
||||||
|
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
|
||||||
|
.await?
|
||||||
|
{
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
modification
|
||||||
|
.tline
|
||||||
|
.get_rel_size(rel, Version::Modified(modification), true, ctx)
|
||||||
|
.await?
|
||||||
|
};
|
||||||
|
Ok(nblocks)
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(clippy::bool_assert_comparison)]
|
#[allow(clippy::bool_assert_comparison)]
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
@@ -1633,10 +1644,7 @@ mod tests {
|
|||||||
|
|
||||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||||
|
|
||||||
async fn init_walingest_test<'a>(
|
async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
|
||||||
tline: &'a Timeline,
|
|
||||||
ctx: &RequestContext,
|
|
||||||
) -> Result<WalIngest<'a>> {
|
|
||||||
let mut m = tline.begin_modification(Lsn(0x10));
|
let mut m = tline.begin_modification(Lsn(0x10));
|
||||||
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
||||||
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
|
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
|
||||||
@@ -1681,29 +1689,29 @@ mod tests {
|
|||||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
|
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
false
|
false
|
||||||
);
|
);
|
||||||
assert!(tline
|
assert!(tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
|
||||||
.await
|
.await
|
||||||
.is_err());
|
.is_err());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
|
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
1
|
1
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
3
|
3
|
||||||
);
|
);
|
||||||
@@ -1711,46 +1719,46 @@ mod tests {
|
|||||||
// Check page contents at each LSN
|
// Check page contents at each LSN
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 0 at 2")
|
TEST_IMG("foo blk 0 at 2")
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 0 at 3")
|
TEST_IMG("foo blk 0 at 3")
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 0 at 3")
|
TEST_IMG("foo blk 0 at 3")
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 1 at 4")
|
TEST_IMG("foo blk 1 at 4")
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 0 at 3")
|
TEST_IMG("foo blk 0 at 3")
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 1 at 4")
|
TEST_IMG("foo blk 1 at 4")
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 2 at 5")
|
TEST_IMG("foo blk 2 at 5")
|
||||||
);
|
);
|
||||||
@@ -1766,19 +1774,19 @@ mod tests {
|
|||||||
// Check reported size and contents after truncation
|
// Check reported size and contents after truncation
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
2
|
2
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 0 at 3")
|
TEST_IMG("foo blk 0 at 3")
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 1 at 4")
|
TEST_IMG("foo blk 1 at 4")
|
||||||
);
|
);
|
||||||
@@ -1786,13 +1794,13 @@ mod tests {
|
|||||||
// should still see the truncated block with older LSN
|
// should still see the truncated block with older LSN
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
3
|
3
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 2 at 5")
|
TEST_IMG("foo blk 2 at 5")
|
||||||
);
|
);
|
||||||
@@ -1805,7 +1813,7 @@ mod tests {
|
|||||||
m.commit(&ctx).await?;
|
m.commit(&ctx).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
0
|
0
|
||||||
);
|
);
|
||||||
@@ -1818,19 +1826,19 @@ mod tests {
|
|||||||
m.commit(&ctx).await?;
|
m.commit(&ctx).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
2
|
2
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
ZERO_PAGE
|
ZERO_PAGE
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 1")
|
TEST_IMG("foo blk 1")
|
||||||
);
|
);
|
||||||
@@ -1843,21 +1851,21 @@ mod tests {
|
|||||||
m.commit(&ctx).await?;
|
m.commit(&ctx).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
1501
|
1501
|
||||||
);
|
);
|
||||||
for blk in 2..1500 {
|
for blk in 2..1500 {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
ZERO_PAGE
|
ZERO_PAGE
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG("foo blk 1500")
|
TEST_IMG("foo blk 1500")
|
||||||
);
|
);
|
||||||
@@ -1884,13 +1892,13 @@ mod tests {
|
|||||||
// Check that rel exists and size is correct
|
// Check that rel exists and size is correct
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
|
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
1
|
1
|
||||||
);
|
);
|
||||||
@@ -1903,7 +1911,7 @@ mod tests {
|
|||||||
// Check that rel is not visible anymore
|
// Check that rel is not visible anymore
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
|
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
false
|
false
|
||||||
);
|
);
|
||||||
@@ -1921,13 +1929,13 @@ mod tests {
|
|||||||
// Check that rel exists and size is correct
|
// Check that rel exists and size is correct
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
|
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
1
|
1
|
||||||
);
|
);
|
||||||
@@ -1960,24 +1968,24 @@ mod tests {
|
|||||||
// The relation was created at LSN 20, not visible at LSN 1 yet.
|
// The relation was created at LSN 20, not visible at LSN 1 yet.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
|
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
false
|
false
|
||||||
);
|
);
|
||||||
assert!(tline
|
assert!(tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
|
||||||
.await
|
.await
|
||||||
.is_err());
|
.is_err());
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
|
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
relsize
|
relsize
|
||||||
);
|
);
|
||||||
@@ -1988,7 +1996,7 @@ mod tests {
|
|||||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG(&data)
|
TEST_IMG(&data)
|
||||||
);
|
);
|
||||||
@@ -2005,7 +2013,7 @@ mod tests {
|
|||||||
// Check reported size and contents after truncation
|
// Check reported size and contents after truncation
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
1
|
1
|
||||||
);
|
);
|
||||||
@@ -2015,7 +2023,7 @@ mod tests {
|
|||||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG(&data)
|
TEST_IMG(&data)
|
||||||
);
|
);
|
||||||
@@ -2024,7 +2032,7 @@ mod tests {
|
|||||||
// should still see all blocks with older LSN
|
// should still see all blocks with older LSN
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
relsize
|
relsize
|
||||||
);
|
);
|
||||||
@@ -2033,7 +2041,7 @@ mod tests {
|
|||||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG(&data)
|
TEST_IMG(&data)
|
||||||
);
|
);
|
||||||
@@ -2053,13 +2061,13 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
|
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
relsize
|
relsize
|
||||||
);
|
);
|
||||||
@@ -2069,7 +2077,7 @@ mod tests {
|
|||||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline
|
tline
|
||||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
|
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
|
||||||
.await?,
|
.await?,
|
||||||
TEST_IMG(&data)
|
TEST_IMG(&data)
|
||||||
);
|
);
|
||||||
@@ -2102,7 +2110,9 @@ mod tests {
|
|||||||
assert_current_logical_size(&tline, Lsn(lsn));
|
assert_current_logical_size(&tline, Lsn(lsn));
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
tline
|
||||||
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
|
||||||
|
.await?,
|
||||||
RELSEG_SIZE + 1
|
RELSEG_SIZE + 1
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -2114,7 +2124,9 @@ mod tests {
|
|||||||
.await?;
|
.await?;
|
||||||
m.commit(&ctx).await?;
|
m.commit(&ctx).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
tline
|
||||||
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
|
||||||
|
.await?,
|
||||||
RELSEG_SIZE
|
RELSEG_SIZE
|
||||||
);
|
);
|
||||||
assert_current_logical_size(&tline, Lsn(lsn));
|
assert_current_logical_size(&tline, Lsn(lsn));
|
||||||
@@ -2127,7 +2139,9 @@ mod tests {
|
|||||||
.await?;
|
.await?;
|
||||||
m.commit(&ctx).await?;
|
m.commit(&ctx).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
tline
|
||||||
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
|
||||||
|
.await?,
|
||||||
RELSEG_SIZE - 1
|
RELSEG_SIZE - 1
|
||||||
);
|
);
|
||||||
assert_current_logical_size(&tline, Lsn(lsn));
|
assert_current_logical_size(&tline, Lsn(lsn));
|
||||||
@@ -2143,7 +2157,9 @@ mod tests {
|
|||||||
.await?;
|
.await?;
|
||||||
m.commit(&ctx).await?;
|
m.commit(&ctx).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
tline
|
||||||
|
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
|
||||||
|
.await?,
|
||||||
size as BlockNumber
|
size as BlockNumber
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -2180,7 +2196,7 @@ mod tests {
|
|||||||
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
|
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
|
||||||
let source_initdb_path = format!("{path}/{INITDB_PATH}");
|
let source_initdb_path = format!("{path}/{INITDB_PATH}");
|
||||||
let startpoint = Lsn::from_hex("14AEC08").unwrap();
|
let startpoint = Lsn::from_hex("14AEC08").unwrap();
|
||||||
let endpoint = Lsn::from_hex("1FFFF98").unwrap();
|
let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
|
||||||
|
|
||||||
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
|
let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
|
||||||
let (tenant, ctx) = harness.load().await;
|
let (tenant, ctx) = harness.load().await;
|
||||||
@@ -2222,7 +2238,7 @@ mod tests {
|
|||||||
let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
|
let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let mut modification = tline.begin_modification(endpoint);
|
let mut modification = tline.begin_modification(startpoint);
|
||||||
let mut decoded = DecodedWALRecord::default();
|
let mut decoded = DecodedWALRecord::default();
|
||||||
println!("decoding {} bytes", bytes.len() - xlogoff);
|
println!("decoding {} bytes", bytes.len() - xlogoff);
|
||||||
|
|
||||||
@@ -2236,6 +2252,7 @@ mod tests {
|
|||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
modification.commit(&ctx).await.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
let duration = started_at.elapsed();
|
let duration = started_at.elapsed();
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ use anyhow::Context;
|
|||||||
use byteorder::{ByteOrder, LittleEndian};
|
use byteorder::{ByteOrder, LittleEndian};
|
||||||
use bytes::{BufMut, Bytes, BytesMut};
|
use bytes::{BufMut, Bytes, BytesMut};
|
||||||
use nix::poll::*;
|
use nix::poll::*;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
use std::io;
|
use std::io;
|
||||||
@@ -35,14 +36,11 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
|||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
#[cfg(feature = "testing")]
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
|
||||||
#[cfg(feature = "testing")]
|
|
||||||
use pageserver_api::shard::TenantShardId;
|
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::metrics::{
|
use crate::metrics::{
|
||||||
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
|
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
|
||||||
@@ -92,7 +90,7 @@ struct ProcessOutput {
|
|||||||
/// records.
|
/// records.
|
||||||
///
|
///
|
||||||
pub struct PostgresRedoManager {
|
pub struct PostgresRedoManager {
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||||
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
|
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
|
||||||
@@ -186,10 +184,13 @@ impl PostgresRedoManager {
|
|||||||
///
|
///
|
||||||
/// Create a new PostgresRedoManager.
|
/// Create a new PostgresRedoManager.
|
||||||
///
|
///
|
||||||
pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
|
pub fn new(
|
||||||
|
conf: &'static PageServerConf,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
) -> PostgresRedoManager {
|
||||||
// The actual process is launched lazily, on first request.
|
// The actual process is launched lazily, on first request.
|
||||||
PostgresRedoManager {
|
PostgresRedoManager {
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
conf,
|
conf,
|
||||||
last_redo_at: std::sync::Mutex::default(),
|
last_redo_at: std::sync::Mutex::default(),
|
||||||
redo_process: RwLock::new(None),
|
redo_process: RwLock::new(None),
|
||||||
@@ -244,8 +245,12 @@ impl PostgresRedoManager {
|
|||||||
let timer =
|
let timer =
|
||||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
|
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
|
||||||
let proc = Arc::new(
|
let proc = Arc::new(
|
||||||
WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
|
WalRedoProcess::launch(
|
||||||
.context("launch walredo process")?,
|
self.conf,
|
||||||
|
self.tenant_shard_id,
|
||||||
|
pg_version,
|
||||||
|
)
|
||||||
|
.context("launch walredo process")?,
|
||||||
);
|
);
|
||||||
timer.observe_duration();
|
timer.observe_duration();
|
||||||
*proc_guard = Some(Arc::clone(&proc));
|
*proc_guard = Some(Arc::clone(&proc));
|
||||||
@@ -638,7 +643,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
|
|||||||
struct WalRedoProcess {
|
struct WalRedoProcess {
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
// Some() on construction, only becomes None on Drop.
|
// Some() on construction, only becomes None on Drop.
|
||||||
child: Option<NoLeakChild>,
|
child: Option<NoLeakChild>,
|
||||||
stdout: Mutex<ProcessOutput>,
|
stdout: Mutex<ProcessOutput>,
|
||||||
@@ -652,10 +657,10 @@ impl WalRedoProcess {
|
|||||||
//
|
//
|
||||||
// Start postgres binary in special WAL redo mode.
|
// Start postgres binary in special WAL redo mode.
|
||||||
//
|
//
|
||||||
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
|
#[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
|
||||||
fn launch(
|
fn launch(
|
||||||
conf: &'static PageServerConf,
|
conf: &'static PageServerConf,
|
||||||
tenant_id: TenantId,
|
tenant_shard_id: TenantShardId,
|
||||||
pg_version: u32,
|
pg_version: u32,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||||
@@ -680,7 +685,7 @@ impl WalRedoProcess {
|
|||||||
// as close-on-exec by default, but that's not enough, since we use
|
// as close-on-exec by default, but that's not enough, since we use
|
||||||
// libraries that directly call libc open without setting that flag.
|
// libraries that directly call libc open without setting that flag.
|
||||||
.close_fds()
|
.close_fds()
|
||||||
.spawn_no_leak_child(tenant_id)
|
.spawn_no_leak_child(tenant_shard_id)
|
||||||
.context("spawn process")?;
|
.context("spawn process")?;
|
||||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||||
let mut child = scopeguard::guard(child, |child| {
|
let mut child = scopeguard::guard(child, |child| {
|
||||||
@@ -741,12 +746,12 @@ impl WalRedoProcess {
|
|||||||
error!(error=?e, "failed to read from walredo stderr");
|
error!(error=?e, "failed to read from walredo stderr");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
|
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
conf,
|
conf,
|
||||||
tenant_id,
|
tenant_shard_id,
|
||||||
child: Some(child),
|
child: Some(child),
|
||||||
stdin: Mutex::new(ProcessInput {
|
stdin: Mutex::new(ProcessInput {
|
||||||
stdin,
|
stdin,
|
||||||
@@ -772,7 +777,7 @@ impl WalRedoProcess {
|
|||||||
// Apply given WAL records ('records') over an old page image. Returns
|
// Apply given WAL records ('records') over an old page image. Returns
|
||||||
// new page image.
|
// new page image.
|
||||||
//
|
//
|
||||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
|
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||||
fn apply_wal_records(
|
fn apply_wal_records(
|
||||||
&self,
|
&self,
|
||||||
tag: BufferTag,
|
tag: BufferTag,
|
||||||
@@ -966,11 +971,7 @@ impl WalRedoProcess {
|
|||||||
// these files will be collected to an allure report
|
// these files will be collected to an allure report
|
||||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||||
|
|
||||||
// TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
|
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||||
let path = self
|
|
||||||
.conf
|
|
||||||
.tenant_path(&TenantShardId::unsharded(self.tenant_id))
|
|
||||||
.join(&filename);
|
|
||||||
|
|
||||||
let res = std::fs::OpenOptions::new()
|
let res = std::fs::OpenOptions::new()
|
||||||
.write(true)
|
.write(true)
|
||||||
@@ -1004,7 +1005,7 @@ impl Drop for WalRedoProcess {
|
|||||||
/// Wrapper type around `std::process::Child` which guarantees that the child
|
/// Wrapper type around `std::process::Child` which guarantees that the child
|
||||||
/// will be killed and waited-for by this process before being dropped.
|
/// will be killed and waited-for by this process before being dropped.
|
||||||
struct NoLeakChild {
|
struct NoLeakChild {
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantShardId,
|
||||||
child: Option<Child>,
|
child: Option<Child>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1023,7 +1024,7 @@ impl DerefMut for NoLeakChild {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl NoLeakChild {
|
impl NoLeakChild {
|
||||||
fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
|
fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
|
||||||
let child = command.spawn()?;
|
let child = command.spawn()?;
|
||||||
Ok(NoLeakChild {
|
Ok(NoLeakChild {
|
||||||
tenant_id,
|
tenant_id,
|
||||||
@@ -1078,7 +1079,7 @@ impl Drop for NoLeakChild {
|
|||||||
Some(child) => child,
|
Some(child) => child,
|
||||||
None => return,
|
None => return,
|
||||||
};
|
};
|
||||||
let tenant_id = self.tenant_id;
|
let tenant_shard_id = self.tenant_id;
|
||||||
// Offload the kill+wait of the child process into the background.
|
// Offload the kill+wait of the child process into the background.
|
||||||
// If someone stops the runtime, we'll leak the child process.
|
// If someone stops the runtime, we'll leak the child process.
|
||||||
// We can ignore that case because we only stop the runtime on pageserver exit.
|
// We can ignore that case because we only stop the runtime on pageserver exit.
|
||||||
@@ -1086,7 +1087,11 @@ impl Drop for NoLeakChild {
|
|||||||
tokio::task::spawn_blocking(move || {
|
tokio::task::spawn_blocking(move || {
|
||||||
// Intentionally don't inherit the tracing context from whoever is dropping us.
|
// Intentionally don't inherit the tracing context from whoever is dropping us.
|
||||||
// This thread here is going to outlive of our dropper.
|
// This thread here is going to outlive of our dropper.
|
||||||
let span = tracing::info_span!("walredo", %tenant_id);
|
let span = tracing::info_span!(
|
||||||
|
"walredo",
|
||||||
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
|
shard_id = %tenant_shard_id.shard_slug()
|
||||||
|
);
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
|
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
|
||||||
})
|
})
|
||||||
@@ -1096,11 +1101,11 @@ impl Drop for NoLeakChild {
|
|||||||
}
|
}
|
||||||
|
|
||||||
trait NoLeakChildCommandExt {
|
trait NoLeakChildCommandExt {
|
||||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
|
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl NoLeakChildCommandExt for Command {
|
impl NoLeakChildCommandExt for Command {
|
||||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
|
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
|
||||||
NoLeakChild::spawn(tenant_id, self)
|
NoLeakChild::spawn(tenant_id, self)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1155,6 +1160,7 @@ mod tests {
|
|||||||
use crate::repository::Key;
|
use crate::repository::Key;
|
||||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use utils::{id::TenantId, lsn::Lsn};
|
use utils::{id::TenantId, lsn::Lsn};
|
||||||
|
|
||||||
@@ -1264,9 +1270,9 @@ mod tests {
|
|||||||
let repo_dir = camino_tempfile::tempdir()?;
|
let repo_dir = camino_tempfile::tempdir()?;
|
||||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||||
let conf = Box::leak(Box::new(conf));
|
let conf = Box::leak(Box::new(conf));
|
||||||
let tenant_id = TenantId::generate();
|
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||||
|
|
||||||
let manager = PostgresRedoManager::new(conf, tenant_id);
|
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||||
|
|
||||||
Ok(RedoHarness {
|
Ok(RedoHarness {
|
||||||
_repo_dir: repo_dir,
|
_repo_dir: repo_dir,
|
||||||
|
|||||||
@@ -35,7 +35,8 @@
|
|||||||
|
|
||||||
#define PageStoreTrace DEBUG5
|
#define PageStoreTrace DEBUG5
|
||||||
|
|
||||||
#define RECONNECT_INTERVAL_USEC 1000000
|
#define MIN_RECONNECT_INTERVAL_USEC 1000
|
||||||
|
#define MAX_RECONNECT_INTERVAL_USEC 1000000
|
||||||
|
|
||||||
bool connected = false;
|
bool connected = false;
|
||||||
PGconn *pageserver_conn = NULL;
|
PGconn *pageserver_conn = NULL;
|
||||||
@@ -133,6 +134,11 @@ pageserver_connect(int elevel)
|
|||||||
const char *values[3];
|
const char *values[3];
|
||||||
int n;
|
int n;
|
||||||
|
|
||||||
|
static TimestampTz last_connect_time = 0;
|
||||||
|
static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||||
|
TimestampTz now;
|
||||||
|
uint64_t us_since_last_connect;
|
||||||
|
|
||||||
Assert(!connected);
|
Assert(!connected);
|
||||||
|
|
||||||
if (CheckConnstringUpdated())
|
if (CheckConnstringUpdated())
|
||||||
@@ -140,6 +146,22 @@ pageserver_connect(int elevel)
|
|||||||
ReloadConnstring();
|
ReloadConnstring();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
now = GetCurrentTimestamp();
|
||||||
|
us_since_last_connect = now - last_connect_time;
|
||||||
|
if (us_since_last_connect < delay_us)
|
||||||
|
{
|
||||||
|
pg_usleep(delay_us - us_since_last_connect);
|
||||||
|
delay_us *= 2;
|
||||||
|
if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
|
||||||
|
delay_us = MAX_RECONNECT_INTERVAL_USEC;
|
||||||
|
last_connect_time = GetCurrentTimestamp();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||||
|
last_connect_time = now;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Connect using the connection string we got from the
|
* Connect using the connection string we got from the
|
||||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||||
@@ -333,7 +355,6 @@ pageserver_send(NeonRequest *request)
|
|||||||
{
|
{
|
||||||
HandleMainLoopInterrupts();
|
HandleMainLoopInterrupts();
|
||||||
n_reconnect_attempts += 1;
|
n_reconnect_attempts += 1;
|
||||||
pg_usleep(RECONNECT_INTERVAL_USEC);
|
|
||||||
}
|
}
|
||||||
n_reconnect_attempts = 0;
|
n_reconnect_attempts = 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
|||||||
port = strchr(host, ':');
|
port = strchr(host, ':');
|
||||||
if (port == NULL)
|
if (port == NULL)
|
||||||
{
|
{
|
||||||
walprop_log(FATAL, "port is not specified");
|
wp_log(FATAL, "port is not specified");
|
||||||
}
|
}
|
||||||
*port++ = '\0';
|
*port++ = '\0';
|
||||||
sep = strchr(port, ',');
|
sep = strchr(port, ',');
|
||||||
@@ -107,7 +107,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
|||||||
*sep++ = '\0';
|
*sep++ = '\0';
|
||||||
if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
|
if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
|
||||||
{
|
{
|
||||||
walprop_log(FATAL, "Too many safekeepers");
|
wp_log(FATAL, "too many safekeepers");
|
||||||
}
|
}
|
||||||
wp->safekeeper[wp->n_safekeepers].host = host;
|
wp->safekeeper[wp->n_safekeepers].host = host;
|
||||||
wp->safekeeper[wp->n_safekeepers].port = port;
|
wp->safekeeper[wp->n_safekeepers].port = port;
|
||||||
@@ -123,7 +123,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
|||||||
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
"host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
|
||||||
sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
|
sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
|
||||||
if (written > MAXCONNINFO || written < 0)
|
if (written > MAXCONNINFO || written < 0)
|
||||||
walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
|
||||||
}
|
}
|
||||||
|
|
||||||
initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
|
initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
|
||||||
@@ -133,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
|||||||
}
|
}
|
||||||
if (wp->n_safekeepers < 1)
|
if (wp->n_safekeepers < 1)
|
||||||
{
|
{
|
||||||
walprop_log(FATAL, "Safekeepers addresses are not specified");
|
wp_log(FATAL, "safekeepers addresses are not specified");
|
||||||
}
|
}
|
||||||
wp->quorum = wp->n_safekeepers / 2 + 1;
|
wp->quorum = wp->n_safekeepers / 2 + 1;
|
||||||
|
|
||||||
@@ -144,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
|||||||
wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
|
wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
|
||||||
wp->greetRequest.systemId = wp->config->systemId;
|
wp->greetRequest.systemId = wp->config->systemId;
|
||||||
if (!wp->config->neon_timeline)
|
if (!wp->config->neon_timeline)
|
||||||
walprop_log(FATAL, "neon.timeline_id is not provided");
|
wp_log(FATAL, "neon.timeline_id is not provided");
|
||||||
if (*wp->config->neon_timeline != '\0' &&
|
if (*wp->config->neon_timeline != '\0' &&
|
||||||
!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
|
!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
|
||||||
walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
|
wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
|
||||||
if (!wp->config->neon_tenant)
|
if (!wp->config->neon_tenant)
|
||||||
walprop_log(FATAL, "neon.tenant_id is not provided");
|
wp_log(FATAL, "neon.tenant_id is not provided");
|
||||||
if (*wp->config->neon_tenant != '\0' &&
|
if (*wp->config->neon_tenant != '\0' &&
|
||||||
!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
|
!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
|
||||||
walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);
|
wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
|
||||||
|
|
||||||
wp->greetRequest.timeline = wp->config->pgTimeline;
|
wp->greetRequest.timeline = wp->config->pgTimeline;
|
||||||
wp->greetRequest.walSegSize = wp->config->wal_segment_size;
|
wp->greetRequest.walSegSize = wp->config->wal_segment_size;
|
||||||
@@ -274,8 +274,8 @@ WalProposerPoll(WalProposer *wp)
|
|||||||
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
|
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
|
||||||
wp->config->safekeeper_connection_timeout))
|
wp->config->safekeeper_connection_timeout))
|
||||||
{
|
{
|
||||||
walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
|
wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
|
||||||
sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
|
sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -356,8 +356,8 @@ ResetConnection(Safekeeper *sk)
|
|||||||
*
|
*
|
||||||
* https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
|
* https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
|
||||||
*/
|
*/
|
||||||
walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
|
wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s",
|
||||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Even though the connection failed, we still need to clean up the
|
* Even though the connection failed, we still need to clean up the
|
||||||
@@ -380,7 +380,7 @@ ResetConnection(Safekeeper *sk)
|
|||||||
* (see libpqrcv_connect, defined in
|
* (see libpqrcv_connect, defined in
|
||||||
* src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
|
* src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
|
||||||
*/
|
*/
|
||||||
walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
|
wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
|
||||||
|
|
||||||
sk->state = SS_CONNECTING_WRITE;
|
sk->state = SS_CONNECTING_WRITE;
|
||||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||||
@@ -434,7 +434,7 @@ ReconnectSafekeepers(WalProposer *wp)
|
|||||||
static void
|
static void
|
||||||
AdvancePollState(Safekeeper *sk, uint32 events)
|
AdvancePollState(Safekeeper *sk, uint32 events)
|
||||||
{
|
{
|
||||||
#ifdef WALPROPOSER_LIB /* walprop_log needs wp in lib build */
|
#ifdef WALPROPOSER_LIB /* wp_log needs wp in lib build */
|
||||||
WalProposer *wp = sk->wp;
|
WalProposer *wp = sk->wp;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -452,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
|||||||
* ResetConnection
|
* ResetConnection
|
||||||
*/
|
*/
|
||||||
case SS_OFFLINE:
|
case SS_OFFLINE:
|
||||||
walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
|
wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline",
|
||||||
sk->host, sk->port);
|
sk->host, sk->port);
|
||||||
break; /* actually unreachable, but prevents
|
break; /* actually unreachable, but prevents
|
||||||
* -Wimplicit-fallthrough */
|
* -Wimplicit-fallthrough */
|
||||||
|
|
||||||
@@ -488,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
|||||||
* requests.
|
* requests.
|
||||||
*/
|
*/
|
||||||
case SS_VOTING:
|
case SS_VOTING:
|
||||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||||
sk->port, FormatSafekeeperState(sk));
|
sk->port, FormatSafekeeperState(sk));
|
||||||
ResetConnection(sk);
|
ResetConnection(sk);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@@ -517,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
|||||||
* Idle state for waiting votes from quorum.
|
* Idle state for waiting votes from quorum.
|
||||||
*/
|
*/
|
||||||
case SS_IDLE:
|
case SS_IDLE:
|
||||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||||
sk->port, FormatSafekeeperState(sk));
|
sk->port, FormatSafekeeperState(sk));
|
||||||
ResetConnection(sk);
|
ResetConnection(sk);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@@ -543,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk)
|
|||||||
switch (result)
|
switch (result)
|
||||||
{
|
{
|
||||||
case WP_CONN_POLLING_OK:
|
case WP_CONN_POLLING_OK:
|
||||||
walprop_log(LOG, "connected with node %s:%s", sk->host,
|
wp_log(LOG, "connected with node %s:%s", sk->host,
|
||||||
sk->port);
|
sk->port);
|
||||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -567,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case WP_CONN_POLLING_FAILED:
|
case WP_CONN_POLLING_FAILED:
|
||||||
walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
|
wp_log(WARNING, "failed to connect to node '%s:%s': %s",
|
||||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If connecting failed, we don't want to restart the connection
|
* If connecting failed, we don't want to restart the connection
|
||||||
@@ -604,8 +604,8 @@ SendStartWALPush(Safekeeper *sk)
|
|||||||
|
|
||||||
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
|
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
|
||||||
{
|
{
|
||||||
walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -641,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case WP_EXEC_FAILED:
|
case WP_EXEC_FAILED:
|
||||||
walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
|
wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s",
|
||||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@@ -652,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk)
|
|||||||
* wrong"
|
* wrong"
|
||||||
*/
|
*/
|
||||||
case WP_EXEC_UNEXPECTED_SUCCESS:
|
case WP_EXEC_UNEXPECTED_SUCCESS:
|
||||||
walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
|
wp_log(WARNING, "received bad response from safekeeper %s:%s query execution",
|
||||||
sk->host, sk->port);
|
sk->host, sk->port);
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
|||||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
|
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
|
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
|
||||||
|
|
||||||
/* Protocol is all good, move to voting. */
|
/* Protocol is all good, move to voting. */
|
||||||
sk->state = SS_VOTING;
|
sk->state = SS_VOTING;
|
||||||
@@ -708,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
|||||||
if (wp->n_connected == wp->quorum)
|
if (wp->n_connected == wp->quorum)
|
||||||
{
|
{
|
||||||
wp->propTerm++;
|
wp->propTerm++;
|
||||||
walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
|
||||||
|
|
||||||
wp->voteRequest = (VoteRequest)
|
wp->voteRequest = (VoteRequest)
|
||||||
{
|
{
|
||||||
@@ -721,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
|||||||
else if (sk->greetResponse.term > wp->propTerm)
|
else if (sk->greetResponse.term > wp->propTerm)
|
||||||
{
|
{
|
||||||
/* Another compute with higher term is running. */
|
/* Another compute with higher term is running. */
|
||||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||||
sk->host, sk->port,
|
sk->host, sk->port,
|
||||||
sk->greetResponse.term, wp->propTerm);
|
sk->greetResponse.term, wp->propTerm);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -763,7 +763,7 @@ SendVoteRequest(Safekeeper *sk)
|
|||||||
WalProposer *wp = sk->wp;
|
WalProposer *wp = sk->wp;
|
||||||
|
|
||||||
/* We have quorum for voting, send our vote request */
|
/* We have quorum for voting, send our vote request */
|
||||||
walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
|
wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
|
||||||
/* On failure, logging & resetting is handled */
|
/* On failure, logging & resetting is handled */
|
||||||
if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
|
if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
|
||||||
return;
|
return;
|
||||||
@@ -780,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk)
|
|||||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
|
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
walprop_log(LOG,
|
wp_log(LOG,
|
||||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
|
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
|
||||||
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
|
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In case of acceptor rejecting our vote, bail out, but only if either it
|
* In case of acceptor rejecting our vote, bail out, but only if either it
|
||||||
@@ -795,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk)
|
|||||||
if ((!sk->voteResponse.voteGiven) &&
|
if ((!sk->voteResponse.voteGiven) &&
|
||||||
(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
|
(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
|
||||||
{
|
{
|
||||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||||
sk->host, sk->port,
|
sk->host, sk->port,
|
||||||
sk->voteResponse.term, wp->propTerm);
|
sk->voteResponse.term, wp->propTerm);
|
||||||
}
|
}
|
||||||
Assert(sk->voteResponse.term == wp->propTerm);
|
Assert(sk->voteResponse.term == wp->propTerm);
|
||||||
|
|
||||||
@@ -841,7 +841,7 @@ HandleElectedProposer(WalProposer *wp)
|
|||||||
*/
|
*/
|
||||||
if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
|
if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
|
||||||
{
|
{
|
||||||
walprop_log(FATAL, "failed to download WAL for logical replicaiton");
|
wp_log(FATAL, "failed to download WAL for logical replicaiton");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
|
if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
|
||||||
@@ -948,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp)
|
|||||||
if (wp->timelineStartLsn != InvalidXLogRecPtr &&
|
if (wp->timelineStartLsn != InvalidXLogRecPtr &&
|
||||||
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
|
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
|
||||||
{
|
{
|
||||||
walprop_log(WARNING,
|
wp_log(WARNING,
|
||||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||||
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
||||||
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
||||||
}
|
}
|
||||||
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
|
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
|
||||||
}
|
}
|
||||||
@@ -969,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp)
|
|||||||
{
|
{
|
||||||
wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
|
wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
|
||||||
}
|
}
|
||||||
walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -996,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp)
|
|||||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
|
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
|
||||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
|
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
|
||||||
|
|
||||||
walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||||
wp->quorum,
|
wp->quorum,
|
||||||
wp->propTerm,
|
wp->propTerm,
|
||||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||||
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
|
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
|
||||||
LSN_FORMAT_ARGS(wp->truncateLsn));
|
LSN_FORMAT_ARGS(wp->truncateLsn));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ensure the basebackup we are running (at RedoStartLsn) matches LSN
|
* Ensure the basebackup we are running (at RedoStartLsn) matches LSN
|
||||||
@@ -1034,10 +1034,10 @@ DetermineEpochStartLsn(WalProposer *wp)
|
|||||||
* scenario.
|
* scenario.
|
||||||
*/
|
*/
|
||||||
disable_core_dump();
|
disable_core_dump();
|
||||||
walprop_log(PANIC,
|
wp_log(PANIC,
|
||||||
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
|
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
|
||||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||||
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
|
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
walprop_shared->mineLastElectedTerm = wp->propTerm;
|
walprop_shared->mineLastElectedTerm = wp->propTerm;
|
||||||
@@ -1091,34 +1091,10 @@ SendProposerElected(Safekeeper *sk)
|
|||||||
{
|
{
|
||||||
/* safekeeper is empty or no common point, start from the beginning */
|
/* safekeeper is empty or no common point, start from the beginning */
|
||||||
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
|
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
|
||||||
|
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
|
||||||
if (sk->startStreamingAt < wp->truncateLsn)
|
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
|
||||||
{
|
/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
|
||||||
/*
|
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
|
||||||
* There's a gap between the WAL starting point and a truncateLsn,
|
|
||||||
* which can't appear in a normal working cluster. That gap means
|
|
||||||
* that all safekeepers reported that they have persisted WAL up
|
|
||||||
* to the truncateLsn before, but now current safekeeper tells
|
|
||||||
* otherwise.
|
|
||||||
*
|
|
||||||
* Also we have a special condition here, which is empty
|
|
||||||
* safekeeper with no history. In combination with a gap, that can
|
|
||||||
* happen when we introduce a new safekeeper to the cluster. This
|
|
||||||
* is a rare case, which is triggered manually for now, and should
|
|
||||||
* be treated with care.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* truncateLsn will not change without ack from current
|
|
||||||
* safekeeper, and it's aligned to the WAL record, so we can
|
|
||||||
* safely start streaming from this point.
|
|
||||||
*/
|
|
||||||
sk->startStreamingAt = wp->truncateLsn;
|
|
||||||
|
|
||||||
walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
|
|
||||||
sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
|
|
||||||
LSN_FORMAT_ARGS(sk->startStreamingAt));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -1141,7 +1117,7 @@ SendProposerElected(Safekeeper *sk)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn);
|
Assert(sk->startStreamingAt <= wp->availableLsn);
|
||||||
|
|
||||||
msg.tag = 'e';
|
msg.tag = 'e';
|
||||||
msg.term = wp->propTerm;
|
msg.term = wp->propTerm;
|
||||||
@@ -1150,9 +1126,9 @@ SendProposerElected(Safekeeper *sk)
|
|||||||
msg.timelineStartLsn = wp->timelineStartLsn;
|
msg.timelineStartLsn = wp->timelineStartLsn;
|
||||||
|
|
||||||
lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
|
lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
|
||||||
walprop_log(LOG,
|
wp_log(LOG,
|
||||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||||
|
|
||||||
resetStringInfo(&sk->outbuf);
|
resetStringInfo(&sk->outbuf);
|
||||||
pq_sendint64_le(&sk->outbuf, msg.tag);
|
pq_sendint64_le(&sk->outbuf, msg.tag);
|
||||||
@@ -1261,8 +1237,8 @@ HandleActiveState(Safekeeper *sk, uint32 events)
|
|||||||
/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
|
/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
|
||||||
if (events & WL_SOCKET_CLOSED)
|
if (events & WL_SOCKET_CLOSED)
|
||||||
{
|
{
|
||||||
walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
|
wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
|
||||||
sk->host, sk->port);
|
sk->host, sk->port);
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -1323,12 +1299,12 @@ SendAppendRequests(Safekeeper *sk)
|
|||||||
req = &sk->appendRequest;
|
req = &sk->appendRequest;
|
||||||
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
|
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
|
||||||
|
|
||||||
walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||||
req->endLsn - req->beginLsn,
|
req->endLsn - req->beginLsn,
|
||||||
LSN_FORMAT_ARGS(req->beginLsn),
|
LSN_FORMAT_ARGS(req->beginLsn),
|
||||||
LSN_FORMAT_ARGS(req->endLsn),
|
LSN_FORMAT_ARGS(req->endLsn),
|
||||||
LSN_FORMAT_ARGS(req->commitLsn),
|
LSN_FORMAT_ARGS(req->commitLsn),
|
||||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||||
|
|
||||||
resetStringInfo(&sk->outbuf);
|
resetStringInfo(&sk->outbuf);
|
||||||
|
|
||||||
@@ -1355,8 +1331,8 @@ SendAppendRequests(Safekeeper *sk)
|
|||||||
case NEON_WALREAD_WOULDBLOCK:
|
case NEON_WALREAD_WOULDBLOCK:
|
||||||
return true;
|
return true;
|
||||||
case NEON_WALREAD_ERROR:
|
case NEON_WALREAD_ERROR:
|
||||||
walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
|
||||||
sk->host, sk->port, errmsg);
|
sk->host, sk->port, errmsg);
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
return false;
|
return false;
|
||||||
default:
|
default:
|
||||||
@@ -1388,9 +1364,9 @@ SendAppendRequests(Safekeeper *sk)
|
|||||||
return true;
|
return true;
|
||||||
|
|
||||||
case PG_ASYNC_WRITE_FAIL:
|
case PG_ASYNC_WRITE_FAIL:
|
||||||
walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
||||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||||
wp->api.conn_error_message(sk));
|
wp->api.conn_error_message(sk));
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
return false;
|
return false;
|
||||||
default:
|
default:
|
||||||
@@ -1429,11 +1405,11 @@ RecvAppendResponses(Safekeeper *sk)
|
|||||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
|
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
|
||||||
break;
|
break;
|
||||||
|
|
||||||
walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
|
wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
|
||||||
sk->appendResponse.term,
|
sk->appendResponse.term,
|
||||||
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
|
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
|
||||||
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
|
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
|
||||||
sk->host, sk->port);
|
sk->host, sk->port);
|
||||||
|
|
||||||
if (sk->appendResponse.term > wp->propTerm)
|
if (sk->appendResponse.term > wp->propTerm)
|
||||||
{
|
{
|
||||||
@@ -1443,9 +1419,9 @@ RecvAppendResponses(Safekeeper *sk)
|
|||||||
* core as this is kinda expected scenario.
|
* core as this is kinda expected scenario.
|
||||||
*/
|
*/
|
||||||
disable_core_dump();
|
disable_core_dump();
|
||||||
walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
||||||
sk->host, sk->port,
|
sk->host, sk->port,
|
||||||
sk->appendResponse.term, wp->propTerm);
|
sk->appendResponse.term, wp->propTerm);
|
||||||
}
|
}
|
||||||
|
|
||||||
readAnything = true;
|
readAnything = true;
|
||||||
@@ -1489,32 +1465,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
|||||||
pq_getmsgint(reply_message, sizeof(int32));
|
pq_getmsgint(reply_message, sizeof(int32));
|
||||||
/* read value length */
|
/* read value length */
|
||||||
rf->currentClusterSize = pq_getmsgint64(reply_message);
|
rf->currentClusterSize = pq_getmsgint64(reply_message);
|
||||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
|
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
|
||||||
rf->currentClusterSize);
|
rf->currentClusterSize);
|
||||||
}
|
}
|
||||||
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
|
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
|
||||||
{
|
{
|
||||||
pq_getmsgint(reply_message, sizeof(int32));
|
pq_getmsgint(reply_message, sizeof(int32));
|
||||||
/* read value length */
|
/* read value length */
|
||||||
rf->last_received_lsn = pq_getmsgint64(reply_message);
|
rf->last_received_lsn = pq_getmsgint64(reply_message);
|
||||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
|
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
|
||||||
LSN_FORMAT_ARGS(rf->last_received_lsn));
|
LSN_FORMAT_ARGS(rf->last_received_lsn));
|
||||||
}
|
}
|
||||||
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
|
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
|
||||||
{
|
{
|
||||||
pq_getmsgint(reply_message, sizeof(int32));
|
pq_getmsgint(reply_message, sizeof(int32));
|
||||||
/* read value length */
|
/* read value length */
|
||||||
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
|
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
|
||||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
|
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
|
||||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
|
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
|
||||||
}
|
}
|
||||||
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
|
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
|
||||||
{
|
{
|
||||||
pq_getmsgint(reply_message, sizeof(int32));
|
pq_getmsgint(reply_message, sizeof(int32));
|
||||||
/* read value length */
|
/* read value length */
|
||||||
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
|
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
|
||||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
|
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
|
||||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
|
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
|
||||||
}
|
}
|
||||||
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
|
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
|
||||||
{
|
{
|
||||||
@@ -1526,8 +1502,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
|||||||
|
|
||||||
/* Copy because timestamptz_to_str returns a static buffer */
|
/* Copy because timestamptz_to_str returns a static buffer */
|
||||||
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
|
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
|
||||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
|
wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
|
||||||
rf->replytime, replyTimeStr);
|
rf->replytime, replyTimeStr);
|
||||||
|
|
||||||
pfree(replyTimeStr);
|
pfree(replyTimeStr);
|
||||||
}
|
}
|
||||||
@@ -1541,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
|||||||
* Skip unknown keys to support backward compatibile protocol
|
* Skip unknown keys to support backward compatibile protocol
|
||||||
* changes
|
* changes
|
||||||
*/
|
*/
|
||||||
walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
|
wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
|
||||||
pq_getmsgbytes(reply_message, len);
|
pq_getmsgbytes(reply_message, len);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -1606,7 +1582,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
|
|||||||
|
|
||||||
if (wp->n_votes < wp->quorum)
|
if (wp->n_votes < wp->quorum)
|
||||||
{
|
{
|
||||||
walprop_log(WARNING, "GetDonor called before elections are won");
|
wp_log(WARNING, "GetDonor called before elections are won");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1734,9 +1710,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
case PG_ASYNC_READ_FAIL:
|
case PG_ASYNC_READ_FAIL:
|
||||||
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
|
wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host,
|
||||||
sk->port, FormatSafekeeperState(sk),
|
sk->port, FormatSafekeeperState(sk),
|
||||||
wp->api.conn_error_message(sk));
|
wp->api.conn_error_message(sk));
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -1774,8 +1750,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
|||||||
tag = pq_getmsgint64_le(&s);
|
tag = pq_getmsgint64_le(&s);
|
||||||
if (tag != anymsg->tag)
|
if (tag != anymsg->tag)
|
||||||
{
|
{
|
||||||
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||||
sk->port, FormatSafekeeperState(sk));
|
sk->port, FormatSafekeeperState(sk));
|
||||||
ResetConnection(sk);
|
ResetConnection(sk);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -1851,9 +1827,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
|
|||||||
|
|
||||||
if (!wp->api.conn_blocking_write(sk, msg, msg_size))
|
if (!wp->api.conn_blocking_write(sk, msg, msg_size))
|
||||||
{
|
{
|
||||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
||||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||||
wp->api.conn_error_message(sk));
|
wp->api.conn_error_message(sk));
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@@ -1904,9 +1880,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
|
|||||||
wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
|
wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
|
||||||
return false;
|
return false;
|
||||||
case PG_ASYNC_WRITE_FAIL:
|
case PG_ASYNC_WRITE_FAIL:
|
||||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
|
||||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||||
wp->api.conn_error_message(sk));
|
wp->api.conn_error_message(sk));
|
||||||
ShutdownConnection(sk);
|
ShutdownConnection(sk);
|
||||||
return false;
|
return false;
|
||||||
default:
|
default:
|
||||||
@@ -1943,9 +1919,9 @@ AsyncFlush(Safekeeper *sk)
|
|||||||
/* Nothing to do; try again when the socket's ready */
|
/* Nothing to do; try again when the socket's ready */
|
||||||
return false;
|
return false;
|
||||||
case -1:
|
case -1:
|
||||||
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
|
wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s",
|
||||||
sk->host, sk->port, FormatSafekeeperState(sk),
|
sk->host, sk->port, FormatSafekeeperState(sk),
|
||||||
wp->api.conn_error_message(sk));
|
wp->api.conn_error_message(sk));
|
||||||
ResetConnection(sk);
|
ResetConnection(sk);
|
||||||
return false;
|
return false;
|
||||||
default:
|
default:
|
||||||
@@ -1974,11 +1950,11 @@ CompareLsn(const void *a, const void *b)
|
|||||||
*
|
*
|
||||||
* The strings are intended to be used as a prefix to "state", e.g.:
|
* The strings are intended to be used as a prefix to "state", e.g.:
|
||||||
*
|
*
|
||||||
* walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
|
* wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
|
||||||
*
|
*
|
||||||
* If this sort of phrasing doesn't fit the message, instead use something like:
|
* If this sort of phrasing doesn't fit the message, instead use something like:
|
||||||
*
|
*
|
||||||
* walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
|
* wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
|
||||||
*/
|
*/
|
||||||
static char *
|
static char *
|
||||||
FormatSafekeeperState(Safekeeper *sk)
|
FormatSafekeeperState(Safekeeper *sk)
|
||||||
@@ -2059,8 +2035,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
|||||||
* To give a descriptive message in the case of failure, we use elog
|
* To give a descriptive message in the case of failure, we use elog
|
||||||
* and then an assertion that's guaranteed to fail.
|
* and then an assertion that's guaranteed to fail.
|
||||||
*/
|
*/
|
||||||
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||||
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
|
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
|
||||||
Assert(events_ok_for_state);
|
Assert(events_ok_for_state);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2199,8 +2175,8 @@ FormatEvents(WalProposer *wp, uint32 events)
|
|||||||
|
|
||||||
if (events & (~all_flags))
|
if (events & (~all_flags))
|
||||||
{
|
{
|
||||||
walprop_log(WARNING, "Event formatting found unexpected component %d",
|
wp_log(WARNING, "event formatting found unexpected component %d",
|
||||||
events & (~all_flags));
|
events & (~all_flags));
|
||||||
return_str[6] = '*';
|
return_str[6] = '*';
|
||||||
return_str[7] = '\0';
|
return_str[7] = '\0';
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -707,11 +707,23 @@ extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
|
|||||||
#define WPEVENT 1337 /* special log level for walproposer internal
|
#define WPEVENT 1337 /* special log level for walproposer internal
|
||||||
* events */
|
* events */
|
||||||
|
|
||||||
|
#define WP_LOG_PREFIX "[WP] "
|
||||||
|
|
||||||
|
/*
|
||||||
|
* wp_log is used in pure wp code (walproposer.c), allowing API callback to
|
||||||
|
* catch logging.
|
||||||
|
*/
|
||||||
#ifdef WALPROPOSER_LIB
|
#ifdef WALPROPOSER_LIB
|
||||||
extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
|
extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
|
||||||
#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
|
#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
|
||||||
#else
|
#else
|
||||||
#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
|
#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* And wpg_log is used all other (postgres specific) walproposer code, just
|
||||||
|
* adding prefix.
|
||||||
|
*/
|
||||||
|
#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
|
||||||
|
|
||||||
#endif /* __NEON_WALPROPOSER_H__ */
|
#endif /* __NEON_WALPROPOSER_H__ */
|
||||||
|
|||||||
@@ -424,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
|
|||||||
{
|
{
|
||||||
StartReplicationCmd cmd;
|
StartReplicationCmd cmd;
|
||||||
|
|
||||||
elog(LOG, "WAL proposer starts streaming at %X/%X",
|
wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
|
||||||
LSN_FORMAT_ARGS(startpos));
|
LSN_FORMAT_ARGS(startpos));
|
||||||
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
|
cmd.slotname = WAL_PROPOSER_SLOT_NAME;
|
||||||
cmd.timeline = wp->greetRequest.timeline;
|
cmd.timeline = wp->greetRequest.timeline;
|
||||||
cmd.startpoint = startpos;
|
cmd.startpoint = startpos;
|
||||||
@@ -549,7 +549,7 @@ walprop_pg_load_libpqwalreceiver(void)
|
|||||||
{
|
{
|
||||||
load_file("libpqwalreceiver", false);
|
load_file("libpqwalreceiver", false);
|
||||||
if (WalReceiverFunctions == NULL)
|
if (WalReceiverFunctions == NULL)
|
||||||
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
|
wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Helper function */
|
/* Helper function */
|
||||||
@@ -630,7 +630,7 @@ libpqwp_connect_start(char *conninfo)
|
|||||||
* PGconn structure"
|
* PGconn structure"
|
||||||
*/
|
*/
|
||||||
if (!pg_conn)
|
if (!pg_conn)
|
||||||
elog(FATAL, "failed to allocate new PGconn object");
|
wpg_log(FATAL, "failed to allocate new PGconn object");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* And in theory this allocation can fail as well, but it's incredibly
|
* And in theory this allocation can fail as well, but it's incredibly
|
||||||
@@ -680,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk)
|
|||||||
* unused. We'll expect it's never returned.
|
* unused. We'll expect it's never returned.
|
||||||
*/
|
*/
|
||||||
case PGRES_POLLING_ACTIVE:
|
case PGRES_POLLING_ACTIVE:
|
||||||
elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
|
wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This return is never actually reached, but it's here to make
|
* This return is never actually reached, but it's here to make
|
||||||
@@ -745,7 +745,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
|
|||||||
*/
|
*/
|
||||||
if (!result)
|
if (!result)
|
||||||
{
|
{
|
||||||
elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
|
wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
|
||||||
return WP_EXEC_UNEXPECTED_SUCCESS;
|
return WP_EXEC_UNEXPECTED_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -793,7 +793,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (unexpected_success)
|
if (unexpected_success)
|
||||||
elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
|
wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
|
||||||
|
|
||||||
return return_val;
|
return return_val;
|
||||||
}
|
}
|
||||||
@@ -872,7 +872,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
|
|||||||
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
|
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
|
||||||
|
|
||||||
if (status != PGRES_FATAL_ERROR)
|
if (status != PGRES_FATAL_ERROR)
|
||||||
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
|
wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If there was actually an error, it'll be properly reported
|
* If there was actually an error, it'll be properly reported
|
||||||
@@ -937,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
|||||||
case -1:
|
case -1:
|
||||||
return PG_ASYNC_WRITE_FAIL;
|
return PG_ASYNC_WRITE_FAIL;
|
||||||
default:
|
default:
|
||||||
elog(FATAL, "invalid return %d from PQputCopyData", result);
|
wpg_log(FATAL, "invalid return %d from PQputCopyData", result);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -958,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
|||||||
case -1:
|
case -1:
|
||||||
return PG_ASYNC_WRITE_FAIL;
|
return PG_ASYNC_WRITE_FAIL;
|
||||||
default:
|
default:
|
||||||
elog(FATAL, "invalid return %d from PQflush", result);
|
wpg_log(FATAL, "invalid return %d from PQflush", result);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1237,19 +1237,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
|||||||
return true; /* recovery not needed */
|
return true; /* recovery not needed */
|
||||||
endpos = wp->propEpochStartLsn;
|
endpos = wp->propEpochStartLsn;
|
||||||
|
|
||||||
/*
|
|
||||||
* If we need to download more than a max_slot_wal_keep_size, cap to it to
|
|
||||||
* avoid risk of exploding pg_wal. Logical replication won't work until
|
|
||||||
* recreated, but at least compute would start; this also follows
|
|
||||||
* max_slot_wal_keep_size semantics.
|
|
||||||
*/
|
|
||||||
download_range_mb = (endpos - startpos) / 1024 / 1024;
|
|
||||||
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
|
|
||||||
{
|
|
||||||
startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
|
|
||||||
walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
|
|
||||||
LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
|
|
||||||
}
|
|
||||||
timeline = wp->greetRequest.timeline;
|
timeline = wp->greetRequest.timeline;
|
||||||
|
|
||||||
if (!neon_auth_token)
|
if (!neon_auth_token)
|
||||||
@@ -1262,7 +1249,7 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
|||||||
|
|
||||||
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
|
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
|
||||||
if (written > MAXCONNINFO || written < 0)
|
if (written > MAXCONNINFO || written < 0)
|
||||||
elog(FATAL, "could not append password to the safekeeper connection string");
|
wpg_log(FATAL, "could not append password to the safekeeper connection string");
|
||||||
}
|
}
|
||||||
|
|
||||||
#if PG_MAJORVERSION_NUM < 16
|
#if PG_MAJORVERSION_NUM < 16
|
||||||
@@ -1279,11 +1266,11 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
|
|||||||
err)));
|
err)));
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
elog(LOG,
|
wpg_log(LOG,
|
||||||
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
|
"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
|
||||||
"%d",
|
"%d",
|
||||||
sk->host, sk->port, (uint32) (startpos >> 32),
|
sk->host, sk->port, (uint32) (startpos >> 32),
|
||||||
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
|
(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
|
||||||
|
|
||||||
options.logical = false;
|
options.logical = false;
|
||||||
options.startpoint = startpos;
|
options.startpoint = startpos;
|
||||||
@@ -1481,11 +1468,11 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
|
|||||||
{
|
{
|
||||||
char log_prefix[64];
|
char log_prefix[64];
|
||||||
|
|
||||||
snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
|
snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
|
||||||
Assert(!sk->xlogreader);
|
Assert(!sk->xlogreader);
|
||||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
|
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
|
||||||
if (sk->xlogreader == NULL)
|
if (sk->xlogreader == NULL)
|
||||||
elog(FATAL, "Failed to allocate xlog reader");
|
wpg_log(FATAL, "failed to allocate xlog reader");
|
||||||
}
|
}
|
||||||
|
|
||||||
static NeonWALReadResult
|
static NeonWALReadResult
|
||||||
@@ -1549,7 +1536,7 @@ static void
|
|||||||
walprop_pg_init_event_set(WalProposer *wp)
|
walprop_pg_init_event_set(WalProposer *wp)
|
||||||
{
|
{
|
||||||
if (waitEvents)
|
if (waitEvents)
|
||||||
elog(FATAL, "double-initialization of event set");
|
wpg_log(FATAL, "double-initialization of event set");
|
||||||
|
|
||||||
/* for each sk, we have socket plus potentially socket for neon walreader */
|
/* for each sk, we have socket plus potentially socket for neon walreader */
|
||||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
|
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
|
||||||
@@ -1581,7 +1568,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events)
|
|||||||
Assert(sk->nwrEventPos == -1);
|
Assert(sk->nwrEventPos == -1);
|
||||||
sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
|
sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
|
||||||
sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
|
sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
|
||||||
elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
|
wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -1680,8 +1667,8 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
|
|||||||
{
|
{
|
||||||
WalProposer *wp = to_remove->wp;
|
WalProposer *wp = to_remove->wp;
|
||||||
|
|
||||||
elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
|
wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d",
|
||||||
to_remove->host, to_remove->port, is_sk);
|
to_remove->host, to_remove->port, is_sk);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Shortpath for exiting if have nothing to do. We never call this
|
* Shortpath for exiting if have nothing to do. We never call this
|
||||||
@@ -1835,13 +1822,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
|
|||||||
rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
|
rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
|
||||||
rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
|
rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
|
||||||
|
|
||||||
elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
|
wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
|
||||||
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
|
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
|
||||||
rf->currentClusterSize,
|
rf->currentClusterSize,
|
||||||
LSN_FORMAT_ARGS(rf->last_received_lsn),
|
LSN_FORMAT_ARGS(rf->last_received_lsn),
|
||||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
|
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
|
||||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
|
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
|
||||||
rf->replytime);
|
rf->replytime);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -1987,7 +1974,7 @@ GetLogRepRestartLSN(WalProposer *wp)
|
|||||||
{
|
{
|
||||||
uint64 download_range_mb;
|
uint64 download_range_mb;
|
||||||
|
|
||||||
elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we need to download more than a max_slot_wal_keep_size,
|
* If we need to download more than a max_slot_wal_keep_size,
|
||||||
@@ -1999,8 +1986,8 @@ GetLogRepRestartLSN(WalProposer *wp)
|
|||||||
download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
|
download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
|
||||||
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
|
if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
|
||||||
{
|
{
|
||||||
walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
|
wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
|
||||||
LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
|
LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
|
||||||
return InvalidXLogRecPtr;
|
return InvalidXLogRecPtr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
174
poetry.lock
generated
174
poetry.lock
generated
@@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aiohttp"
|
name = "aiohttp"
|
||||||
@@ -288,70 +288,21 @@ files = [
|
|||||||
{file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
|
{file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "black"
|
|
||||||
version = "23.3.0"
|
|
||||||
description = "The uncompromising code formatter."
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
|
|
||||||
{file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
|
|
||||||
{file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
|
|
||||||
{file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
|
|
||||||
{file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
|
|
||||||
{file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
|
|
||||||
{file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
|
|
||||||
{file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
|
|
||||||
{file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
|
|
||||||
{file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
|
|
||||||
{file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
|
|
||||||
{file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
|
|
||||||
{file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
|
|
||||||
{file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
|
|
||||||
{file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
|
|
||||||
{file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
|
|
||||||
{file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
|
|
||||||
{file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
|
|
||||||
{file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
|
|
||||||
{file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
|
|
||||||
{file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
|
|
||||||
{file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
|
|
||||||
{file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
|
|
||||||
{file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
|
|
||||||
{file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.dependencies]
|
|
||||||
click = ">=8.0.0"
|
|
||||||
mypy-extensions = ">=0.4.3"
|
|
||||||
packaging = ">=22.0"
|
|
||||||
pathspec = ">=0.9.0"
|
|
||||||
platformdirs = ">=2"
|
|
||||||
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
|
|
||||||
typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
|
|
||||||
|
|
||||||
[package.extras]
|
|
||||||
colorama = ["colorama (>=0.4.3)"]
|
|
||||||
d = ["aiohttp (>=3.7.4)"]
|
|
||||||
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
|
|
||||||
uvloop = ["uvloop (>=0.15.2)"]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "boto3"
|
name = "boto3"
|
||||||
version = "1.26.16"
|
version = "1.34.11"
|
||||||
description = "The AWS SDK for Python"
|
description = "The AWS SDK for Python"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">= 3.7"
|
python-versions = ">= 3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"},
|
{file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
|
||||||
{file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"},
|
{file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
botocore = ">=1.29.16,<1.30.0"
|
botocore = ">=1.34.11,<1.35.0"
|
||||||
jmespath = ">=0.7.1,<2.0.0"
|
jmespath = ">=0.7.1,<2.0.0"
|
||||||
s3transfer = ">=0.6.0,<0.7.0"
|
s3transfer = ">=0.10.0,<0.11.0"
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
|
crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
|
||||||
@@ -702,22 +653,25 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "botocore"
|
name = "botocore"
|
||||||
version = "1.29.16"
|
version = "1.34.11"
|
||||||
description = "Low-level, data-driven core of boto 3."
|
description = "Low-level, data-driven core of boto 3."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">= 3.7"
|
python-versions = ">= 3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"},
|
{file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
|
||||||
{file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"},
|
{file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
jmespath = ">=0.7.1,<2.0.0"
|
jmespath = ">=0.7.1,<2.0.0"
|
||||||
python-dateutil = ">=2.1,<3.0.0"
|
python-dateutil = ">=2.1,<3.0.0"
|
||||||
urllib3 = ">=1.25.4,<1.27"
|
urllib3 = [
|
||||||
|
{version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
|
||||||
|
{version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
|
||||||
|
]
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
crt = ["awscrt (==0.14.0)"]
|
crt = ["awscrt (==0.19.19)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "botocore-stubs"
|
name = "botocore-stubs"
|
||||||
@@ -1624,17 +1578,6 @@ files = [
|
|||||||
{file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
|
{file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pathspec"
|
|
||||||
version = "0.9.0"
|
|
||||||
description = "Utility library for gitignore style pattern matching of file paths."
|
|
||||||
optional = false
|
|
||||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
|
|
||||||
files = [
|
|
||||||
{file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"},
|
|
||||||
{file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pbr"
|
name = "pbr"
|
||||||
version = "5.9.0"
|
version = "5.9.0"
|
||||||
@@ -1646,21 +1589,6 @@ files = [
|
|||||||
{file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
|
{file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "platformdirs"
|
|
||||||
version = "2.5.2"
|
|
||||||
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
|
|
||||||
{file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.extras]
|
|
||||||
docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"]
|
|
||||||
test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pluggy"
|
name = "pluggy"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
@@ -1889,13 +1817,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pytest"
|
name = "pytest"
|
||||||
version = "7.3.1"
|
version = "7.4.4"
|
||||||
description = "pytest: simple powerful testing with Python"
|
description = "pytest: simple powerful testing with Python"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
{file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
|
{file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
|
||||||
{file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
|
{file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@@ -1907,7 +1835,7 @@ pluggy = ">=0.12,<2.0"
|
|||||||
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
|
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
|
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pytest-asyncio"
|
name = "pytest-asyncio"
|
||||||
@@ -2204,46 +2132,46 @@ pyasn1 = ">=0.1.3"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ruff"
|
name = "ruff"
|
||||||
version = "0.0.269"
|
version = "0.1.11"
|
||||||
description = "An extremely fast Python linter, written in Rust."
|
description = "An extremely fast Python linter and code formatter, written in Rust."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
{file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"},
|
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
|
||||||
{file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"},
|
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
|
||||||
{file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"},
|
{file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
|
||||||
{file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"},
|
{file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
|
||||||
{file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"},
|
{file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
|
||||||
{file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"},
|
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
|
||||||
{file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"},
|
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
|
||||||
{file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"},
|
{file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
|
||||||
{file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"},
|
{file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
|
||||||
{file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"},
|
{file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
|
||||||
{file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"},
|
{file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
|
||||||
{file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"},
|
{file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
|
||||||
{file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"},
|
{file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
|
||||||
{file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"},
|
{file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
|
||||||
{file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"},
|
{file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
|
||||||
{file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"},
|
{file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
|
||||||
{file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"},
|
{file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "s3transfer"
|
name = "s3transfer"
|
||||||
version = "0.6.0"
|
version = "0.10.0"
|
||||||
description = "An Amazon S3 Transfer Manager"
|
description = "An Amazon S3 Transfer Manager"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">= 3.7"
|
python-versions = ">= 3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"},
|
{file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
|
||||||
{file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"},
|
{file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
botocore = ">=1.12.36,<2.0a.0"
|
botocore = ">=1.33.2,<2.0a.0"
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
|
crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sarif-om"
|
name = "sarif-om"
|
||||||
@@ -2493,16 +2421,6 @@ files = [
|
|||||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
|
||||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
|
||||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||||
@@ -2740,4 +2658,4 @@ cffi = ["cffi (>=1.11)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
|
content-hash = "35c237fe6a9278b2dc65b06ed96bde5afb9e393d52c01b00c59acf1df3a8d482"
|
||||||
|
|||||||
@@ -36,17 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
|
|||||||
return cmd
|
return cmd
|
||||||
|
|
||||||
|
|
||||||
def black(fix_inplace: bool) -> str:
|
def ruff_check(fix_inplace: bool) -> str:
|
||||||
cmd = "poetry run black"
|
cmd = "poetry run ruff check"
|
||||||
if not fix_inplace:
|
if fix_inplace:
|
||||||
cmd += " --diff --check"
|
cmd += " --fix"
|
||||||
return cmd
|
return cmd
|
||||||
|
|
||||||
|
|
||||||
def ruff(fix_inplace: bool) -> str:
|
def ruff_format(fix_inplace: bool) -> str:
|
||||||
cmd = "poetry run ruff"
|
cmd = "poetry run ruff format"
|
||||||
if fix_inplace:
|
if not fix_inplace:
|
||||||
cmd += " --fix"
|
cmd += " --diff --check"
|
||||||
return cmd
|
return cmd
|
||||||
|
|
||||||
|
|
||||||
@@ -109,16 +109,16 @@ if __name__ == "__main__":
|
|||||||
no_color=args.no_color,
|
no_color=args.no_color,
|
||||||
)
|
)
|
||||||
check(
|
check(
|
||||||
name="black",
|
name="ruff check",
|
||||||
suffix=".py",
|
suffix=".py",
|
||||||
cmd=black(fix_inplace=args.fix_inplace),
|
cmd=ruff_check(fix_inplace=args.fix_inplace),
|
||||||
changed_files=files,
|
changed_files=files,
|
||||||
no_color=args.no_color,
|
no_color=args.no_color,
|
||||||
)
|
)
|
||||||
check(
|
check(
|
||||||
name="ruff",
|
name="ruff format",
|
||||||
suffix=".py",
|
suffix=".py",
|
||||||
cmd=ruff(fix_inplace=args.fix_inplace),
|
cmd=ruff_format(fix_inplace=args.fix_inplace),
|
||||||
changed_files=files,
|
changed_files=files,
|
||||||
no_color=args.no_color,
|
no_color=args.no_color,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ authors = []
|
|||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
pytest = "^7.3.1"
|
pytest = "^7.4.4"
|
||||||
psycopg2-binary = "^2.9.6"
|
psycopg2-binary = "^2.9.6"
|
||||||
typing-extensions = "^4.6.1"
|
typing-extensions = "^4.6.1"
|
||||||
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
|
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
|
||||||
@@ -17,7 +17,7 @@ aiopg = "^1.4.0"
|
|||||||
Jinja2 = "^3.0.2"
|
Jinja2 = "^3.0.2"
|
||||||
types-requests = "^2.31.0.0"
|
types-requests = "^2.31.0.0"
|
||||||
types-psycopg2 = "^2.9.21.10"
|
types-psycopg2 = "^2.9.21.10"
|
||||||
boto3 = "^1.26.16"
|
boto3 = "^1.34.11"
|
||||||
boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
|
boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
|
||||||
moto = {extras = ["server"], version = "^4.1.2"}
|
moto = {extras = ["server"], version = "^4.1.2"}
|
||||||
backoff = "^2.2.1"
|
backoff = "^2.2.1"
|
||||||
@@ -40,22 +40,13 @@ pytest-split = "^0.8.1"
|
|||||||
zstandard = "^0.21.0"
|
zstandard = "^0.21.0"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
black = "^23.3.0"
|
|
||||||
mypy = "==1.3.0"
|
mypy = "==1.3.0"
|
||||||
ruff = "^0.0.269"
|
ruff = "^0.1.11"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core>=1.0.0"]
|
requires = ["poetry-core>=1.0.0"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.black]
|
|
||||||
line-length = 100
|
|
||||||
extend-exclude = '''
|
|
||||||
/(
|
|
||||||
vendor
|
|
||||||
)/
|
|
||||||
'''
|
|
||||||
|
|
||||||
[tool.mypy]
|
[tool.mypy]
|
||||||
exclude = "^vendor/"
|
exclude = "^vendor/"
|
||||||
check_untyped_defs = true
|
check_untyped_defs = true
|
||||||
@@ -82,7 +73,9 @@ ignore_missing_imports = true
|
|||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
target-version = "py39"
|
target-version = "py39"
|
||||||
extend-exclude = ["vendor/"]
|
extend-exclude = ["vendor/"]
|
||||||
ignore = ["E501"]
|
ignore = [
|
||||||
|
"E501", # Line too long, we don't want to be too strict about it
|
||||||
|
]
|
||||||
select = [
|
select = [
|
||||||
"E", # pycodestyle
|
"E", # pycodestyle
|
||||||
"F", # Pyflakes
|
"F", # Pyflakes
|
||||||
@@ -90,3 +83,4 @@ select = [
|
|||||||
"W", # pycodestyle
|
"W", # pycodestyle
|
||||||
"B", # bugbear
|
"B", # bugbear
|
||||||
]
|
]
|
||||||
|
line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ license.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
aws-sdk-s3.workspace = true
|
aws-sdk-s3.workspace = true
|
||||||
|
aws-smithy-async.workspace = true
|
||||||
either.workspace = true
|
either.workspace = true
|
||||||
tokio-rustls.workspace = true
|
tokio-rustls.workspace = true
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
@@ -39,3 +40,5 @@ tracing-subscriber.workspace = true
|
|||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
tracing-appender = "0.2"
|
tracing-appender = "0.2"
|
||||||
histogram = "0.7"
|
histogram = "0.7"
|
||||||
|
|
||||||
|
futures.workspace = true
|
||||||
|
|||||||
@@ -16,10 +16,12 @@ use aws_config::environment::EnvironmentVariableCredentialsProvider;
|
|||||||
use aws_config::imds::credentials::ImdsCredentialsProvider;
|
use aws_config::imds::credentials::ImdsCredentialsProvider;
|
||||||
use aws_config::meta::credentials::CredentialsProviderChain;
|
use aws_config::meta::credentials::CredentialsProviderChain;
|
||||||
use aws_config::profile::ProfileFileCredentialsProvider;
|
use aws_config::profile::ProfileFileCredentialsProvider;
|
||||||
|
use aws_config::retry::RetryConfig;
|
||||||
use aws_config::sso::SsoCredentialsProvider;
|
use aws_config::sso::SsoCredentialsProvider;
|
||||||
use aws_config::BehaviorVersion;
|
use aws_config::BehaviorVersion;
|
||||||
use aws_sdk_s3::config::Region;
|
use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
|
||||||
use aws_sdk_s3::{Client, Config};
|
use aws_sdk_s3::{Client, Config};
|
||||||
|
use aws_smithy_async::rt::sleep::TokioSleep;
|
||||||
|
|
||||||
use clap::ValueEnum;
|
use clap::ValueEnum;
|
||||||
use pageserver::tenant::TENANTS_SEGMENT_NAME;
|
use pageserver::tenant::TENANTS_SEGMENT_NAME;
|
||||||
@@ -283,9 +285,13 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
|
|||||||
)
|
)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
|
||||||
|
|
||||||
let mut builder = Config::builder()
|
let mut builder = Config::builder()
|
||||||
.behavior_version(BehaviorVersion::v2023_11_09())
|
.behavior_version(BehaviorVersion::v2023_11_09())
|
||||||
.region(bucket_region)
|
.region(bucket_region)
|
||||||
|
.retry_config(RetryConfig::adaptive().with_max_attempts(3))
|
||||||
|
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
|
||||||
.credentials_provider(credentials_provider);
|
.credentials_provider(credentials_provider);
|
||||||
|
|
||||||
if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
|
if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use pageserver_api::shard::TenantShardId;
|
||||||
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
|
||||||
use s3_scrubber::scan_metadata::scan_metadata;
|
use s3_scrubber::scan_metadata::scan_metadata;
|
||||||
use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
|
use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
|
||||||
@@ -34,6 +35,8 @@ enum Command {
|
|||||||
ScanMetadata {
|
ScanMetadata {
|
||||||
#[arg(short, long, default_value_t = false)]
|
#[arg(short, long, default_value_t = false)]
|
||||||
json: bool,
|
json: bool,
|
||||||
|
#[arg(long = "tenant-id", num_args = 0..)]
|
||||||
|
tenant_ids: Vec<TenantShardId>,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -57,35 +60,37 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
));
|
));
|
||||||
|
|
||||||
match cli.command {
|
match cli.command {
|
||||||
Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await {
|
Command::ScanMetadata { json, tenant_ids } => {
|
||||||
Err(e) => {
|
match scan_metadata(bucket_config.clone(), tenant_ids).await {
|
||||||
tracing::error!("Failed: {e}");
|
Err(e) => {
|
||||||
Err(e)
|
tracing::error!("Failed: {e}");
|
||||||
}
|
Err(e)
|
||||||
Ok(summary) => {
|
|
||||||
if json {
|
|
||||||
println!("{}", serde_json::to_string(&summary).unwrap())
|
|
||||||
} else {
|
|
||||||
println!("{}", summary.summary_string());
|
|
||||||
}
|
}
|
||||||
if summary.is_fatal() {
|
Ok(summary) => {
|
||||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
if json {
|
||||||
} else if summary.is_empty() {
|
println!("{}", serde_json::to_string(&summary).unwrap())
|
||||||
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
} else {
|
||||||
// scrubber they were likely expecting to scan something, and if we see no timelines
|
println!("{}", summary.summary_string());
|
||||||
// at all then it's likely due to some configuration issues like a bad prefix
|
}
|
||||||
Err(anyhow::anyhow!(
|
if summary.is_fatal() {
|
||||||
"No timelines found in bucket {} prefix {}",
|
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||||
bucket_config.bucket,
|
} else if summary.is_empty() {
|
||||||
bucket_config
|
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
|
||||||
.prefix_in_bucket
|
// scrubber they were likely expecting to scan something, and if we see no timelines
|
||||||
.unwrap_or("<none>".to_string())
|
// at all then it's likely due to some configuration issues like a bad prefix
|
||||||
))
|
Err(anyhow::anyhow!(
|
||||||
} else {
|
"No timelines found in bucket {} prefix {}",
|
||||||
Ok(())
|
bucket_config.bucket,
|
||||||
|
bucket_config
|
||||||
|
.prefix_in_bucket
|
||||||
|
.unwrap_or("<none>".to_string())
|
||||||
|
))
|
||||||
|
} else {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
}
|
||||||
Command::FindGarbage {
|
Command::FindGarbage {
|
||||||
node_kind,
|
node_kind,
|
||||||
depth,
|
depth,
|
||||||
|
|||||||
@@ -187,10 +187,17 @@ Timeline layer count: {6}
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
|
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
|
||||||
pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
|
pub async fn scan_metadata(
|
||||||
|
bucket_config: BucketConfig,
|
||||||
|
tenant_ids: Vec<TenantShardId>,
|
||||||
|
) -> anyhow::Result<MetadataSummary> {
|
||||||
let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
|
let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
|
||||||
|
|
||||||
let tenants = stream_tenants(&s3_client, &target);
|
let tenants = if tenant_ids.is_empty() {
|
||||||
|
futures::future::Either::Left(stream_tenants(&s3_client, &target))
|
||||||
|
} else {
|
||||||
|
futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
|
||||||
|
};
|
||||||
|
|
||||||
// How many tenants to process in parallel. We need to be mindful of pageservers
|
// How many tenants to process in parallel. We need to be mindful of pageservers
|
||||||
// accessing the same per tenant prefixes, so use a lower setting than pageservers.
|
// accessing the same per tenant prefixes, so use a lower setting than pageservers.
|
||||||
|
|||||||
@@ -4,6 +4,12 @@ version = "0.1.0"
|
|||||||
edition.workspace = true
|
edition.workspace = true
|
||||||
license.workspace = true
|
license.workspace = true
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = []
|
||||||
|
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||||
|
# which adds some runtime cost to run tests on outage conditions
|
||||||
|
testing = ["fail/failpoints"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
async-stream.workspace = true
|
async-stream.workspace = true
|
||||||
anyhow.workspace = true
|
anyhow.workspace = true
|
||||||
@@ -16,6 +22,7 @@ chrono.workspace = true
|
|||||||
clap = { workspace = true, features = ["derive"] }
|
clap = { workspace = true, features = ["derive"] }
|
||||||
const_format.workspace = true
|
const_format.workspace = true
|
||||||
crc32c.workspace = true
|
crc32c.workspace = true
|
||||||
|
fail.workspace = true
|
||||||
fs2.workspace = true
|
fs2.workspace = true
|
||||||
git-version.workspace = true
|
git-version.workspace = true
|
||||||
hex.workspace = true
|
hex.workspace = true
|
||||||
@@ -47,6 +54,7 @@ postgres_ffi.workspace = true
|
|||||||
pq_proto.workspace = true
|
pq_proto.workspace = true
|
||||||
remote_storage.workspace = true
|
remote_storage.workspace = true
|
||||||
safekeeper_api.workspace = true
|
safekeeper_api.workspace = true
|
||||||
|
sha2.workspace = true
|
||||||
sd-notify.workspace = true
|
sd-notify.workspace = true
|
||||||
storage_broker.workspace = true
|
storage_broker.workspace = true
|
||||||
tokio-stream.workspace = true
|
tokio-stream.workspace = true
|
||||||
|
|||||||
@@ -54,6 +54,19 @@ const ID_FILE_NAME: &str = "safekeeper.id";
|
|||||||
project_git_version!(GIT_VERSION);
|
project_git_version!(GIT_VERSION);
|
||||||
project_build_tag!(BUILD_TAG);
|
project_build_tag!(BUILD_TAG);
|
||||||
|
|
||||||
|
const FEATURES: &[&str] = &[
|
||||||
|
#[cfg(feature = "testing")]
|
||||||
|
"testing",
|
||||||
|
];
|
||||||
|
|
||||||
|
fn version() -> String {
|
||||||
|
format!(
|
||||||
|
"{GIT_VERSION} failpoints: {}, features: {:?}",
|
||||||
|
fail::has_failpoints(),
|
||||||
|
FEATURES,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
const ABOUT: &str = r#"
|
const ABOUT: &str = r#"
|
||||||
A fleet of safekeepers is responsible for reliably storing WAL received from
|
A fleet of safekeepers is responsible for reliably storing WAL received from
|
||||||
compute, passing it through consensus (mitigating potential computes brain
|
compute, passing it through consensus (mitigating potential computes brain
|
||||||
@@ -167,7 +180,9 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
// getting 'argument cannot be used multiple times' error. This seems to be
|
// getting 'argument cannot be used multiple times' error. This seems to be
|
||||||
// impossible with pure Derive API, so convert struct to Command, modify it,
|
// impossible with pure Derive API, so convert struct to Command, modify it,
|
||||||
// parse arguments, and then fill the struct back.
|
// parse arguments, and then fill the struct back.
|
||||||
let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
|
let cmd = <Args as clap::CommandFactory>::command()
|
||||||
|
.args_override_self(true)
|
||||||
|
.version(version());
|
||||||
let mut matches = cmd.get_matches();
|
let mut matches = cmd.get_matches();
|
||||||
let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
|
let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
|
||||||
|
|
||||||
|
|||||||
@@ -66,12 +66,10 @@ impl FileStorage {
|
|||||||
|
|
||||||
/// Create file storage for a new timeline, but don't persist it yet.
|
/// Create file storage for a new timeline, but don't persist it yet.
|
||||||
pub fn create_new(
|
pub fn create_new(
|
||||||
ttid: &TenantTimelineId,
|
timeline_dir: Utf8PathBuf,
|
||||||
conf: &SafeKeeperConf,
|
conf: &SafeKeeperConf,
|
||||||
state: SafeKeeperState,
|
state: SafeKeeperState,
|
||||||
) -> Result<FileStorage> {
|
) -> Result<FileStorage> {
|
||||||
let timeline_dir = conf.timeline_dir(ttid);
|
|
||||||
|
|
||||||
let store = FileStorage {
|
let store = FileStorage {
|
||||||
timeline_dir,
|
timeline_dir,
|
||||||
conf: conf.clone(),
|
conf: conf.clone(),
|
||||||
@@ -277,7 +275,8 @@ mod test {
|
|||||||
.await
|
.await
|
||||||
.expect("failed to create timeline dir");
|
.expect("failed to create timeline dir");
|
||||||
let state = SafeKeeperState::empty();
|
let state = SafeKeeperState::empty();
|
||||||
let storage = FileStorage::create_new(ttid, conf, state.clone())?;
|
let timeline_dir = conf.timeline_dir(ttid);
|
||||||
|
let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
|
||||||
Ok((storage, state))
|
Ok((storage, state))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
250
safekeeper/src/copy_timeline.rs
Normal file
250
safekeeper/src/copy_timeline.rs
Normal file
@@ -0,0 +1,250 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use anyhow::{bail, Result};
|
||||||
|
use camino::Utf8PathBuf;
|
||||||
|
|
||||||
|
use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||||
|
use tokio::{
|
||||||
|
fs::OpenOptions,
|
||||||
|
io::{AsyncSeekExt, AsyncWriteExt},
|
||||||
|
};
|
||||||
|
use tracing::{info, warn};
|
||||||
|
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
control_file::{FileStorage, Storage},
|
||||||
|
pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
|
||||||
|
safekeeper::SafeKeeperState,
|
||||||
|
timeline::{Timeline, TimelineError},
|
||||||
|
wal_backup::copy_s3_segments,
|
||||||
|
wal_storage::{wal_file_paths, WalReader},
|
||||||
|
GlobalTimelines, SafeKeeperConf,
|
||||||
|
};
|
||||||
|
|
||||||
|
// we don't want to have more than 10 segments on disk after copy, because they take space
|
||||||
|
const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
|
||||||
|
|
||||||
|
pub struct Request {
|
||||||
|
pub source: Arc<Timeline>,
|
||||||
|
pub until_lsn: Lsn,
|
||||||
|
pub destination_ttid: TenantTimelineId,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn handle_request(request: Request) -> Result<()> {
|
||||||
|
// TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
|
||||||
|
// if LSN will point to the middle of a WAL record, timeline will be in "broken" state
|
||||||
|
|
||||||
|
match GlobalTimelines::get(request.destination_ttid) {
|
||||||
|
// timeline already exists. would be good to check that this timeline is the copy
|
||||||
|
// of the source timeline, but it isn't obvious how to do that
|
||||||
|
Ok(_) => return Ok(()),
|
||||||
|
// timeline not found, we are going to create it
|
||||||
|
Err(TimelineError::NotFound(_)) => {}
|
||||||
|
// error, probably timeline was deleted
|
||||||
|
res => {
|
||||||
|
res?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let conf = &GlobalTimelines::get_global_config();
|
||||||
|
let ttid = request.destination_ttid;
|
||||||
|
|
||||||
|
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
|
||||||
|
|
||||||
|
let (mem_state, state) = request.source.get_state().await;
|
||||||
|
let start_lsn = state.timeline_start_lsn;
|
||||||
|
if start_lsn == Lsn::INVALID {
|
||||||
|
bail!("timeline is not initialized");
|
||||||
|
}
|
||||||
|
let backup_lsn = mem_state.backup_lsn;
|
||||||
|
|
||||||
|
{
|
||||||
|
let commit_lsn = mem_state.commit_lsn;
|
||||||
|
let flush_lsn = request.source.get_flush_lsn().await;
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
|
||||||
|
start_lsn, backup_lsn, commit_lsn, flush_lsn
|
||||||
|
);
|
||||||
|
|
||||||
|
assert!(backup_lsn >= start_lsn);
|
||||||
|
assert!(commit_lsn >= start_lsn);
|
||||||
|
assert!(flush_lsn >= start_lsn);
|
||||||
|
|
||||||
|
if request.until_lsn > flush_lsn {
|
||||||
|
bail!("requested LSN is beyond the end of the timeline");
|
||||||
|
}
|
||||||
|
if request.until_lsn < start_lsn {
|
||||||
|
bail!("requested LSN is before the start of the timeline");
|
||||||
|
}
|
||||||
|
|
||||||
|
if request.until_lsn > commit_lsn {
|
||||||
|
warn!("copy_timeline WAL is not fully committed");
|
||||||
|
}
|
||||||
|
|
||||||
|
if backup_lsn < request.until_lsn && request.until_lsn.0 - backup_lsn.0 > MAX_BACKUP_LAG {
|
||||||
|
// we have a lot of segments that are not backed up. we can try to wait here until
|
||||||
|
// segments will be backed up to remote storage, but it's not clear how long to wait
|
||||||
|
bail!("too many segments are not backed up");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let wal_seg_size = state.server.wal_seg_size as usize;
|
||||||
|
if wal_seg_size == 0 {
|
||||||
|
bail!("wal_seg_size is not set");
|
||||||
|
}
|
||||||
|
|
||||||
|
let first_segment = start_lsn.segment_number(wal_seg_size);
|
||||||
|
let last_segment = request.until_lsn.segment_number(wal_seg_size);
|
||||||
|
|
||||||
|
let new_backup_lsn = {
|
||||||
|
// we can't have new backup_lsn greater than existing backup_lsn or start of the last segment
|
||||||
|
let max_backup_lsn = backup_lsn.min(Lsn(last_segment * wal_seg_size as u64));
|
||||||
|
|
||||||
|
if max_backup_lsn <= start_lsn {
|
||||||
|
// probably we are starting from the first segment, which was not backed up yet.
|
||||||
|
// note that start_lsn can be in the middle of the segment
|
||||||
|
start_lsn
|
||||||
|
} else {
|
||||||
|
// we have some segments backed up, so we will assume all WAL below max_backup_lsn is backed up
|
||||||
|
assert!(max_backup_lsn.segment_offset(wal_seg_size) == 0);
|
||||||
|
max_backup_lsn
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// all previous segments will be copied inside S3
|
||||||
|
let first_ondisk_segment = new_backup_lsn.segment_number(wal_seg_size);
|
||||||
|
assert!(first_ondisk_segment <= last_segment);
|
||||||
|
assert!(first_ondisk_segment >= first_segment);
|
||||||
|
|
||||||
|
copy_s3_segments(
|
||||||
|
wal_seg_size,
|
||||||
|
&request.source.ttid,
|
||||||
|
&request.destination_ttid,
|
||||||
|
first_segment,
|
||||||
|
first_ondisk_segment,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
copy_disk_segments(
|
||||||
|
conf,
|
||||||
|
&state,
|
||||||
|
wal_seg_size,
|
||||||
|
&request.source.ttid,
|
||||||
|
new_backup_lsn,
|
||||||
|
request.until_lsn,
|
||||||
|
&tli_dir_path,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let mut new_state = SafeKeeperState::new(
|
||||||
|
&request.destination_ttid,
|
||||||
|
state.server.clone(),
|
||||||
|
vec![],
|
||||||
|
request.until_lsn,
|
||||||
|
start_lsn,
|
||||||
|
);
|
||||||
|
new_state.timeline_start_lsn = start_lsn;
|
||||||
|
new_state.peer_horizon_lsn = request.until_lsn;
|
||||||
|
new_state.backup_lsn = new_backup_lsn;
|
||||||
|
|
||||||
|
let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?;
|
||||||
|
file_storage.persist(&new_state).await?;
|
||||||
|
|
||||||
|
// now we have a ready timeline in a temp directory
|
||||||
|
validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
|
||||||
|
load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn copy_disk_segments(
|
||||||
|
conf: &SafeKeeperConf,
|
||||||
|
persisted_state: &SafeKeeperState,
|
||||||
|
wal_seg_size: usize,
|
||||||
|
source_ttid: &TenantTimelineId,
|
||||||
|
start_lsn: Lsn,
|
||||||
|
end_lsn: Lsn,
|
||||||
|
tli_dir_path: &Utf8PathBuf,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut wal_reader = WalReader::new(
|
||||||
|
conf.workdir.clone(),
|
||||||
|
conf.timeline_dir(source_ttid),
|
||||||
|
persisted_state,
|
||||||
|
start_lsn,
|
||||||
|
true,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let mut buf = [0u8; MAX_SEND_SIZE];
|
||||||
|
|
||||||
|
let first_segment = start_lsn.segment_number(wal_seg_size);
|
||||||
|
let last_segment = end_lsn.segment_number(wal_seg_size);
|
||||||
|
|
||||||
|
for segment in first_segment..=last_segment {
|
||||||
|
let segment_start = segment * wal_seg_size as u64;
|
||||||
|
let segment_end = segment_start + wal_seg_size as u64;
|
||||||
|
|
||||||
|
let copy_start = segment_start.max(start_lsn.0);
|
||||||
|
let copy_end = segment_end.min(end_lsn.0);
|
||||||
|
|
||||||
|
let copy_start = copy_start - segment_start;
|
||||||
|
let copy_end = copy_end - segment_start;
|
||||||
|
|
||||||
|
let wal_file_path = {
|
||||||
|
let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?;
|
||||||
|
|
||||||
|
if segment == last_segment {
|
||||||
|
partial
|
||||||
|
} else {
|
||||||
|
normal
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
write_segment(
|
||||||
|
&mut buf,
|
||||||
|
&wal_file_path,
|
||||||
|
wal_seg_size as u64,
|
||||||
|
copy_start,
|
||||||
|
copy_end,
|
||||||
|
&mut wal_reader,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn write_segment(
|
||||||
|
buf: &mut [u8],
|
||||||
|
file_path: &Utf8PathBuf,
|
||||||
|
wal_seg_size: u64,
|
||||||
|
from: u64,
|
||||||
|
to: u64,
|
||||||
|
reader: &mut WalReader,
|
||||||
|
) -> Result<()> {
|
||||||
|
assert!(from <= to);
|
||||||
|
assert!(to <= wal_seg_size);
|
||||||
|
|
||||||
|
let mut file = OpenOptions::new()
|
||||||
|
.create(true)
|
||||||
|
.write(true)
|
||||||
|
.open(&file_path)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// maybe fill with zeros, as in wal_storage.rs?
|
||||||
|
file.set_len(wal_seg_size).await?;
|
||||||
|
file.seek(std::io::SeekFrom::Start(from)).await?;
|
||||||
|
|
||||||
|
let mut bytes_left = to - from;
|
||||||
|
while bytes_left > 0 {
|
||||||
|
let len = bytes_left as usize;
|
||||||
|
let len = len.min(buf.len());
|
||||||
|
let len = reader.read(&mut buf[..len]).await?;
|
||||||
|
file.write_all(&buf[..len]).await?;
|
||||||
|
bytes_left -= len as u64;
|
||||||
|
}
|
||||||
|
|
||||||
|
file.flush().await?;
|
||||||
|
file.sync_all().await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -7,13 +7,16 @@ use std::io::Read;
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use anyhow::bail;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use postgres_ffi::XLogSegNo;
|
use postgres_ffi::XLogSegNo;
|
||||||
|
use postgres_ffi::MAX_SEND_SIZE;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
|
||||||
|
use sha2::{Digest, Sha256};
|
||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
use utils::id::{TenantId, TimelineId};
|
use utils::id::{TenantId, TimelineId};
|
||||||
@@ -25,6 +28,7 @@ use crate::safekeeper::TermHistory;
|
|||||||
use crate::SafeKeeperConf;
|
use crate::SafeKeeperConf;
|
||||||
|
|
||||||
use crate::send_wal::WalSenderState;
|
use crate::send_wal::WalSenderState;
|
||||||
|
use crate::wal_storage::WalReader;
|
||||||
use crate::GlobalTimelines;
|
use crate::GlobalTimelines;
|
||||||
|
|
||||||
/// Various filters that influence the resulting JSON output.
|
/// Various filters that influence the resulting JSON output.
|
||||||
@@ -300,3 +304,56 @@ fn build_config(config: SafeKeeperConf) -> Config {
|
|||||||
wal_backup_enabled: config.wal_backup_enabled,
|
wal_backup_enabled: config.wal_backup_enabled,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||||
|
pub struct TimelineDigestRequest {
|
||||||
|
pub from_lsn: Lsn,
|
||||||
|
pub until_lsn: Lsn,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct TimelineDigest {
|
||||||
|
pub sha256: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn calculate_digest(
|
||||||
|
tli: &Arc<crate::timeline::Timeline>,
|
||||||
|
request: TimelineDigestRequest,
|
||||||
|
) -> Result<TimelineDigest> {
|
||||||
|
if request.from_lsn > request.until_lsn {
|
||||||
|
bail!("from_lsn is greater than until_lsn");
|
||||||
|
}
|
||||||
|
|
||||||
|
let conf = GlobalTimelines::get_global_config();
|
||||||
|
let (_, persisted_state) = tli.get_state().await;
|
||||||
|
|
||||||
|
if persisted_state.timeline_start_lsn > request.from_lsn {
|
||||||
|
bail!("requested LSN is before the start of the timeline");
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut wal_reader = WalReader::new(
|
||||||
|
conf.workdir.clone(),
|
||||||
|
tli.timeline_dir.clone(),
|
||||||
|
&persisted_state,
|
||||||
|
request.from_lsn,
|
||||||
|
true,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let mut hasher = Sha256::new();
|
||||||
|
let mut buf = [0u8; MAX_SEND_SIZE];
|
||||||
|
|
||||||
|
let mut bytes_left = (request.until_lsn.0 - request.from_lsn.0) as usize;
|
||||||
|
while bytes_left > 0 {
|
||||||
|
let bytes_to_read = std::cmp::min(buf.len(), bytes_left);
|
||||||
|
let bytes_read = wal_reader.read(&mut buf[..bytes_to_read]).await?;
|
||||||
|
if bytes_read == 0 {
|
||||||
|
bail!("wal_reader.read returned 0 bytes");
|
||||||
|
}
|
||||||
|
hasher.update(&buf[..bytes_read]);
|
||||||
|
bytes_left -= bytes_read;
|
||||||
|
}
|
||||||
|
|
||||||
|
let digest = hasher.finalize();
|
||||||
|
let digest = hex::encode(digest);
|
||||||
|
Ok(TimelineDigest { sha256: digest })
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use hyper::{Body, Request, Response, StatusCode, Uri};
|
|||||||
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||||
use safekeeper_api::models::SkTimelineInfo;
|
use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
@@ -12,19 +12,23 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
|||||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||||
use tokio::fs::File;
|
use tokio::fs::File;
|
||||||
use tokio::io::AsyncReadExt;
|
use tokio::io::AsyncReadExt;
|
||||||
|
use tokio_util::sync::CancellationToken;
|
||||||
|
use utils::failpoint_support::failpoints_handler;
|
||||||
|
use utils::http::request::parse_query_param;
|
||||||
|
|
||||||
use std::io::Write as _;
|
use std::io::Write as _;
|
||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
use tokio_stream::wrappers::ReceiverStream;
|
use tokio_stream::wrappers::ReceiverStream;
|
||||||
use tracing::info_span;
|
use tracing::{info_span, Instrument};
|
||||||
use utils::http::endpoint::{request_span, ChannelWriter};
|
use utils::http::endpoint::{request_span, ChannelWriter};
|
||||||
|
|
||||||
|
use crate::debug_dump::TimelineDigestRequest;
|
||||||
use crate::receive_wal::WalReceiverState;
|
use crate::receive_wal::WalReceiverState;
|
||||||
use crate::safekeeper::Term;
|
use crate::safekeeper::Term;
|
||||||
use crate::safekeeper::{ServerInfo, TermLsn};
|
use crate::safekeeper::{ServerInfo, TermLsn};
|
||||||
use crate::send_wal::WalSenderState;
|
use crate::send_wal::WalSenderState;
|
||||||
use crate::timeline::PeerInfo;
|
use crate::timeline::PeerInfo;
|
||||||
use crate::{debug_dump, pull_timeline};
|
use crate::{copy_timeline, debug_dump, pull_timeline};
|
||||||
|
|
||||||
use crate::timelines_global_map::TimelineDeleteForceResult;
|
use crate::timelines_global_map::TimelineDeleteForceResult;
|
||||||
use crate::GlobalTimelines;
|
use crate::GlobalTimelines;
|
||||||
@@ -202,6 +206,56 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
|
|||||||
json_response(StatusCode::OK, resp)
|
json_response(StatusCode::OK, resp)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
check_permission(&request, None)?;
|
||||||
|
|
||||||
|
let request_data: TimelineCopyRequest = json_request(&mut request).await?;
|
||||||
|
let ttid = TenantTimelineId::new(
|
||||||
|
parse_request_param(&request, "tenant_id")?,
|
||||||
|
parse_request_param(&request, "source_timeline_id")?,
|
||||||
|
);
|
||||||
|
|
||||||
|
let source = GlobalTimelines::get(ttid)?;
|
||||||
|
|
||||||
|
copy_timeline::handle_request(copy_timeline::Request{
|
||||||
|
source,
|
||||||
|
until_lsn: request_data.until_lsn,
|
||||||
|
destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
|
||||||
|
})
|
||||||
|
.instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let ttid = TenantTimelineId::new(
|
||||||
|
parse_request_param(&request, "tenant_id")?,
|
||||||
|
parse_request_param(&request, "timeline_id")?,
|
||||||
|
);
|
||||||
|
check_permission(&request, Some(ttid.tenant_id))?;
|
||||||
|
|
||||||
|
let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
|
||||||
|
let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;
|
||||||
|
|
||||||
|
let request = TimelineDigestRequest {
|
||||||
|
from_lsn: from_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
|
||||||
|
"from_lsn is required"
|
||||||
|
)))?,
|
||||||
|
until_lsn: until_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
|
||||||
|
"until_lsn is required"
|
||||||
|
)))?,
|
||||||
|
};
|
||||||
|
|
||||||
|
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||||
|
|
||||||
|
let response = debug_dump::calculate_digest(&tli, request)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
json_response(StatusCode::OK, response)
|
||||||
|
}
|
||||||
|
|
||||||
/// Download a file from the timeline directory.
|
/// Download a file from the timeline directory.
|
||||||
// TODO: figure out a better way to copy files between safekeepers
|
// TODO: figure out a better way to copy files between safekeepers
|
||||||
async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
@@ -444,6 +498,12 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
|||||||
.data(Arc::new(conf))
|
.data(Arc::new(conf))
|
||||||
.data(auth)
|
.data(auth)
|
||||||
.get("/v1/status", |r| request_span(r, status_handler))
|
.get("/v1/status", |r| request_span(r, status_handler))
|
||||||
|
.put("/v1/failpoints", |r| {
|
||||||
|
request_span(r, move |r| async {
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
failpoints_handler(r, cancel).await
|
||||||
|
})
|
||||||
|
})
|
||||||
// Will be used in the future instead of implicit timeline creation
|
// Will be used in the future instead of implicit timeline creation
|
||||||
.post("/v1/tenant/timeline", |r| {
|
.post("/v1/tenant/timeline", |r| {
|
||||||
request_span(r, timeline_create_handler)
|
request_span(r, timeline_create_handler)
|
||||||
@@ -464,11 +524,18 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
|||||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
|
"/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
|
||||||
|r| request_span(r, timeline_files_handler),
|
|r| request_span(r, timeline_files_handler),
|
||||||
)
|
)
|
||||||
|
.post(
|
||||||
|
"/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
|
||||||
|
|r| request_span(r, timeline_copy_handler),
|
||||||
|
)
|
||||||
// for tests
|
// for tests
|
||||||
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
|
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
|
||||||
request_span(r, record_safekeeper_info)
|
request_span(r, record_safekeeper_info)
|
||||||
})
|
})
|
||||||
.get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
|
.get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
|
||||||
|
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
|
||||||
|
request_span(r, timeline_digest_handler)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ mod auth;
|
|||||||
pub mod broker;
|
pub mod broker;
|
||||||
pub mod control_file;
|
pub mod control_file;
|
||||||
pub mod control_file_upgrade;
|
pub mod control_file_upgrade;
|
||||||
|
pub mod copy_timeline;
|
||||||
pub mod debug_dump;
|
pub mod debug_dump;
|
||||||
pub mod handler;
|
pub mod handler;
|
||||||
pub mod http;
|
pub mod http;
|
||||||
|
|||||||
@@ -1,16 +1,24 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use camino::Utf8PathBuf;
|
||||||
|
use camino_tempfile::Utf8TempDir;
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use anyhow::{bail, Context, Result};
|
use anyhow::{bail, Context, Result};
|
||||||
use tokio::io::AsyncWriteExt;
|
use tokio::io::AsyncWriteExt;
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
use utils::{
|
||||||
|
id::{TenantId, TenantTimelineId, TimelineId},
|
||||||
|
lsn::Lsn,
|
||||||
|
};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
control_file, debug_dump,
|
control_file, debug_dump,
|
||||||
http::routes::TimelineStatus,
|
http::routes::TimelineStatus,
|
||||||
|
timeline::{Timeline, TimelineError},
|
||||||
wal_storage::{self, Storage},
|
wal_storage::{self, Storage},
|
||||||
GlobalTimelines,
|
GlobalTimelines, SafeKeeperConf,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Info about timeline on safekeeper ready for reporting.
|
/// Info about timeline on safekeeper ready for reporting.
|
||||||
@@ -91,7 +99,7 @@ pub async fn handle_request(request: Request) -> Result<Response> {
|
|||||||
async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
|
async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
|
||||||
let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
|
let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
|
||||||
info!(
|
info!(
|
||||||
"Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
|
"pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
|
||||||
ttid,
|
ttid,
|
||||||
host,
|
host,
|
||||||
status.commit_lsn,
|
status.commit_lsn,
|
||||||
@@ -121,14 +129,14 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
|
|||||||
|
|
||||||
if dump.timelines.len() != 1 {
|
if dump.timelines.len() != 1 {
|
||||||
bail!(
|
bail!(
|
||||||
"Expected to fetch single timeline, got {} timelines",
|
"expected to fetch single timeline, got {} timelines",
|
||||||
dump.timelines.len()
|
dump.timelines.len()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let timeline = dump.timelines.into_iter().next().unwrap();
|
let timeline = dump.timelines.into_iter().next().unwrap();
|
||||||
let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
|
let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
|
||||||
"Timeline {} doesn't have disk content",
|
"timeline {} doesn't have disk content",
|
||||||
ttid
|
ttid
|
||||||
))?;
|
))?;
|
||||||
|
|
||||||
@@ -155,29 +163,12 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
|
|||||||
filenames.insert(0, "safekeeper.control".to_string());
|
filenames.insert(0, "safekeeper.control".to_string());
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Downloading {} files from safekeeper {}",
|
"downloading {} files from safekeeper {}",
|
||||||
filenames.len(),
|
filenames.len(),
|
||||||
host
|
host
|
||||||
);
|
);
|
||||||
|
|
||||||
// Creating temp directory for a new timeline. It needs to be
|
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
|
||||||
// located on the same filesystem as the rest of the timelines.
|
|
||||||
|
|
||||||
// conf.workdir is usually /storage/safekeeper/data
|
|
||||||
// will try to transform it into /storage/safekeeper/tmp
|
|
||||||
let temp_base = conf
|
|
||||||
.workdir
|
|
||||||
.parent()
|
|
||||||
.ok_or(anyhow::anyhow!("workdir has no parent"))?
|
|
||||||
.join("tmp");
|
|
||||||
|
|
||||||
tokio::fs::create_dir_all(&temp_base).await?;
|
|
||||||
|
|
||||||
let tli_dir = camino_tempfile::Builder::new()
|
|
||||||
.suffix("_temptli")
|
|
||||||
.prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
|
|
||||||
.tempdir_in(temp_base)?;
|
|
||||||
let tli_dir_path = tli_dir.path().to_path_buf();
|
|
||||||
|
|
||||||
// Note: some time happens between fetching list of files and fetching files themselves.
|
// Note: some time happens between fetching list of files and fetching files themselves.
|
||||||
// It's possible that some files will be removed from safekeeper and we will fail to fetch them.
|
// It's possible that some files will be removed from safekeeper and we will fail to fetch them.
|
||||||
@@ -201,47 +192,105 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
|
|||||||
// TODO: fsync?
|
// TODO: fsync?
|
||||||
|
|
||||||
// Let's create timeline from temp directory and verify that it's correct
|
// Let's create timeline from temp directory and verify that it's correct
|
||||||
|
let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?;
|
||||||
|
info!(
|
||||||
|
"finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
|
||||||
|
ttid, commit_lsn, flush_lsn
|
||||||
|
);
|
||||||
|
assert!(status.commit_lsn <= status.flush_lsn);
|
||||||
|
|
||||||
let control_path = tli_dir_path.join("safekeeper.control");
|
// Finally, load the timeline.
|
||||||
|
let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?;
|
||||||
|
|
||||||
|
Ok(Response {
|
||||||
|
safekeeper_host: host,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create temp directory for a new timeline. It needs to be located on the same
|
||||||
|
/// filesystem as the rest of the timelines. It will be automatically deleted when
|
||||||
|
/// Utf8TempDir goes out of scope.
|
||||||
|
pub async fn create_temp_timeline_dir(
|
||||||
|
conf: &SafeKeeperConf,
|
||||||
|
ttid: TenantTimelineId,
|
||||||
|
) -> Result<(Utf8TempDir, Utf8PathBuf)> {
|
||||||
|
// conf.workdir is usually /storage/safekeeper/data
|
||||||
|
// will try to transform it into /storage/safekeeper/tmp
|
||||||
|
let temp_base = conf
|
||||||
|
.workdir
|
||||||
|
.parent()
|
||||||
|
.ok_or(anyhow::anyhow!("workdir has no parent"))?
|
||||||
|
.join("tmp");
|
||||||
|
|
||||||
|
tokio::fs::create_dir_all(&temp_base).await?;
|
||||||
|
|
||||||
|
let tli_dir = camino_tempfile::Builder::new()
|
||||||
|
.suffix("_temptli")
|
||||||
|
.prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
|
||||||
|
.tempdir_in(temp_base)?;
|
||||||
|
|
||||||
|
let tli_dir_path = tli_dir.path().to_path_buf();
|
||||||
|
|
||||||
|
Ok((tli_dir, tli_dir_path))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Do basic validation of a temp timeline, before moving it to the global map.
|
||||||
|
pub async fn validate_temp_timeline(
|
||||||
|
conf: &SafeKeeperConf,
|
||||||
|
ttid: TenantTimelineId,
|
||||||
|
path: &Utf8PathBuf,
|
||||||
|
) -> Result<(Lsn, Lsn)> {
|
||||||
|
let control_path = path.join("safekeeper.control");
|
||||||
|
|
||||||
let control_store = control_file::FileStorage::load_control_file(control_path)?;
|
let control_store = control_file::FileStorage::load_control_file(control_path)?;
|
||||||
if control_store.server.wal_seg_size == 0 {
|
if control_store.server.wal_seg_size == 0 {
|
||||||
bail!("wal_seg_size is not set");
|
bail!("wal_seg_size is not set");
|
||||||
}
|
}
|
||||||
|
|
||||||
let wal_store =
|
let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
|
||||||
wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?;
|
|
||||||
|
|
||||||
let commit_lsn = status.commit_lsn;
|
let commit_lsn = control_store.commit_lsn;
|
||||||
let flush_lsn = wal_store.flush_lsn();
|
let flush_lsn = wal_store.flush_lsn();
|
||||||
|
|
||||||
info!(
|
Ok((commit_lsn, flush_lsn))
|
||||||
"Finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
|
}
|
||||||
ttid, commit_lsn, flush_lsn
|
|
||||||
);
|
/// Move timeline from a temp directory to the main storage, and load it to the global map.
|
||||||
assert!(status.commit_lsn <= status.flush_lsn);
|
/// This operation is done under a lock to prevent bugs if several concurrent requests are
|
||||||
|
/// trying to load the same timeline. Note that it doesn't guard against creating the
|
||||||
|
/// timeline with the same ttid, but no one should be doing this anyway.
|
||||||
|
pub async fn load_temp_timeline(
|
||||||
|
conf: &SafeKeeperConf,
|
||||||
|
ttid: TenantTimelineId,
|
||||||
|
tmp_path: &Utf8PathBuf,
|
||||||
|
) -> Result<Arc<Timeline>> {
|
||||||
|
// Take a lock to prevent concurrent loadings
|
||||||
|
let load_lock = GlobalTimelines::loading_lock().await;
|
||||||
|
let guard = load_lock.lock().await;
|
||||||
|
|
||||||
|
if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) {
|
||||||
|
bail!("timeline already exists, cannot overwrite it")
|
||||||
|
}
|
||||||
|
|
||||||
// Move timeline dir to the correct location
|
// Move timeline dir to the correct location
|
||||||
let timeline_path = conf.timeline_dir(&ttid);
|
let timeline_path = conf.timeline_dir(&ttid);
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Moving timeline {} from {} to {}",
|
"moving timeline {} from {} to {}",
|
||||||
ttid, tli_dir_path, timeline_path
|
ttid, tmp_path, timeline_path
|
||||||
);
|
);
|
||||||
tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
|
tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
|
||||||
tokio::fs::rename(tli_dir_path, &timeline_path).await?;
|
tokio::fs::rename(tmp_path, &timeline_path).await?;
|
||||||
|
|
||||||
let tli = GlobalTimelines::load_timeline(ttid)
|
let tli = GlobalTimelines::load_timeline(&guard, ttid)
|
||||||
.await
|
.await
|
||||||
.context("Failed to load timeline after copy")?;
|
.context("Failed to load timeline after copy")?;
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Loaded timeline {}, flush_lsn={}",
|
"loaded timeline {}, flush_lsn={}",
|
||||||
ttid,
|
ttid,
|
||||||
tli.get_flush_lsn().await
|
tli.get_flush_lsn().await
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(Response {
|
Ok(tli)
|
||||||
safekeeper_host: host,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
|
|||||||
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
|
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
|
use utils::failpoint_support;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::id::TenantTimelineId;
|
||||||
use utils::lsn::AtomicLsn;
|
use utils::lsn::AtomicLsn;
|
||||||
use utils::pageserver_feedback::PageserverFeedback;
|
use utils::pageserver_feedback::PageserverFeedback;
|
||||||
@@ -391,15 +392,8 @@ impl SafekeeperPostgresHandler {
|
|||||||
// application_name: give only committed WAL (used by pageserver) or all
|
// application_name: give only committed WAL (used by pageserver) or all
|
||||||
// existing WAL (up to flush_lsn, used by walproposer or peer recovery).
|
// existing WAL (up to flush_lsn, used by walproposer or peer recovery).
|
||||||
// The second case is always driven by a consensus leader which term
|
// The second case is always driven by a consensus leader which term
|
||||||
// must generally be also supplied. However we're sloppy to do this in
|
// must be supplied.
|
||||||
// walproposer recovery which will be removed soon. So TODO is to make
|
let end_watch = if term.is_some() {
|
||||||
// it not Option'al then.
|
|
||||||
//
|
|
||||||
// Fetching WAL without term in recovery creates a small risk of this
|
|
||||||
// WAL getting concurrently garbaged if another compute rises which
|
|
||||||
// collects majority and starts fixing log on this safekeeper itself.
|
|
||||||
// That's ok as (old) proposer will never be able to commit such WAL.
|
|
||||||
let end_watch = if self.is_walproposer_recovery() {
|
|
||||||
EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
|
EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
|
||||||
} else {
|
} else {
|
||||||
EndWatch::Commit(tli.get_commit_lsn_watch_rx())
|
EndWatch::Commit(tli.get_commit_lsn_watch_rx())
|
||||||
@@ -535,12 +529,19 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
// try to send as much as available, capped by MAX_SEND_SIZE
|
// try to send as much as available, capped by MAX_SEND_SIZE
|
||||||
let mut send_size = self
|
let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
|
||||||
.end_pos
|
// if we went behind available WAL, back off
|
||||||
.checked_sub(self.start_pos)
|
if chunk_end_pos >= self.end_pos {
|
||||||
.context("reading wal without waiting for it first")?
|
chunk_end_pos = self.end_pos;
|
||||||
.0 as usize;
|
} else {
|
||||||
send_size = min(send_size, self.send_buf.len());
|
// If sending not up to end pos, round down to page boundary to
|
||||||
|
// avoid breaking WAL record not at page boundary, as protocol
|
||||||
|
// demands. See walsender.c (XLogSendPhysical).
|
||||||
|
chunk_end_pos = chunk_end_pos
|
||||||
|
.checked_sub(chunk_end_pos.block_offset())
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
|
||||||
let send_buf = &mut self.send_buf[..send_size];
|
let send_buf = &mut self.send_buf[..send_size];
|
||||||
let send_size: usize;
|
let send_size: usize;
|
||||||
{
|
{
|
||||||
@@ -551,7 +552,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
|||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
// read wal into buffer
|
// Read WAL into buffer. send_size can be additionally capped to
|
||||||
|
// segment boundary here.
|
||||||
send_size = self.wal_reader.read(send_buf).await?
|
send_size = self.wal_reader.read(send_buf).await?
|
||||||
};
|
};
|
||||||
let send_buf = &send_buf[..send_size];
|
let send_buf = &send_buf[..send_size];
|
||||||
@@ -566,6 +568,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
|||||||
}))
|
}))
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
if let Some(appname) = &self.appname {
|
||||||
|
if appname == "replica" {
|
||||||
|
failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
|
||||||
|
}
|
||||||
|
}
|
||||||
trace!(
|
trace!(
|
||||||
"sent {} bytes of WAL {}-{}",
|
"sent {} bytes of WAL {}-{}",
|
||||||
send_size,
|
send_size,
|
||||||
|
|||||||
@@ -141,7 +141,8 @@ impl SharedState {
|
|||||||
|
|
||||||
// We don't want to write anything to disk, because we may have existing timeline there.
|
// We don't want to write anything to disk, because we may have existing timeline there.
|
||||||
// These functions should not change anything on disk.
|
// These functions should not change anything on disk.
|
||||||
let control_store = control_file::FileStorage::create_new(ttid, conf, state)?;
|
let timeline_dir = conf.timeline_dir(ttid);
|
||||||
|
let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
|
||||||
let wal_store =
|
let wal_store =
|
||||||
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
|
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
|
||||||
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
|
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
|
||||||
|
|||||||
@@ -21,8 +21,12 @@ struct GlobalTimelinesState {
|
|||||||
timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
|
timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
|
||||||
wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
|
wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
|
||||||
conf: Option<SafeKeeperConf>,
|
conf: Option<SafeKeeperConf>,
|
||||||
|
load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Used to prevent concurrent timeline loading.
|
||||||
|
pub struct TimelineLoadLock;
|
||||||
|
|
||||||
impl GlobalTimelinesState {
|
impl GlobalTimelinesState {
|
||||||
/// Get configuration, which must be set once during init.
|
/// Get configuration, which must be set once during init.
|
||||||
fn get_conf(&self) -> &SafeKeeperConf {
|
fn get_conf(&self) -> &SafeKeeperConf {
|
||||||
@@ -63,6 +67,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
|
|||||||
timelines: HashMap::new(),
|
timelines: HashMap::new(),
|
||||||
wal_backup_launcher_tx: None,
|
wal_backup_launcher_tx: None,
|
||||||
conf: None,
|
conf: None,
|
||||||
|
load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -174,8 +179,16 @@ impl GlobalTimelines {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Take a lock for timeline loading.
|
||||||
|
pub async fn loading_lock() -> Arc<tokio::sync::Mutex<TimelineLoadLock>> {
|
||||||
|
TIMELINES_STATE.lock().unwrap().load_lock.clone()
|
||||||
|
}
|
||||||
|
|
||||||
/// Load timeline from disk to the memory.
|
/// Load timeline from disk to the memory.
|
||||||
pub async fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
|
pub async fn load_timeline<'a>(
|
||||||
|
_guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
|
||||||
|
ttid: TenantTimelineId,
|
||||||
|
) -> Result<Arc<Timeline>> {
|
||||||
let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
|
let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
|
||||||
|
|
||||||
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
|
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use tokio::task::JoinHandle;
|
|||||||
use utils::id::NodeId;
|
use utils::id::NodeId;
|
||||||
|
|
||||||
use std::cmp::min;
|
use std::cmp::min;
|
||||||
use std::collections::HashMap;
|
use std::collections::{HashMap, HashSet};
|
||||||
use std::pin::Pin;
|
use std::pin::Pin;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
@@ -531,3 +531,62 @@ pub async fn read_object(
|
|||||||
|
|
||||||
Ok(Box::pin(reader))
|
Ok(Box::pin(reader))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Copy segments from one timeline to another. Used in copy_timeline.
|
||||||
|
pub async fn copy_s3_segments(
|
||||||
|
wal_seg_size: usize,
|
||||||
|
src_ttid: &TenantTimelineId,
|
||||||
|
dst_ttid: &TenantTimelineId,
|
||||||
|
from_segment: XLogSegNo,
|
||||||
|
to_segment: XLogSegNo,
|
||||||
|
) -> Result<()> {
|
||||||
|
const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;
|
||||||
|
|
||||||
|
let storage = REMOTE_STORAGE
|
||||||
|
.get()
|
||||||
|
.expect("failed to get remote storage")
|
||||||
|
.as_ref()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let relative_dst_path =
|
||||||
|
Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
|
||||||
|
|
||||||
|
let remote_path = RemotePath::new(&relative_dst_path)?;
|
||||||
|
|
||||||
|
let files = storage.list_files(Some(&remote_path)).await?;
|
||||||
|
let uploaded_segments = &files
|
||||||
|
.iter()
|
||||||
|
.filter_map(|file| file.object_name().map(ToOwned::to_owned))
|
||||||
|
.collect::<HashSet<_>>();
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"these segments have already been uploaded: {:?}",
|
||||||
|
uploaded_segments
|
||||||
|
);
|
||||||
|
|
||||||
|
let relative_src_path =
|
||||||
|
Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
|
||||||
|
|
||||||
|
for segno in from_segment..to_segment {
|
||||||
|
if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
|
||||||
|
info!("copied all segments from {} until {}", from_segment, segno);
|
||||||
|
}
|
||||||
|
|
||||||
|
let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size);
|
||||||
|
if uploaded_segments.contains(&segment_name) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
debug!("copying segment {}", segment_name);
|
||||||
|
|
||||||
|
let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
|
||||||
|
let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
|
||||||
|
|
||||||
|
storage.copy_object(&from, &to).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"finished copying segments from {} until {}",
|
||||||
|
from_segment, to_segment
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|||||||
@@ -565,6 +565,9 @@ impl WalReader {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Read WAL at current position into provided buf, returns number of bytes
|
||||||
|
/// read. It can be smaller than buf size only if segment boundary is
|
||||||
|
/// reached.
|
||||||
pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
|
pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
|
||||||
// If this timeline is new, we may not have a full segment yet, so
|
// If this timeline is new, we may not have a full segment yet, so
|
||||||
// we pad the first bytes of the timeline's first WAL segment with 0s
|
// we pad the first bytes of the timeline's first WAL segment with 0s
|
||||||
@@ -725,7 +728,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Helper returning full path to WAL segment file and its .partial brother.
|
/// Helper returning full path to WAL segment file and its .partial brother.
|
||||||
fn wal_file_paths(
|
pub fn wal_file_paths(
|
||||||
timeline_dir: &Utf8Path,
|
timeline_dir: &Utf8Path,
|
||||||
segno: XLogSegNo,
|
segno: XLogSegNo,
|
||||||
wal_seg_size: usize,
|
wal_seg_size: usize,
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
|||||||
If those files already exist, we will overwrite them.
|
If those files already exist, we will overwrite them.
|
||||||
Returns basepath for files with captured output.
|
Returns basepath for files with captured output.
|
||||||
"""
|
"""
|
||||||
assert type(cmd) is list
|
assert isinstance(cmd, list)
|
||||||
base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
|
base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
|
||||||
basepath = os.path.join(capture_dir, base)
|
basepath = os.path.join(capture_dir, base)
|
||||||
stdout_filename = basepath + ".stdout"
|
stdout_filename = basepath + ".stdout"
|
||||||
|
|||||||
@@ -6,5 +6,5 @@ set -euox pipefail
|
|||||||
echo 'Reformatting Rust code'
|
echo 'Reformatting Rust code'
|
||||||
cargo fmt
|
cargo fmt
|
||||||
echo 'Reformatting Python code'
|
echo 'Reformatting Python code'
|
||||||
poetry run ruff --fix test_runner scripts
|
poetry run ruff check --fix test_runner scripts
|
||||||
poetry run black test_runner scripts
|
poetry run ruff format test_runner scripts
|
||||||
|
|||||||
@@ -347,7 +347,9 @@ class PgProtocol:
|
|||||||
"""
|
"""
|
||||||
return self.safe_psql_many([query], **kwargs)[0]
|
return self.safe_psql_many([query], **kwargs)[0]
|
||||||
|
|
||||||
def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
|
def safe_psql_many(
|
||||||
|
self, queries: List[str], log_query=True, **kwargs: Any
|
||||||
|
) -> List[List[Tuple[Any, ...]]]:
|
||||||
"""
|
"""
|
||||||
Execute queries against the node and return all rows.
|
Execute queries against the node and return all rows.
|
||||||
This method passes all extra params to connstr.
|
This method passes all extra params to connstr.
|
||||||
@@ -356,7 +358,8 @@ class PgProtocol:
|
|||||||
with closing(self.connect(**kwargs)) as conn:
|
with closing(self.connect(**kwargs)) as conn:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
for query in queries:
|
for query in queries:
|
||||||
log.info(f"Executing query: {query}")
|
if log_query:
|
||||||
|
log.info(f"Executing query: {query}")
|
||||||
cur.execute(query)
|
cur.execute(query)
|
||||||
|
|
||||||
if cur.description is None:
|
if cur.description is None:
|
||||||
@@ -365,11 +368,11 @@ class PgProtocol:
|
|||||||
result.append(cur.fetchall())
|
result.append(cur.fetchall())
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def safe_psql_scalar(self, query) -> Any:
|
def safe_psql_scalar(self, query, log_query=True) -> Any:
|
||||||
"""
|
"""
|
||||||
Execute query returning single row with single column.
|
Execute query returning single row with single column.
|
||||||
"""
|
"""
|
||||||
return self.safe_psql(query)[0][0]
|
return self.safe_psql(query, log_query=log_query)[0][0]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -890,8 +893,8 @@ class NeonEnv:
|
|||||||
"""Get list of safekeeper endpoints suitable for safekeepers GUC"""
|
"""Get list of safekeeper endpoints suitable for safekeepers GUC"""
|
||||||
return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
|
return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
|
||||||
|
|
||||||
def get_pageserver_version(self) -> str:
|
def get_binary_version(self, binary_name: str) -> str:
|
||||||
bin_pageserver = str(self.neon_binpath / "pageserver")
|
bin_pageserver = str(self.neon_binpath / binary_name)
|
||||||
res = subprocess.run(
|
res = subprocess.run(
|
||||||
[bin_pageserver, "--version"],
|
[bin_pageserver, "--version"],
|
||||||
check=True,
|
check=True,
|
||||||
@@ -1098,8 +1101,8 @@ class AbstractNeonCli(abc.ABC):
|
|||||||
If `local_binpath` is true, then we are invoking a test utility
|
If `local_binpath` is true, then we are invoking a test utility
|
||||||
"""
|
"""
|
||||||
|
|
||||||
assert type(arguments) == list
|
assert isinstance(arguments, list)
|
||||||
assert type(self.COMMAND) == str
|
assert isinstance(self.COMMAND, str)
|
||||||
|
|
||||||
if local_binpath:
|
if local_binpath:
|
||||||
# Test utility
|
# Test utility
|
||||||
@@ -1656,7 +1659,7 @@ class NeonPageserver(PgProtocol):
|
|||||||
self.running = False
|
self.running = False
|
||||||
self.service_port = port
|
self.service_port = port
|
||||||
self.config_override = config_override
|
self.config_override = config_override
|
||||||
self.version = env.get_pageserver_version()
|
self.version = env.get_binary_version("pageserver")
|
||||||
|
|
||||||
# After a test finishes, we will scrape the log to see if there are any
|
# After a test finishes, we will scrape the log to see if there are any
|
||||||
# unexpected error messages. If your test expects an error, add it to
|
# unexpected error messages. If your test expects an error, add it to
|
||||||
@@ -2924,7 +2927,10 @@ class Safekeeper:
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
|
def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
|
||||||
return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token)
|
is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
|
||||||
|
return SafekeeperHttpClient(
|
||||||
|
port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
|
||||||
|
)
|
||||||
|
|
||||||
def data_dir(self) -> str:
|
def data_dir(self) -> str:
|
||||||
return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
|
return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
|
||||||
@@ -2975,10 +2981,11 @@ class SafekeeperMetrics:
|
|||||||
class SafekeeperHttpClient(requests.Session):
|
class SafekeeperHttpClient(requests.Session):
|
||||||
HTTPError = requests.HTTPError
|
HTTPError = requests.HTTPError
|
||||||
|
|
||||||
def __init__(self, port: int, auth_token: Optional[str] = None):
|
def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.port = port
|
self.port = port
|
||||||
self.auth_token = auth_token
|
self.auth_token = auth_token
|
||||||
|
self.is_testing_enabled = is_testing_enabled
|
||||||
|
|
||||||
if auth_token is not None:
|
if auth_token is not None:
|
||||||
self.headers["Authorization"] = f"Bearer {auth_token}"
|
self.headers["Authorization"] = f"Bearer {auth_token}"
|
||||||
@@ -2986,6 +2993,30 @@ class SafekeeperHttpClient(requests.Session):
|
|||||||
def check_status(self):
|
def check_status(self):
|
||||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||||
|
|
||||||
|
def is_testing_enabled_or_skip(self):
|
||||||
|
if not self.is_testing_enabled:
|
||||||
|
pytest.skip("safekeeper was built without 'testing' feature")
|
||||||
|
|
||||||
|
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
|
||||||
|
self.is_testing_enabled_or_skip()
|
||||||
|
|
||||||
|
if isinstance(config_strings, tuple):
|
||||||
|
pairs = [config_strings]
|
||||||
|
else:
|
||||||
|
pairs = config_strings
|
||||||
|
|
||||||
|
log.info(f"Requesting config failpoints: {repr(pairs)}")
|
||||||
|
|
||||||
|
res = self.put(
|
||||||
|
f"http://localhost:{self.port}/v1/failpoints",
|
||||||
|
json=[{"name": name, "actions": actions} for name, actions in pairs],
|
||||||
|
)
|
||||||
|
log.info(f"Got failpoints request response code {res.status_code}")
|
||||||
|
res.raise_for_status()
|
||||||
|
res_json = res.json()
|
||||||
|
assert res_json is None
|
||||||
|
return res_json
|
||||||
|
|
||||||
def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
|
||||||
params = params or {}
|
params = params or {}
|
||||||
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
|
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
|
||||||
@@ -3001,6 +3032,28 @@ class SafekeeperHttpClient(requests.Session):
|
|||||||
assert isinstance(res_json, dict)
|
assert isinstance(res_json, dict)
|
||||||
return res_json
|
return res_json
|
||||||
|
|
||||||
|
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
|
||||||
|
res = self.post(
|
||||||
|
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
|
||||||
|
json=body,
|
||||||
|
)
|
||||||
|
res.raise_for_status()
|
||||||
|
|
||||||
|
def timeline_digest(
|
||||||
|
self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
res = self.get(
|
||||||
|
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
|
||||||
|
params={
|
||||||
|
"from_lsn": str(from_lsn),
|
||||||
|
"until_lsn": str(until_lsn),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
res.raise_for_status()
|
||||||
|
res_json = res.json()
|
||||||
|
assert isinstance(res_json, dict)
|
||||||
|
return res_json
|
||||||
|
|
||||||
def timeline_create(
|
def timeline_create(
|
||||||
self,
|
self,
|
||||||
tenant_id: TenantId,
|
tenant_id: TenantId,
|
||||||
|
|||||||
@@ -326,6 +326,10 @@ class PageserverHttpClient(requests.Session):
|
|||||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
|
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
|
||||||
self.verbose_error(res)
|
self.verbose_error(res)
|
||||||
|
|
||||||
|
def tenant_secondary_download(self, tenant_id: TenantId):
|
||||||
|
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
|
||||||
|
self.verbose_error(res)
|
||||||
|
|
||||||
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
|
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
|
||||||
assert "tenant_id" not in config.keys()
|
assert "tenant_id" not in config.keys()
|
||||||
res = self.put(
|
res = self.put(
|
||||||
@@ -361,9 +365,9 @@ class PageserverHttpClient(requests.Session):
|
|||||||
assert isinstance(res, dict)
|
assert isinstance(res, dict)
|
||||||
assert TenantId(res["id"]) == tenant_id
|
assert TenantId(res["id"]) == tenant_id
|
||||||
size = res["size"]
|
size = res["size"]
|
||||||
assert type(size) == int
|
assert isinstance(size, int)
|
||||||
inputs = res["inputs"]
|
inputs = res["inputs"]
|
||||||
assert type(inputs) is dict
|
assert isinstance(inputs, dict)
|
||||||
return (size, inputs)
|
return (size, inputs)
|
||||||
|
|
||||||
def tenant_size_debug(self, tenant_id: TenantId) -> str:
|
def tenant_size_debug(self, tenant_id: TenantId) -> str:
|
||||||
@@ -714,7 +718,7 @@ class PageserverHttpClient(requests.Session):
|
|||||||
)
|
)
|
||||||
self.verbose_error(res)
|
self.verbose_error(res)
|
||||||
|
|
||||||
assert res.status_code == 200
|
assert res.status_code in (200, 304)
|
||||||
|
|
||||||
def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
|
def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||||
info = self.layer_map_info(tenant_id, timeline_id)
|
info = self.layer_map_info(tenant_id, timeline_id)
|
||||||
|
|||||||
@@ -42,9 +42,10 @@ def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
|
|||||||
# Please do not alter the label for the query, as it is used to identify it.
|
# Please do not alter the label for the query, as it is used to identify it.
|
||||||
# Labels for ClickBench queries match the labels in ClickBench reports
|
# Labels for ClickBench queries match the labels in ClickBench reports
|
||||||
# on https://benchmark.clickhouse.com/ (the DB size may differ).
|
# on https://benchmark.clickhouse.com/ (the DB size may differ).
|
||||||
|
#
|
||||||
|
# Disable auto formatting for the list of queries so that it's easier to read
|
||||||
|
# fmt: off
|
||||||
QUERIES: Tuple[LabelledQuery, ...] = (
|
QUERIES: Tuple[LabelledQuery, ...] = (
|
||||||
# Disable `black` formatting for the list of queries so that it's easier to read
|
|
||||||
# fmt: off
|
|
||||||
### ClickBench queries:
|
### ClickBench queries:
|
||||||
LabelledQuery("Q0", r"SELECT COUNT(*) FROM hits;"),
|
LabelledQuery("Q0", r"SELECT COUNT(*) FROM hits;"),
|
||||||
LabelledQuery("Q1", r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"),
|
LabelledQuery("Q1", r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"),
|
||||||
@@ -96,8 +97,8 @@ QUERIES: Tuple[LabelledQuery, ...] = (
|
|||||||
# LabelledQuery("NQ0", r"..."),
|
# LabelledQuery("NQ0", r"..."),
|
||||||
# LabelledQuery("NQ1", r"..."),
|
# LabelledQuery("NQ1", r"..."),
|
||||||
# ...
|
# ...
|
||||||
# fmt: on
|
|
||||||
)
|
)
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
|
EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
|
||||||
|
|
||||||
|
|||||||
@@ -32,8 +32,7 @@ def pg_compare(request) -> PgCompare:
|
|||||||
else:
|
else:
|
||||||
assert (
|
assert (
|
||||||
len(x) == 2
|
len(x) == 2
|
||||||
), f"request param ({request.param}) should have a format of \
|
), f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`"
|
||||||
`neon_{{safekeepers_enable_fsync}}`"
|
|
||||||
|
|
||||||
# `NeonCompare` interface
|
# `NeonCompare` interface
|
||||||
neon_env_builder = request.getfixturevalue("neon_env_builder")
|
neon_env_builder = request.getfixturevalue("neon_env_builder")
|
||||||
|
|||||||
@@ -194,12 +194,13 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
|||||||
assert set(our_tenant_config.effective_config.keys()) == set(
|
assert set(our_tenant_config.effective_config.keys()) == set(
|
||||||
fully_custom_config.keys()
|
fully_custom_config.keys()
|
||||||
), "ensure we cover all config options"
|
), "ensure we cover all config options"
|
||||||
assert {
|
assert (
|
||||||
k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
|
{
|
||||||
for k in fully_custom_config.keys()
|
k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
|
||||||
} == {
|
for k in fully_custom_config.keys()
|
||||||
k: True for k in fully_custom_config.keys()
|
}
|
||||||
}, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
|
== {k: True for k in fully_custom_config.keys()}
|
||||||
|
), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
|
||||||
|
|
||||||
ps_http.tenant_detach(tenant_id)
|
ps_http.tenant_detach(tenant_id)
|
||||||
env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
|
env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
|
||||||
|
|||||||
@@ -186,9 +186,7 @@ def test_backward_compatibility(
|
|||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
assert (
|
assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
|
||||||
not breaking_changes_allowed
|
|
||||||
), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
|
|
||||||
|
|
||||||
|
|
||||||
@check_ondisk_data_compatibility_if_enabled
|
@check_ondisk_data_compatibility_if_enabled
|
||||||
@@ -247,9 +245,7 @@ def test_forward_compatibility(
|
|||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
assert (
|
assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
|
||||||
not breaking_changes_allowed
|
|
||||||
), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
|
|
||||||
|
|
||||||
|
|
||||||
def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
|
def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ import pytest
|
|||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
|
from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
|
||||||
|
|
||||||
|
|
||||||
# Restart nodes with WAL end having specially crafted shape, like last record
|
# Restart nodes with WAL end having specially crafted shape, like last record
|
||||||
# crossing segment boundary, to test decoding issues.
|
# crossing segment boundary, to test decoding issues.
|
||||||
|
|
||||||
|
|||||||
@@ -1,19 +1,59 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from fixtures.neon_fixtures import NeonEnv
|
from fixtures.log_helper import log
|
||||||
|
from fixtures.neon_fixtures import Endpoint, NeonEnv
|
||||||
|
|
||||||
|
|
||||||
|
def wait_caughtup(primary: Endpoint, secondary: Endpoint):
|
||||||
|
primary_lsn = primary.safe_psql_scalar(
|
||||||
|
"SELECT pg_current_wal_insert_lsn()::text", log_query=False
|
||||||
|
)
|
||||||
|
while True:
|
||||||
|
secondary_lsn = secondary.safe_psql_scalar(
|
||||||
|
"SELECT pg_last_wal_replay_lsn()", log_query=False
|
||||||
|
)
|
||||||
|
caught_up = secondary_lsn >= primary_lsn
|
||||||
|
log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}")
|
||||||
|
if caught_up:
|
||||||
|
return
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
# Check for corrupted WAL messages which might otherwise go unnoticed if
|
||||||
|
# reconnection fixes this.
|
||||||
|
def scan_standby_log_for_errors(secondary):
|
||||||
|
log_path = secondary.endpoint_path() / "compute.log"
|
||||||
|
with log_path.open("r") as f:
|
||||||
|
markers = re.compile(
|
||||||
|
r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
|
||||||
|
)
|
||||||
|
for line in f:
|
||||||
|
if markers.search(line):
|
||||||
|
log.info(f"bad error in standby log: {line}")
|
||||||
|
raise AssertionError()
|
||||||
|
|
||||||
|
|
||||||
def test_hot_standby(neon_simple_env: NeonEnv):
|
def test_hot_standby(neon_simple_env: NeonEnv):
|
||||||
env = neon_simple_env
|
env = neon_simple_env
|
||||||
|
|
||||||
|
# We've had a bug caused by WAL records split across multiple XLogData
|
||||||
|
# messages resulting in corrupted WAL complains on standby. It reproduced
|
||||||
|
# only when sending from safekeeper is slow enough to grab full
|
||||||
|
# MAX_SEND_SIZE messages. So insert sleep through failpoints, but only in
|
||||||
|
# one conf to decrease test time.
|
||||||
|
slow_down_send = "[debug-pg16]" in os.environ.get("PYTEST_CURRENT_TEST", "")
|
||||||
|
if slow_down_send:
|
||||||
|
sk_http = env.safekeepers[0].http_client()
|
||||||
|
sk_http.configure_failpoints([("sk-send-wal-replica-sleep", "return(100)")])
|
||||||
|
|
||||||
with env.endpoints.create_start(
|
with env.endpoints.create_start(
|
||||||
branch_name="main",
|
branch_name="main",
|
||||||
endpoint_id="primary",
|
endpoint_id="primary",
|
||||||
) as primary:
|
) as primary:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
|
with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
|
||||||
primary_lsn = None
|
|
||||||
caught_up = False
|
|
||||||
queries = [
|
queries = [
|
||||||
"SHOW neon.timeline_id",
|
"SHOW neon.timeline_id",
|
||||||
"SHOW neon.tenant_id",
|
"SHOW neon.tenant_id",
|
||||||
@@ -26,23 +66,6 @@ def test_hot_standby(neon_simple_env: NeonEnv):
|
|||||||
with p_con.cursor() as p_cur:
|
with p_con.cursor() as p_cur:
|
||||||
p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i")
|
p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i")
|
||||||
|
|
||||||
# Explicit commit to make sure other connections (and replicas) can
|
|
||||||
# see the changes of this commit.
|
|
||||||
p_con.commit()
|
|
||||||
|
|
||||||
with p_con.cursor() as p_cur:
|
|
||||||
p_cur.execute("SELECT pg_current_wal_insert_lsn()::text")
|
|
||||||
res = p_cur.fetchone()
|
|
||||||
assert res is not None
|
|
||||||
(lsn,) = res
|
|
||||||
primary_lsn = lsn
|
|
||||||
|
|
||||||
# Explicit commit to make sure other connections (and replicas) can
|
|
||||||
# see the changes of this commit.
|
|
||||||
# Note that this may generate more WAL if the transaction has changed
|
|
||||||
# things, but we don't care about that.
|
|
||||||
p_con.commit()
|
|
||||||
|
|
||||||
for query in queries:
|
for query in queries:
|
||||||
with p_con.cursor() as p_cur:
|
with p_con.cursor() as p_cur:
|
||||||
p_cur.execute(query)
|
p_cur.execute(query)
|
||||||
@@ -51,30 +74,28 @@ def test_hot_standby(neon_simple_env: NeonEnv):
|
|||||||
response = res
|
response = res
|
||||||
responses[query] = response
|
responses[query] = response
|
||||||
|
|
||||||
|
# insert more data to make safekeeper send MAX_SEND_SIZE messages
|
||||||
|
if slow_down_send:
|
||||||
|
primary.safe_psql("create table t(key int, value text)")
|
||||||
|
primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'")
|
||||||
|
|
||||||
|
wait_caughtup(primary, secondary)
|
||||||
|
|
||||||
with secondary.connect() as s_con:
|
with secondary.connect() as s_con:
|
||||||
with s_con.cursor() as s_cur:
|
with s_con.cursor() as s_cur:
|
||||||
s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
|
s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
|
||||||
res = s_cur.fetchone()
|
res = s_cur.fetchone()
|
||||||
assert res is not None
|
assert res is not None
|
||||||
|
|
||||||
while not caught_up:
|
|
||||||
with s_con.cursor() as secondary_cursor:
|
|
||||||
secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()")
|
|
||||||
res = secondary_cursor.fetchone()
|
|
||||||
assert res is not None
|
|
||||||
(secondary_lsn,) = res
|
|
||||||
# There may be more changes on the primary after we got our LSN
|
|
||||||
# due to e.g. autovacuum, but that shouldn't impact the content
|
|
||||||
# of the tables, so we check whether we've replayed up to at
|
|
||||||
# least after the commit of the `test` table.
|
|
||||||
caught_up = secondary_lsn >= primary_lsn
|
|
||||||
|
|
||||||
# Explicit commit to flush any transient transaction-level state.
|
|
||||||
s_con.commit()
|
|
||||||
|
|
||||||
for query in queries:
|
for query in queries:
|
||||||
with s_con.cursor() as secondary_cursor:
|
with s_con.cursor() as secondary_cursor:
|
||||||
secondary_cursor.execute(query)
|
secondary_cursor.execute(query)
|
||||||
response = secondary_cursor.fetchone()
|
response = secondary_cursor.fetchone()
|
||||||
assert response is not None
|
assert response is not None
|
||||||
assert response == responses[query]
|
assert response == responses[query]
|
||||||
|
|
||||||
|
scan_standby_log_for_errors(secondary)
|
||||||
|
|
||||||
|
# clean up
|
||||||
|
if slow_down_send:
|
||||||
|
sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off"))
|
||||||
|
|||||||
@@ -102,9 +102,7 @@ def test_basic_eviction(
|
|||||||
), f"Did not expect to find {local_layer} layer after evicting"
|
), f"Did not expect to find {local_layer} layer after evicting"
|
||||||
|
|
||||||
empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
|
||||||
assert (
|
assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
|
||||||
not empty_layers
|
|
||||||
), f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
|
|
||||||
|
|
||||||
evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
|
evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
|
||||||
assert (
|
assert (
|
||||||
|
|||||||
@@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
|||||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||||
|
|
||||||
env = neon_env_builder.init_start()
|
env = neon_env_builder.init_start()
|
||||||
|
env.pageserver.allowed_errors.extend(
|
||||||
|
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
|
||||||
|
)
|
||||||
|
|
||||||
ps_http = env.pageserver.http_client()
|
ps_http = env.pageserver.http_client()
|
||||||
|
|
||||||
|
|||||||
@@ -145,8 +145,7 @@ def expect_updated_msg_lsn(
|
|||||||
last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"])
|
last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"])
|
||||||
assert (
|
assert (
|
||||||
prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn
|
prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn
|
||||||
), f"the last received message's LSN {last_msg_lsn} hasn't been updated \
|
), f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}"
|
||||||
compared to the previous message's LSN {prev_msg_lsn}"
|
|
||||||
|
|
||||||
return last_msg_lsn
|
return last_msg_lsn
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
import random
|
import random
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from fixtures.log_helper import log
|
from fixtures.log_helper import log
|
||||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
|
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
|
||||||
|
from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
|
||||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||||
from fixtures.types import TenantId, TimelineId
|
from fixtures.types import TenantId, TimelineId
|
||||||
from fixtures.utils import wait_until
|
from fixtures.utils import wait_until
|
||||||
@@ -251,6 +253,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
|
|||||||
flush_ms=5000,
|
flush_ms=5000,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Encourage the new location to download while still in secondary mode
|
||||||
|
pageserver_b.http_client().tenant_secondary_download(tenant_id)
|
||||||
|
|
||||||
migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
|
migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
|
||||||
log.info(f"Acquired generation {migrated_generation} for destination pageserver")
|
log.info(f"Acquired generation {migrated_generation} for destination pageserver")
|
||||||
assert migrated_generation == initial_generation + 1
|
assert migrated_generation == initial_generation + 1
|
||||||
@@ -258,8 +263,6 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
|
|||||||
# Writes and reads still work in AttachedStale.
|
# Writes and reads still work in AttachedStale.
|
||||||
workload.validate(pageserver_a.id)
|
workload.validate(pageserver_a.id)
|
||||||
|
|
||||||
# TODO: call into secondary mode API hooks to do an upload/download sync
|
|
||||||
|
|
||||||
# Generate some more dirty writes: we expect the origin to ingest WAL in
|
# Generate some more dirty writes: we expect the origin to ingest WAL in
|
||||||
# in AttachedStale
|
# in AttachedStale
|
||||||
workload.churn_rows(64, pageserver_a.id, upload=False)
|
workload.churn_rows(64, pageserver_a.id, upload=False)
|
||||||
@@ -369,3 +372,143 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
|||||||
log.info(f"Read back heatmap: {heatmap_second}")
|
log.info(f"Read back heatmap: {heatmap_second}")
|
||||||
assert heatmap_second != heatmap_first
|
assert heatmap_second != heatmap_first
|
||||||
validate_heatmap(heatmap_second)
|
validate_heatmap(heatmap_second)
|
||||||
|
|
||||||
|
|
||||||
|
def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
|
||||||
|
"""
|
||||||
|
Inspect local storage on a pageserver to discover which layer files are present.
|
||||||
|
|
||||||
|
:return: list of relative paths to layers, from the timeline root.
|
||||||
|
"""
|
||||||
|
timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
|
||||||
|
|
||||||
|
def relative(p: Path) -> Path:
|
||||||
|
return p.relative_to(timeline_path)
|
||||||
|
|
||||||
|
return sorted(
|
||||||
|
list(
|
||||||
|
map(
|
||||||
|
relative,
|
||||||
|
filter(
|
||||||
|
lambda path: path.name != "metadata"
|
||||||
|
and "ephemeral" not in path.name
|
||||||
|
and "temp" not in path.name,
|
||||||
|
timeline_path.glob("*"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||||
|
"""
|
||||||
|
Test the overall data flow in secondary mode:
|
||||||
|
- Heatmap uploads from the attached location
|
||||||
|
- Heatmap & layer downloads from the secondary location
|
||||||
|
- Eviction of layers on the attached location results in deletion
|
||||||
|
on the secondary location as well.
|
||||||
|
"""
|
||||||
|
neon_env_builder.num_pageservers = 2
|
||||||
|
neon_env_builder.enable_pageserver_remote_storage(
|
||||||
|
remote_storage_kind=RemoteStorageKind.MOCK_S3,
|
||||||
|
)
|
||||||
|
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||||
|
assert env.attachment_service is not None
|
||||||
|
|
||||||
|
tenant_id = env.initial_tenant
|
||||||
|
timeline_id = env.initial_timeline
|
||||||
|
|
||||||
|
ps_attached = env.pageservers[0]
|
||||||
|
ps_secondary = env.pageservers[1]
|
||||||
|
|
||||||
|
workload = Workload(env, tenant_id, timeline_id)
|
||||||
|
workload.init(env.pageservers[0].id)
|
||||||
|
workload.write_rows(256, ps_attached.id)
|
||||||
|
|
||||||
|
# Configure a secondary location
|
||||||
|
log.info("Setting up secondary location...")
|
||||||
|
ps_secondary.tenant_location_configure(
|
||||||
|
tenant_id,
|
||||||
|
{
|
||||||
|
"mode": "Secondary",
|
||||||
|
"secondary_conf": {"warm": True},
|
||||||
|
"tenant_conf": {},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
|
||||||
|
log.info(f"Read back conf: {readback_conf}")
|
||||||
|
|
||||||
|
# Explicit upload/download cycle
|
||||||
|
# ==============================
|
||||||
|
log.info("Synchronizing after initial write...")
|
||||||
|
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
|
||||||
|
|
||||||
|
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||||
|
|
||||||
|
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||||
|
ps_secondary, tenant_id, timeline_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make changes on attached pageserver, check secondary downloads them
|
||||||
|
# ===================================================================
|
||||||
|
log.info("Synchronizing after subsequent write...")
|
||||||
|
workload.churn_rows(128, ps_attached.id)
|
||||||
|
|
||||||
|
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
|
||||||
|
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||||
|
|
||||||
|
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||||
|
ps_secondary, tenant_id, timeline_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
|
||||||
|
# walreceiver is still doing something.
|
||||||
|
import time
|
||||||
|
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
# Do evictions on attached pageserver, check secondary follows along
|
||||||
|
# ==================================================================
|
||||||
|
log.info("Evicting a layer...")
|
||||||
|
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
|
||||||
|
ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
|
||||||
|
|
||||||
|
log.info("Synchronizing after eviction...")
|
||||||
|
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
|
||||||
|
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||||
|
|
||||||
|
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
|
||||||
|
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||||
|
ps_secondary, tenant_id, timeline_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Scrub the remote storage
|
||||||
|
# ========================
|
||||||
|
# This confirms that the scrubber isn't upset by the presence of the heatmap
|
||||||
|
S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
|
||||||
|
|
||||||
|
# Detach secondary and delete tenant
|
||||||
|
# ===================================
|
||||||
|
# This confirms that the heatmap gets cleaned up as well as other normal content.
|
||||||
|
log.info("Detaching secondary location...")
|
||||||
|
ps_secondary.tenant_location_configure(
|
||||||
|
tenant_id,
|
||||||
|
{
|
||||||
|
"mode": "Detached",
|
||||||
|
"secondary_conf": None,
|
||||||
|
"tenant_conf": {},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
log.info("Deleting tenant...")
|
||||||
|
tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
|
||||||
|
|
||||||
|
assert_prefix_empty(
|
||||||
|
neon_env_builder,
|
||||||
|
prefix="/".join(
|
||||||
|
(
|
||||||
|
"tenants",
|
||||||
|
str(tenant_id),
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|||||||
@@ -391,8 +391,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
|
|||||||
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
|
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
|
||||||
assert (
|
assert (
|
||||||
tenant_id not in tenants_after_detach
|
tenant_id not in tenants_after_detach
|
||||||
), f"Ignored and then detached tenant {tenant_id} \
|
), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
|
||||||
should not be present in pageserver's memory"
|
|
||||||
|
|
||||||
|
|
||||||
# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
|
# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
|
||||||
@@ -430,8 +429,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
|
|||||||
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
|
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
|
||||||
assert (
|
assert (
|
||||||
tenant_id not in tenants_after_detach
|
tenant_id not in tenants_after_detach
|
||||||
), f"Ignored and then detached tenant {tenant_id} \
|
), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
|
||||||
should not be present in pageserver's memory"
|
|
||||||
|
|
||||||
|
|
||||||
def test_detach_while_attaching(
|
def test_detach_while_attaching(
|
||||||
@@ -817,9 +815,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
|||||||
if found_broken:
|
if found_broken:
|
||||||
break
|
break
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
assert (
|
assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
|
||||||
found_broken
|
|
||||||
), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
|
|
||||||
|
|
||||||
env.pageserver.tenant_load(env.initial_tenant)
|
env.pageserver.tenant_load(env.initial_tenant)
|
||||||
|
|
||||||
@@ -837,6 +833,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
|||||||
break
|
break
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
assert (
|
assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
|
||||||
found_active
|
|
||||||
), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user