mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 07:00:38 +00:00
Compare commits
54 Commits
release-85
...
problame/b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a80790e0c4 | ||
|
|
906a963351 | ||
|
|
9e7556bef2 | ||
|
|
4f4214eea3 | ||
|
|
3dffbda428 | ||
|
|
04fb256e0f | ||
|
|
02fe35831c | ||
|
|
5f7bc3ce60 | ||
|
|
e40b1c79fa | ||
|
|
2f416267dc | ||
|
|
d52d560f16 | ||
|
|
de908a7b0d | ||
|
|
687fb25d41 | ||
|
|
eaa91291ae | ||
|
|
fc9f38dd2d | ||
|
|
c689110ad6 | ||
|
|
ba6abe203d | ||
|
|
d16e024d49 | ||
|
|
a4b9335b73 | ||
|
|
2cea7a7838 | ||
|
|
e1c1aa74fe | ||
|
|
ee775a24a0 | ||
|
|
428f532f08 | ||
|
|
5efb0d8072 | ||
|
|
36ba2b8e44 | ||
|
|
1de0f41403 | ||
|
|
4d2f27a33f | ||
|
|
88a3c9e7fd | ||
|
|
df36b9aa62 | ||
|
|
18a43eeab3 | ||
|
|
39039d1be7 | ||
|
|
9ee75ceee6 | ||
|
|
f5210a367d | ||
|
|
f36520eb94 | ||
|
|
afa35eea87 | ||
|
|
8eb853b731 | ||
|
|
a95015d967 | ||
|
|
3836ee8539 | ||
|
|
a6bd4a3be6 | ||
|
|
24d96e4372 | ||
|
|
29ea89b61d | ||
|
|
322e742e4c | ||
|
|
cdb6479c8a | ||
|
|
81c557d87e | ||
|
|
e963129678 | ||
|
|
4f0a9fc569 | ||
|
|
81c6a5a796 | ||
|
|
8e05639dbf | ||
|
|
deed46015d | ||
|
|
532d9b646e | ||
|
|
55f91cf10b | ||
|
|
baafcc5d41 | ||
|
|
aa22572d8c | ||
|
|
2d247375b3 |
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -963,7 +963,7 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Verify docker-compose example and test extensions
|
||||
timeout-minutes: 20
|
||||
timeout-minutes: 60
|
||||
env:
|
||||
TAG: >-
|
||||
${{
|
||||
|
||||
121
Cargo.lock
generated
121
Cargo.lock
generated
@@ -1112,6 +1112,12 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "cfg_aliases"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
|
||||
|
||||
[[package]]
|
||||
name = "cgroups-rs"
|
||||
version = "0.3.3"
|
||||
@@ -1306,7 +1312,7 @@ dependencies = [
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"nix 0.30.1",
|
||||
"notify",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
@@ -1429,7 +1435,7 @@ dependencies = [
|
||||
"humantime-serde",
|
||||
"hyper 0.14.30",
|
||||
"jsonwebtoken",
|
||||
"nix 0.27.1",
|
||||
"nix 0.30.1",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
@@ -3512,9 +3518,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.169"
|
||||
version = "0.2.172"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
|
||||
checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
|
||||
|
||||
[[package]]
|
||||
name = "libloading"
|
||||
@@ -3788,6 +3794,16 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
|
||||
[[package]]
|
||||
name = "neon-shmem"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"nix 0.30.1",
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "never-say-never"
|
||||
version = "6.6.666"
|
||||
@@ -3821,12 +3837,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.27.1"
|
||||
version = "0.30.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
|
||||
checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
"cfg-if",
|
||||
"cfg_aliases",
|
||||
"libc",
|
||||
"memoffset 0.9.0",
|
||||
]
|
||||
@@ -4280,7 +4297,7 @@ dependencies = [
|
||||
"jsonwebtoken",
|
||||
"md5",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"nix 0.30.1",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
@@ -4356,7 +4373,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools 0.10.5",
|
||||
"nix 0.27.1",
|
||||
"nix 0.30.1",
|
||||
"once_cell",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
@@ -4417,6 +4434,16 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_page_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"prost 0.13.3",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "papaya"
|
||||
version = "0.2.1"
|
||||
@@ -6037,8 +6064,10 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"sk_ps_discovery",
|
||||
"smallvec",
|
||||
"storage_broker",
|
||||
"storage_controller_client",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror 1.0.69",
|
||||
@@ -6050,6 +6079,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
@@ -6545,6 +6575,76 @@ version = "0.3.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
|
||||
|
||||
[[package]]
|
||||
name = "sk_ps_discovery"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"desim",
|
||||
"env_logger",
|
||||
"fail",
|
||||
"futures",
|
||||
"hex",
|
||||
"http 1.1.0",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"parking_lot 0.12.1",
|
||||
"pem",
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"rustls 0.23.18",
|
||||
"safekeeper_api",
|
||||
"safekeeper_client",
|
||||
"scopeguard",
|
||||
"sd-notify",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror 1.0.69",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"utils",
|
||||
"wal_decoder",
|
||||
"walproposer",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.8"
|
||||
@@ -6651,6 +6751,7 @@ dependencies = [
|
||||
"rustls 0.23.18",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tracing",
|
||||
@@ -6663,6 +6764,7 @@ name = "storage_controller"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"bytes",
|
||||
"camino",
|
||||
"chrono",
|
||||
@@ -7899,7 +8001,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"nix 0.27.1",
|
||||
"nix 0.30.1",
|
||||
"once_cell",
|
||||
"pem",
|
||||
"pin-project-lite",
|
||||
@@ -8475,6 +8577,7 @@ dependencies = [
|
||||
"log",
|
||||
"memchr",
|
||||
"nix 0.26.4",
|
||||
"nix 0.30.1",
|
||||
"nom",
|
||||
"num",
|
||||
"num-bigint",
|
||||
|
||||
@@ -9,6 +9,7 @@ members = [
|
||||
"pageserver/ctl",
|
||||
"pageserver/client",
|
||||
"pageserver/pagebench",
|
||||
"pageserver/page_api",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"safekeeper/client",
|
||||
@@ -23,6 +24,7 @@ members = [
|
||||
"libs/postgres_ffi",
|
||||
"libs/safekeeper_api",
|
||||
"libs/desim",
|
||||
"libs/neon-shmem",
|
||||
"libs/utils",
|
||||
"libs/consumption_metrics",
|
||||
"libs/postgres_backend",
|
||||
@@ -41,7 +43,7 @@ members = [
|
||||
"libs/proxy/postgres-protocol2",
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"endpoint_storage", "libs/sk_ps_discovery",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -127,7 +129,7 @@ md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.9"
|
||||
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
||||
nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
|
||||
# Do not update to >= 7.0.0, at least. The update will have a significant impact
|
||||
# on compute startup metrics (start_postgres_ms), >= 25% degradation.
|
||||
notify = "6.0.0"
|
||||
@@ -251,6 +253,7 @@ pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
pageserver_page_api = { path = "./pageserver/page_api" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
@@ -259,6 +262,7 @@ pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||
safekeeper_client = { path = "./safekeeper/client" }
|
||||
sk_ps_discovery = { path = "./libs/sk_ps_discovery" }
|
||||
desim = { version = "0.1", path = "./libs/desim" }
|
||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
storage_controller_client = { path = "./storage_controller/client" }
|
||||
|
||||
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.86.0
|
||||
ENV RUSTC_VERSION=1.87.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
|
||||
@@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644
|
||||
RelationGetRelationName(index));
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_start_unlogged_build(index->rd_smgr);
|
||||
+ smgr_start_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
initRumState(&buildstate.rumstate, index);
|
||||
@@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644
|
||||
rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(index->rd_smgr);
|
||||
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
@@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644
|
||||
}
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_end_unlogged_build(index->rd_smgr);
|
||||
+ smgr_end_unlogged_build(RelationGetSmgr(index));
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
use std::ffi::OsStr;
|
||||
use std::io::Write;
|
||||
use std::os::unix::prelude::AsRawFd;
|
||||
use std::os::fd::AsFd;
|
||||
use std::os::unix::process::CommandExt;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
@@ -356,7 +356,7 @@ where
|
||||
let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
|
||||
// Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
|
||||
// remains locked after exec.
|
||||
nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
|
||||
nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
|
||||
.expect("remove FD_CLOEXEC");
|
||||
// Don't run drop(file), it would close the file before we actually exec.
|
||||
std::mem::forget(file);
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::fs::File;
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
@@ -31,7 +30,7 @@ use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage_controller::{
|
||||
NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
|
||||
};
|
||||
use nix::fcntl::{FlockArg, flock};
|
||||
use nix::fcntl::{Flock, FlockArg};
|
||||
use pageserver_api::config::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
|
||||
@@ -749,16 +748,16 @@ struct TimelineTreeEl {
|
||||
|
||||
/// A flock-based guard over the neon_local repository directory
|
||||
struct RepoLock {
|
||||
_file: File,
|
||||
_file: Flock<File>,
|
||||
}
|
||||
|
||||
impl RepoLock {
|
||||
fn new() -> Result<Self> {
|
||||
let repo_dir = File::open(local_env::base_path())?;
|
||||
let repo_dir_fd = repo_dir.as_raw_fd();
|
||||
flock(repo_dir_fd, FlockArg::LockExclusive)?;
|
||||
|
||||
Ok(Self { _file: repo_dir })
|
||||
match Flock::lock(repo_dir, FlockArg::LockExclusive) {
|
||||
Ok(f) => Ok(Self { _file: f }),
|
||||
Err((_, e)) => Err(e).context("flock error"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
13
libs/neon-shmem/Cargo.toml
Normal file
13
libs/neon-shmem/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "neon-shmem"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
thiserror.workspace = true
|
||||
nix.workspace=true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[target.'cfg(target_os = "macos")'.dependencies]
|
||||
tempfile = "3.14.0"
|
||||
418
libs/neon-shmem/src/lib.rs
Normal file
418
libs/neon-shmem/src/lib.rs
Normal file
@@ -0,0 +1,418 @@
|
||||
//! Shared memory utilities for neon communicator
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {} too large", max_size);
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,9 +6,11 @@ use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::TimestampTz;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::Instant;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use crate::membership::Configuration;
|
||||
use crate::{ServerInfo, Term};
|
||||
@@ -309,3 +311,29 @@ pub struct PullTimelineResponse {
|
||||
pub safekeeper_host: Option<String>,
|
||||
// TODO: add more fields?
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[serde(tag = "action")]
|
||||
pub enum TenantShardPageserverAttachmentChange {
|
||||
Attach(TenantShardPageserverAttachment),
|
||||
Detach(TenantShardPageserverAttachment),
|
||||
}
|
||||
|
||||
impl TenantShardPageserverAttachmentChange {
|
||||
pub fn attachment(&self) -> &TenantShardPageserverAttachment {
|
||||
match self {
|
||||
TenantShardPageserverAttachmentChange::Attach(a) => a,
|
||||
TenantShardPageserverAttachmentChange::Detach(a) => a,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct TenantShardPageserverAttachment {
|
||||
pub shard_id: ShardIndex,
|
||||
pub generation: Generation,
|
||||
pub ps_id: NodeId,
|
||||
// TODO: avoid transmitting this with every request.
|
||||
// How nice things could be if there were simple DNS records for ps-$node_id.$cell.$region.$cloud.neon.tech
|
||||
pub ps_hostname: String, // TODO: some type safety
|
||||
}
|
||||
|
||||
81
libs/sk_ps_discovery/Cargo.toml
Normal file
81
libs/sk_ps_discovery/Cargo.toml
Normal file
@@ -0,0 +1,81 @@
|
||||
[package]
|
||||
name = "sk_ps_discovery"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
workspace_hack.workspace = true
|
||||
|
||||
async-stream.workspace = true
|
||||
anyhow.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
chrono.workspace = true
|
||||
clap = { workspace = true, features = ["derive"] }
|
||||
crc32c.workspace = true
|
||||
fail.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
http.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
futures.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
pprof.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
rustls.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
smallvec.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
tokio = { workspace = true, features = ["fs"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util = { workspace = true }
|
||||
tonic = { workspace = true }
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
metrics.workspace = true
|
||||
pem.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
remote_storage.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
safekeeper_client.workspace = true
|
||||
sha2.workspace = true
|
||||
sd-notify.workspace = true
|
||||
storage_broker.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
http-utils.workspace = true
|
||||
utils.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
env_logger.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
itertools.workspace = true
|
||||
walproposer.workspace = true
|
||||
rand.workspace = true
|
||||
desim.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
97
libs/sk_ps_discovery/benches/bench.rs
Normal file
97
libs/sk_ps_discovery/benches/bench.rs
Normal file
@@ -0,0 +1,97 @@
|
||||
//! WAL ingestion benchmarks.
|
||||
|
||||
use std::time::Instant;
|
||||
|
||||
use criterion::{Criterion, criterion_group, criterion_main};
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use sk_ps_discovery::{
|
||||
AttachmentUpdate, RemoteConsistentLsnAdv, TenantShardAttachmentId, TimelineAttachmentId,
|
||||
};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
shard::ShardIndex,
|
||||
};
|
||||
|
||||
/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs.
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[unsafe(export_name = "malloc_conf")]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
|
||||
|
||||
// Register benchmarks with Criterion.
|
||||
criterion_group!(
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets = bench_simple,
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
fn bench_simple(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("simple");
|
||||
|
||||
// setup
|
||||
let mut world = sk_ps_discovery::World::default();
|
||||
|
||||
// Simplified view: lots of unsharded tenants with one timeline each
|
||||
let n_pageservers = 20;
|
||||
let n_tenant_shards_per_pageserver = 2000;
|
||||
for ps_id in 1..=n_pageservers {
|
||||
for _ in ..n_tenant_shards_per_pageserver {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
for generation in 10..=11 {
|
||||
let tenant_shard_attachment_id = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(generation),
|
||||
};
|
||||
let timeline_attachment = TimelineAttachmentId {
|
||||
tenant_timeline_id: TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(generation),
|
||||
};
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action: sk_ps_discovery::AttachmentUpdateAction::Attach {
|
||||
ps_id: NodeId(ps_id),
|
||||
},
|
||||
});
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
remote_consistent_lsn: Lsn(23),
|
||||
attachment: timeline_attachment,
|
||||
});
|
||||
}
|
||||
world.handle_commit_lsn_advancement(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
Lsn(42),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// setup done
|
||||
let world = world;
|
||||
g.bench_function("get_commit_lsn_advertisements", |bencher| {
|
||||
bencher.iter_custom(|iters| {
|
||||
let started = Instant::now();
|
||||
|
||||
for _ in 0..iters {
|
||||
criterion::black_box(world.get_commit_lsn_advertisements());
|
||||
}
|
||||
|
||||
let elapsed = started.elapsed();
|
||||
elapsed
|
||||
});
|
||||
});
|
||||
|
||||
g.finish();
|
||||
}
|
||||
515
libs/sk_ps_discovery/src/lib.rs
Normal file
515
libs/sk_ps_discovery/src/lib.rs
Normal file
@@ -0,0 +1,515 @@
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet, HashMap, HashSet, btree_map, hash_map},
|
||||
ops::RangeInclusive,
|
||||
};
|
||||
|
||||
use tracing::{info, warn};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
merge_join,
|
||||
shard::ShardIndex,
|
||||
};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct World {
|
||||
attachments: BTreeMap<TenantShardAttachmentId, NodeId>,
|
||||
attachment_count: HashMap<TenantId, u16>,
|
||||
nodes_timelines: HashMap<NodeId, HashMap<TenantTimelineId, u16>>, // u16 is a refcount from each timeline attachment id
|
||||
// continously maintained aggregate for efficient decisionmaking on quiescing;
|
||||
// quiesced timelines are always caught up
|
||||
// can quiesce one == attachment_count (TODO: this requires enforcing foreign key relationship between attachments and remote_consistent_lsn)
|
||||
caught_up_count: HashMap<TenantTimelineId, u16>,
|
||||
|
||||
// BEGIN quiescing/active split
|
||||
quiesced_timelines: BTreeMap<TenantTimelineId, Lsn>,
|
||||
// ^
|
||||
// either a timeline is in quiesced_timelines
|
||||
// or it is below
|
||||
// v
|
||||
commit_lsns: BTreeMap<TenantTimelineId, Lsn>,
|
||||
remote_consistent_lsns: BTreeMap<TimelineAttachmentId, Lsn>,
|
||||
// END quiescing/active split
|
||||
|
||||
// other fields
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, PartialOrd, Ord)]
|
||||
pub struct TenantShardAttachmentId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_id: ShardIndex,
|
||||
pub generation: Generation,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, PartialOrd, Ord)]
|
||||
pub struct TimelineAttachmentId {
|
||||
pub tenant_timeline_id: TenantTimelineId,
|
||||
pub shard_id: ShardIndex,
|
||||
pub generation: Generation,
|
||||
}
|
||||
|
||||
pub struct AttachmentUpdate {
|
||||
pub tenant_shard_attachment_id: TenantShardAttachmentId,
|
||||
pub action: AttachmentUpdateAction,
|
||||
}
|
||||
|
||||
pub enum AttachmentUpdateAction {
|
||||
Attach { ps_id: NodeId },
|
||||
Detach,
|
||||
}
|
||||
|
||||
pub struct RemoteConsistentLsnAdv {
|
||||
pub attachment: TimelineAttachmentId,
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl World {
|
||||
fn check_invariants(&self) {
|
||||
if !cfg!(debug_assertions) {
|
||||
return;
|
||||
}
|
||||
|
||||
// caught_up_count maintenance
|
||||
{
|
||||
for (tenant_timeline_id, caught_up_count) in
|
||||
self.caught_up_count.iter().map(|(k, v)| (*k, *v))
|
||||
{
|
||||
let attachment_count = *self
|
||||
.attachment_count
|
||||
.get(&tenant_timeline_id.tenant_id)
|
||||
.unwrap();
|
||||
assert!(caught_up_count <= attachment_count);
|
||||
if caught_up_count == attachment_count {
|
||||
self.quiesced_timelines.contains_key(&tenant_timeline_id);
|
||||
// remote_consistent_lsn and commit_lsns is empty, checked by "quiescing XOR ..." below
|
||||
} else {
|
||||
let commit_lsn = self.commit_lsns[&&tenant_timeline_id];
|
||||
let mut validate_caught_up = 0;
|
||||
let mut validate_not_caught_up = 0;
|
||||
for (_, r_c_lsn) in self
|
||||
.remote_consistent_lsns
|
||||
.range(TimelineAttachmentId::timeline_range(tenant_timeline_id))
|
||||
.map(|(k, v)| (*k, *v))
|
||||
{
|
||||
if r_c_lsn == commit_lsn {
|
||||
validate_caught_up += 1;
|
||||
} else {
|
||||
assert!(r_c_lsn < commit_lsn);
|
||||
validate_not_caught_up += 1;
|
||||
}
|
||||
}
|
||||
assert_eq!(validate_caught_up, caught_up_count);
|
||||
assert_eq!(
|
||||
validate_caught_up + validate_not_caught_up,
|
||||
attachment_count
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// quiescing XOR ...
|
||||
{
|
||||
let quiesced_timelines: HashSet<TenantTimelineId> =
|
||||
self.quiesced_timelines.keys().cloned().collect();
|
||||
let commit_lsn_timelines: HashSet<TenantTimelineId> =
|
||||
self.commit_lsns.keys().cloned().collect();
|
||||
let remote_consistent_lsn_timelines: HashSet<TenantTimelineId> = self
|
||||
.remote_consistent_lsns
|
||||
.keys()
|
||||
.map(|tlaid: &TimelineAttachmentId| tlaid.tenant_timeline_id)
|
||||
.collect();
|
||||
#[rustfmt::skip]
|
||||
assert_eq!(0, quiesced_timelines.intersection(&commit_lsn_timelines).count());
|
||||
#[rustfmt::skip]
|
||||
assert_eq!(0, quiesced_timelines.intersection(&remote_consistent_lsn_timelines).count());
|
||||
}
|
||||
|
||||
// nodes_timelines maintenance
|
||||
{
|
||||
let mut expect: HashMap<NodeId, HashMap<TenantTimelineId, u16>> = HashMap::new();
|
||||
let all_ttids: BTreeSet<TenantTimelineId> = self
|
||||
.quiesced_timelines
|
||||
.keys()
|
||||
.cloned()
|
||||
.chain(
|
||||
self.remote_consistent_lsns
|
||||
.keys()
|
||||
.cloned()
|
||||
.map(|tlaid| tlaid.tenant_timeline_id),
|
||||
)
|
||||
.collect();
|
||||
for ttid in all_ttids {
|
||||
for (_, node_id) in self
|
||||
.attachments
|
||||
.range(TenantShardAttachmentId::tenant_range(ttid.tenant_id))
|
||||
.map(|(k, v)| (*k, *v))
|
||||
{
|
||||
let expect = expect.entry(node_id).or_default();
|
||||
let refcount = expect.entry(ttid).or_default();
|
||||
*refcount += 1;
|
||||
}
|
||||
}
|
||||
assert_eq!(expect, self.nodes_timelines);
|
||||
}
|
||||
}
|
||||
pub fn update_attachment(&mut self, upd: AttachmentUpdate) {
|
||||
self.check_invariants();
|
||||
use AttachmentUpdateAction::*;
|
||||
use btree_map::Entry::*;
|
||||
let AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action,
|
||||
} = upd;
|
||||
match (action, self.attachments.entry(tenant_shard_attachment_id)) {
|
||||
(Attach { ps_id }, Occupied(e)) if *e.get() == ps_id => {
|
||||
info!("attachment is already known")
|
||||
}
|
||||
(Attach { ps_id }, Occupied(e)) => {
|
||||
warn!(current_node=%e.get(), proposed_node=%ps_id, "ignoring update that moves attachment to a different pageserver");
|
||||
}
|
||||
(Attach { ps_id }, Vacant(e)) => {
|
||||
e.insert(ps_id);
|
||||
// Keep attachmount_count up to date
|
||||
let attachment_count = self
|
||||
.attachment_count
|
||||
.entry(tenant_shard_attachment_id.tenant_id)
|
||||
.or_default();
|
||||
*attachment_count += attachment_count.checked_add(1).unwrap();
|
||||
// Keep nodes_timelines up to date
|
||||
let nodes_timelines = self.nodes_timelines.entry(ps_id).or_default();
|
||||
for (ttid, _) in self.commit_lsns.range(TenantTimelineId::tenant_range(
|
||||
tenant_shard_attachment_id.tenant_id,
|
||||
)) {
|
||||
let refcount = nodes_timelines.entry(*ttid).or_default();
|
||||
*refcount = refcount.checked_add(1).unwrap();
|
||||
}
|
||||
if nodes_timelines.is_empty() {
|
||||
self.nodes_timelines.remove(&ps_id);
|
||||
}
|
||||
// New shards may start at an older LSN than where we quiesced => activate all quiesced timelines.
|
||||
let activate_range =
|
||||
TenantTimelineId::tenant_range(tenant_shard_attachment_id.tenant_id);
|
||||
let activate: HashSet<TenantTimelineId> = self
|
||||
.quiesced_timelines
|
||||
.range(activate_range)
|
||||
.map(|(ttid, _quiesced_lsn)| *ttid)
|
||||
.collect();
|
||||
for tenant_timeline_id in activate {
|
||||
self.activate_timeline(tenant_timeline_id);
|
||||
}
|
||||
}
|
||||
(Detach, Occupied(e)) => {
|
||||
let ps_id = e.remove();
|
||||
// Keep attachment count up to date
|
||||
let attachment_count = self
|
||||
.attachment_count
|
||||
.get_mut(&tenant_shard_attachment_id.tenant_id)
|
||||
.expect("attachment action initializes the hasmap entry");
|
||||
*attachment_count = attachment_count.checked_sub(1).unwrap();
|
||||
// Keep nodes_timelines up to date
|
||||
let nodes_timelines = self
|
||||
.nodes_timelines
|
||||
.get_mut(&ps_id)
|
||||
.expect("attachment action initializes hashmap entry");
|
||||
for (ttid, _) in self.commit_lsns.range(TenantTimelineId::tenant_range(
|
||||
tenant_shard_attachment_id.tenant_id,
|
||||
)) {
|
||||
let refcount = nodes_timelines.entry(*ttid).or_default();
|
||||
*refcount = refcount.checked_sub(1).unwrap();
|
||||
}
|
||||
}
|
||||
(Detach, Vacant(_)) => {
|
||||
info!("detachment is already known");
|
||||
}
|
||||
}
|
||||
self.check_invariants();
|
||||
}
|
||||
pub fn handle_remote_consistent_lsn_advertisement(&mut self, adv: RemoteConsistentLsnAdv) {
|
||||
self.check_invariants();
|
||||
let RemoteConsistentLsnAdv {
|
||||
attachment,
|
||||
remote_consistent_lsn,
|
||||
} = adv;
|
||||
|
||||
match self.remote_consistent_lsns.entry(attachment) {
|
||||
btree_map::Entry::Occupied(mut occupied_entry) => {
|
||||
let current = occupied_entry.get_mut();
|
||||
use std::cmp::Ordering::*;
|
||||
match (*current).cmp(&remote_consistent_lsn) {
|
||||
Less => {
|
||||
*current = remote_consistent_lsn;
|
||||
let caught_up_count = self
|
||||
.caught_up_count
|
||||
.get_mut(&attachment.tenant_timeline_id)
|
||||
.unwrap();
|
||||
*caught_up_count = caught_up_count.checked_add(1).unwrap();
|
||||
if *caught_up_count
|
||||
== self.attachment_count[&attachment.tenant_timeline_id.tenant_id]
|
||||
{
|
||||
self.quiesce_timeline(attachment.tenant_timeline_id);
|
||||
}
|
||||
}
|
||||
Equal => {
|
||||
info!("ignoring no-op update, likely duplicate delivery");
|
||||
}
|
||||
Greater => {
|
||||
warn!(
|
||||
"ignoring advertisement because remote_consistent_lsn is moving backwards"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
btree_map::Entry::Vacant(_) => {
|
||||
let ttid = attachment.tenant_timeline_id;
|
||||
match self.quiesced_timelines.get(&ttid).cloned() {
|
||||
Some(quiesced_lsn) if quiesced_lsn == remote_consistent_lsn => {
|
||||
info!("ignoring no-op update for quiesced timeline");
|
||||
}
|
||||
Some(_) => {
|
||||
self.activate_timeline(ttid);
|
||||
// recurse one level, guarnateed to hit `Occupied` case above
|
||||
self.handle_remote_consistent_lsn_advertisement(adv);
|
||||
}
|
||||
None => {
|
||||
info!("ignoring advertisement because timeline is not known");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.check_invariants();
|
||||
}
|
||||
pub fn handle_commit_lsn_advancement(&mut self, ttid: TenantTimelineId, update: Lsn) {
|
||||
self.check_invariants();
|
||||
match self.commit_lsns.entry(ttid) {
|
||||
btree_map::Entry::Occupied(mut entry) => {
|
||||
let current = entry.get_mut();
|
||||
use std::cmp::Ordering::*;
|
||||
match (*current).cmp(&update) {
|
||||
Less => {
|
||||
*current = update;
|
||||
// We never allow remote_consistent_lsn to be ahead of commit_lsn.
|
||||
// Therefore, it is safe to say nothing is caught up anymore.
|
||||
let caught_up_count = self.caught_up_count.get_mut(&ttid).unwrap();
|
||||
*caught_up_count = 0;
|
||||
}
|
||||
Equal => {
|
||||
// This code runs in safekeeper impl, no reason why there would be duplicate delivery.
|
||||
warn!("ignoring no-op update; why is this happening?");
|
||||
}
|
||||
Greater => {
|
||||
panic!(
|
||||
"proposed commit_lsn would move it backwards: current={} update={}",
|
||||
current, update
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
btree_map::Entry::Vacant(entry) => {
|
||||
match self.quiesced_timelines.get(&ttid).cloned() {
|
||||
Some(quiesced_lsn) if quiesced_lsn == update => {
|
||||
info!("ignoring no-op update for quiesced timeline");
|
||||
}
|
||||
Some(_) => {
|
||||
self.activate_timeline(ttid);
|
||||
// recurse one level, guarnateed to hit `Occupied` case above
|
||||
self.handle_commit_lsn_advancement(ttid, update);
|
||||
}
|
||||
None => {
|
||||
info!("first time hearing about this timeline, initializing");
|
||||
entry.insert(update);
|
||||
let replaced = self.caught_up_count.insert(ttid, 0);
|
||||
// only commit_lsn advancement makes timelines known to world
|
||||
assert_eq!(None, replaced);
|
||||
for (attachment, node_id) in self
|
||||
.attachments
|
||||
.range(TenantShardAttachmentId::tenant_range(ttid.tenant_id))
|
||||
{
|
||||
let replaced = self.remote_consistent_lsns.insert(
|
||||
attachment.timeline_attachment_id(ttid.timeline_id),
|
||||
Lsn(0),
|
||||
);
|
||||
// only commit_lsn advancement makes timelines known to World
|
||||
assert_eq!(None, replaced);
|
||||
|
||||
let nodes_timelines = self.nodes_timelines.entry(*node_id).or_default();
|
||||
let refcount = nodes_timelines.entry(ttid).or_default();
|
||||
*refcount = refcount.checked_add(1).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.check_invariants();
|
||||
}
|
||||
|
||||
pub fn get_commit_lsn_advertisements(&self) -> HashMap<NodeId, HashMap<TenantTimelineId, Lsn>> {
|
||||
let mut commit_lsn_advertisements_by_node: HashMap<NodeId, HashMap<TenantTimelineId, Lsn>> =
|
||||
HashMap::with_capacity(self.nodes_timelines.len());
|
||||
let commit_lsns_iter = self.commit_lsns.iter().map(|(k, v)| (*k, *v));
|
||||
let attachments_iter = self.attachments.iter().map(|(k, v)| (*k, *v));
|
||||
let remote_consistent_lsns_iter = self.remote_consistent_lsns.iter().map(|(k, v)| (*k, *v));
|
||||
|
||||
let join = merge_join::inner_equi_join_with_merge_strategy(
|
||||
commit_lsns_iter,
|
||||
attachments_iter,
|
||||
|(tenant_timeline_id, _)| tenant_timeline_id.tenant_id,
|
||||
|(shard_attachment_id, _)| shard_attachment_id.tenant_id,
|
||||
);
|
||||
let join = merge_join::left_equi_join_with_merge_strategy(
|
||||
join,
|
||||
remote_consistent_lsns_iter,
|
||||
|((ttid, _), _)| ttid.tenant_id,
|
||||
|(tlaid, _)| tlaid.tenant_timeline_id.tenant_id,
|
||||
);
|
||||
for ((c, a), r) in join {
|
||||
let (tenant_timeline_id, commit_lsn): (TenantTimelineId, Lsn) = c;
|
||||
let (_, node_id): (TenantShardAttachmentId, NodeId) = a;
|
||||
match r {
|
||||
// TODO: can > ever happen?
|
||||
Some((_, remote_consistent_lsn)) if remote_consistent_lsn >= commit_lsn => {
|
||||
// this timeline shard attachment is already caught up
|
||||
continue;
|
||||
}
|
||||
Some(_) | None => {
|
||||
// need to advertise
|
||||
// -> fallthrough
|
||||
}
|
||||
};
|
||||
// DISTINCT node_id, array_agg(DISTINCT tenant_shard_id )
|
||||
let for_node = commit_lsn_advertisements_by_node
|
||||
.entry(node_id)
|
||||
.or_insert_with(|| HashMap::with_capacity(self.nodes_timelines[&node_id].len()));
|
||||
match for_node.entry(tenant_timeline_id) {
|
||||
hash_map::Entry::Vacant(vacant_entry) => {
|
||||
vacant_entry.insert(commit_lsn);
|
||||
}
|
||||
hash_map::Entry::Occupied(occupied_entry) => {
|
||||
assert_eq!(*occupied_entry.get(), commit_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
commit_lsn_advertisements_by_node
|
||||
}
|
||||
|
||||
fn activate_timeline(&mut self, tenant_timeline_id: TenantTimelineId) {
|
||||
let quiesced_lsn = self
|
||||
.quiesced_timelines
|
||||
.remove(&tenant_timeline_id)
|
||||
.expect("must call this function only on quiesced tenant_timeline_id");
|
||||
let replaced = self.commit_lsns.insert(tenant_timeline_id, quiesced_lsn);
|
||||
assert_eq!(None, replaced);
|
||||
let reconstruct_remote_consistent_lsn_entries = self
|
||||
.attachments
|
||||
.range(TenantShardAttachmentId::tenant_range(
|
||||
tenant_timeline_id.tenant_id,
|
||||
))
|
||||
.map(|(k, _)| *k)
|
||||
.map(|tenant_shard_attachment_id| {
|
||||
(
|
||||
tenant_shard_attachment_id
|
||||
.timeline_attachment_id(tenant_timeline_id.timeline_id),
|
||||
quiesced_lsn,
|
||||
)
|
||||
});
|
||||
for (key, value) in reconstruct_remote_consistent_lsn_entries {
|
||||
let replaced = self.remote_consistent_lsns.insert(key, value);
|
||||
assert_eq!(None, replaced);
|
||||
}
|
||||
}
|
||||
|
||||
fn quiesce_timeline(&mut self, tenant_timeline_id: TenantTimelineId) {
|
||||
self.check_invariants();
|
||||
if self.quiesced_timelines.contains_key(&tenant_timeline_id) {
|
||||
panic!("only call this function on active timelines");
|
||||
}
|
||||
let quiesced_lsn = self
|
||||
.commit_lsns
|
||||
.remove(&tenant_timeline_id)
|
||||
.expect("inconsistent: we checked it's not in quiesced_timelines, so, must be active");
|
||||
let caught_up_count = self
|
||||
.caught_up_count
|
||||
.remove(&tenant_timeline_id)
|
||||
.expect("inconsistent: we checked it's not in quiesced_timleines, so, must be active");
|
||||
let mut remove_remote_consistent_lsns = Vec::new();
|
||||
for (k, remote_consistent_lsn) in self
|
||||
.remote_consistent_lsns
|
||||
.range(TimelineAttachmentId::timeline_range(tenant_timeline_id))
|
||||
{
|
||||
assert_eq!(*remote_consistent_lsn, quiesced_lsn);
|
||||
remove_remote_consistent_lsns.push(*k);
|
||||
}
|
||||
assert_eq!(
|
||||
caught_up_count,
|
||||
u16::try_from(remove_remote_consistent_lsns.len()).unwrap()
|
||||
);
|
||||
for k in remove_remote_consistent_lsns {
|
||||
let removed = self.remote_consistent_lsns.remove(&k);
|
||||
assert!(removed.is_some(), "we just added");
|
||||
}
|
||||
let replaced = self
|
||||
.quiesced_timelines
|
||||
.insert(tenant_timeline_id, quiesced_lsn);
|
||||
assert_eq!(None, replaced); // we checked at function entry
|
||||
self.check_invariants();
|
||||
}
|
||||
}
|
||||
|
||||
impl TimelineAttachmentId {
|
||||
pub fn timeline_range(ttid: TenantTimelineId) -> RangeInclusive<Self> {
|
||||
let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
|
||||
let generation_range: RangeInclusive<_> = Generation::RANGE;
|
||||
RangeInclusive::new(
|
||||
TimelineAttachmentId {
|
||||
tenant_timeline_id: ttid,
|
||||
shard_id: *shard_index_range.start(),
|
||||
generation: *generation_range.start(),
|
||||
},
|
||||
TimelineAttachmentId {
|
||||
tenant_timeline_id: ttid,
|
||||
shard_id: *shard_index_range.end(),
|
||||
generation: *generation_range.end(),
|
||||
},
|
||||
)
|
||||
}
|
||||
pub fn tenant_shard_attachment_id(self) -> TenantShardAttachmentId {
|
||||
TenantShardAttachmentId {
|
||||
tenant_id: self.tenant_timeline_id.tenant_id,
|
||||
shard_id: self.shard_id,
|
||||
generation: self.generation,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantShardAttachmentId {
|
||||
pub fn timeline_attachment_id(self, timeline_id: TimelineId) -> TimelineAttachmentId {
|
||||
TimelineAttachmentId {
|
||||
tenant_timeline_id: TenantTimelineId {
|
||||
tenant_id: self.tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
shard_id: self.shard_id,
|
||||
generation: self.generation,
|
||||
}
|
||||
}
|
||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
||||
let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
|
||||
let generation_range: RangeInclusive<_> = Generation::RANGE;
|
||||
RangeInclusive::new(
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_id: *shard_index_range.start(),
|
||||
generation: *generation_range.start(),
|
||||
},
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_id: *shard_index_range.end(),
|
||||
generation: *generation_range.end(),
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
224
libs/sk_ps_discovery/src/tests.rs
Normal file
224
libs/sk_ps_discovery/src/tests.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
use utils::{id::TenantId, logging};
|
||||
|
||||
use super::*;
|
||||
use crate::World;
|
||||
|
||||
#[track_caller]
|
||||
fn validate_advertisements(
|
||||
actual: HashMap<NodeId, HashMap<TenantTimelineId, Lsn>>,
|
||||
expect: Vec<(NodeId, Vec<(TenantTimelineId, Lsn)>)>,
|
||||
) {
|
||||
let expect: HashMap<_, _> = expect
|
||||
.into_iter()
|
||||
.map(|(node_id, innermap)| (node_id, innermap.into_iter().collect()))
|
||||
.collect();
|
||||
assert_eq!(actual, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let mut world = World::default();
|
||||
|
||||
let tenant_id = TenantId::from_array([0xff; 16]);
|
||||
let timeline_id = TimelineId::from_array([1; 16]);
|
||||
let timeline2 = TimelineId::from_array([2; 16]);
|
||||
|
||||
let attachment1 = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
};
|
||||
let attachment2 = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(3),
|
||||
};
|
||||
|
||||
let ps1 = NodeId(0x100);
|
||||
|
||||
// Out of order; in happy path, commit_lsn advances first, but let's test the
|
||||
// case where safekeeper doesn't know about the attachments yet first, before
|
||||
// we extend the case to the happy path.
|
||||
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
attachment: attachment1.timeline_attachment_id(timeline_id),
|
||||
remote_consistent_lsn: Lsn(0x23),
|
||||
});
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
attachment: attachment2.timeline_attachment_id(timeline_id),
|
||||
remote_consistent_lsn: Lsn(0x42),
|
||||
});
|
||||
// SK authoritative info on which advertisements ought exist is still empty
|
||||
assert_eq!(world.get_commit_lsn_advertisements(), HashMap::default());
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id: attachment1,
|
||||
action: AttachmentUpdateAction::Attach { ps_id: ps1 },
|
||||
});
|
||||
// We have not inserted any commit_lsn info yet, so, still no advs expected
|
||||
assert_eq!(world.get_commit_lsn_advertisements(), HashMap::default());
|
||||
// insert commit_lsn info for different timeline
|
||||
world.handle_commit_lsn_advancement(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id: timeline2,
|
||||
},
|
||||
Lsn(0x66),
|
||||
);
|
||||
// Advs should still be empty
|
||||
validate_advertisements(
|
||||
world.get_commit_lsn_advertisements(),
|
||||
vec![(
|
||||
ps1,
|
||||
vec![(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id: timeline2,
|
||||
},
|
||||
Lsn(0x66),
|
||||
)],
|
||||
)],
|
||||
);
|
||||
|
||||
// Ok, out of order part tested. Now Safekeeper learns about the attachments.
|
||||
|
||||
// insert commit_lsn info for the timeline we have remote_consistent_lsn info for
|
||||
world.handle_commit_lsn_advancement(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
Lsn(0x55),
|
||||
);
|
||||
dbg!(&world);
|
||||
// Now advertisements to attachment1 will be sent out, but attachment2 is still not known, so, no advertisements to it.
|
||||
validate_advertisements(
|
||||
world.get_commit_lsn_advertisements(),
|
||||
vec![(
|
||||
ps1,
|
||||
vec![(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
Lsn(0x55),
|
||||
)],
|
||||
)],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn advertisement_for_new_timeline() {
|
||||
let mut world = World::default();
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let ttid = TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let tenant_shard_attachment_id = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
};
|
||||
|
||||
let ps_id = NodeId(0x100);
|
||||
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action: AttachmentUpdateAction::Attach { ps_id },
|
||||
});
|
||||
world.handle_commit_lsn_advancement(ttid, Lsn(23));
|
||||
|
||||
let advs = world.get_commit_lsn_advertisements();
|
||||
validate_advertisements(advs, vec![(ps_id, vec![(ttid, Lsn(23))])]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quiescing_timeline_catchup() {
|
||||
let _guard = logging::init(
|
||||
logging::LogFormat::Test,
|
||||
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
logging::Output::Stdout,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut world = World::default();
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let ttid = TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let tenant_shard_attachment_id = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
};
|
||||
|
||||
let ps_id = NodeId(0x100);
|
||||
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action: AttachmentUpdateAction::Attach { ps_id },
|
||||
});
|
||||
world.handle_commit_lsn_advancement(ttid, Lsn(23));
|
||||
|
||||
assert!(world.quiesced_timelines.is_empty());
|
||||
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
attachment: tenant_shard_attachment_id.timeline_attachment_id(timeline_id),
|
||||
remote_consistent_lsn: Lsn(23),
|
||||
});
|
||||
|
||||
assert!(world.quiesced_timelines.contains_key(&ttid));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nodes_timelines() {
|
||||
let mut world = World::default();
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::from_array([0x1; 16]);
|
||||
let ttid = TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let tenant_shard_attachment_id = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
};
|
||||
|
||||
let ps_id = NodeId(0x100);
|
||||
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action: AttachmentUpdateAction::Attach { ps_id },
|
||||
});
|
||||
|
||||
assert!(world.nodes_timelines.get(&ps_id).is_none());
|
||||
|
||||
world.handle_commit_lsn_advancement(ttid, Lsn(0x23));
|
||||
|
||||
assert_eq!(world.nodes_timelines[&ps_id].len(), 1);
|
||||
|
||||
let timeline2 = TimelineId::from_array([0x2; 16]);
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
attachment: TimelineAttachmentId {
|
||||
tenant_timeline_id: TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id: timeline2,
|
||||
},
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
},
|
||||
remote_consistent_lsn: Lsn(0x42),
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: need more tests, esp for the removal path
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{self, Write};
|
||||
use std::os::fd::AsRawFd;
|
||||
use std::os::fd::AsFd;
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
|
||||
@@ -210,13 +210,13 @@ pub fn overwrite(
|
||||
|
||||
/// Syncs the filesystem for the given file descriptor.
|
||||
#[cfg_attr(target_os = "macos", allow(unused_variables))]
|
||||
pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
|
||||
pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> {
|
||||
// Linux guarantees durability for syncfs.
|
||||
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
use anyhow::Context;
|
||||
nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
|
||||
nix::unistd::syncfs(fd).context("syncfs")?;
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
|
||||
@@ -11,9 +11,9 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
|
||||
#[cfg(all(target_os = "linux", target_env = "gnu"))]
|
||||
{
|
||||
nix::fcntl::renameat2(
|
||||
None,
|
||||
nix::fcntl::AT_FDCWD,
|
||||
src,
|
||||
None,
|
||||
nix::fcntl::AT_FDCWD,
|
||||
dst,
|
||||
nix::fcntl::RenameFlags::RENAME_NOREPLACE,
|
||||
)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::fmt::Debug;
|
||||
use std::{fmt::Debug, ops::RangeInclusive};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -25,7 +25,9 @@ pub enum Generation {
|
||||
/// scenarios where pageservers might otherwise issue conflicting writes to
|
||||
/// remote storage
|
||||
impl Generation {
|
||||
pub const MIN: Self = Self::None;
|
||||
pub const MAX: Self = Self::Valid(u32::MAX);
|
||||
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
|
||||
|
||||
/// Create a new Generation that represents a legacy key format with
|
||||
/// no generation suffix
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::fmt;
|
||||
use std::num::ParseIntError;
|
||||
use std::ops::RangeInclusive;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -320,6 +321,19 @@ impl TenantTimelineId {
|
||||
pub fn empty() -> Self {
|
||||
Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16]))
|
||||
}
|
||||
|
||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
||||
RangeInclusive::new(
|
||||
Self {
|
||||
tenant_id,
|
||||
timeline_id: TimelineId::from_array([u8::MIN; 16]),
|
||||
},
|
||||
Self {
|
||||
tenant_id,
|
||||
timeline_id: TimelineId::from_array([u8::MAX; 16]),
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for TenantTimelineId {
|
||||
|
||||
@@ -95,6 +95,9 @@ pub mod guard_arc_swap;
|
||||
|
||||
pub mod elapsed_accum;
|
||||
|
||||
pub mod merge_join;
|
||||
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod linux_socket_ioctl;
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//! A module to create and read lock files.
|
||||
//!
|
||||
//! File locking is done using [`fcntl::flock`] exclusive locks.
|
||||
//! File locking is done using [`nix::fcntl::Flock`] exclusive locks.
|
||||
//! The only consumer of this module is currently
|
||||
//! [`pid_file`](crate::pid_file). See the module-level comment
|
||||
//! there for potential pitfalls with lock files that are used
|
||||
@@ -9,26 +9,25 @@
|
||||
use std::fs;
|
||||
use std::io::{Read, Write};
|
||||
use std::ops::Deref;
|
||||
use std::os::unix::prelude::AsRawFd;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use nix::errno::Errno::EAGAIN;
|
||||
use nix::fcntl;
|
||||
use nix::fcntl::{Flock, FlockArg};
|
||||
|
||||
use crate::crashsafe;
|
||||
|
||||
/// A handle to an open and unlocked, but not-yet-written lock file.
|
||||
/// A handle to an open and flocked, but not-yet-written lock file.
|
||||
/// Returned by [`create_exclusive`].
|
||||
#[must_use]
|
||||
pub struct UnwrittenLockFile {
|
||||
path: Utf8PathBuf,
|
||||
file: fs::File,
|
||||
file: Flock<fs::File>,
|
||||
}
|
||||
|
||||
/// Returned by [`UnwrittenLockFile::write_content`].
|
||||
#[must_use]
|
||||
pub struct LockFileGuard(fs::File);
|
||||
pub struct LockFileGuard(Flock<fs::File>);
|
||||
|
||||
impl Deref for LockFileGuard {
|
||||
type Target = fs::File;
|
||||
@@ -67,17 +66,14 @@ pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLo
|
||||
.open(lock_file_path)
|
||||
.context("open lock file")?;
|
||||
|
||||
let res = fcntl::flock(
|
||||
lock_file.as_raw_fd(),
|
||||
fcntl::FlockArg::LockExclusiveNonblock,
|
||||
);
|
||||
let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
|
||||
match res {
|
||||
Ok(()) => Ok(UnwrittenLockFile {
|
||||
Ok(lock_file) => Ok(UnwrittenLockFile {
|
||||
path: lock_file_path.to_owned(),
|
||||
file: lock_file,
|
||||
}),
|
||||
Err(EAGAIN) => anyhow::bail!("file is already locked"),
|
||||
Err(e) => Err(e).context("flock error"),
|
||||
Err((_, EAGAIN)) => anyhow::bail!("file is already locked"),
|
||||
Err((_, e)) => Err(e).context("flock error"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,32 +101,37 @@ pub enum LockFileRead {
|
||||
/// Check the [`LockFileRead`] variants for details.
|
||||
pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
|
||||
let res = fs::OpenOptions::new().read(true).open(path);
|
||||
let mut lock_file = match res {
|
||||
let lock_file = match res {
|
||||
Ok(f) => f,
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
|
||||
_ => return Err(e).context("open lock file"),
|
||||
},
|
||||
};
|
||||
let res = fcntl::flock(
|
||||
lock_file.as_raw_fd(),
|
||||
fcntl::FlockArg::LockExclusiveNonblock,
|
||||
);
|
||||
let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
|
||||
// We need the content regardless of lock success / failure.
|
||||
// But, read it after flock so that, if it succeeded, the content is consistent.
|
||||
let mut content = String::new();
|
||||
lock_file
|
||||
.read_to_string(&mut content)
|
||||
.context("read lock file")?;
|
||||
match res {
|
||||
Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
|
||||
LockFileGuard(lock_file),
|
||||
content,
|
||||
)),
|
||||
Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
|
||||
not_locked_file: lock_file,
|
||||
content,
|
||||
}),
|
||||
Err(e) => Err(e).context("flock error"),
|
||||
Ok(mut locked_file) => {
|
||||
let mut content = String::new();
|
||||
locked_file
|
||||
.read_to_string(&mut content)
|
||||
.context("read lock file")?;
|
||||
Ok(LockFileRead::NotHeldByAnyProcess(
|
||||
LockFileGuard(locked_file),
|
||||
content,
|
||||
))
|
||||
}
|
||||
Err((mut not_locked_file, EAGAIN)) => {
|
||||
let mut content = String::new();
|
||||
not_locked_file
|
||||
.read_to_string(&mut content)
|
||||
.context("read lock file")?;
|
||||
Ok(LockFileRead::LockedByOtherProcess {
|
||||
not_locked_file,
|
||||
content,
|
||||
})
|
||||
}
|
||||
Err((_, e)) => Err(e).context("flock error"),
|
||||
}
|
||||
}
|
||||
|
||||
164
libs/utils/src/merge_join.rs
Normal file
164
libs/utils/src/merge_join.rs
Normal file
@@ -0,0 +1,164 @@
|
||||
pub fn inner_equi_join_with_merge_strategy<L, LI, R, RI, K, FL, FR>(
|
||||
l: L,
|
||||
r: R,
|
||||
key_l: FL,
|
||||
key_r: FR,
|
||||
) -> impl Iterator<Item = (LI, RI)>
|
||||
where
|
||||
L: Iterator<Item = LI>, // + Sorted
|
||||
R: Iterator<Item = RI>, // + Sorted
|
||||
FL: 'static + Fn(&LI) -> K,
|
||||
FR: 'static + Fn(&RI) -> K,
|
||||
LI: Copy,
|
||||
RI: Copy,
|
||||
K: PartialEq + Eq + Ord,
|
||||
{
|
||||
let mut l = l.map(move |i| (i, key_l(&i))).peekable();
|
||||
let mut r = r.map(move |i| (i, key_r(&i))).peekable();
|
||||
std::iter::from_fn(move || {
|
||||
loop {
|
||||
match (l.peek(), r.peek()) {
|
||||
(Some((_, lk)), Some((_, rk))) if lk < rk => {
|
||||
drop(l.next());
|
||||
continue;
|
||||
}
|
||||
(Some((_, lk)), Some((_, rk))) if lk > rk => {
|
||||
drop(r.next());
|
||||
continue;
|
||||
}
|
||||
(Some((lv, lk)), Some((_, rk))) => {
|
||||
assert!(lk == rk);
|
||||
let (rv, _) = r.next().unwrap();
|
||||
return Some((lv.clone(), rv));
|
||||
}
|
||||
(None, None) | (None, Some(_)) | (Some(_), None) => return None,
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn left_equi_join_with_merge_strategy<L, LI, R, RI, K, FL, FR>(
|
||||
l: L,
|
||||
r: R,
|
||||
key_l: FL,
|
||||
key_r: FR,
|
||||
) -> impl Iterator<Item = (LI, Option<RI>)>
|
||||
where
|
||||
L: Iterator<Item = LI>, // + Sorted
|
||||
R: Iterator<Item = RI>, // + Sorted
|
||||
FL: 'static + Fn(&LI) -> K,
|
||||
FR: 'static + Fn(&RI) -> K,
|
||||
LI: Copy,
|
||||
RI: Copy,
|
||||
K: PartialEq + Eq + Ord,
|
||||
{
|
||||
let mut l = l.map(move |i| (i, key_l(&i))).peekable();
|
||||
let mut r = r.map(move |i| (i, key_r(&i))).peekable();
|
||||
let mut l_had_match = false;
|
||||
std::iter::from_fn(move || {
|
||||
loop {
|
||||
match (l.peek(), r.peek()) {
|
||||
(Some((_, lk)), Some((_, rk))) if lk < rk => {
|
||||
let (lv, _) = l.next().unwrap();
|
||||
if l_had_match {
|
||||
l_had_match = false;
|
||||
continue;
|
||||
} else {
|
||||
return Some((lv, None));
|
||||
}
|
||||
}
|
||||
(Some((_, _)), None) => {
|
||||
let (lv, _) = l.next().unwrap();
|
||||
if l_had_match {
|
||||
l_had_match = false;
|
||||
continue;
|
||||
} else {
|
||||
return Some((lv, None));
|
||||
}
|
||||
}
|
||||
(Some((_, lk)), Some((_, rk))) if lk > rk => {
|
||||
drop(r.next());
|
||||
continue;
|
||||
}
|
||||
(Some((lv, lk)), Some((_, rk))) => {
|
||||
l_had_match = true;
|
||||
assert!(lk == rk);
|
||||
let (rv, _) = r.next().unwrap();
|
||||
return Some((lv.clone(), Some(rv)));
|
||||
}
|
||||
(None, None) | (None, Some(_)) => return None,
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
#[test]
|
||||
fn inner_equi_basic() {
|
||||
let l = vec![b"a", b"c"];
|
||||
let r = vec![b"aa", b"ad", b"ba", b"bb", b"ca", b"cb", b"cd", b"dd"];
|
||||
|
||||
let res: Vec<_> = super::inner_equi_join_with_merge_strategy(
|
||||
l.into_iter(),
|
||||
r.into_iter(),
|
||||
|l| &l[0..1],
|
||||
|r| &r[0..1],
|
||||
)
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
vec![
|
||||
(b"a", b"aa"),
|
||||
(b"a", b"ad"),
|
||||
(b"c", b"ca"),
|
||||
(b"c", b"cb"),
|
||||
(b"c", b"cd"),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn left_equi_basic() {
|
||||
/*
|
||||
create table aleft (id text, aleft text);
|
||||
create table aright (id text, aright text);
|
||||
insert into aleft values ('a', 'a'), ('b', 'b');
|
||||
insert into aright values ('a', 'aa'), ('a', 'ab'), ('c', 'cd');
|
||||
select * from aleft left join aright using ("id");
|
||||
*/
|
||||
|
||||
let l = vec![b"a", b"b"];
|
||||
let r = vec![b"aa", b"ab", b"cd"];
|
||||
|
||||
let res: Vec<_> = super::left_equi_join_with_merge_strategy(
|
||||
l.into_iter(),
|
||||
r.into_iter(),
|
||||
|l| &l[0..1],
|
||||
|r| &r[0..1],
|
||||
)
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
vec![(b"a", Some(b"aa")), (b"a", Some(b"ab")), (b"b", None)]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn left_equi_basic_2() {
|
||||
let l = vec![b"b"];
|
||||
let r = vec![b"aa", b"ab", b"bb"];
|
||||
|
||||
let res: Vec<_> = super::left_equi_join_with_merge_strategy(
|
||||
l.into_iter(),
|
||||
r.into_iter(),
|
||||
|l| &l[0..1],
|
||||
|r| &r[0..1],
|
||||
)
|
||||
.collect();
|
||||
|
||||
assert_eq!(res, vec![(b"b", Some(b"bb"))])
|
||||
}
|
||||
}
|
||||
@@ -52,6 +52,7 @@ pub struct TenantShardId {
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
pub const MIN: Self = Self(0);
|
||||
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
@@ -85,7 +86,9 @@ impl ShardCount {
|
||||
}
|
||||
|
||||
impl ShardNumber {
|
||||
pub const MIN: Self = Self(0);
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
|
||||
}
|
||||
|
||||
impl TenantShardId {
|
||||
@@ -100,16 +103,17 @@ impl TenantShardId {
|
||||
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
|
||||
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
|
||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
||||
let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
|
||||
RangeInclusive::new(
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
shard_number: shard_index_range.start().shard_number,
|
||||
shard_count: shard_index_range.start().shard_count,
|
||||
},
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber::MAX,
|
||||
shard_count: ShardCount::MAX,
|
||||
shard_number: shard_index_range.end().shard_number,
|
||||
shard_count: shard_index_range.end().shard_count,
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -241,6 +245,16 @@ impl From<[u8; 18]> for TenantShardId {
|
||||
}
|
||||
|
||||
impl ShardIndex {
|
||||
pub const MIN: Self = ShardIndex {
|
||||
shard_number: ShardNumber::MIN,
|
||||
shard_count: ShardCount::MIN,
|
||||
};
|
||||
pub const MAX: Self = ShardIndex {
|
||||
shard_number: ShardNumber::MAX,
|
||||
shard_count: ShardCount::MAX,
|
||||
};
|
||||
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
|
||||
|
||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
shard_number: number,
|
||||
|
||||
@@ -4,3 +4,5 @@ pub mod duplex;
|
||||
pub mod gate;
|
||||
|
||||
pub mod spsc_fold;
|
||||
|
||||
pub mod spsc_watch;
|
||||
|
||||
@@ -56,7 +56,7 @@ impl<T: Send> Sender<T> {
|
||||
/// # Panics
|
||||
///
|
||||
/// If `try_fold` panics, any subsequent call to `send` panic.
|
||||
pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), SendError>
|
||||
pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), (T, SendError)>
|
||||
where
|
||||
F: Fn(&mut T, T) -> Result<(), T>,
|
||||
{
|
||||
@@ -104,7 +104,9 @@ impl<T: Send> Sender<T> {
|
||||
}
|
||||
Poll::Pending
|
||||
}
|
||||
State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
|
||||
State::ReceiverGone => {
|
||||
Poll::Ready(Err((value.take().unwrap(), SendError::ReceiverGone)))
|
||||
}
|
||||
State::SenderGone(_)
|
||||
| State::AllGone
|
||||
| State::SenderDropping
|
||||
|
||||
55
libs/utils/src/sync/spsc_watch.rs
Normal file
55
libs/utils/src/sync/spsc_watch.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
//! watch is probably not the right word, because we do take out
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::sync::spsc_fold;
|
||||
|
||||
pub fn channel<T: Send>() -> (Sender<T>, Receiver<T>) {
|
||||
let (tx, rx) = spsc_fold::channel();
|
||||
let cancel = CancellationToken::new();
|
||||
(
|
||||
Sender {
|
||||
tx,
|
||||
_cancel: cancel.clone().drop_guard(),
|
||||
},
|
||||
Receiver { rx, cancel },
|
||||
)
|
||||
}
|
||||
|
||||
pub struct Sender<T> {
|
||||
tx: spsc_fold::Sender<T>,
|
||||
_cancel: tokio_util::sync::DropGuard,
|
||||
}
|
||||
|
||||
pub struct Receiver<T> {
|
||||
rx: spsc_fold::Receiver<T>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl<T: Send> Sender<T> {
|
||||
pub fn send_replace(&mut self, value: T) -> Result<(), (T, spsc_fold::SendError)> {
|
||||
poll_ready(self.tx.send(value, |old, new| {
|
||||
*old = new;
|
||||
Ok(())
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send> Receiver<T> {
|
||||
pub async fn recv(&mut self) -> Result<T, spsc_fold::RecvError> {
|
||||
self.rx.recv().await
|
||||
}
|
||||
pub async fn cancelled(&mut self) {
|
||||
self.cancel.cancelled().await
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_ready<F: Future<Output = O>, O>(f: F) -> O {
|
||||
futures::executor::block_on(async move {
|
||||
let f = std::pin::pin!(f);
|
||||
match futures::poll!(f) {
|
||||
std::task::Poll::Ready(r) => r,
|
||||
std::task::Poll::Pending => unreachable!("expecting future to always return Ready"),
|
||||
}
|
||||
})
|
||||
}
|
||||
13
pageserver/page_api/Cargo.toml
Normal file
13
pageserver/page_api/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "pageserver_page_api"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
prost.workspace = true
|
||||
tonic.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build.workspace = true
|
||||
7
pageserver/page_api/build.rs
Normal file
7
pageserver/page_api/build.rs
Normal file
@@ -0,0 +1,7 @@
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Generates Rust code from .proto Protobuf schemas.
|
||||
tonic_build::configure()
|
||||
.bytes(["."])
|
||||
.compile_protos(&["proto/page_service.proto"], &["proto"])
|
||||
.map_err(|err| err.into())
|
||||
}
|
||||
220
pageserver/page_api/proto/page_service.proto
Normal file
220
pageserver/page_api/proto/page_service.proto
Normal file
@@ -0,0 +1,220 @@
|
||||
// Page service, presented by pageservers for computes.
|
||||
//
|
||||
// This is the compute read path. It primarily serves page versions at given
|
||||
// LSNs, but also base backups, SLRU segments, and relation metadata.
|
||||
//
|
||||
// EXPERIMENTAL: this is still under development and subject to change.
|
||||
//
|
||||
// Request metadata headers:
|
||||
// - authorization: JWT token ("Bearer <token>"), if auth is enabled
|
||||
// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980")
|
||||
// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
|
||||
// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
|
||||
//
|
||||
// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
|
||||
// However, this will require reconnecting when changing modes.
|
||||
//
|
||||
// TODO: write implementation guidance on
|
||||
// - Health checks
|
||||
// - Tracing, OpenTelemetry
|
||||
// - Compression
|
||||
|
||||
syntax = "proto3";
|
||||
package page_service;
|
||||
|
||||
service PageService {
|
||||
// Returns whether a relation exists.
|
||||
rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
|
||||
|
||||
// Fetches a base backup.
|
||||
rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
|
||||
|
||||
// Returns the total size of a database, as # of bytes.
|
||||
rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse);
|
||||
|
||||
// Fetches pages.
|
||||
//
|
||||
// This is implemented as a bidirectional streaming RPC for performance. Unary
|
||||
// requests incur costs for e.g. HTTP/2 stream setup, header parsing,
|
||||
// authentication, and so on -- with streaming, we only pay these costs during
|
||||
// the initial stream setup. This ~doubles throughput in benchmarks. Other
|
||||
// RPCs use regular unary requests, since they are not as frequent and
|
||||
// performance-critical, and this simplifies implementation.
|
||||
//
|
||||
// NB: a status response (e.g. errors) will terminate the stream. The stream
|
||||
// may be shared by e.g. multiple Postgres backends, so we should avoid this.
|
||||
// Most errors are therefore sent as GetPageResponse.status instead.
|
||||
rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);
|
||||
|
||||
// Returns the size of a relation, as # of blocks.
|
||||
rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse);
|
||||
|
||||
// Fetches an SLRU segment.
|
||||
rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
|
||||
}
|
||||
|
||||
// The LSN a request should read at.
|
||||
message ReadLsn {
|
||||
// The request's read LSN. Required.
|
||||
uint64 request_lsn = 1;
|
||||
// If given, the caller guarantees that the page has not been modified since
|
||||
// this LSN. Must be smaller than or equal to request_lsn. This allows the
|
||||
// Pageserver to serve an old page without waiting for the request LSN to
|
||||
// arrive. Valid for all request types.
|
||||
//
|
||||
// It is undefined behaviour to make a request such that the page was, in
|
||||
// fact, modified between request_lsn and not_modified_since_lsn. The
|
||||
// Pageserver might detect it and return an error, or it might return the old
|
||||
// page version or the new page version. Setting not_modified_since_lsn equal
|
||||
// to request_lsn is always safe, but can lead to unnecessary waiting.
|
||||
uint64 not_modified_since_lsn = 2;
|
||||
}
|
||||
|
||||
// A relation identifier.
|
||||
message RelTag {
|
||||
uint32 spc_oid = 1;
|
||||
uint32 db_oid = 2;
|
||||
uint32 rel_number = 3;
|
||||
uint32 fork_number = 4;
|
||||
}
|
||||
|
||||
// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
|
||||
// other shards will error.
|
||||
message CheckRelExistsRequest {
|
||||
ReadLsn read_lsn = 1;
|
||||
RelTag rel = 2;
|
||||
}
|
||||
|
||||
message CheckRelExistsResponse {
|
||||
bool exists = 1;
|
||||
}
|
||||
|
||||
// Requests a base backup at a given LSN.
|
||||
message GetBaseBackupRequest {
|
||||
// The LSN to fetch a base backup at.
|
||||
ReadLsn read_lsn = 1;
|
||||
// If true, logical replication slots will not be created.
|
||||
bool replica = 2;
|
||||
}
|
||||
|
||||
// Base backup response chunk, returned as an ordered stream.
|
||||
message GetBaseBackupResponseChunk {
|
||||
// A basebackup data chunk. The size is undefined, but bounded by the 4 MB
|
||||
// gRPC message size limit.
|
||||
bytes chunk = 1;
|
||||
}
|
||||
|
||||
// Requests the size of a database, as # of bytes. Only valid on shard 0, other
|
||||
// shards will error.
|
||||
message GetDbSizeRequest {
|
||||
ReadLsn read_lsn = 1;
|
||||
uint32 db_oid = 2;
|
||||
}
|
||||
|
||||
message GetDbSizeResponse {
|
||||
uint64 num_bytes = 1;
|
||||
}
|
||||
|
||||
// Requests one or more pages.
|
||||
message GetPageRequest {
|
||||
// A request ID. Will be included in the response. Should be unique for
|
||||
// in-flight requests on the stream.
|
||||
uint64 request_id = 1;
|
||||
// The request class.
|
||||
GetPageClass request_class = 2;
|
||||
// The LSN to read at.
|
||||
ReadLsn read_lsn = 3;
|
||||
// The relation to read from.
|
||||
RelTag rel = 4;
|
||||
// Page numbers to read. Must belong to the remote shard.
|
||||
//
|
||||
// Multiple pages will be executed as a single batch by the Pageserver,
|
||||
// amortizing layer access costs and parallelizing them. This may increase the
|
||||
// latency of any individual request, but improves the overall latency and
|
||||
// throughput of the batch as a whole.
|
||||
//
|
||||
// TODO: this causes an allocation in the common single-block case. The sender
|
||||
// can use a SmallVec to stack-allocate it, but Prost will always deserialize
|
||||
// into a heap-allocated Vec. Consider optimizing this.
|
||||
//
|
||||
// TODO: we might be able to avoid a sort or something if we mandate that these
|
||||
// are always in order. But we can't currenly rely on this on the server, because
|
||||
// of compatibility with the libpq protocol handler.
|
||||
repeated uint32 block_number = 5;
|
||||
}
|
||||
|
||||
// A GetPageRequest class. Primarily intended for observability, but may also be
|
||||
// used for prioritization in the future.
|
||||
enum GetPageClass {
|
||||
// Unknown class. For forwards compatibility: used when the client sends a
|
||||
// class that the server doesn't know about.
|
||||
GET_PAGE_CLASS_UNKNOWN = 0;
|
||||
// A normal request. This is the default.
|
||||
GET_PAGE_CLASS_NORMAL = 1;
|
||||
// A prefetch request. NB: can only be classified on pg < 18.
|
||||
GET_PAGE_CLASS_PREFETCH = 2;
|
||||
// A background request (e.g. vacuum).
|
||||
GET_PAGE_CLASS_BACKGROUND = 3;
|
||||
}
|
||||
|
||||
// A GetPage response.
|
||||
//
|
||||
// A batch response will contain all of the requested pages. We could eagerly
|
||||
// emit individual pages as soon as they are ready, but on a readv() Postgres
|
||||
// holds buffer pool locks on all pages in the batch and we'll only return once
|
||||
// the entire batch is ready, so no one can make use of the individual pages.
|
||||
message GetPageResponse {
|
||||
// The original request's ID.
|
||||
uint64 request_id = 1;
|
||||
// The response status code.
|
||||
GetPageStatus status = 2;
|
||||
// A string describing the status, if any.
|
||||
string reason = 3;
|
||||
// The 8KB page images, in the same order as the request. Empty if status != OK.
|
||||
repeated bytes page_image = 4;
|
||||
}
|
||||
|
||||
// A GetPageResponse status code. Since we use a bidirectional stream, we don't
|
||||
// want to send errors as gRPC statuses, since this would terminate the stream.
|
||||
enum GetPageStatus {
|
||||
// Unknown status. For forwards compatibility: used when the server sends a
|
||||
// status code that the client doesn't know about.
|
||||
GET_PAGE_STATUS_UNKNOWN = 0;
|
||||
// The request was successful.
|
||||
GET_PAGE_STATUS_OK = 1;
|
||||
// The page did not exist. The tenant/timeline/shard has already been
|
||||
// validated during stream setup.
|
||||
GET_PAGE_STATUS_NOT_FOUND = 2;
|
||||
// The request was invalid.
|
||||
GET_PAGE_STATUS_INVALID = 3;
|
||||
// The tenant is rate limited. Slow down and retry later.
|
||||
GET_PAGE_STATUS_SLOW_DOWN = 4;
|
||||
// TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
|
||||
// layer download. This could free up the server task to process other
|
||||
// requests while the layer download is in progress.
|
||||
}
|
||||
|
||||
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
|
||||
// shard 0, other shards will error.
|
||||
message GetRelSizeRequest {
|
||||
ReadLsn read_lsn = 1;
|
||||
RelTag rel = 2;
|
||||
}
|
||||
|
||||
message GetRelSizeResponse {
|
||||
uint32 num_blocks = 1;
|
||||
}
|
||||
|
||||
// Requests an SLRU segment. Only valid on shard 0, other shards will error.
|
||||
message GetSlruSegmentRequest {
|
||||
ReadLsn read_lsn = 1;
|
||||
uint32 kind = 2;
|
||||
uint32 segno = 3;
|
||||
}
|
||||
|
||||
// Returns an SLRU segment.
|
||||
//
|
||||
// These are up 32 pages (256 KB), so we can send them as a single response.
|
||||
message GetSlruSegmentResponse {
|
||||
bytes segment = 1;
|
||||
}
|
||||
14
pageserver/page_api/src/lib.rs
Normal file
14
pageserver/page_api/src/lib.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
//! This crate provides the Pageserver's page API. It contains:
|
||||
//!
|
||||
//! * proto/page_service.proto: the Protobuf schema for the page API.
|
||||
//! * proto: auto-generated Protobuf types for gRPC.
|
||||
//!
|
||||
//! This crate is used by both the client and the server. Try to keep it slim.
|
||||
|
||||
// Code generated by protobuf.
|
||||
pub mod proto {
|
||||
tonic::include_proto!("page_service");
|
||||
|
||||
pub use page_service_client::PageServiceClient;
|
||||
pub use page_service_server::{PageService, PageServiceServer};
|
||||
}
|
||||
@@ -423,11 +423,14 @@ fn start_pageserver(
|
||||
.map(storage_broker::Certificate::from_pem),
|
||||
);
|
||||
// Note: we do not attempt connecting here (but validate endpoints sanity).
|
||||
storage_broker::connect(
|
||||
let service_client = storage_broker::connect(
|
||||
conf.broker_endpoint.clone(),
|
||||
conf.broker_keepalive_interval,
|
||||
tls_config,
|
||||
)
|
||||
)?;
|
||||
anyhow::Ok(storage_broker::TimelineUpdatesSubscriber::new(
|
||||
service_client,
|
||||
))
|
||||
})
|
||||
.with_context(|| {
|
||||
format!(
|
||||
|
||||
@@ -100,7 +100,7 @@ pub struct State {
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
allowlist_routes: &'static [&'static str],
|
||||
remote_storage: GenericRemoteStorage,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
@@ -114,7 +114,7 @@ impl State {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
|
||||
@@ -1278,7 +1278,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[instrument(level = tracing::Level::DEBUG, skip_all)]
|
||||
async fn pagesteam_handle_batched_message<IO>(
|
||||
async fn pagestream_handle_batched_message<IO>(
|
||||
&mut self,
|
||||
pgb_writer: &mut PostgresBackend<IO>,
|
||||
batch: BatchedFeMessage,
|
||||
@@ -1733,7 +1733,7 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
let result = self
|
||||
.pagesteam_handle_batched_message(
|
||||
.pagestream_handle_batched_message(
|
||||
pgb_writer,
|
||||
msg,
|
||||
io_concurrency.clone(),
|
||||
@@ -1909,7 +1909,7 @@ impl PageServerHandler {
|
||||
return Err(e);
|
||||
}
|
||||
};
|
||||
self.pagesteam_handle_batched_message(
|
||||
self.pagestream_handle_batched_message(
|
||||
pgb_writer,
|
||||
batch,
|
||||
io_concurrency.clone(),
|
||||
|
||||
@@ -48,7 +48,6 @@ use remote_timeline_client::{
|
||||
download_tenant_manifest,
|
||||
};
|
||||
use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
|
||||
use timeline::import_pgdata::ImportingTimeline;
|
||||
use timeline::offload::{OffloadError, offload_timeline};
|
||||
@@ -153,7 +152,7 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
/// as the shared remote storage client and process initialization state.
|
||||
#[derive(Clone)]
|
||||
pub struct TenantSharedResources {
|
||||
pub broker_client: storage_broker::BrokerClientChannel,
|
||||
pub broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
pub remote_storage: GenericRemoteStorage,
|
||||
pub deletion_queue_client: DeletionQueueClient,
|
||||
pub l0_flush_global_state: L0FlushGlobalState,
|
||||
@@ -2107,7 +2106,7 @@ impl TenantShard {
|
||||
async fn unoffload_timeline(
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: RequestContext,
|
||||
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
||||
info!("unoffloading timeline");
|
||||
@@ -2242,7 +2241,7 @@ impl TenantShard {
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
new_state: TimelineArchivalState,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), TimelineArchivalError> {
|
||||
info!("setting timeline archival config");
|
||||
@@ -2571,7 +2570,7 @@ impl TenantShard {
|
||||
pub(crate) async fn create_timeline(
|
||||
self: &Arc<TenantShard>,
|
||||
params: CreateTimelineParams,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
if !self.is_active() {
|
||||
@@ -3299,7 +3298,7 @@ impl TenantShard {
|
||||
/// to delay background jobs. Background jobs can be started right away when None is given.
|
||||
fn activate(
|
||||
self: &Arc<Self>,
|
||||
broker_client: BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
|
||||
@@ -668,7 +668,9 @@ impl From<DownloadError> for UpdateError {
|
||||
|
||||
impl From<std::io::Error> for UpdateError {
|
||||
fn from(value: std::io::Error) -> Self {
|
||||
if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
|
||||
if let Some(nix::errno::Errno::ENOSPC) =
|
||||
value.raw_os_error().map(nix::errno::Errno::from_raw)
|
||||
{
|
||||
UpdateError::NoSpace
|
||||
} else if value
|
||||
.get_ref()
|
||||
|
||||
@@ -61,7 +61,6 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp};
|
||||
use rand::Rng;
|
||||
use remote_storage::DownloadError;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::sync::{Notify, oneshot, watch};
|
||||
@@ -2080,7 +2079,7 @@ impl Timeline {
|
||||
pub(crate) fn activate(
|
||||
self: &Arc<Self>,
|
||||
parent: Arc<crate::tenant::TenantShard>,
|
||||
broker_client: BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
@@ -3114,7 +3113,7 @@ impl Timeline {
|
||||
fn launch_wal_receiver(
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestContext,
|
||||
broker_client: BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
) {
|
||||
info!(
|
||||
"launching WAL receiver for timeline {} of tenant {}",
|
||||
|
||||
@@ -161,7 +161,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
tenant: Arc<TenantShard>,
|
||||
copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
|
||||
base_lsn: Lsn,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
self.write(|raw_timeline| async move {
|
||||
|
||||
@@ -28,7 +28,6 @@ use std::num::NonZeroU64;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -70,7 +69,7 @@ impl WalReceiver {
|
||||
pub fn start(
|
||||
timeline: Arc<Timeline>,
|
||||
conf: WalReceiverConf,
|
||||
mut broker_client: BrokerClientChannel,
|
||||
mut broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: &RequestContext,
|
||||
) -> Self {
|
||||
let tenant_shard_id = timeline.tenant_shard_id;
|
||||
|
||||
@@ -17,19 +17,12 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use storage_broker::proto::{
|
||||
FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
|
||||
SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription,
|
||||
TypedMessage,
|
||||
};
|
||||
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||
use storage_broker::proto::SafekeeperDiscoveryResponse;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::backoff::{
|
||||
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
|
||||
};
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::{
|
||||
@@ -56,7 +49,7 @@ pub(crate) struct Cancelled;
|
||||
///
|
||||
/// Not cancellation-safe. Use `cancel` token to request cancellation.
|
||||
pub(super) async fn connection_manager_loop_step(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
broker_client: &mut storage_broker::TimelineUpdatesSubscriber,
|
||||
connection_manager_state: &mut ConnectionManagerState,
|
||||
ctx: &RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
@@ -81,11 +74,6 @@ pub(super) async fn connection_manager_loop_step(
|
||||
WALRECEIVER_ACTIVE_MANAGERS.dec();
|
||||
}
|
||||
|
||||
let id = TenantTimelineId {
|
||||
tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
|
||||
timeline_id: connection_manager_state.timeline.timeline_id,
|
||||
};
|
||||
|
||||
let mut timeline_state_updates = connection_manager_state
|
||||
.timeline
|
||||
.subscribe_for_state_updates();
|
||||
@@ -101,7 +89,12 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
let (timeline_updates, mut discovery_requester) = broker_client.subscribe(
|
||||
connection_manager_state.timeline.tenant_shard_id,
|
||||
connection_manager_state.timeline.timeline_id,
|
||||
cancel,
|
||||
);
|
||||
let mut timeline_updates = Box::pin(timeline_updates);
|
||||
debug!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
@@ -155,29 +148,10 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
},
|
||||
|
||||
// Got a new update from the broker
|
||||
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
|
||||
match broker_update {
|
||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||
Err(status) => {
|
||||
match status.code() {
|
||||
Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
|
||||
// tonic's error handling doesn't provide a clear code for disconnections: we get
|
||||
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
|
||||
// => https://github.com/neondatabase/neon/issues/9562
|
||||
info!("broker disconnected: {status}");
|
||||
},
|
||||
_ => {
|
||||
warn!("broker subscription failed: {status}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
Ok(None) => {
|
||||
error!("broker subscription stream ended"); // can't happen
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// Got a new update from the broker.
|
||||
// The stream ends with None if and only if `cancel` is cancelled.
|
||||
Some(timeline_update) = timeline_updates.next() => {
|
||||
connection_manager_state.register_timeline_update(timeline_update)
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
@@ -258,32 +232,11 @@ pub(super) async fn connection_manager_loop_step(
|
||||
tokio::time::sleep(next_discovery_ts - now).await;
|
||||
}
|
||||
|
||||
let tenant_timeline_id = Some(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
});
|
||||
let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
|
||||
let msg = TypedMessage {
|
||||
r#type: MessageType::SafekeeperDiscoveryRequest as i32,
|
||||
safekeeper_timeline_info: None,
|
||||
safekeeper_discovery_request: Some(request),
|
||||
safekeeper_discovery_response: None,
|
||||
};
|
||||
info!("No active connection and no candidates, sending discovery request to the broker");
|
||||
discovery_requester.request().await;
|
||||
|
||||
last_discovery_ts = Some(std::time::Instant::now());
|
||||
info!("No active connection and no candidates, sending discovery request to the broker");
|
||||
|
||||
// Cancellation safety: we want to send a message to the broker, but publish_one()
|
||||
// function can get cancelled by the other select! arm. This is absolutely fine, because
|
||||
// we just want to receive broker updates and discovery is not important if we already
|
||||
// receive updates.
|
||||
//
|
||||
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
|
||||
// This is totally fine because of the reason above.
|
||||
|
||||
// This is a fire-and-forget request, we don't care about the response
|
||||
let _ = broker_client.publish_one(msg).await;
|
||||
debug!("Discovery request sent to the broker");
|
||||
None
|
||||
} => {}
|
||||
}
|
||||
@@ -298,63 +251,6 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
}
|
||||
|
||||
/// Endlessly try to subscribe for broker updates for a given timeline.
|
||||
async fn subscribe_for_timeline_updates(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
id: TenantTimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Streaming<TypedMessage>, Cancelled> {
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
exponential_backoff(
|
||||
attempt,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
|
||||
// subscribe to the specific timeline
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperTimelineInfo as i32,
|
||||
},
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
|
||||
},
|
||||
],
|
||||
tenant_timeline_id: Some(FilterTenantTimelineId {
|
||||
enabled: true,
|
||||
tenant_timeline_id: Some(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
}),
|
||||
}),
|
||||
};
|
||||
|
||||
match {
|
||||
tokio::select! {
|
||||
r = broker_client.subscribe_by_filter(request) => { r }
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
}
|
||||
} {
|
||||
Ok(resp) => {
|
||||
return Ok(resp.into_inner());
|
||||
}
|
||||
Err(e) => {
|
||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
||||
// entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
|
||||
info!(
|
||||
"Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS: f64 = 0.1;
|
||||
const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0;
|
||||
const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5;
|
||||
@@ -695,44 +591,14 @@ impl ConnectionManagerState {
|
||||
}
|
||||
|
||||
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
|
||||
fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
|
||||
let mut is_discovery = false;
|
||||
let timeline_update = match typed_msg.r#type() {
|
||||
MessageType::SafekeeperTimelineInfo => {
|
||||
let info = match typed_msg.safekeeper_timeline_info {
|
||||
Some(info) => info,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_timeline_info");
|
||||
return;
|
||||
}
|
||||
};
|
||||
SafekeeperDiscoveryResponse {
|
||||
safekeeper_id: info.safekeeper_id,
|
||||
tenant_timeline_id: info.tenant_timeline_id,
|
||||
commit_lsn: info.commit_lsn,
|
||||
safekeeper_connstr: info.safekeeper_connstr,
|
||||
availability_zone: info.availability_zone,
|
||||
standby_horizon: info.standby_horizon,
|
||||
}
|
||||
}
|
||||
MessageType::SafekeeperDiscoveryResponse => {
|
||||
is_discovery = true;
|
||||
match typed_msg.safekeeper_discovery_response {
|
||||
Some(response) => response,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_discovery_response");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// unexpected message
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
fn register_timeline_update(&mut self, timeline_update: storage_broker::TimelineShardUpdate) {
|
||||
WALRECEIVER_BROKER_UPDATES.inc();
|
||||
|
||||
let storage_broker::TimelineShardUpdate {
|
||||
is_discovery,
|
||||
inner: timeline_update,
|
||||
} = timeline_update;
|
||||
|
||||
trace!(
|
||||
"safekeeper info update: standby_horizon(cutoff)={}",
|
||||
timeline_update.standby_horizon
|
||||
@@ -1013,7 +879,7 @@ impl ConnectionManagerState {
|
||||
shard_stripe_size,
|
||||
listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
|
||||
auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
|
||||
availability_zone: self.conf.availability_zone.as_deref()
|
||||
availability_zone: self.conf.availability_zone.as_deref(),
|
||||
};
|
||||
|
||||
match wal_stream_connection_config(connection_conf_args) {
|
||||
|
||||
@@ -408,7 +408,7 @@ impl OpenFiles {
|
||||
/// error types may be elegible for retry.
|
||||
pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
|
||||
use nix::errno::Errno::*;
|
||||
match e.raw_os_error().map(nix::errno::from_i32) {
|
||||
match e.raw_os_error().map(nix::errno::Errno::from_raw) {
|
||||
Some(EIO) => {
|
||||
// Terminate on EIO because we no longer trust the device to store
|
||||
// data safely, or to uphold persistence guarantees on fsync.
|
||||
|
||||
@@ -124,9 +124,7 @@ pub(super) fn epoll_uring_error_to_std(
|
||||
) -> std::io::Error {
|
||||
match e {
|
||||
tokio_epoll_uring::Error::Op(e) => e,
|
||||
tokio_epoll_uring::Error::System(system) => {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, system)
|
||||
}
|
||||
tokio_epoll_uring::Error::System(system) => std::io::Error::other(system),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -936,6 +936,44 @@ lfc_prewarm_main(Datum main_arg)
|
||||
lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
|
||||
}
|
||||
|
||||
void
|
||||
lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
if (LFC_ENABLED())
|
||||
{
|
||||
for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk)
|
||||
{
|
||||
tag.blockNum = blkno;
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
if (entry != NULL)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
{
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
{
|
||||
lfc_ctl->used_pages -= 1;
|
||||
SET_STATE(entry, i, UNAVAILABLE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if page is present in the cache.
|
||||
|
||||
@@ -28,6 +28,7 @@ typedef struct FileCacheState
|
||||
extern bool lfc_store_prefetch_result;
|
||||
|
||||
/* functions for local file cache */
|
||||
extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
|
||||
extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
|
||||
BlockNumber blkno, const void *const *buffers,
|
||||
BlockNumber nblocks);
|
||||
|
||||
@@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
|
||||
|
||||
#define InvalidRelFileNumber InvalidOid
|
||||
|
||||
#define SMgrRelGetRelInfo(reln) \
|
||||
#define SMgrRelGetRelInfo(reln) \
|
||||
(reln->smgr_rnode.node)
|
||||
|
||||
#define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
|
||||
@@ -148,6 +148,12 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
|
||||
#define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
|
||||
#endif
|
||||
|
||||
#define NRelFileInfoInvalidate(rinfo) do { \
|
||||
NInfoGetSpcOid(rinfo) = InvalidOid; \
|
||||
NInfoGetDbOid(rinfo) = InvalidOid; \
|
||||
NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \
|
||||
} while (0)
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 17
|
||||
#define ProcNumber BackendId
|
||||
#define INVALID_PROC_NUMBER InvalidBackendId
|
||||
|
||||
@@ -108,7 +108,7 @@ typedef enum
|
||||
UNLOGGED_BUILD_NOT_PERMANENT
|
||||
} UnloggedBuildPhase;
|
||||
|
||||
static SMgrRelation unlogged_build_rel = NULL;
|
||||
static NRelFileInfo unlogged_build_rel_info;
|
||||
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
|
||||
static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
|
||||
@@ -912,16 +912,19 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
case 0:
|
||||
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
{
|
||||
mdextend(reln, forkNum, blkno, buffer, skipFsync);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
case RELPERSISTENCE_UNLOGGED:
|
||||
mdextend(reln, forkNum, blkno, buffer, skipFsync);
|
||||
/* Update LFC in case of unlogged index build */
|
||||
if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
|
||||
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
|
||||
return;
|
||||
|
||||
default:
|
||||
@@ -1003,21 +1006,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
{
|
||||
case 0:
|
||||
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
{
|
||||
mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
case RELPERSISTENCE_UNLOGGED:
|
||||
mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
|
||||
/* Update LFC in case of unlogged index build */
|
||||
if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
|
||||
{
|
||||
for (int i = 0; i < nblocks; i++)
|
||||
{
|
||||
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
|
||||
}
|
||||
}
|
||||
return;
|
||||
|
||||
default:
|
||||
@@ -1387,8 +1388,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
{
|
||||
case 0:
|
||||
neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
{
|
||||
mdread(reln, forkNum, blkno, buffer);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
@@ -1474,8 +1481,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
{
|
||||
case 0:
|
||||
neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
{
|
||||
mdreadv(reln, forknum, blocknum, buffers, nblocks);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
@@ -1608,6 +1621,15 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
{
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
|
||||
#else
|
||||
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
@@ -1617,9 +1639,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
|
||||
#else
|
||||
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
|
||||
#endif
|
||||
/* Update LFC in case of unlogged index build */
|
||||
if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
|
||||
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
|
||||
return;
|
||||
default:
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
@@ -1680,14 +1699,16 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
{
|
||||
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
case RELPERSISTENCE_UNLOGGED:
|
||||
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
|
||||
/* Update LFC in case of unlogged index build */
|
||||
if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
|
||||
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
|
||||
return;
|
||||
default:
|
||||
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
@@ -1723,6 +1744,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
{
|
||||
return mdnblocks(reln, forknum);
|
||||
}
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
@@ -1792,6 +1817,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
|
||||
{
|
||||
mdtruncate(reln, forknum, old_blocks, nblocks);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
@@ -1930,7 +1960,6 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
*/
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
|
||||
neon_log(ERROR, "unlogged relation build is already in progress");
|
||||
Assert(unlogged_build_rel == NULL);
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
|
||||
@@ -1947,7 +1976,7 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
|
||||
case RELPERSISTENCE_TEMP:
|
||||
case RELPERSISTENCE_UNLOGGED:
|
||||
unlogged_build_rel = reln;
|
||||
unlogged_build_rel_info = InfoFromSMgrRel(reln);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (!IsParallelWorker())
|
||||
@@ -1968,12 +1997,9 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
|
||||
#endif
|
||||
|
||||
unlogged_build_rel = reln;
|
||||
unlogged_build_rel_info = InfoFromSMgrRel(reln);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
|
||||
|
||||
/* Make the relation look like it's unlogged */
|
||||
reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
|
||||
|
||||
/*
|
||||
* Create the local file. In a parallel build, the leader is expected to
|
||||
* call this first and do it.
|
||||
@@ -2000,17 +2026,16 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
static void
|
||||
neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
{
|
||||
Assert(unlogged_build_rel == reln);
|
||||
Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
|
||||
RelFileInfoFmt((unlogged_build_rel_info)))));
|
||||
|
||||
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
return;
|
||||
|
||||
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
|
||||
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
|
||||
|
||||
/*
|
||||
* In a parallel build, (only) the leader process performs the 2nd
|
||||
@@ -2018,7 +2043,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
*/
|
||||
if (IsParallelWorker())
|
||||
{
|
||||
unlogged_build_rel = NULL;
|
||||
NRelFileInfoInvalidate(unlogged_build_rel_info);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
}
|
||||
else
|
||||
@@ -2039,11 +2064,11 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
{
|
||||
NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
|
||||
|
||||
Assert(unlogged_build_rel == reln);
|
||||
Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
|
||||
RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
|
||||
RelFileInfoFmt(unlogged_build_rel_info))));
|
||||
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
{
|
||||
@@ -2051,7 +2076,6 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
BlockNumber nblocks;
|
||||
|
||||
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
|
||||
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
|
||||
|
||||
/*
|
||||
* Update the last-written LSN cache.
|
||||
@@ -2072,9 +2096,6 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
InfoFromNInfoB(rinfob),
|
||||
MAIN_FORKNUM);
|
||||
|
||||
/* Make the relation look permanent again */
|
||||
reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
|
||||
|
||||
/* Remove local copy */
|
||||
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||
{
|
||||
@@ -2083,6 +2104,8 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
forknum);
|
||||
|
||||
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
|
||||
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
|
||||
|
||||
mdclose(reln, forknum);
|
||||
#ifndef DEBUG_COMPARE_LOCAL
|
||||
/* use isRedo == true, so that we drop it immediately */
|
||||
@@ -2093,7 +2116,7 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
mdunlink(rinfob, INIT_FORKNUM, true);
|
||||
#endif
|
||||
}
|
||||
unlogged_build_rel = NULL;
|
||||
NRelFileInfoInvalidate(unlogged_build_rel_info);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
}
|
||||
|
||||
@@ -2166,7 +2189,7 @@ AtEOXact_neon(XactEvent event, void *arg)
|
||||
* Forget about any build we might have had in progress. The local
|
||||
* file will be unlinked by smgrDoPendingDeletes()
|
||||
*/
|
||||
unlogged_build_rel = NULL;
|
||||
NRelFileInfoInvalidate(unlogged_build_rel_info);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
break;
|
||||
|
||||
@@ -2178,7 +2201,7 @@ AtEOXact_neon(XactEvent event, void *arg)
|
||||
case XACT_EVENT_PRE_PREPARE:
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
|
||||
{
|
||||
unlogged_build_rel = NULL;
|
||||
NRelFileInfoInvalidate(unlogged_build_rel_info);
|
||||
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INTERNAL_ERROR),
|
||||
|
||||
15
poetry.lock
generated
15
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohappyeyeballs"
|
||||
@@ -1145,18 +1145,19 @@ dotenv = ["python-dotenv"]
|
||||
|
||||
[[package]]
|
||||
name = "flask-cors"
|
||||
version = "5.0.0"
|
||||
description = "A Flask extension adding a decorator for CORS support"
|
||||
version = "6.0.0"
|
||||
description = "A Flask extension simplifying CORS support"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
python-versions = "<4.0,>=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
|
||||
{file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
|
||||
{file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"},
|
||||
{file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Flask = ">=0.9"
|
||||
flask = ">=0.9"
|
||||
Werkzeug = ">=0.7"
|
||||
|
||||
[[package]]
|
||||
name = "frozenlist"
|
||||
|
||||
@@ -394,6 +394,7 @@ async fn handle_client(
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
enum Connection {
|
||||
Raw(tokio::net::TcpStream),
|
||||
Tls(tokio_rustls::client::TlsStream<tokio::net::TcpStream>),
|
||||
|
||||
@@ -43,11 +43,12 @@ project_build_tag!(BUILD_TAG);
|
||||
use clap::{Parser, ValueEnum};
|
||||
|
||||
#[derive(Clone, Debug, ValueEnum)]
|
||||
#[clap(rename_all = "kebab-case")]
|
||||
enum AuthBackendType {
|
||||
#[value(name("cplane-v1"), alias("control-plane"))]
|
||||
ControlPlaneV1,
|
||||
#[clap(alias("cplane-v1"))]
|
||||
ControlPlane,
|
||||
|
||||
#[value(name("link"), alias("control-redirect"))]
|
||||
#[clap(alias("link"))]
|
||||
ConsoleRedirect,
|
||||
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
@@ -707,7 +708,7 @@ fn build_auth_backend(
|
||||
args: &ProxyCliArgs,
|
||||
) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
|
||||
match &args.auth_backend {
|
||||
AuthBackendType::ControlPlaneV1 => {
|
||||
AuthBackendType::ControlPlane => {
|
||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||
let project_info_cache_config: ProjectInfoCacheOptions =
|
||||
args.project_info_cache.parse()?;
|
||||
@@ -862,7 +863,7 @@ async fn configure_redis(
|
||||
("irsa", _) => match (&args.redis_host, args.redis_port) {
|
||||
(Some(host), Some(port)) => Some(
|
||||
ConnectionWithCredentialsProvider::new_with_credentials_provider(
|
||||
host.to_string(),
|
||||
host.clone(),
|
||||
port,
|
||||
elasticache::CredentialsProvider::new(
|
||||
args.aws_region.clone(),
|
||||
|
||||
@@ -78,7 +78,7 @@ struct RequestContextInner {
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum AuthMethod {
|
||||
// aka passwordless, fka link
|
||||
// aka link
|
||||
ConsoleRedirect,
|
||||
ScramSha256,
|
||||
ScramSha256Plus,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.86.0"
|
||||
channel = "1.87.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -52,6 +52,7 @@ tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util = { workspace = true }
|
||||
tonic = { workspace = true }
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
metrics.workspace = true
|
||||
@@ -62,9 +63,11 @@ pq_proto.workspace = true
|
||||
remote_storage.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
safekeeper_client.workspace = true
|
||||
sk_ps_discovery.workspace = true
|
||||
sha2.workspace = true
|
||||
sd-notify.workspace = true
|
||||
storage_broker.workspace = true
|
||||
storage_controller_client.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
http-utils.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -8,11 +8,12 @@ use std::error::Error as _;
|
||||
use http_utils::error::HttpErrorBody;
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
use safekeeper_api::models::{
|
||||
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
|
||||
TimelineStatus,
|
||||
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization,
|
||||
TenantShardPageserverAttachmentChange, TimelineCreateRequest, TimelineStatus,
|
||||
};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::logging::SecretString;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Client {
|
||||
@@ -189,6 +190,20 @@ impl Client {
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn post_tenant_shard_pageserver_attachments(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
body: TenantShardPageserverAttachmentChange,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/pageserver_attachments",
|
||||
tenant_shard_id.tenant_id,
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
let resp = self.post(uri, body).await?;
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
async fn post<B: serde::Serialize, U: IntoUrl>(
|
||||
&self,
|
||||
uri: U,
|
||||
|
||||
@@ -22,9 +22,10 @@ use safekeeper::defaults::{
|
||||
DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE,
|
||||
DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
|
||||
};
|
||||
use safekeeper::wal_backup::WalBackup;
|
||||
use safekeeper::{
|
||||
BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
|
||||
WAL_SERVICE_RUNTIME, broker, control_file, http, wal_backup, wal_service,
|
||||
WAL_ADVERTISER_RUNTIME, WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service,
|
||||
};
|
||||
use sd_notify::NotifyState;
|
||||
use storage_broker::{DEFAULT_ENDPOINT, Uri};
|
||||
@@ -484,15 +485,15 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let global_timelines = Arc::new(GlobalTimelines::new(conf.clone()));
|
||||
let wal_backup = Arc::new(WalBackup::new(&conf).await?);
|
||||
|
||||
let global_timelines = Arc::new(GlobalTimelines::new(conf.clone(), wal_backup.clone()));
|
||||
|
||||
// Register metrics collector for active timelines. It's important to do this
|
||||
// after daemonizing, otherwise process collector will be upset.
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone());
|
||||
metrics::register_internal(Box::new(timeline_collector))?;
|
||||
|
||||
wal_backup::init_remote_storage(&conf).await;
|
||||
|
||||
// Keep handles to main tasks to die if any of them disappears.
|
||||
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
|
||||
FuturesUnordered::new();
|
||||
@@ -625,6 +626,30 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
|
||||
.map(|res| ("broker main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(broker_task_handle));
|
||||
|
||||
let ps_connectivity_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| HTTP_RUNTIME.handle())
|
||||
.spawn(
|
||||
global_timelines
|
||||
.get_pageserver_connectivity()
|
||||
.task_main()
|
||||
.instrument(info_span!("pageserver_connectivity")),
|
||||
)
|
||||
.map(|res| ("pageserver connectivity".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(ps_connectivity_handle));
|
||||
|
||||
let wal_advertiser_task_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_ADVERTISER_RUNTIME.handle())
|
||||
.spawn(
|
||||
global_timelines
|
||||
.get_wal_advertiser()
|
||||
.task_main()
|
||||
.instrument(info_span!("wal_advertiser_main")),
|
||||
)
|
||||
.map(|res| ("wal advertiser task handle".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_advertiser_task_handle));
|
||||
|
||||
set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
|
||||
// TODO: update tokio-stream, convert to real async Stream with
|
||||
|
||||
@@ -50,7 +50,8 @@ async fn push_loop(
|
||||
conf.broker_endpoint.clone(),
|
||||
conf.broker_keepalive_interval,
|
||||
make_tls_config(&conf),
|
||||
)?;
|
||||
)?
|
||||
.into_raw_grpc_client();
|
||||
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||
|
||||
let outbound = async_stream::stream! {
|
||||
@@ -97,7 +98,8 @@ async fn pull_loop(
|
||||
conf.broker_endpoint.clone(),
|
||||
conf.broker_keepalive_interval,
|
||||
make_tls_config(&conf),
|
||||
)?;
|
||||
)?
|
||||
.into_raw_grpc_client();
|
||||
|
||||
// TODO: subscribe only to local timelines instead of all
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
@@ -153,7 +155,8 @@ async fn discover_loop(
|
||||
conf.broker_endpoint.clone(),
|
||||
conf.broker_keepalive_interval,
|
||||
make_tls_config(&conf),
|
||||
)?;
|
||||
)?
|
||||
.into_raw_grpc_client();
|
||||
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![TypeSubscription {
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::sync::Arc;
|
||||
use anyhow::{Result, bail};
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use safekeeper_api::membership::Configuration;
|
||||
use tokio::fs::OpenOptions;
|
||||
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
|
||||
@@ -30,6 +31,7 @@ pub struct Request {
|
||||
pub async fn handle_request(
|
||||
request: Request,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
) -> Result<()> {
|
||||
// TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
|
||||
// if LSN will point to the middle of a WAL record, timeline will be in "broken" state
|
||||
@@ -127,6 +129,7 @@ pub async fn handle_request(
|
||||
assert!(first_ondisk_segment >= first_segment);
|
||||
|
||||
copy_s3_segments(
|
||||
&storage,
|
||||
wal_seg_size,
|
||||
&request.source_ttid,
|
||||
&request.destination_ttid,
|
||||
|
||||
@@ -67,6 +67,19 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
||||
})
|
||||
}
|
||||
|
||||
async fn post_tenant_pageserver_attachments(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let body: models::TenantShardPageserverAttachmentChange = json_request(&mut request).await?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let wal_advertiser = global_timelines.get_wal_advertiser();
|
||||
wal_advertiser
|
||||
.update_pageserver_attachments(tenant_id, body)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Deactivates all timelines for the tenant and removes its data directory.
|
||||
/// See `timeline_delete_handler`.
|
||||
async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -258,6 +271,7 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let storage = global_timelines.get_wal_backup().get_storage();
|
||||
|
||||
// To stream the body use wrap_stream which wants Stream of Result<Bytes>,
|
||||
// so create the chan and write to it in another task.
|
||||
@@ -269,6 +283,7 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
conf.my_id,
|
||||
destination,
|
||||
tx,
|
||||
storage,
|
||||
));
|
||||
|
||||
let rx_stream = ReceiverStream::new(rx);
|
||||
@@ -390,12 +405,18 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
);
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let wal_backup = global_timelines.get_wal_backup();
|
||||
let storage = wal_backup
|
||||
.get_storage()
|
||||
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Remote Storage is not configured"
|
||||
)))?;
|
||||
|
||||
copy_timeline::handle_request(copy_timeline::Request{
|
||||
source_ttid,
|
||||
until_lsn: request_data.until_lsn,
|
||||
destination_ttid: TenantTimelineId::new(source_ttid.tenant_id, request_data.target_timeline_id),
|
||||
}, global_timelines)
|
||||
}, global_timelines, storage)
|
||||
.instrument(info_span!("copy_timeline", from=%source_ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
@@ -710,6 +731,9 @@ pub fn make_router(
|
||||
})
|
||||
})
|
||||
.get("/v1/utilization", |r| request_span(r, utilization_handler))
|
||||
.post("/v1/tenant/:tenant_id/pageserver_attachments", |r| {
|
||||
request_span(r, post_tenant_pageserver_attachments)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
request_span(r, tenant_delete_handler)
|
||||
})
|
||||
|
||||
@@ -38,11 +38,13 @@ pub mod timeline_eviction;
|
||||
pub mod timeline_guard;
|
||||
pub mod timeline_manager;
|
||||
pub mod timelines_set;
|
||||
pub mod wal_advertiser;
|
||||
pub mod wal_backup;
|
||||
pub mod wal_backup_partial;
|
||||
pub mod wal_reader_stream;
|
||||
pub mod wal_service;
|
||||
pub mod wal_storage;
|
||||
pub(crate) mod pageserver_connectivity;
|
||||
|
||||
#[cfg(any(test, feature = "benchmarking"))]
|
||||
pub mod test_utils;
|
||||
@@ -123,12 +125,7 @@ pub struct SafeKeeperConf {
|
||||
pub ssl_ca_certs: Vec<Pem>,
|
||||
pub use_https_safekeeper_api: bool,
|
||||
pub enable_tls_wal_service_api: bool,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
pub fn is_wal_backup_enabled(&self) -> bool {
|
||||
self.remote_storage.is_some() && self.wal_backup_enabled
|
||||
}
|
||||
pub storage_controller_api: Option<Uri>,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -174,6 +171,7 @@ impl SafeKeeperConf {
|
||||
ssl_ca_certs: Vec::new(),
|
||||
use_https_safekeeper_api: false,
|
||||
enable_tls_wal_service_api: false,
|
||||
storage_controller_api: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -204,6 +202,14 @@ pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
pub static WAL_ADVERTISER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("wal advertiser worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("WAL backup worker")
|
||||
|
||||
117
safekeeper/src/pageserver_connectivity.rs
Normal file
117
safekeeper/src/pageserver_connectivity.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
use desim::world::Node;
|
||||
use hyper::Uri;
|
||||
use pageserver_api::controller_api;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::timeline::Timeline;
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, hash_map},
|
||||
sync::{Arc, Mutex},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tracing::{Instrument, error, info, info_span, warn};
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
sync::{spsc_fold, spsc_watch},
|
||||
};
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
type Advs = HashMap<TenantTimelineId, Lsn>;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct GlobalState {
|
||||
inner: once_cell::sync::OnceCell<tokio::sync::mpsc::Sender<Message>>,
|
||||
}
|
||||
|
||||
enum Message {
|
||||
Resolve {
|
||||
ps_id: NodeId,
|
||||
reply: tokio::sync::oneshot::Sender<tokio::sync::watch::Receiver<hyper::Uri>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl GlobalState {
|
||||
pub fn task_main(&self) -> impl 'static + Future<Output = anyhow::Result<()>> + Send {
|
||||
let mut ret = None;
|
||||
self.inner.get_or_init(|| {
|
||||
let (tx, task_fut) = MainTask::prepare_run();
|
||||
ret = Some(task_fut);
|
||||
tx
|
||||
});
|
||||
ret.expect("must only call this method once")
|
||||
}
|
||||
}
|
||||
|
||||
struct MainTask {
|
||||
rx: tokio::sync::mpsc::Receiver<Message>,
|
||||
}
|
||||
|
||||
impl MainTask {
|
||||
fn prepare_run() -> (
|
||||
tokio::sync::mpsc::Sender<Message>,
|
||||
impl Future<Output = anyhow::Result<()>> + Send,
|
||||
) {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(100 /* TODO think */);
|
||||
let task = MainTask { rx };
|
||||
(tx, task.task())
|
||||
}
|
||||
async fn task(mut self) -> anyhow::Result<()> {
|
||||
// TODO: persistence
|
||||
|
||||
let storcon_client = todo!();
|
||||
|
||||
let mut resolution: HashMap<NodeId, tokio::sync::watch::Sender<hyper::Uri>> =
|
||||
HashMap::new();
|
||||
|
||||
while let Some(rx) = self.rx.recv().await {
|
||||
match rx {
|
||||
Message::Resolve { ps_id, reply } => match resolution.entry(ps_id) {
|
||||
hash_map::Entry::Occupied(e) => {}
|
||||
hash_map::Entry::Vacant(e) => {
|
||||
tokio::spawn(
|
||||
ResolutionTask { ps_id, storcon_client }.run()
|
||||
)
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ResolutionTask {
|
||||
ps_id: NodeId,
|
||||
storcon_client: storage_controller_client::control_api::Client,
|
||||
}
|
||||
|
||||
impl ResolutionTask {
|
||||
pub async fn run(self) -> Result<Uri, Error> {
|
||||
loop {
|
||||
// XXX: well-defined upcall API?
|
||||
let res = self
|
||||
.storcon_client
|
||||
.dispatch(
|
||||
reqwest::Method::GET,
|
||||
format!("control/v1/node/{}", self.node_id),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
let node: NodeDescribeResponse = match res {
|
||||
Ok(res) => res,
|
||||
Err(err) => {
|
||||
warn!("storcon upcall failed")
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,6 +9,7 @@ use chrono::{DateTime, Utc};
|
||||
use futures::{SinkExt, StreamExt, TryStreamExt};
|
||||
use http_utils::error::ApiError;
|
||||
use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use reqwest::Certificate;
|
||||
use safekeeper_api::Term;
|
||||
use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus};
|
||||
@@ -43,6 +44,7 @@ pub async fn stream_snapshot(
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
tx: mpsc::Sender<Result<Bytes>>,
|
||||
storage: Option<Arc<GenericRemoteStorage>>,
|
||||
) {
|
||||
match tli.try_wal_residence_guard().await {
|
||||
Err(e) => {
|
||||
@@ -53,10 +55,32 @@ pub async fn stream_snapshot(
|
||||
Ok(maybe_resident_tli) => {
|
||||
if let Err(e) = match maybe_resident_tli {
|
||||
Some(resident_tli) => {
|
||||
stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
|
||||
.await
|
||||
stream_snapshot_resident_guts(
|
||||
resident_tli,
|
||||
source,
|
||||
destination,
|
||||
tx.clone(),
|
||||
storage,
|
||||
)
|
||||
.await
|
||||
}
|
||||
None => {
|
||||
if let Some(storage) = storage {
|
||||
stream_snapshot_offloaded_guts(
|
||||
tli,
|
||||
source,
|
||||
destination,
|
||||
tx.clone(),
|
||||
&storage,
|
||||
)
|
||||
.await
|
||||
} else {
|
||||
tx.send(Err(anyhow!("remote storage not configured")))
|
||||
.await
|
||||
.ok();
|
||||
return;
|
||||
}
|
||||
}
|
||||
None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
|
||||
} {
|
||||
// Error type/contents don't matter as they won't can't reach the client
|
||||
// (hyper likely doesn't do anything with it), but http stream will be
|
||||
@@ -123,10 +147,12 @@ pub(crate) async fn stream_snapshot_offloaded_guts(
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
tx: mpsc::Sender<Result<Bytes>>,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> Result<()> {
|
||||
let mut ar = prepare_tar_stream(tx);
|
||||
|
||||
tli.snapshot_offloaded(&mut ar, source, destination).await?;
|
||||
tli.snapshot_offloaded(&mut ar, source, destination, storage)
|
||||
.await?;
|
||||
|
||||
ar.finish().await?;
|
||||
|
||||
@@ -139,10 +165,13 @@ pub async fn stream_snapshot_resident_guts(
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
tx: mpsc::Sender<Result<Bytes>>,
|
||||
storage: Option<Arc<GenericRemoteStorage>>,
|
||||
) -> Result<()> {
|
||||
let mut ar = prepare_tar_stream(tx);
|
||||
|
||||
let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
|
||||
let bctx = tli
|
||||
.start_snapshot(&mut ar, source, destination, storage)
|
||||
.await?;
|
||||
pausable_failpoint!("sk-snapshot-after-list-pausable");
|
||||
|
||||
let tli_dir = tli.get_timeline_dir();
|
||||
@@ -182,6 +211,7 @@ impl Timeline {
|
||||
ar: &mut tokio_tar::Builder<W>,
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> Result<()> {
|
||||
// Take initial copy of control file, then release state lock
|
||||
let mut control_file = {
|
||||
@@ -216,6 +246,7 @@ impl Timeline {
|
||||
// can fail if the timeline was un-evicted and modified in the background.
|
||||
let remote_timeline_path = &self.remote_path;
|
||||
wal_backup::copy_partial_segment(
|
||||
storage,
|
||||
&replace.previous.remote_path(remote_timeline_path),
|
||||
&replace.current.remote_path(remote_timeline_path),
|
||||
)
|
||||
@@ -262,6 +293,7 @@ impl WalResidentTimeline {
|
||||
ar: &mut tokio_tar::Builder<W>,
|
||||
source: NodeId,
|
||||
destination: NodeId,
|
||||
storage: Option<Arc<GenericRemoteStorage>>,
|
||||
) -> Result<SnapshotContext> {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
let wal_seg_size = shared_state.get_wal_seg_size();
|
||||
@@ -283,6 +315,7 @@ impl WalResidentTimeline {
|
||||
|
||||
let remote_timeline_path = &self.tli.remote_path;
|
||||
wal_backup::copy_partial_segment(
|
||||
&*storage.context("remote storage not configured")?,
|
||||
&replace.previous.remote_path(remote_timeline_path),
|
||||
&replace.current.remote_path(remote_timeline_path),
|
||||
)
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::send_wal::EndWatch;
|
||||
use crate::state::{TimelinePersistentState, TimelineState};
|
||||
use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::remote_timeline_path;
|
||||
use crate::wal_backup::{WalBackup, remote_timeline_path};
|
||||
use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage};
|
||||
|
||||
/// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop.
|
||||
@@ -101,18 +101,23 @@ impl Env {
|
||||
let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?;
|
||||
let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
|
||||
|
||||
let wal_backup = Arc::new(WalBackup::new(&conf).await?);
|
||||
|
||||
let timeline = Timeline::new(
|
||||
ttid,
|
||||
&timeline_dir,
|
||||
&remote_path,
|
||||
shared_state,
|
||||
conf.clone(),
|
||||
wal_backup.clone(),
|
||||
);
|
||||
timeline.bootstrap(
|
||||
&mut timeline.write_shared_state().await,
|
||||
&conf,
|
||||
Arc::new(TimelinesSet::default()), // ignored for now
|
||||
RateLimiter::new(0, 0),
|
||||
wal_backup,
|
||||
todo!(),
|
||||
);
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -35,10 +35,13 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim
|
||||
use crate::timeline_guard::ResidenceGuard;
|
||||
use crate::timeline_manager::{AtomicStatus, ManagerCtl};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::{self, remote_timeline_path};
|
||||
use crate::wal_backup;
|
||||
use crate::wal_backup::{WalBackup, remote_timeline_path};
|
||||
use crate::wal_backup_partial::PartialRemoteSegment;
|
||||
use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
|
||||
use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage};
|
||||
use crate::{
|
||||
SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_advertiser, wal_storage,
|
||||
};
|
||||
|
||||
fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
|
||||
PeerInfo {
|
||||
@@ -452,6 +455,8 @@ pub struct Timeline {
|
||||
manager_ctl: ManagerCtl,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
|
||||
pub(crate) wal_backup: Arc<WalBackup>,
|
||||
|
||||
remote_deletion: std::sync::Mutex<Option<RemoteDeletionReceiver>>,
|
||||
|
||||
/// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding
|
||||
@@ -476,6 +481,7 @@ impl Timeline {
|
||||
remote_path: &RemotePath,
|
||||
shared_state: SharedState,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
) -> Arc<Self> {
|
||||
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
|
||||
watch::channel(shared_state.sk.state().commit_lsn);
|
||||
@@ -509,6 +515,7 @@ impl Timeline {
|
||||
wal_backup_active: AtomicBool::new(false),
|
||||
last_removed_segno: AtomicU64::new(0),
|
||||
mgr_status: AtomicStatus::new(),
|
||||
wal_backup,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -516,6 +523,7 @@ impl Timeline {
|
||||
pub fn load_timeline(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
ttid: TenantTimelineId,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
|
||||
|
||||
@@ -529,6 +537,7 @@ impl Timeline {
|
||||
&remote_path,
|
||||
shared_state,
|
||||
conf,
|
||||
wal_backup,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -539,6 +548,8 @@ impl Timeline {
|
||||
conf: &SafeKeeperConf,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
partial_backup_rate_limiter: RateLimiter,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
wal_advertiser: Arc<wal_advertiser::GlobalState>,
|
||||
) {
|
||||
let (tx, rx) = self.manager_ctl.bootstrap_manager();
|
||||
|
||||
@@ -561,6 +572,8 @@ impl Timeline {
|
||||
tx,
|
||||
rx,
|
||||
partial_backup_rate_limiter,
|
||||
wal_backup,
|
||||
wal_advertiser,
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -606,9 +619,10 @@ impl Timeline {
|
||||
// it is cancelled, so WAL storage won't be opened again.
|
||||
shared_state.sk.close_wal_store();
|
||||
|
||||
if !only_local && self.conf.is_wal_backup_enabled() {
|
||||
if !only_local {
|
||||
self.remote_delete().await?;
|
||||
}
|
||||
|
||||
let dir_existed = delete_dir(&self.timeline_dir).await?;
|
||||
Ok(dir_existed)
|
||||
}
|
||||
@@ -675,11 +689,20 @@ impl Timeline {
|
||||
guard: &mut std::sync::MutexGuard<Option<RemoteDeletionReceiver>>,
|
||||
) -> RemoteDeletionReceiver {
|
||||
tracing::info!("starting remote deletion");
|
||||
let storage = self.wal_backup.get_storage().clone();
|
||||
let (result_tx, result_rx) = tokio::sync::watch::channel(None);
|
||||
let ttid = self.ttid;
|
||||
tokio::task::spawn(
|
||||
async move {
|
||||
let r = wal_backup::delete_timeline(&ttid).await;
|
||||
let r = if let Some(storage) = storage {
|
||||
wal_backup::delete_timeline(&storage, &ttid).await
|
||||
} else {
|
||||
tracing::info!(
|
||||
"skipping remote deletion because no remote storage is configured; this effectively leaks the objects in remote storage"
|
||||
);
|
||||
Ok(())
|
||||
};
|
||||
|
||||
if let Err(e) = &r {
|
||||
// Log error here in case nobody ever listens for our result (e.g. dropped API request)
|
||||
tracing::error!("remote deletion failed: {e}");
|
||||
@@ -1046,14 +1069,13 @@ impl WalResidentTimeline {
|
||||
|
||||
pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
|
||||
let (_, persisted_state) = self.get_state().await;
|
||||
let enable_remote_read = self.conf.is_wal_backup_enabled();
|
||||
|
||||
WalReader::new(
|
||||
&self.ttid,
|
||||
self.timeline_dir.clone(),
|
||||
&persisted_state,
|
||||
start_lsn,
|
||||
enable_remote_read,
|
||||
self.wal_backup.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
use remote_storage::RemotePath;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use tokio::fs::File;
|
||||
use tokio::io::{AsyncRead, AsyncWriteExt};
|
||||
use tracing::{debug, info, instrument, warn};
|
||||
@@ -42,6 +42,7 @@ impl Manager {
|
||||
&& next_event.is_none()
|
||||
&& self.access_service.is_empty()
|
||||
&& !self.tli_broker_active.get()
|
||||
&& self.wal_advertiser.ready_for_eviction()
|
||||
// Partial segment of current flush_lsn is uploaded up to this flush_lsn.
|
||||
&& !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded)
|
||||
// And it is the next one after the last removed. Given that local
|
||||
@@ -68,6 +69,10 @@ impl Manager {
|
||||
#[instrument(name = "evict_timeline", skip_all)]
|
||||
pub(crate) async fn evict_timeline(&mut self) -> bool {
|
||||
assert!(!self.is_offloaded);
|
||||
let Some(storage) = self.wal_backup.get_storage() else {
|
||||
warn!("no remote storage configured, skipping uneviction");
|
||||
return false;
|
||||
};
|
||||
let partial_backup_uploaded = match &self.partial_backup_uploaded {
|
||||
Some(p) => p.clone(),
|
||||
None => {
|
||||
@@ -87,7 +92,7 @@ impl Manager {
|
||||
.inc();
|
||||
});
|
||||
|
||||
if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
|
||||
if let Err(e) = do_eviction(self, &partial_backup_uploaded, &storage).await {
|
||||
warn!("failed to evict timeline: {:?}", e);
|
||||
return false;
|
||||
}
|
||||
@@ -102,6 +107,10 @@ impl Manager {
|
||||
#[instrument(name = "unevict_timeline", skip_all)]
|
||||
pub(crate) async fn unevict_timeline(&mut self) {
|
||||
assert!(self.is_offloaded);
|
||||
let Some(storage) = self.wal_backup.get_storage() else {
|
||||
warn!("no remote storage configured, skipping uneviction");
|
||||
return;
|
||||
};
|
||||
let partial_backup_uploaded = match &self.partial_backup_uploaded {
|
||||
Some(p) => p.clone(),
|
||||
None => {
|
||||
@@ -121,7 +130,7 @@ impl Manager {
|
||||
.inc();
|
||||
});
|
||||
|
||||
if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
|
||||
if let Err(e) = do_uneviction(self, &partial_backup_uploaded, &storage).await {
|
||||
warn!("failed to unevict timeline: {:?}", e);
|
||||
return;
|
||||
}
|
||||
@@ -137,8 +146,12 @@ impl Manager {
|
||||
/// Ensure that content matches the remote partial backup, if local segment exists.
|
||||
/// Then change state in control file and in-memory. If `delete_offloaded_wal` is set,
|
||||
/// delete the local segment.
|
||||
async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
|
||||
compare_local_segment_with_remote(mgr, partial).await?;
|
||||
async fn do_eviction(
|
||||
mgr: &mut Manager,
|
||||
partial: &PartialRemoteSegment,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<()> {
|
||||
compare_local_segment_with_remote(mgr, partial, storage).await?;
|
||||
|
||||
mgr.tli.switch_to_offloaded(partial).await?;
|
||||
// switch manager state as soon as possible
|
||||
@@ -153,12 +166,16 @@ async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyho
|
||||
|
||||
/// Ensure that content matches the remote partial backup, if local segment exists.
|
||||
/// Then download segment to local disk and change state in control file and in-memory.
|
||||
async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
|
||||
async fn do_uneviction(
|
||||
mgr: &mut Manager,
|
||||
partial: &PartialRemoteSegment,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<()> {
|
||||
// if the local segment is present, validate it
|
||||
compare_local_segment_with_remote(mgr, partial).await?;
|
||||
compare_local_segment_with_remote(mgr, partial, storage).await?;
|
||||
|
||||
// atomically download the partial segment
|
||||
redownload_partial_segment(mgr, partial).await?;
|
||||
redownload_partial_segment(mgr, partial, storage).await?;
|
||||
|
||||
mgr.tli.switch_to_present().await?;
|
||||
// switch manager state as soon as possible
|
||||
@@ -181,6 +198,7 @@ async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) ->
|
||||
async fn redownload_partial_segment(
|
||||
mgr: &Manager,
|
||||
partial: &PartialRemoteSegment,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<()> {
|
||||
let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp");
|
||||
let remote_segfile = remote_segment_path(mgr, partial);
|
||||
@@ -190,7 +208,7 @@ async fn redownload_partial_segment(
|
||||
remote_segfile, tmp_file
|
||||
);
|
||||
|
||||
let mut reader = wal_backup::read_object(&remote_segfile, 0).await?;
|
||||
let mut reader = wal_backup::read_object(storage, &remote_segfile, 0).await?;
|
||||
let mut file = File::create(&tmp_file).await?;
|
||||
|
||||
let actual_len = tokio::io::copy(&mut reader, &mut file).await?;
|
||||
@@ -234,13 +252,16 @@ async fn redownload_partial_segment(
|
||||
async fn compare_local_segment_with_remote(
|
||||
mgr: &Manager,
|
||||
partial: &PartialRemoteSegment,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_path = local_segment_path(mgr, partial);
|
||||
|
||||
match File::open(&local_path).await {
|
||||
Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial)
|
||||
.await
|
||||
.context("validation failed"),
|
||||
Ok(mut local_file) => {
|
||||
do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial, storage)
|
||||
.await
|
||||
.context("validation failed")
|
||||
}
|
||||
Err(_) => {
|
||||
info!(
|
||||
"local WAL file {} is not present, skipping validation",
|
||||
@@ -258,6 +279,7 @@ async fn do_validation(
|
||||
file: &mut File,
|
||||
wal_seg_size: usize,
|
||||
partial: &PartialRemoteSegment,
|
||||
storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_size = file.metadata().await?.len() as usize;
|
||||
if local_size != wal_seg_size {
|
||||
@@ -270,7 +292,7 @@ async fn do_validation(
|
||||
|
||||
let remote_segfile = remote_segment_path(mgr, partial);
|
||||
let mut remote_reader: std::pin::Pin<Box<dyn AsyncRead + Send + Sync>> =
|
||||
wal_backup::read_object(&remote_segfile, 0).await?;
|
||||
wal_backup::read_object(storage, &remote_segfile, 0).await?;
|
||||
|
||||
// remote segment should have bytes excatly up to `flush_lsn`
|
||||
let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size);
|
||||
|
||||
@@ -22,7 +22,6 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, debug, info, info_span, instrument, warn};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::control_file::{FileStorage, Storage};
|
||||
use crate::metrics::{
|
||||
MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, NUM_EVICTED_TIMELINES,
|
||||
@@ -35,8 +34,9 @@ use crate::state::TimelineState;
|
||||
use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline};
|
||||
use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard};
|
||||
use crate::timelines_set::{TimelineSetGuard, TimelinesSet};
|
||||
use crate::wal_backup::{self, WalBackupTaskHandle};
|
||||
use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle};
|
||||
use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment};
|
||||
use crate::{SafeKeeperConf, wal_advertiser};
|
||||
|
||||
pub(crate) struct StateSnapshot {
|
||||
// inmem values
|
||||
@@ -200,6 +200,8 @@ pub(crate) struct Manager {
|
||||
pub(crate) conf: SafeKeeperConf,
|
||||
pub(crate) wal_seg_size: usize,
|
||||
pub(crate) walsenders: Arc<WalSenders>,
|
||||
pub(crate) wal_backup: Arc<WalBackup>,
|
||||
pub(crate) wal_advertiser: wal_advertiser::SafekeeperTimelineHandle,
|
||||
|
||||
// current state
|
||||
pub(crate) state_version_rx: tokio::sync::watch::Receiver<usize>,
|
||||
@@ -238,6 +240,8 @@ pub async fn main_task(
|
||||
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
|
||||
mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
|
||||
global_rate_limiter: RateLimiter,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
wal_advertiser: Arc<wal_advertiser::GlobalState>,
|
||||
) {
|
||||
tli.set_status(Status::Started);
|
||||
|
||||
@@ -256,6 +260,8 @@ pub async fn main_task(
|
||||
broker_active_set,
|
||||
manager_tx,
|
||||
global_rate_limiter,
|
||||
wal_backup,
|
||||
wal_advertiser,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -284,7 +290,8 @@ pub async fn main_task(
|
||||
|
||||
mgr.set_status(Status::UpdateBackup);
|
||||
let is_wal_backup_required = mgr.update_backup(num_computes, &state_snapshot).await;
|
||||
mgr.update_is_active(is_wal_backup_required, num_computes, &state_snapshot);
|
||||
|
||||
mgr.update_broker_active(is_wal_backup_required, num_computes, &state_snapshot);
|
||||
|
||||
mgr.set_status(Status::UpdateControlFile);
|
||||
mgr.update_control_file_save(&state_snapshot, &mut next_event)
|
||||
@@ -371,7 +378,7 @@ pub async fn main_task(
|
||||
mgr.tli_broker_active.set(false);
|
||||
|
||||
// shutdown background tasks
|
||||
if mgr.conf.is_wal_backup_enabled() {
|
||||
if let Some(storage) = mgr.wal_backup.get_storage() {
|
||||
if let Some(backup_task) = mgr.backup_task.take() {
|
||||
// If we fell through here, then the timeline is shutting down. This is important
|
||||
// because otherwise joining on the wal_backup handle might hang.
|
||||
@@ -379,7 +386,7 @@ pub async fn main_task(
|
||||
|
||||
backup_task.join().await;
|
||||
}
|
||||
wal_backup::update_task(&mut mgr, false, &last_state).await;
|
||||
wal_backup::update_task(&mut mgr, storage, false, &last_state).await;
|
||||
}
|
||||
|
||||
if let Some(recovery_task) = &mut mgr.recovery_task {
|
||||
@@ -415,14 +422,18 @@ impl Manager {
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
|
||||
global_rate_limiter: RateLimiter,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
wal_advertiser: Arc<wal_advertiser::GlobalState>,
|
||||
) -> Manager {
|
||||
let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
|
||||
Manager {
|
||||
wal_seg_size: tli.get_wal_seg_size().await,
|
||||
walsenders: tli.get_walsenders().clone(),
|
||||
wal_backup,
|
||||
state_version_rx: tli.get_state_version_rx(),
|
||||
num_computes_rx: tli.get_walreceivers().get_num_rx(),
|
||||
tli_broker_active: broker_active_set.guard(tli.clone()),
|
||||
wal_advertiser: wal_advertiser.new_timeline(tli.clone()).await.unwrap(),
|
||||
last_removed_segno: 0,
|
||||
is_offloaded,
|
||||
backup_task: None,
|
||||
@@ -477,8 +488,8 @@ impl Manager {
|
||||
let is_wal_backup_required =
|
||||
wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state);
|
||||
|
||||
if self.conf.is_wal_backup_enabled() {
|
||||
wal_backup::update_task(self, is_wal_backup_required, state).await;
|
||||
if let Some(storage) = self.wal_backup.get_storage() {
|
||||
wal_backup::update_task(self, storage, is_wal_backup_required, state).await;
|
||||
}
|
||||
|
||||
// update the state in Arc<Timeline>
|
||||
@@ -489,8 +500,8 @@ impl Manager {
|
||||
is_wal_backup_required
|
||||
}
|
||||
|
||||
/// Update is_active flag and returns its value.
|
||||
fn update_is_active(
|
||||
/// Update broker is_active flag and returns its value.
|
||||
fn update_broker_active(
|
||||
&mut self,
|
||||
is_wal_backup_required: bool,
|
||||
num_computes: usize,
|
||||
@@ -500,6 +511,7 @@ impl Manager {
|
||||
|| num_computes > 0
|
||||
|| state.remote_consistent_lsn < state.commit_lsn;
|
||||
|
||||
|
||||
// update the broker timeline set
|
||||
if self.tli_broker_active.set(is_active) {
|
||||
// write log if state has changed
|
||||
@@ -624,9 +636,9 @@ impl Manager {
|
||||
/// Spawns partial WAL backup task if needed.
|
||||
async fn update_partial_backup(&mut self, state: &StateSnapshot) {
|
||||
// check if WAL backup is enabled and should be started
|
||||
if !self.conf.is_wal_backup_enabled() {
|
||||
let Some(storage) = self.wal_backup.get_storage() else {
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if self.partial_backup_task.is_some() {
|
||||
// partial backup is already running
|
||||
@@ -650,6 +662,7 @@ impl Manager {
|
||||
self.conf.clone(),
|
||||
self.global_rate_limiter.clone(),
|
||||
cancel.clone(),
|
||||
storage,
|
||||
));
|
||||
self.partial_backup_task = Some((handle, cancel));
|
||||
}
|
||||
@@ -669,6 +682,10 @@ impl Manager {
|
||||
/// Reset partial backup state and remove its remote storage data. Since it
|
||||
/// might concurrently uploading something, cancel the task first.
|
||||
async fn backup_partial_reset(&mut self) -> anyhow::Result<Vec<String>> {
|
||||
let Some(storage) = self.wal_backup.get_storage() else {
|
||||
anyhow::bail!("remote storage is not enabled");
|
||||
};
|
||||
|
||||
info!("resetting partial backup state");
|
||||
// Force unevict timeline if it is evicted before erasing partial backup
|
||||
// state. The intended use of this function is to drop corrupted remote
|
||||
@@ -689,7 +706,7 @@ impl Manager {
|
||||
}
|
||||
|
||||
let tli = self.wal_resident_timeline()?;
|
||||
let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await;
|
||||
let mut partial_backup = PartialBackup::new(tli, self.conf.clone(), storage).await;
|
||||
// Reset might fail e.g. when cfile is already reset but s3 removal
|
||||
// failed, so set manager state to None beforehand. In any case caller
|
||||
// is expected to retry until success.
|
||||
|
||||
@@ -25,8 +25,9 @@ use crate::rate_limit::RateLimiter;
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir};
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::WalBackup;
|
||||
use crate::wal_storage::Storage;
|
||||
use crate::{SafeKeeperConf, control_file, wal_storage};
|
||||
use crate::{SafeKeeperConf, control_file, pageserver_connectivity, wal_advertiser, wal_storage};
|
||||
|
||||
// Timeline entry in the global map: either a ready timeline, or mark that it is
|
||||
// being created.
|
||||
@@ -46,16 +47,29 @@ struct GlobalTimelinesState {
|
||||
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
wal_advertisement: Arc<wal_advertiser::GlobalState>,
|
||||
pageserver_connectivity: Arc<pageserver_connectivity::GlobalState>,
|
||||
global_rate_limiter: RateLimiter,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
}
|
||||
|
||||
impl GlobalTimelinesState {
|
||||
/// Get dependencies for a timeline constructor.
|
||||
fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
|
||||
fn get_dependencies(
|
||||
&self,
|
||||
) -> (
|
||||
Arc<SafeKeeperConf>,
|
||||
Arc<TimelinesSet>,
|
||||
RateLimiter,
|
||||
Arc<WalBackup>,
|
||||
Arc<wal_advertiser::GlobalState>,
|
||||
) {
|
||||
(
|
||||
self.conf.clone(),
|
||||
self.broker_active_set.clone(),
|
||||
self.global_rate_limiter.clone(),
|
||||
self.wal_backup.clone(),
|
||||
self.wal_advertisement.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -84,14 +98,17 @@ pub struct GlobalTimelines {
|
||||
|
||||
impl GlobalTimelines {
|
||||
/// Create a new instance of the global timelines map.
|
||||
pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
|
||||
pub fn new(conf: Arc<SafeKeeperConf>, wal_backup: Arc<WalBackup>) -> Self {
|
||||
Self {
|
||||
state: Mutex::new(GlobalTimelinesState {
|
||||
timelines: HashMap::new(),
|
||||
tombstones: HashMap::new(),
|
||||
conf,
|
||||
broker_active_set: Arc::new(TimelinesSet::default()),
|
||||
wal_advertisement: Arc::new(wal_advertiser::GlobalState::default()),
|
||||
pageserver_connectivity: Arc::new(pageserver_connectivity::GlobalState::default()),
|
||||
global_rate_limiter: RateLimiter::new(1, 1),
|
||||
wal_backup,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -147,12 +164,13 @@ impl GlobalTimelines {
|
||||
/// just lock and unlock it for each timeline -- this function is called
|
||||
/// during init when nothing else is running, so this is fine.
|
||||
async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter) = {
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup, wal_advertiser) = {
|
||||
let state = self.state.lock().unwrap();
|
||||
state.get_dependencies()
|
||||
};
|
||||
|
||||
let timelines_dir = get_tenant_dir(&conf, &tenant_id);
|
||||
|
||||
for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
|
||||
.with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
|
||||
{
|
||||
@@ -162,7 +180,7 @@ impl GlobalTimelines {
|
||||
TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
|
||||
{
|
||||
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
match Timeline::load_timeline(conf.clone(), ttid) {
|
||||
match Timeline::load_timeline(conf.clone(), ttid, wal_backup.clone()) {
|
||||
Ok(tli) => {
|
||||
let mut shared_state = tli.write_shared_state().await;
|
||||
self.state
|
||||
@@ -175,6 +193,8 @@ impl GlobalTimelines {
|
||||
&conf,
|
||||
broker_active_set.clone(),
|
||||
partial_backup_rate_limiter.clone(),
|
||||
wal_backup.clone(),
|
||||
wal_advertiser.clone(),
|
||||
);
|
||||
}
|
||||
// If we can't load a timeline, it's most likely because of a corrupted
|
||||
@@ -212,6 +232,10 @@ impl GlobalTimelines {
|
||||
self.state.lock().unwrap().broker_active_set.clone()
|
||||
}
|
||||
|
||||
pub fn get_wal_backup(&self) -> Arc<WalBackup> {
|
||||
self.state.lock().unwrap().wal_backup.clone()
|
||||
}
|
||||
|
||||
/// Create a new timeline with the given id. If the timeline already exists, returns
|
||||
/// an existing timeline.
|
||||
pub(crate) async fn create(
|
||||
@@ -222,7 +246,7 @@ impl GlobalTimelines {
|
||||
start_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let (conf, _, _) = {
|
||||
let (conf, _, _, _, _) = {
|
||||
let state = self.state.lock().unwrap();
|
||||
if let Ok(timeline) = state.get(&ttid) {
|
||||
// Timeline already exists, return it.
|
||||
@@ -267,7 +291,7 @@ impl GlobalTimelines {
|
||||
check_tombstone: bool,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
// Check for existence and mark that we're creating it.
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter) = {
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup, wal_advertiser) = {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
match state.timelines.get(&ttid) {
|
||||
Some(GlobalMapTimeline::CreationInProgress) => {
|
||||
@@ -296,7 +320,14 @@ impl GlobalTimelines {
|
||||
};
|
||||
|
||||
// Do the actual move and reflect the result in the map.
|
||||
match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
|
||||
match GlobalTimelines::install_temp_timeline(
|
||||
ttid,
|
||||
tmp_path,
|
||||
conf.clone(),
|
||||
wal_backup.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(timeline) => {
|
||||
let mut timeline_shared_state = timeline.write_shared_state().await;
|
||||
let mut state = self.state.lock().unwrap();
|
||||
@@ -314,6 +345,8 @@ impl GlobalTimelines {
|
||||
&conf,
|
||||
broker_active_set,
|
||||
partial_backup_rate_limiter,
|
||||
wal_backup,
|
||||
wal_advertiser.clone(),
|
||||
);
|
||||
drop(timeline_shared_state);
|
||||
Ok(timeline)
|
||||
@@ -336,6 +369,7 @@ impl GlobalTimelines {
|
||||
ttid: TenantTimelineId,
|
||||
tmp_path: &Utf8PathBuf,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
|
||||
let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
|
||||
@@ -377,7 +411,7 @@ impl GlobalTimelines {
|
||||
// Do the move.
|
||||
durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;
|
||||
|
||||
Timeline::load_timeline(conf, ttid)
|
||||
Timeline::load_timeline(conf, ttid, wal_backup)
|
||||
}
|
||||
|
||||
/// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
|
||||
@@ -565,6 +599,14 @@ impl GlobalTimelines {
|
||||
Ok(deleted)
|
||||
}
|
||||
|
||||
pub fn get_wal_advertiser(&self) -> Arc<wal_advertiser::GlobalState> {
|
||||
self.state.lock().unwrap().wal_advertisement.clone()
|
||||
}
|
||||
|
||||
pub fn get_pageserver_connectivity(&self) -> Arc<pageserver_connectivity::GlobalState> {
|
||||
self.state.lock().unwrap().pageserver_connectivity.clone()
|
||||
}
|
||||
|
||||
pub fn housekeeping(&self, tombstone_ttl: &Duration) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
|
||||
|
||||
201
safekeeper/src/wal_advertiser.rs
Normal file
201
safekeeper/src/wal_advertiser.rs
Normal file
@@ -0,0 +1,201 @@
|
||||
mod persistence;
|
||||
mod pageserver_connectivity;
|
||||
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::timeline::Timeline;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, Mutex},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tracing::{Instrument, error, info, info_span, warn};
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
sync::{spsc_fold, spsc_watch},
|
||||
};
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
type Advs = HashMap<TenantTimelineId, Lsn>;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct GlobalState {
|
||||
inner: once_cell::sync::OnceCell<tokio::sync::mpsc::Sender<Message>>,
|
||||
}
|
||||
|
||||
pub struct SafekeeperTimelineHandle {
|
||||
tx: tokio::sync::mpsc::Sender<Message>,
|
||||
}
|
||||
|
||||
enum Message {
|
||||
NewTimeline {
|
||||
reply: tokio::sync::oneshot::Sender<Result<(), Error>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl GlobalState {
|
||||
pub fn task_main(&self) -> impl 'static + Future<Output = anyhow::Result<()>> + Send {
|
||||
let mut ret = None;
|
||||
self.inner.get_or_init(|| {
|
||||
let (tx, task_fut) = MainTask::prepare_run();
|
||||
ret = Some(task_fut);
|
||||
tx
|
||||
});
|
||||
ret.expect("must only call this method once")
|
||||
}
|
||||
|
||||
pub async fn new_timeline(
|
||||
&self,
|
||||
tli: Arc<Timeline>,
|
||||
) -> Result<SafekeeperTimelineHandle, Error> {
|
||||
let tx = self.inner.get().unwrap().clone();
|
||||
let handle = SafekeeperTimelineHandle { tx };
|
||||
let (reply, rx) = tokio::sync::oneshot::channel();
|
||||
let Ok(()) = handle.tx.send(Message::NewTimeline { reply }).await else {
|
||||
return Err(Error::Cancelled);
|
||||
};
|
||||
let Ok(res) = rx.await else {
|
||||
return Err(Error::Cancelled);
|
||||
};
|
||||
Ok(handle)
|
||||
}
|
||||
pub fn update_pageserver_attachments(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
update: safekeeper_api::models::TenantShardPageserverAttachmentChange,
|
||||
) -> anyhow::Result<()> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
impl SafekeeperTimelineHandle {
|
||||
pub fn ready_for_eviction(&self) -> bool {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
struct MainTask {
|
||||
rx: tokio::sync::mpsc::Receiver<Message>,
|
||||
world: sk_ps_discovery::World,
|
||||
senders: HashMap<utils::id::NodeId, spsc_watch::Sender<Advs>>,
|
||||
}
|
||||
|
||||
impl MainTask {
|
||||
fn prepare_run() -> (
|
||||
tokio::sync::mpsc::Sender<Message>,
|
||||
impl Future<Output = anyhow::Result<()>> + Send,
|
||||
) {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(100 /* TODO think */);
|
||||
let task = MainTask {
|
||||
rx,
|
||||
world: sk_ps_discovery::World::default(),
|
||||
senders: Default::default(),
|
||||
};
|
||||
(tx, task.task())
|
||||
}
|
||||
async fn task(mut self) -> anyhow::Result<()> {
|
||||
let mut adv_frequency = tokio::time::interval(Duration::from_secs(1));
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = adv_frequency.tick() => {
|
||||
let start = Instant::now();
|
||||
self.advertisements_iteration();
|
||||
let elapsed = start.elapsed();
|
||||
if elapsed > Duration::from_millis(10) {
|
||||
warn!(?elapsed, "advertisements iteration is slow");
|
||||
}
|
||||
},
|
||||
message = self.rx.recv() => {
|
||||
match message {
|
||||
None => anyhow::bail!("last main task sender dropped, shouldn't happen, exiting"),
|
||||
Some(_) => todo!(),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn advertisements_iteration(&mut self) {
|
||||
loop {
|
||||
let advertisements = self.world.get_commit_lsn_advertisements();
|
||||
for (node_id, mut advs) in advertisements {
|
||||
'inner: loop {
|
||||
let tx = self.senders.entry(node_id).or_insert_with(|| {
|
||||
let (tx, rx) = spsc_watch::channel();
|
||||
tokio::spawn(
|
||||
PageserverTask {
|
||||
ps_id: node_id,
|
||||
endpoint: todo!(),
|
||||
advs: rx,
|
||||
}
|
||||
.run()
|
||||
.instrument(info_span!("wal_advertiser", ps_id=%node_id)),
|
||||
);
|
||||
tx
|
||||
});
|
||||
if let Err((failed, err)) = tx.send_replace(advs) {
|
||||
self.senders.remove(&node_id);
|
||||
advs = failed;
|
||||
} else {
|
||||
break 'inner;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
struct PageserverTask {
|
||||
ps_id: NodeId,
|
||||
advs: spsc_watch::Receiver<Advs>,
|
||||
}
|
||||
|
||||
impl PageserverTask {
|
||||
/// Cancellation: happens through last PageserverHandle being dropped.
|
||||
async fn run(mut self) {
|
||||
loop {
|
||||
let Ok(advs) = self.advs.recv().await else {
|
||||
info!("main task gone, exiting");
|
||||
return;
|
||||
};
|
||||
let res = self.run0(advs).await;
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(err) => {
|
||||
error!(?err, "error sending advertisements");
|
||||
// TODO: proper backoff?
|
||||
tokio::time::sleep(Duration::from_secs(5)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn run0(&mut self, advs: HashMap<TenantTimelineId, Lsn>) -> anyhow::Result<()> {
|
||||
use storage_broker::wal_advertisement as proto;
|
||||
use storage_broker::wal_advertisement::pageserver_client::PageserverClient;
|
||||
let stream = async_stream::stream! {
|
||||
for (tenant_timeline_id, commit_lsn) in advs {
|
||||
yield proto::CommitLsnAdvertisement {tenant_timeline_id: Some(proto::TenantTimelineId {
|
||||
tenant_id: tenant_timeline_id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: tenant_timeline_id.timeline_id.as_ref().to_owned(),
|
||||
}), commit_lsn: commit_lsn.0 };
|
||||
}
|
||||
};
|
||||
let mut client: PageserverClient<_> = PageserverClient::connect(self.endpoint.clone())
|
||||
.await
|
||||
.context("connect")?;
|
||||
let publish_stream = client
|
||||
.publish_commit_lsn_advertisements(stream)
|
||||
.await
|
||||
.context("publish stream")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
0
safekeeper/src/wal_advertiser/persistence.rs
Normal file
0
safekeeper/src/wal_advertiser/persistence.rs
Normal file
@@ -2,6 +2,7 @@ use std::cmp::min;
|
||||
use std::collections::HashSet;
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -17,7 +18,7 @@ use safekeeper_api::models::PeerInfo;
|
||||
use tokio::fs::File;
|
||||
use tokio::select;
|
||||
use tokio::sync::mpsc::{self, Receiver, Sender};
|
||||
use tokio::sync::{OnceCell, watch};
|
||||
use tokio::sync::watch;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -63,7 +64,12 @@ pub(crate) fn is_wal_backup_required(
|
||||
/// Based on peer information determine which safekeeper should offload; if it
|
||||
/// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
|
||||
/// is running, kill it.
|
||||
pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) {
|
||||
pub(crate) async fn update_task(
|
||||
mgr: &mut Manager,
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
need_backup: bool,
|
||||
state: &StateSnapshot,
|
||||
) {
|
||||
let (offloader, election_dbg_str) =
|
||||
determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf);
|
||||
let elected_me = Some(mgr.conf.my_id) == offloader;
|
||||
@@ -82,7 +88,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St
|
||||
return;
|
||||
};
|
||||
|
||||
let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx);
|
||||
let async_task = backup_task_main(
|
||||
resident,
|
||||
storage,
|
||||
mgr.conf.backup_parallel_jobs,
|
||||
shutdown_rx,
|
||||
);
|
||||
|
||||
let handle = if mgr.conf.current_thread_runtime {
|
||||
tokio::spawn(async_task)
|
||||
@@ -169,33 +180,31 @@ fn determine_offloader(
|
||||
}
|
||||
}
|
||||
|
||||
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::const_new();
|
||||
|
||||
// Storage must be configured and initialized when this is called.
|
||||
fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
|
||||
REMOTE_STORAGE
|
||||
.get()
|
||||
.expect("failed to get remote storage")
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
pub struct WalBackup {
|
||||
storage: Option<Arc<GenericRemoteStorage>>,
|
||||
}
|
||||
|
||||
pub async fn init_remote_storage(conf: &SafeKeeperConf) {
|
||||
// TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
|
||||
// dependencies to all tasks instead.
|
||||
REMOTE_STORAGE
|
||||
.get_or_init(|| async {
|
||||
if let Some(conf) = conf.remote_storage.as_ref() {
|
||||
Some(
|
||||
GenericRemoteStorage::from_config(conf)
|
||||
.await
|
||||
.expect("failed to create remote storage"),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
impl WalBackup {
|
||||
/// Create a new WalBackup instance.
|
||||
pub async fn new(conf: &SafeKeeperConf) -> Result<Self> {
|
||||
if !conf.wal_backup_enabled {
|
||||
return Ok(Self { storage: None });
|
||||
}
|
||||
|
||||
match conf.remote_storage.as_ref() {
|
||||
Some(config) => {
|
||||
let storage = GenericRemoteStorage::from_config(config).await?;
|
||||
Ok(Self {
|
||||
storage: Some(Arc::new(storage)),
|
||||
})
|
||||
}
|
||||
})
|
||||
.await;
|
||||
None => Ok(Self { storage: None }),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_storage(&self) -> Option<Arc<GenericRemoteStorage>> {
|
||||
self.storage.clone()
|
||||
}
|
||||
}
|
||||
|
||||
struct WalBackupTask {
|
||||
@@ -204,12 +213,14 @@ struct WalBackupTask {
|
||||
wal_seg_size: usize,
|
||||
parallel_jobs: usize,
|
||||
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
}
|
||||
|
||||
/// Offload single timeline.
|
||||
#[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))]
|
||||
async fn backup_task_main(
|
||||
tli: WalResidentTimeline,
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
parallel_jobs: usize,
|
||||
mut shutdown_rx: Receiver<()>,
|
||||
) {
|
||||
@@ -223,6 +234,7 @@ async fn backup_task_main(
|
||||
timeline_dir: tli.get_timeline_dir(),
|
||||
timeline: tli,
|
||||
parallel_jobs,
|
||||
storage,
|
||||
};
|
||||
|
||||
// task is spinned up only when wal_seg_size already initialized
|
||||
@@ -293,6 +305,7 @@ impl WalBackupTask {
|
||||
|
||||
match backup_lsn_range(
|
||||
&self.timeline,
|
||||
self.storage.clone(),
|
||||
&mut backup_lsn,
|
||||
commit_lsn,
|
||||
self.wal_seg_size,
|
||||
@@ -322,6 +335,7 @@ impl WalBackupTask {
|
||||
|
||||
async fn backup_lsn_range(
|
||||
timeline: &WalResidentTimeline,
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
backup_lsn: &mut Lsn,
|
||||
end_lsn: Lsn,
|
||||
wal_seg_size: usize,
|
||||
@@ -352,7 +366,12 @@ async fn backup_lsn_range(
|
||||
loop {
|
||||
let added_task = match iter.next() {
|
||||
Some(s) => {
|
||||
uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path));
|
||||
uploads.push_back(backup_single_segment(
|
||||
&storage,
|
||||
s,
|
||||
timeline_dir,
|
||||
remote_timeline_path,
|
||||
));
|
||||
true
|
||||
}
|
||||
None => false,
|
||||
@@ -388,6 +407,7 @@ async fn backup_lsn_range(
|
||||
}
|
||||
|
||||
async fn backup_single_segment(
|
||||
storage: &GenericRemoteStorage,
|
||||
seg: &Segment,
|
||||
timeline_dir: &Utf8Path,
|
||||
remote_timeline_path: &RemotePath,
|
||||
@@ -395,7 +415,13 @@ async fn backup_single_segment(
|
||||
let segment_file_path = seg.file_path(timeline_dir)?;
|
||||
let remote_segment_path = seg.remote_path(remote_timeline_path);
|
||||
|
||||
let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
|
||||
let res = backup_object(
|
||||
storage,
|
||||
&segment_file_path,
|
||||
&remote_segment_path,
|
||||
seg.size(),
|
||||
)
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
BACKED_UP_SEGMENTS.inc();
|
||||
} else {
|
||||
@@ -455,12 +481,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
|
||||
}
|
||||
|
||||
async fn backup_object(
|
||||
storage: &GenericRemoteStorage,
|
||||
source_file: &Utf8Path,
|
||||
target_file: &RemotePath,
|
||||
size: usize,
|
||||
) -> Result<()> {
|
||||
let storage = get_configured_remote_storage();
|
||||
|
||||
let file = File::open(&source_file)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
|
||||
@@ -475,12 +500,11 @@ async fn backup_object(
|
||||
}
|
||||
|
||||
pub(crate) async fn backup_partial_segment(
|
||||
storage: &GenericRemoteStorage,
|
||||
source_file: &Utf8Path,
|
||||
target_file: &RemotePath,
|
||||
size: usize,
|
||||
) -> Result<()> {
|
||||
let storage = get_configured_remote_storage();
|
||||
|
||||
let file = File::open(&source_file)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
|
||||
@@ -504,25 +528,20 @@ pub(crate) async fn backup_partial_segment(
|
||||
}
|
||||
|
||||
pub(crate) async fn copy_partial_segment(
|
||||
storage: &GenericRemoteStorage,
|
||||
source: &RemotePath,
|
||||
destination: &RemotePath,
|
||||
) -> Result<()> {
|
||||
let storage = get_configured_remote_storage();
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
storage.copy_object(source, destination, &cancel).await
|
||||
}
|
||||
|
||||
pub async fn read_object(
|
||||
storage: &GenericRemoteStorage,
|
||||
file_path: &RemotePath,
|
||||
offset: u64,
|
||||
) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead + Send + Sync>>> {
|
||||
let storage = REMOTE_STORAGE
|
||||
.get()
|
||||
.context("Failed to get remote storage")?
|
||||
.as_ref()
|
||||
.context("No remote storage configured")?;
|
||||
|
||||
info!("segment download about to start from remote path {file_path:?} at offset {offset}");
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
@@ -547,8 +566,10 @@ pub async fn read_object(
|
||||
|
||||
/// Delete WAL files for the given timeline. Remote storage must be configured
|
||||
/// when called.
|
||||
pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
let storage = get_configured_remote_storage();
|
||||
pub async fn delete_timeline(
|
||||
storage: &GenericRemoteStorage,
|
||||
ttid: &TenantTimelineId,
|
||||
) -> Result<()> {
|
||||
let remote_path = remote_timeline_path(ttid)?;
|
||||
|
||||
// see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
|
||||
@@ -618,14 +639,14 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
}
|
||||
|
||||
/// Used by wal_backup_partial.
|
||||
pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
|
||||
pub async fn delete_objects(storage: &GenericRemoteStorage, paths: &[RemotePath]) -> Result<()> {
|
||||
let cancel = CancellationToken::new(); // not really used
|
||||
let storage = get_configured_remote_storage();
|
||||
storage.delete_objects(paths, &cancel).await
|
||||
}
|
||||
|
||||
/// Copy segments from one timeline to another. Used in copy_timeline.
|
||||
pub async fn copy_s3_segments(
|
||||
storage: &GenericRemoteStorage,
|
||||
wal_seg_size: usize,
|
||||
src_ttid: &TenantTimelineId,
|
||||
dst_ttid: &TenantTimelineId,
|
||||
@@ -634,12 +655,6 @@ pub async fn copy_s3_segments(
|
||||
) -> Result<()> {
|
||||
const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;
|
||||
|
||||
let storage = REMOTE_STORAGE
|
||||
.get()
|
||||
.expect("failed to get remote storage")
|
||||
.as_ref()
|
||||
.unwrap();
|
||||
|
||||
let remote_dst_path = remote_timeline_path(dst_ttid)?;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
@@ -19,9 +19,11 @@
|
||||
//! file. Code updates state in the control file before doing any S3 operations.
|
||||
//! This way control file stores information about all potentially existing
|
||||
//! remote partial segments and can clean them up after uploading a newer version.
|
||||
use std::sync::Arc;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
|
||||
use remote_storage::RemotePath;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use safekeeper_api::Term;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -154,12 +156,16 @@ pub struct PartialBackup {
|
||||
conf: SafeKeeperConf,
|
||||
local_prefix: Utf8PathBuf,
|
||||
remote_timeline_path: RemotePath,
|
||||
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
state: State,
|
||||
}
|
||||
|
||||
impl PartialBackup {
|
||||
pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup {
|
||||
pub async fn new(
|
||||
tli: WalResidentTimeline,
|
||||
conf: SafeKeeperConf,
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
) -> PartialBackup {
|
||||
let (_, persistent_state) = tli.get_state().await;
|
||||
let wal_seg_size = tli.get_wal_seg_size().await;
|
||||
|
||||
@@ -173,6 +179,7 @@ impl PartialBackup {
|
||||
conf,
|
||||
local_prefix,
|
||||
remote_timeline_path,
|
||||
storage,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -240,7 +247,8 @@ impl PartialBackup {
|
||||
let remote_path = prepared.remote_path(&self.remote_timeline_path);
|
||||
|
||||
// Upload first `backup_bytes` bytes of the segment to the remote storage.
|
||||
wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
|
||||
wal_backup::backup_partial_segment(&self.storage, &local_path, &remote_path, backup_bytes)
|
||||
.await?;
|
||||
PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);
|
||||
|
||||
// We uploaded the segment, now let's verify that the data is still actual.
|
||||
@@ -326,7 +334,7 @@ impl PartialBackup {
|
||||
let remote_path = self.remote_timeline_path.join(seg);
|
||||
objects_to_delete.push(remote_path);
|
||||
}
|
||||
wal_backup::delete_objects(&objects_to_delete).await
|
||||
wal_backup::delete_objects(&self.storage, &objects_to_delete).await
|
||||
}
|
||||
|
||||
/// Delete all non-Uploaded segments from the remote storage. There should be only one
|
||||
@@ -424,6 +432,7 @@ pub async fn main_task(
|
||||
conf: SafeKeeperConf,
|
||||
limiter: RateLimiter,
|
||||
cancel: CancellationToken,
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
) -> Option<PartialRemoteSegment> {
|
||||
debug!("started");
|
||||
let await_duration = conf.partial_backup_timeout;
|
||||
@@ -432,7 +441,7 @@ pub async fn main_task(
|
||||
let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
|
||||
let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();
|
||||
|
||||
let mut backup = PartialBackup::new(tli, conf).await;
|
||||
let mut backup = PartialBackup::new(tli, conf, storage).await;
|
||||
|
||||
debug!("state: {:?}", backup.state);
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion};
|
||||
use pq_proto::SystemId;
|
||||
use remote_storage::RemotePath;
|
||||
use std::sync::Arc;
|
||||
use tokio::fs::{self, File, OpenOptions, remove_file};
|
||||
use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
|
||||
use tracing::*;
|
||||
@@ -32,7 +33,7 @@ use crate::metrics::{
|
||||
REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure,
|
||||
};
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::wal_backup::{read_object, remote_timeline_path};
|
||||
use crate::wal_backup::{WalBackup, read_object, remote_timeline_path};
|
||||
|
||||
pub trait Storage {
|
||||
// Last written LSN.
|
||||
@@ -645,7 +646,7 @@ pub struct WalReader {
|
||||
wal_segment: Option<Pin<Box<dyn AsyncRead + Send + Sync>>>,
|
||||
|
||||
// S3 will be used to read WAL if LSN is not available locally
|
||||
enable_remote_read: bool,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
|
||||
// We don't have WAL locally if LSN is less than local_start_lsn
|
||||
local_start_lsn: Lsn,
|
||||
@@ -664,7 +665,7 @@ impl WalReader {
|
||||
timeline_dir: Utf8PathBuf,
|
||||
state: &TimelinePersistentState,
|
||||
start_pos: Lsn,
|
||||
enable_remote_read: bool,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
) -> Result<Self> {
|
||||
if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
|
||||
bail!("state uninitialized, no data to read");
|
||||
@@ -693,7 +694,7 @@ impl WalReader {
|
||||
wal_seg_size: state.server.wal_seg_size as usize,
|
||||
pos: start_pos,
|
||||
wal_segment: None,
|
||||
enable_remote_read,
|
||||
wal_backup,
|
||||
local_start_lsn: state.local_start_lsn,
|
||||
timeline_start_lsn: state.timeline_start_lsn,
|
||||
pg_version: state.server.pg_version / 10000,
|
||||
@@ -812,9 +813,9 @@ impl WalReader {
|
||||
}
|
||||
|
||||
// Try to open remote file, if remote reads are enabled
|
||||
if self.enable_remote_read {
|
||||
if let Some(storage) = self.wal_backup.get_storage() {
|
||||
let remote_wal_file_path = self.remote_path.join(&wal_file_name);
|
||||
return read_object(&remote_wal_file_path, xlogoff as u64).await;
|
||||
return read_object(&storage, &remote_wal_file_path, xlogoff as u64).await;
|
||||
}
|
||||
|
||||
bail!("WAL segment is not found")
|
||||
|
||||
@@ -27,6 +27,7 @@ parking_lot.workspace = true
|
||||
prost.workspace = true
|
||||
tonic.workspace = true
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
tokio-util.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
metrics.workspace = true
|
||||
|
||||
@@ -5,7 +5,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// easy location, but apparently interference with cachepot sometimes fails
|
||||
// the build then. Anyway, per cargo docs build script shouldn't output to
|
||||
// anywhere but $OUT_DIR.
|
||||
tonic_build::compile_protos("proto/broker.proto")
|
||||
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
|
||||
let protos = [
|
||||
"proto/broker.proto",
|
||||
"proto/wal_advertisement.proto",
|
||||
];
|
||||
for proto in protos {
|
||||
tonic_build::compile_protos(proto)
|
||||
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -35,14 +35,14 @@ message SafekeeperTimelineInfo {
|
||||
// LSN of the last record.
|
||||
uint64 flush_lsn = 4;
|
||||
// Up to which LSN safekeeper regards its WAL as committed.
|
||||
uint64 commit_lsn = 5;
|
||||
uint64 commit_lsn = 5; // yes
|
||||
// LSN up to which safekeeper has backed WAL.
|
||||
uint64 backup_lsn = 6;
|
||||
// LSN of last checkpoint uploaded by pageserver.
|
||||
uint64 remote_consistent_lsn = 7;
|
||||
uint64 peer_horizon_lsn = 8;
|
||||
uint64 local_start_lsn = 9;
|
||||
uint64 standby_horizon = 14;
|
||||
uint64 standby_horizon = 14; // yes
|
||||
// A connection string to use for WAL receiving.
|
||||
string safekeeper_connstr = 10;
|
||||
// HTTP endpoint connection string.
|
||||
|
||||
29
storage_broker/proto/wal_advertisement.proto
Normal file
29
storage_broker/proto/wal_advertisement.proto
Normal file
@@ -0,0 +1,29 @@
|
||||
syntax = "proto3";
|
||||
|
||||
import "google/protobuf/empty.proto";
|
||||
|
||||
package wal_advertisement;
|
||||
|
||||
service Pageserver {
|
||||
rpc PublishCommitLsnAdvertisements(stream CommitLsnAdvertisement) returns (google.protobuf.Empty) {};
|
||||
rpc SubscribeRemoteConsistentLsnAdvertisements(google.protobuf.Empty) returns (stream RemoteConsistentLsnAdvertisement) {};
|
||||
}
|
||||
|
||||
message CommitLsnAdvertisement {
|
||||
TenantTimelineId tenant_timeline_id = 1;
|
||||
uint64 commit_lsn = 2;
|
||||
}
|
||||
|
||||
message RemoteConsistentLsnAdvertisement {
|
||||
bytes tenant_id = 1;
|
||||
uint32 shard_id = 2;
|
||||
bytes timeline_id = 3;
|
||||
uint64 generation = 4;
|
||||
uint64 remote_consistent_lsn = 5;
|
||||
}
|
||||
|
||||
message TenantTimelineId {
|
||||
bytes tenant_id = 1;
|
||||
bytes timeline_id = 2;
|
||||
}
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::Stream;
|
||||
use proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use proto::broker_service_client::BrokerServiceClient;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tonic::Status;
|
||||
use tonic::codegen::StdError;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
use tonic::transport::Endpoint;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use utils::backoff::{
|
||||
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
|
||||
};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
// Code generated by protobuf.
|
||||
@@ -16,12 +20,18 @@ pub mod proto {
|
||||
tonic::include_proto!("storage_broker");
|
||||
}
|
||||
|
||||
pub mod wal_advertisement {
|
||||
#![allow(clippy::derive_partial_eq_without_eq)]
|
||||
tonic::include_proto!("wal_advertisement");
|
||||
}
|
||||
|
||||
pub mod metrics;
|
||||
|
||||
// Re-exports to avoid direct tonic dependency in user crates.
|
||||
pub use hyper::Uri;
|
||||
pub use tonic::transport::{Certificate, ClientTlsConfig};
|
||||
pub use tonic::{Code, Request, Streaming};
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
|
||||
pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
|
||||
@@ -29,9 +39,199 @@ pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LIST
|
||||
pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
|
||||
pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
// BrokerServiceClient charged with tonic provided Channel transport; helps to
|
||||
// avoid depending on tonic directly in user crates.
|
||||
pub type BrokerClientChannel = BrokerServiceClient<Channel>;
|
||||
#[derive(Clone)]
|
||||
pub struct TimelineUpdatesSubscriber {
|
||||
client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
|
||||
}
|
||||
|
||||
/// Wrapper type to weed out all places in the codebase that interact directly with the gRPC generated code.
|
||||
pub struct BrokerClientChannel {
|
||||
client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
|
||||
}
|
||||
|
||||
impl BrokerClientChannel {
|
||||
pub fn into_raw_grpc_client(
|
||||
self,
|
||||
) -> proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel> {
|
||||
self.client
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TimelineShardUpdate {
|
||||
pub is_discovery: bool,
|
||||
pub inner: proto::SafekeeperDiscoveryResponse,
|
||||
}
|
||||
|
||||
pub struct DiscoveryRequester {
|
||||
id: ProtoTenantTimelineId,
|
||||
client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
|
||||
}
|
||||
|
||||
impl TimelineUpdatesSubscriber {
|
||||
pub fn new(service_client: BrokerClientChannel) -> Self {
|
||||
Self {
|
||||
client: service_client.client.clone(),
|
||||
}
|
||||
}
|
||||
pub fn subscribe(
|
||||
&mut self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> (impl Stream<Item = TimelineShardUpdate>, DiscoveryRequester) {
|
||||
let id = ProtoTenantTimelineId {
|
||||
tenant_id: tenant_shard_id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: timeline_id.as_ref().to_owned(),
|
||||
};
|
||||
let discovery_requester = DiscoveryRequester {
|
||||
id: id.clone(),
|
||||
client: self.client.clone(),
|
||||
};
|
||||
let stream = async_stream::stream! {
|
||||
let mut attempt = 0;
|
||||
'resubscribe: loop {
|
||||
exponential_backoff(
|
||||
attempt,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
|
||||
use proto::*;
|
||||
// subscribe to the specific timeline
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperTimelineInfo as i32,
|
||||
},
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
|
||||
},
|
||||
],
|
||||
tenant_timeline_id: Some(FilterTenantTimelineId {
|
||||
enabled: true,
|
||||
tenant_timeline_id: Some(id.clone()),
|
||||
}),
|
||||
};
|
||||
|
||||
let res = tokio::select! {
|
||||
r = self.client.subscribe_by_filter(request) => { r }
|
||||
_ = cancel.cancelled() => { return; }
|
||||
};
|
||||
let mut update_stream = match res
|
||||
{
|
||||
Ok(resp) => {
|
||||
resp.into_inner()
|
||||
}
|
||||
Err(e) => {
|
||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
||||
// entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
|
||||
info!(
|
||||
attempt, "failed to subscribe: {e:#}"
|
||||
);
|
||||
continue 'resubscribe;
|
||||
}
|
||||
};
|
||||
loop {
|
||||
let broker_update = tokio::select!{
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
}
|
||||
update = update_stream.message() => { update }
|
||||
};
|
||||
match broker_update {
|
||||
Ok(Some(typed_msg)) => {
|
||||
let mut is_discovery = false;
|
||||
let timeline_update = match typed_msg.r#type() {
|
||||
MessageType::SafekeeperTimelineInfo => {
|
||||
let info = match typed_msg.safekeeper_timeline_info {
|
||||
Some(info) => info,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_timeline_info");
|
||||
continue 'resubscribe;
|
||||
}
|
||||
};
|
||||
SafekeeperDiscoveryResponse {
|
||||
safekeeper_id: info.safekeeper_id,
|
||||
tenant_timeline_id: info.tenant_timeline_id,
|
||||
commit_lsn: info.commit_lsn,
|
||||
safekeeper_connstr: info.safekeeper_connstr,
|
||||
availability_zone: info.availability_zone,
|
||||
standby_horizon: info.standby_horizon,
|
||||
}
|
||||
}
|
||||
MessageType::SafekeeperDiscoveryResponse => {
|
||||
is_discovery = true;
|
||||
match typed_msg.safekeeper_discovery_response {
|
||||
Some(response) => response,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_discovery_response");
|
||||
continue 'resubscribe;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// unexpected message
|
||||
warn!("unexpected message from broker: {typed_msg:?}");
|
||||
continue 'resubscribe;
|
||||
}
|
||||
};
|
||||
attempt = 0; // reset backoff iff we received a valid update
|
||||
yield TimelineShardUpdate{is_discovery, inner: timeline_update };
|
||||
},
|
||||
Err(status) => {
|
||||
match status.code() {
|
||||
Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
|
||||
// tonic's error handling doesn't provide a clear code for disconnections: we get
|
||||
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
|
||||
// => https://github.com/neondatabase/neon/issues/9562
|
||||
info!("broker disconnected: {status}");
|
||||
},
|
||||
_ => {
|
||||
warn!("broker subscription failed: {status}");
|
||||
}
|
||||
}
|
||||
continue 'resubscribe;
|
||||
}
|
||||
Ok(None) => {
|
||||
error!("broker subscription stream ended"); // can't happen
|
||||
continue 'resubscribe;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
(stream, discovery_requester)
|
||||
}
|
||||
}
|
||||
|
||||
impl DiscoveryRequester {
|
||||
pub async fn request(&mut self) {
|
||||
let request = proto::SafekeeperDiscoveryRequest {
|
||||
tenant_timeline_id: Some(self.id.clone()),
|
||||
};
|
||||
let msg = proto::TypedMessage {
|
||||
r#type: proto::MessageType::SafekeeperDiscoveryRequest as i32,
|
||||
safekeeper_timeline_info: None,
|
||||
safekeeper_discovery_request: Some(request),
|
||||
safekeeper_discovery_response: None,
|
||||
};
|
||||
|
||||
// Cancellation safety: we want to send a message to the broker, but publish_one()
|
||||
// function can get cancelled by the other select! arm. This is absolutely fine, because
|
||||
// we just want to receive broker updates and discovery is not important if we already
|
||||
// receive updates.
|
||||
//
|
||||
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
|
||||
// This is totally fine because of the reason above.
|
||||
|
||||
// This is a fire-and-forget request, we don't care about the response
|
||||
let _ = self.client.publish_one(msg).await;
|
||||
debug!("Discovery request sent to the broker");
|
||||
}
|
||||
}
|
||||
|
||||
// Create connection object configured to run TLS if schema starts with https://
|
||||
// and plain text otherwise. Connection is lazy, only endpoint sanity is
|
||||
@@ -67,19 +267,9 @@ where
|
||||
.connect_timeout(DEFAULT_CONNECT_TIMEOUT);
|
||||
// keep_alive_timeout is 20s by default on both client and server side
|
||||
let channel = tonic_endpoint.connect_lazy();
|
||||
Ok(BrokerClientChannel::new(channel))
|
||||
}
|
||||
|
||||
impl BrokerClientChannel {
|
||||
/// Create a new client to the given endpoint, but don't actually connect until the first request.
|
||||
pub async fn connect_lazy<D>(dst: D) -> Result<Self, tonic::transport::Error>
|
||||
where
|
||||
D: std::convert::TryInto<tonic::transport::Endpoint>,
|
||||
D::Error: Into<StdError>,
|
||||
{
|
||||
let conn = tonic::transport::Endpoint::new(dst)?.connect_lazy();
|
||||
Ok(Self::new(conn))
|
||||
}
|
||||
Ok(BrokerClientChannel {
|
||||
client: proto::broker_service_client::BrokerServiceClient::new(channel),
|
||||
})
|
||||
}
|
||||
|
||||
// parse variable length bytes from protobuf
|
||||
|
||||
@@ -15,6 +15,7 @@ testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-stream.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
@@ -69,4 +70,4 @@ http-utils = { path = "../libs/http-utils/" }
|
||||
utils = { path = "../libs/utils/" }
|
||||
metrics = { path = "../libs/metrics/" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
DROP TRIGGER on_timelines_UPDATE_enqueue_sk_ps_discovery on "timelines";
|
||||
DROP FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_timelines_DELETE_enqueue_sk_ps_discovery on "timelines";
|
||||
DROP FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_timelines_INSERT_enqueue_sk_ps_discovery on "timelines";
|
||||
DROP FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery on "tenant_shards";
|
||||
DROP FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery on "tenant_shards";
|
||||
DROP FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery on "tenant_shards";
|
||||
DROP FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP FUNCTION IF EXISTS sk_ps_discovery_enqueue_attachment_create;
|
||||
DROP TABLE "sk_ps_discovery";
|
||||
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
CREATE TABLE "sk_ps_discovery"(
|
||||
"tenant_id" VARCHAR NOT NULL,
|
||||
"shard_number" INT4 NOT NULL,
|
||||
"shard_count" INT4 NOT NULL,
|
||||
"ps_generation" INT4 NOT NULL,
|
||||
"sk_id" INT8 NOT NULL REFERENCES "safekeepers"("id") ON DELETE CASCADE, -- more efficient that trigger on "safekeepers"
|
||||
"intent_state" VARCHAR NOT NULL, -- attached,detached
|
||||
"ps_id" INT8 NOT NULL REFERENCES "nodes"("node_id") ON DELETE CASCADE, -- more efficient that trigger on "nodes"
|
||||
"created_at" TIMESTAMPTZ NOT NULL,
|
||||
"retries" INT4 NOT NULL DEFAULT 0,
|
||||
"last_retry_at" TIMESTAMPTZ,
|
||||
"acknowledged_at" TIMESTAMPTZ,
|
||||
PRIMARY KEY("tenant_id", "shard_number", "shard_count", "ps_generation", "sk_id")
|
||||
);
|
||||
|
||||
CREATE OR REPLACE FUNCTION sk_ps_discovery_enqueue_attachment_create(ARG_TENANT_ID VARCHAR)
|
||||
RETURNS VOID AS $$
|
||||
BEGIN
|
||||
INSERT INTO sk_ps_discovery (tenant_id, shard_number, shard_count, ps_generation, sk_id, intent_state, ps_id, created_at)
|
||||
WITH intent_attachments AS (
|
||||
SELECT DISTINCT tenant_id,unnest(array_cat(sk_set, new_sk_set)) as sk_id FROM timelines
|
||||
WHERE
|
||||
tenant_id = ARG_TENANT_ID
|
||||
AND
|
||||
timelines.deleted_at IS NULL
|
||||
)
|
||||
SELECT tenant_shards.tenant_id, tenant_shards.shard_number, tenant_shards.shard_count,
|
||||
tenant_shards.generation, intent_attachments.sk_id, 'attached', tenant_shards.generation_pageserver, NOW()
|
||||
FROM tenant_shards
|
||||
INNER JOIN intent_attachments ON tenant_shards.tenant_id = intent_attachments.tenant_id
|
||||
ON CONFLICT DO NOTHING; -- the first trigger creates the attachment, all others are identical because tenant shard generations are monotonic
|
||||
|
||||
PERFORM pg_notify('sk_ps_discovery', json_build_object(
|
||||
'tenant_id', ARG_TENANT_ID
|
||||
)::text);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Trigger on tenant_shards table
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery
|
||||
AFTER INSERT
|
||||
ON "tenant_shards"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(OLD.tenant_id);
|
||||
RETURN OLD;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery
|
||||
AFTER DELETE
|
||||
ON "tenant_shards"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery
|
||||
AFTER UPDATE
|
||||
ON "tenant_shards"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
-- Trigger on timelines table
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_timelines_INSERT_enqueue_sk_ps_discovery
|
||||
AFTER INSERT
|
||||
ON "timelines"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(OLD.tenant_id);
|
||||
RETURN OLD;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_timelines_DELETE_enqueue_sk_ps_discovery
|
||||
AFTER DELETE
|
||||
ON "timelines"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_timelines_UPDATE_enqueue_sk_ps_discovery
|
||||
AFTER UPDATE
|
||||
ON "timelines"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
@@ -436,7 +436,8 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
};
|
||||
|
||||
// Validate that we can connect to the database
|
||||
Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
|
||||
Persistence::await_connection(secrets.database_url.clone(), args.db_connect_timeout.into())
|
||||
.await?;
|
||||
|
||||
let persistence = Arc::new(Persistence::new(secrets.database_url).await);
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
pub(crate) mod split_state;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::ops::Add;
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -15,8 +17,8 @@ use diesel_async::pooled_connection::bb8::Pool;
|
||||
use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig};
|
||||
use diesel_async::{AsyncPgConnection, RunQueryDsl};
|
||||
use diesel_migrations::{EmbeddedMigrations, embed_migrations};
|
||||
use futures::FutureExt;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::controller_api::{
|
||||
AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy,
|
||||
@@ -31,6 +33,7 @@ use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
|
||||
use rustls::crypto::ring;
|
||||
use scoped_futures::ScopedBoxFuture;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -74,6 +77,8 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
|
||||
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
|
||||
pub struct Persistence {
|
||||
connection_pool: Pool<AsyncPgConnection>,
|
||||
connect_tokio_postgres:
|
||||
Box<dyn Sync + Send + 'static + Fn() -> BoxFuture<'static, TokioPostgresConnectResult>>,
|
||||
}
|
||||
|
||||
/// Legacy format, for use in JSON compat objects in test environment
|
||||
@@ -135,6 +140,8 @@ pub(crate) enum DatabaseOperation {
|
||||
DeleteTimelineImport,
|
||||
ListTimelineImports,
|
||||
IsTenantImportingTimeline,
|
||||
ListSkPsDiscovery,
|
||||
UpdateSkPsDiscoveryAttempt,
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
@@ -177,10 +184,11 @@ impl Persistence {
|
||||
|
||||
pub async fn new(database_url: String) -> Self {
|
||||
let mut mgr_config = ManagerConfig::default();
|
||||
mgr_config.custom_setup = Box::new(establish_connection_rustls);
|
||||
mgr_config.custom_setup =
|
||||
Box::new(|config| establish_connection_rustls_diesel(config.to_owned()));
|
||||
|
||||
let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new_with_config(
|
||||
database_url,
|
||||
database_url.clone(),
|
||||
mgr_config,
|
||||
);
|
||||
|
||||
@@ -197,20 +205,25 @@ impl Persistence {
|
||||
.await
|
||||
.expect("Could not build connection pool");
|
||||
|
||||
Self { connection_pool }
|
||||
Self {
|
||||
connection_pool,
|
||||
connect_tokio_postgres: Box::new(move || {
|
||||
establish_connection_rustls_tokio_postgres(database_url.clone())
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
|
||||
/// database and the storage controller, therefore the database might not be available right away
|
||||
pub async fn await_connection(
|
||||
database_url: &str,
|
||||
database_url: String,
|
||||
timeout: Duration,
|
||||
) -> Result<(), diesel::ConnectionError> {
|
||||
let started_at = Instant::now();
|
||||
log_postgres_connstr_info(database_url)
|
||||
log_postgres_connstr_info(&database_url)
|
||||
.map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
|
||||
loop {
|
||||
match establish_connection_rustls(database_url).await {
|
||||
match establish_connection_rustls_diesel(database_url.clone()).await {
|
||||
Ok(_) => {
|
||||
tracing::info!("Connected to database.");
|
||||
return Ok(());
|
||||
@@ -1821,6 +1834,151 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn listen_sk_ps_discovery(
|
||||
&self,
|
||||
) -> DatabaseResult<
|
||||
Pin<Box<dyn Send + 'static + futures::Stream<Item = Result<TenantId, serde_json::Error>>>>,
|
||||
> {
|
||||
let (client, mut conn) = (&self.connect_tokio_postgres)().await?;
|
||||
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel(1);
|
||||
tokio::spawn(async move {
|
||||
let mut stream = futures::stream::poll_fn(move |cx| conn.poll_message(cx));
|
||||
while let Some(msg) = stream.next().await {
|
||||
info!(?msg, "async message");
|
||||
match msg {
|
||||
Ok(tokio_postgres::AsyncMessage::Notification(notification))
|
||||
if notification.channel() == "sk_ps_discovery" =>
|
||||
{
|
||||
let Ok(()) = tx.send(notification).await else {
|
||||
tracing::info!(
|
||||
"sk_ps_discovery notification rx dropped, stopping async notification processing"
|
||||
);
|
||||
break;
|
||||
};
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(err) => {
|
||||
tracing::error!(?err, "tokio_postgres poll_message error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
tracing::info!("sk_ps_discovery notification stream returned None, exiting");
|
||||
});
|
||||
|
||||
client
|
||||
.batch_execute("LISTEN sk_ps_discovery;")
|
||||
.await
|
||||
.expect("TODO");
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct Notification {
|
||||
tenant_id: TenantId,
|
||||
}
|
||||
Ok(Box::pin(async_stream::stream! {
|
||||
while let Some(msg) = rx.recv().await {
|
||||
let msg: Result<Notification, _> = serde_json::from_str(msg.payload());
|
||||
let msg = msg.map(|Notification { tenant_id }| tenant_id );
|
||||
yield msg;
|
||||
}
|
||||
tracing::info!("sk_ps_discovery channel closed, stopping stream");
|
||||
// keep client alive inside the returned sream object, othrwise `conn` ends as soon as we return from this function
|
||||
drop(client);
|
||||
}))
|
||||
}
|
||||
|
||||
pub(crate) async fn get_all_sk_ps_discovery_work(
|
||||
&self,
|
||||
) -> DatabaseResult<Vec<SkPsDiscoveryPersistence>> {
|
||||
use crate::schema::sk_ps_discovery::dsl;
|
||||
self.with_measured_conn(DatabaseOperation::ListSkPsDiscovery, move |conn| {
|
||||
Box::pin(async move {
|
||||
let vec: Vec<SkPsDiscoveryPersistence> = dsl::sk_ps_discovery.load(conn).await?;
|
||||
Ok(vec)
|
||||
})
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn update_sk_ps_discovery_attempt(
|
||||
&self,
|
||||
pk: SkPsDiscoveryPersistencePk,
|
||||
intent_state: String,
|
||||
update: Result<(), ()>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::sk_ps_discovery::dsl;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::UpdateSkPsDiscoveryAttempt, move |conn| {
|
||||
let pk = pk.clone();
|
||||
let intent_state = intent_state.clone();
|
||||
Box::pin(async move {
|
||||
match update {
|
||||
Ok(()) => {
|
||||
let SkPsDiscoveryPersistencePk {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
ps_generation,
|
||||
sk_id,
|
||||
} = pk;
|
||||
let nrows = diesel::delete(dsl::sk_ps_discovery)
|
||||
// primary key
|
||||
.filter(dsl::tenant_id.eq(tenant_id))
|
||||
.filter(dsl::shard_number.eq(shard_number))
|
||||
.filter(dsl::shard_count.eq(shard_count))
|
||||
.filter(dsl::ps_generation.eq(ps_generation))
|
||||
.filter(dsl::sk_id.eq(sk_id))
|
||||
// intent_state could have changed beneath us (split brain or concurrent state gc)
|
||||
// TODO: this could also just be a globally monotonic sequence number, maybe easier to reason about?
|
||||
.filter(dsl::intent_state.eq(intent_state))
|
||||
.execute(conn)
|
||||
.await?;
|
||||
if nrows != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of deletes: {nrows}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
let SkPsDiscoveryPersistencePk {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
ps_generation,
|
||||
sk_id,
|
||||
} = pk;
|
||||
|
||||
let nrows = diesel::update(dsl::sk_ps_discovery)
|
||||
// primary key
|
||||
.filter(dsl::tenant_id.eq(tenant_id))
|
||||
.filter(dsl::shard_number.eq(shard_number))
|
||||
.filter(dsl::shard_count.eq(shard_count))
|
||||
.filter(dsl::ps_generation.eq(ps_generation))
|
||||
.filter(dsl::sk_id.eq(sk_id))
|
||||
// intent_state could have changed beneath us (split brain or concurrent state gc)
|
||||
// TODO: this could also just be a globally monotonic sequence number, maybe easier to reason about?
|
||||
.filter(dsl::intent_state.eq(intent_state))
|
||||
// action:
|
||||
.set((
|
||||
dsl::retries.eq(dsl::retries.add(1)), // XXX: in split-brain situation we would bump twice...
|
||||
dsl::last_retry_at.eq(diesel::dsl::now),
|
||||
))
|
||||
.execute(conn) // TODO: check update count?
|
||||
.await?;
|
||||
if nrows != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of updates: {nrows}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
|
||||
@@ -1909,21 +2067,40 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
|
||||
})
|
||||
}
|
||||
|
||||
fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {
|
||||
let fut = async {
|
||||
type TokioPostgresConnectResult = ConnectionResult<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<
|
||||
tokio_postgres::Socket,
|
||||
tokio_postgres_rustls::RustlsStream<tokio_postgres::Socket>,
|
||||
>,
|
||||
)>;
|
||||
|
||||
fn establish_connection_rustls_tokio_postgres(
|
||||
config: String,
|
||||
) -> BoxFuture<'static, TokioPostgresConnectResult> {
|
||||
let fut = async move {
|
||||
// We first set up the way we want rustls to work.
|
||||
let rustls_config = client_config_with_root_certs()
|
||||
.map_err(|err| ConnectionError::BadConnection(format!("{err:?}")))?;
|
||||
let tls = tokio_postgres_rustls::MakeRustlsConnect::new(rustls_config);
|
||||
let (client, conn) = tokio_postgres::connect(config, tls)
|
||||
let (client, conn) = tokio_postgres::connect(&config, tls)
|
||||
.await
|
||||
.map_err(|e| ConnectionError::BadConnection(e.to_string()))?;
|
||||
|
||||
AsyncPgConnection::try_from_client_and_connection(client, conn).await
|
||||
Ok((client, conn))
|
||||
};
|
||||
fut.boxed()
|
||||
}
|
||||
|
||||
fn establish_connection_rustls_diesel(
|
||||
config: String,
|
||||
) -> BoxFuture<'static, ConnectionResult<AsyncPgConnection>> {
|
||||
async {
|
||||
let (client, conn) = establish_connection_rustls_tokio_postgres(config).await?;
|
||||
AsyncPgConnection::try_from_client_and_connection(client, conn).await
|
||||
}
|
||||
.boxed()
|
||||
}
|
||||
|
||||
#[cfg_attr(test, test)]
|
||||
fn test_config_debug_censors_password() {
|
||||
let has_pw =
|
||||
@@ -2386,3 +2563,61 @@ pub(crate) struct TimelineImportPersistence {
|
||||
pub(crate) timeline_id: String,
|
||||
pub(crate) shard_statuses: serde_json::Value,
|
||||
}
|
||||
|
||||
#[derive(Insertable, AsChangeset, Selectable, Clone, PartialEq, Eq, Hash, Debug)]
|
||||
#[diesel(table_name = crate::schema::sk_ps_discovery)]
|
||||
pub(crate) struct SkPsDiscoveryPersistencePk {
|
||||
pub(crate) tenant_id: String,
|
||||
pub(crate) shard_number: i32,
|
||||
pub(crate) shard_count: i32,
|
||||
pub(crate) ps_generation: i32,
|
||||
pub(crate) sk_id: i64,
|
||||
}
|
||||
|
||||
#[derive(Queryable, Selectable, Clone, PartialEq, Eq)]
|
||||
#[diesel(table_name = crate::schema::sk_ps_discovery)]
|
||||
pub(crate) struct SkPsDiscoveryPersistence {
|
||||
pub(crate) tenant_id: String,
|
||||
pub(crate) shard_number: i32,
|
||||
pub(crate) shard_count: i32,
|
||||
pub(crate) ps_generation: i32,
|
||||
pub(crate) sk_id: i64,
|
||||
pub(crate) intent_state: String,
|
||||
pub(crate) ps_id: i64,
|
||||
pub(crate) created_at: chrono::DateTime<chrono::Utc>,
|
||||
pub(crate) retries: i32,
|
||||
pub(crate) last_retry_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
pub(crate) acknowledged_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
}
|
||||
|
||||
impl SkPsDiscoveryPersistence {
|
||||
pub(crate) fn tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
|
||||
Ok(TenantShardId {
|
||||
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(self.shard_number as u8),
|
||||
shard_count: ShardCount::new(self.shard_count as u8),
|
||||
})
|
||||
}
|
||||
pub(crate) fn primary_key(&self) -> SkPsDiscoveryPersistencePk {
|
||||
let SkPsDiscoveryPersistence {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
ps_generation,
|
||||
sk_id,
|
||||
intent_state: _,
|
||||
ps_id: _,
|
||||
created_at: _,
|
||||
retries: _,
|
||||
last_retry_at: _,
|
||||
acknowledged_at: _,
|
||||
} = self;
|
||||
SkPsDiscoveryPersistencePk {
|
||||
tenant_id: tenant_id.clone(),
|
||||
shard_number: *shard_number,
|
||||
shard_count: *shard_count,
|
||||
ps_generation: *ps_generation,
|
||||
sk_id: *sk_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use safekeeper_api::models::{
|
||||
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
|
||||
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization,
|
||||
TenantShardPageserverAttachmentChange, TimelineCreateRequest,
|
||||
};
|
||||
use safekeeper_client::mgmt_api::{Client, Result};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::logging::SecretString;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use crate::metrics::PageserverRequestLabelGroup;
|
||||
|
||||
@@ -164,4 +166,19 @@ impl SafekeeperClient {
|
||||
self.inner.utilization().await
|
||||
)
|
||||
}
|
||||
|
||||
pub async fn post_tenant_shard_pageserver_attachments(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
body: TenantShardPageserverAttachmentChange,
|
||||
) -> Result<()> {
|
||||
measured_request!(
|
||||
"post_tenant_shard_pageserver_attachments",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.post_tenant_shard_pageserver_attachments(tenant_shard_id, body)
|
||||
.await
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -628,11 +628,7 @@ impl Scheduler {
|
||||
tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
|
||||
}
|
||||
|
||||
if node.attached_shard_count < expected_attached_shards_per_node {
|
||||
expected_attached_shards_per_node - node.attached_shard_count
|
||||
} else {
|
||||
0
|
||||
}
|
||||
expected_attached_shards_per_node.saturating_sub(node.attached_shard_count)
|
||||
}
|
||||
|
||||
pub(crate) fn expected_attached_shard_count(&self) -> usize {
|
||||
|
||||
@@ -60,6 +60,22 @@ diesel::table! {
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
sk_ps_discovery (tenant_id, shard_number, shard_count, ps_generation, sk_id) {
|
||||
tenant_id -> Varchar,
|
||||
shard_number -> Int4,
|
||||
shard_count -> Int4,
|
||||
ps_generation -> Int4,
|
||||
sk_id -> Int8,
|
||||
intent_state -> Varchar,
|
||||
ps_id -> Int8,
|
||||
created_at -> Timestamptz,
|
||||
retries -> Int4,
|
||||
last_retry_at -> Nullable<Timestamptz>,
|
||||
acknowledged_at -> Nullable<Timestamptz>,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
tenant_shards (tenant_id, shard_number, shard_count) {
|
||||
tenant_id -> Varchar,
|
||||
@@ -100,12 +116,16 @@ diesel::table! {
|
||||
}
|
||||
}
|
||||
|
||||
diesel::joinable!(sk_ps_discovery -> nodes (ps_id));
|
||||
diesel::joinable!(sk_ps_discovery -> safekeepers (sk_id));
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(
|
||||
controllers,
|
||||
metadata_health,
|
||||
nodes,
|
||||
safekeeper_timeline_pending_ops,
|
||||
safekeepers,
|
||||
sk_ps_discovery,
|
||||
tenant_shards,
|
||||
timeline_imports,
|
||||
timelines,
|
||||
|
||||
@@ -2,6 +2,7 @@ pub mod chaos_injector;
|
||||
mod context_iterator;
|
||||
pub(crate) mod safekeeper_reconciler;
|
||||
mod safekeeper_service;
|
||||
mod sk_ps_discovery;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
@@ -1192,6 +1193,16 @@ impl Service {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn run_sk_ps_discovery(self: &Arc<Self>) {
|
||||
self.startup_complete.clone().wait().await;
|
||||
sk_ps_discovery::run(
|
||||
self.clone(),
|
||||
self.http_client.clone(), /* TODO this client is configured to openf resh TCP connection each time, very inefficient */
|
||||
).await;
|
||||
}
|
||||
|
||||
/// Heartbeat all storage nodes once in a while.
|
||||
#[instrument(skip_all)]
|
||||
async fn spawn_heartbeat_driver(&self) {
|
||||
@@ -1797,7 +1808,7 @@ impl Service {
|
||||
reconcilers_gate: Gate::default(),
|
||||
tenant_op_locks: Default::default(),
|
||||
node_op_locks: Default::default(),
|
||||
http_client,
|
||||
http_client: http_client.clone(),
|
||||
step_down_barrier: Default::default(),
|
||||
});
|
||||
|
||||
@@ -1865,6 +1876,15 @@ impl Service {
|
||||
}
|
||||
});
|
||||
|
||||
tokio::task::spawn({
|
||||
let this = this.clone();
|
||||
let startup_complete = startup_complete.clone();
|
||||
async move {
|
||||
startup_complete.wait().await;
|
||||
this.run_sk_ps_discovery().await
|
||||
}
|
||||
});
|
||||
|
||||
tokio::task::spawn({
|
||||
let this = this.clone();
|
||||
let startup_complete = startup_complete.clone();
|
||||
|
||||
@@ -647,6 +647,11 @@ impl Service {
|
||||
sk.describe_response()
|
||||
}
|
||||
|
||||
pub(crate) fn get_safekeeper_object(&self, node_id: i64) -> Option<Safekeeper> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
locked.safekeepers.get(&NodeId(node_id as u64)).cloned()
|
||||
}
|
||||
|
||||
pub(crate) async fn upsert_safekeeper(
|
||||
self: &Arc<Service>,
|
||||
record: crate::persistence::SafekeeperUpsert,
|
||||
|
||||
266
storage_controller/src/service/sk_ps_discovery.rs
Normal file
266
storage_controller/src/service/sk_ps_discovery.rs
Normal file
@@ -0,0 +1,266 @@
|
||||
use std::{
|
||||
collections::{HashMap, hash_map},
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::{StreamExt, stream::FuturesUnordered};
|
||||
use safekeeper_api::models::{
|
||||
TenantShardPageserverAttachment, TenantShardPageserverAttachmentChange,
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, Span, error, info, info_span};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId},
|
||||
logging::SecretString,
|
||||
shard::ShardIndex,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
heartbeater::SafekeeperState,
|
||||
persistence::{Persistence, SkPsDiscoveryPersistence},
|
||||
};
|
||||
|
||||
use super::Service;
|
||||
|
||||
struct Actor {
|
||||
service: Arc<Service>,
|
||||
persistence: Arc<Persistence>,
|
||||
http_client: reqwest::Client,
|
||||
}
|
||||
|
||||
pub async fn run(service: Arc<Service>, http_client: reqwest::Client) {
|
||||
let actor = Actor {
|
||||
persistence: service.persistence.clone(),
|
||||
service,
|
||||
http_client, // XXX: build our own client instead of getting Service's client; we probably want idle conn to each sk
|
||||
};
|
||||
actor.run().await;
|
||||
}
|
||||
|
||||
impl Actor {
|
||||
async fn run(mut self) {
|
||||
loop {
|
||||
match self.run0().await {
|
||||
Ok(()) => {
|
||||
info!("sk_ps_discovery actor exiting after shutdown signal observed");
|
||||
return;
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
?err,
|
||||
"sk_ps_discovery actor encountered an error, restarting after backoff"
|
||||
);
|
||||
// TODO: proper backoff
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn run0(&mut self) -> anyhow::Result<()> {
|
||||
let mut subscription = self
|
||||
.persistence
|
||||
.listen_sk_ps_discovery()
|
||||
.await
|
||||
.context("listen to sk_ps_discovery")?;
|
||||
|
||||
let mut sync_full_ticker = tokio::time::interval(std::time::Duration::from_secs(5));
|
||||
|
||||
struct Task {
|
||||
work: SkPsDiscoveryPersistence,
|
||||
cancel: CancellationToken,
|
||||
join_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
let mut tasks = HashMap::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
biased; // control messages have higher priority, the periodic full tick, then subscriptions.
|
||||
_ = sync_full_ticker.tick() => {
|
||||
info!("rebuild");
|
||||
}
|
||||
maybe_res = subscription.next() => {
|
||||
match maybe_res {
|
||||
None => {
|
||||
anyhow::bail!("subscription should never end");
|
||||
}
|
||||
Some(Ok(tenant_id)) => {
|
||||
let tenant_id: TenantId = tenant_id;
|
||||
info!(?tenant_id, "notify for tenant_id");
|
||||
// for now, just also rebuild everything
|
||||
}
|
||||
Some(Err(err)) => {
|
||||
let err: serde_json::Error = err;
|
||||
anyhow::bail!("incorrect notification format: {err:?}"); // FIXME repeat message in error so it can be debugged ?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// get list of tasks from database
|
||||
let mut new_tasks = self
|
||||
.persistence
|
||||
.get_all_sk_ps_discovery_work()
|
||||
.await
|
||||
.context("get_all_sk_ps_discovery_work")?
|
||||
.into_iter()
|
||||
.map(|work: SkPsDiscoveryPersistence| {
|
||||
(
|
||||
work.primary_key(),
|
||||
Task {
|
||||
work,
|
||||
cancel: CancellationToken::new(),
|
||||
join_handle: None,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
// Carry over ongoing tasks
|
||||
let mut cancelled_wait = FuturesUnordered::new();
|
||||
for (
|
||||
task_key,
|
||||
Task {
|
||||
work: ongoing_persistence,
|
||||
cancel,
|
||||
join_handle,
|
||||
},
|
||||
) in tasks.drain()
|
||||
{
|
||||
match new_tasks.entry(task_key) {
|
||||
hash_map::Entry::Occupied(mut planned) => {
|
||||
let Task {
|
||||
work: planned_persistence,
|
||||
cancel: planned_cancel,
|
||||
join_handle: planned_jh,
|
||||
} = planned.get_mut();
|
||||
assert!(planned_jh.is_none());
|
||||
if *planned_persistence == ongoing_persistence {
|
||||
*planned_jh = join_handle;
|
||||
*planned_cancel = cancel;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(_) => (),
|
||||
}
|
||||
cancel.cancel();
|
||||
cancelled_wait.push(async move {
|
||||
if let Some(jh) = join_handle {
|
||||
let _ = jh.await;
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(_) = cancelled_wait.next().await {}
|
||||
tasks = new_tasks;
|
||||
|
||||
// Kick off new tasks
|
||||
for (key, task) in tasks.iter_mut() {
|
||||
if task.join_handle.is_none() {
|
||||
task.join_handle = Some(tokio::spawn(
|
||||
DeliveryAttempt {
|
||||
cancel: task.cancel.clone(),
|
||||
persistence: self.persistence.clone(),
|
||||
service: self.service.clone(),
|
||||
http_client: self.http_client.clone(),
|
||||
work: task.work.clone(),
|
||||
}
|
||||
.run()
|
||||
.instrument({
|
||||
let span = info_span!(parent: None, "sk_ps_discovery_delivery", ?key);
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct DeliveryAttempt {
|
||||
cancel: CancellationToken,
|
||||
persistence: Arc<Persistence>,
|
||||
service: Arc<super::Service>,
|
||||
http_client: reqwest::Client,
|
||||
work: SkPsDiscoveryPersistence,
|
||||
}
|
||||
|
||||
impl DeliveryAttempt {
|
||||
pub async fn run(self) {
|
||||
let res = self.run0().await;
|
||||
if self.cancel.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
if let Err(ref err) = res {
|
||||
error!(?err, "attempt failed");
|
||||
}
|
||||
let res = self
|
||||
.persistence
|
||||
.update_sk_ps_discovery_attempt(
|
||||
self.work.primary_key(),
|
||||
self.work.intent_state.clone(),
|
||||
res.map_err(|_| ()),
|
||||
)
|
||||
.await;
|
||||
if let Err(ref err) = res {
|
||||
error!(?err, "persistence of attempt result failed");
|
||||
}
|
||||
}
|
||||
async fn run0(&self) -> anyhow::Result<()> {
|
||||
let Some(sk) = self.service.get_safekeeper_object(self.work.sk_id) else {
|
||||
anyhow::bail!("safekeeper object does not exist");
|
||||
};
|
||||
|
||||
match sk.availability() {
|
||||
SafekeeperState::Available { .. } => (),
|
||||
SafekeeperState::Offline => {
|
||||
anyhow::bail!("safekeeper is offline");
|
||||
}
|
||||
}
|
||||
|
||||
let body = {
|
||||
let val = TenantShardPageserverAttachment {
|
||||
shard_id: ShardIndex {
|
||||
shard_number: utils::shard::ShardNumber(self.work.shard_number as u8),
|
||||
shard_count: utils::shard::ShardCount(self.work.shard_count as u8),
|
||||
},
|
||||
ps_id: NodeId(self.work.ps_id as u64),
|
||||
generation: Generation::new(self.work.ps_generation as u32),
|
||||
};
|
||||
match self.work.intent_state.as_str() {
|
||||
"attached" => TenantShardPageserverAttachmentChange::Attach { field1: val },
|
||||
"detached" => TenantShardPageserverAttachmentChange::Detach(val),
|
||||
x => anyhow::bail!("unknown intent state {x:?}"),
|
||||
}
|
||||
};
|
||||
let tenant_shard_id = self.work.tenant_shard_id()?;
|
||||
sk.with_client_retries(
|
||||
|client| {
|
||||
let body = body.clone();
|
||||
async move {
|
||||
client
|
||||
.post_tenant_shard_pageserver_attachments(tenant_shard_id, body)
|
||||
.await
|
||||
}
|
||||
},
|
||||
&self.http_client,
|
||||
&self
|
||||
.service
|
||||
.config
|
||||
.safekeeper_jwt_token
|
||||
.clone()
|
||||
.map(SecretString::from),
|
||||
1,
|
||||
3,
|
||||
Duration::from_secs(1),
|
||||
&self.cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -103,7 +103,7 @@ class AbstractNeonCli:
|
||||
else:
|
||||
stdout = ""
|
||||
|
||||
log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
|
||||
log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}")
|
||||
raise
|
||||
|
||||
indent = " "
|
||||
|
||||
@@ -510,7 +510,7 @@ def list_elegible_layers(
|
||||
except KeyError:
|
||||
# Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
|
||||
# matches what's on disk.
|
||||
log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
|
||||
log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
|
||||
raise
|
||||
|
||||
return list(c for c in candidates if is_visible(c))
|
||||
@@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
except:
|
||||
# On assertion failures, log some details to help with debugging
|
||||
heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
|
||||
log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
|
||||
log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
|
||||
raise
|
||||
|
||||
# Scrub the remote storage
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 4cca6f8083...55c0d45abe
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: daa81cffcf...de7640f55d
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 15710a76b7...0bf96bd6d7
2
vendor/postgres-v17
vendored
2
vendor/postgres-v17
vendored
Submodule vendor/postgres-v17 updated: e5374b7299...8be779fd3a
8
vendor/revisions.json
vendored
8
vendor/revisions.json
vendored
@@ -1,18 +1,18 @@
|
||||
{
|
||||
"v17": [
|
||||
"17.5",
|
||||
"e5374b72997b0afc8374137674e873f7a558120a"
|
||||
"8be779fd3ab9e87206da96a7e4842ef1abf04f44"
|
||||
],
|
||||
"v16": [
|
||||
"16.9",
|
||||
"15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc"
|
||||
"0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198"
|
||||
],
|
||||
"v15": [
|
||||
"15.13",
|
||||
"daa81cffcf063c54b29a9aabdb6604625f675ad0"
|
||||
"de7640f55da07512834d5cc40c4b3fb376b5f04f"
|
||||
],
|
||||
"v14": [
|
||||
"14.18",
|
||||
"4cca6f8083483dda9e12eae292cf788d45bd561f"
|
||||
"55c0d45abe6467c02084c2192bca117eda6ce1e7"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -60,7 +60,8 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st
|
||||
libc = { version = "0.2", features = ["extra_traits", "use_std"] }
|
||||
log = { version = "0.4", default-features = false, features = ["std"] }
|
||||
memchr = { version = "2" }
|
||||
nix = { version = "0.26" }
|
||||
nix-2f80eeee3b1b6c7e = { package = "nix", version = "0.26" }
|
||||
nix-fa1f6196edfd7249 = { package = "nix", version = "0.30", features = ["dir", "ioctl", "mman", "poll", "signal", "socket"] }
|
||||
nom = { version = "7" }
|
||||
num = { version = "0.4" }
|
||||
num-bigint = { version = "0.4" }
|
||||
|
||||
Reference in New Issue
Block a user