mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-26 17:40:37 +00:00
Compare commits
42 Commits
release-86
...
problame/b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a80790e0c4 | ||
|
|
906a963351 | ||
|
|
9e7556bef2 | ||
|
|
4f4214eea3 | ||
|
|
3dffbda428 | ||
|
|
04fb256e0f | ||
|
|
02fe35831c | ||
|
|
5f7bc3ce60 | ||
|
|
e40b1c79fa | ||
|
|
2f416267dc | ||
|
|
d52d560f16 | ||
|
|
de908a7b0d | ||
|
|
687fb25d41 | ||
|
|
eaa91291ae | ||
|
|
fc9f38dd2d | ||
|
|
c689110ad6 | ||
|
|
ba6abe203d | ||
|
|
d16e024d49 | ||
|
|
a4b9335b73 | ||
|
|
2cea7a7838 | ||
|
|
e1c1aa74fe | ||
|
|
ee775a24a0 | ||
|
|
428f532f08 | ||
|
|
5efb0d8072 | ||
|
|
36ba2b8e44 | ||
|
|
1de0f41403 | ||
|
|
4d2f27a33f | ||
|
|
88a3c9e7fd | ||
|
|
df36b9aa62 | ||
|
|
18a43eeab3 | ||
|
|
39039d1be7 | ||
|
|
9ee75ceee6 | ||
|
|
f5210a367d | ||
|
|
f36520eb94 | ||
|
|
afa35eea87 | ||
|
|
8eb853b731 | ||
|
|
a95015d967 | ||
|
|
3836ee8539 | ||
|
|
a6bd4a3be6 | ||
|
|
24d96e4372 | ||
|
|
29ea89b61d | ||
|
|
322e742e4c |
116
Cargo.lock
generated
116
Cargo.lock
generated
@@ -3898,16 +3898,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||
dependencies = [
|
||||
"overload",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.4.1"
|
||||
@@ -4192,12 +4182,6 @@ version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "p256"
|
||||
version = "0.11.1"
|
||||
@@ -4302,7 +4286,6 @@ dependencies = [
|
||||
"enumset",
|
||||
"fail",
|
||||
"futures",
|
||||
"hashlink",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"http-utils",
|
||||
@@ -5255,7 +5238,6 @@ dependencies = [
|
||||
"tracing-log",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
"tracing-test",
|
||||
"tracing-utils",
|
||||
"try-lock",
|
||||
"typed-json",
|
||||
@@ -6082,8 +6064,10 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"sk_ps_discovery",
|
||||
"smallvec",
|
||||
"storage_broker",
|
||||
"storage_controller_client",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror 1.0.69",
|
||||
@@ -6095,6 +6079,7 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
@@ -6590,6 +6575,76 @@ version = "0.3.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
|
||||
|
||||
[[package]]
|
||||
name = "sk_ps_discovery"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"desim",
|
||||
"env_logger",
|
||||
"fail",
|
||||
"futures",
|
||||
"hex",
|
||||
"http 1.1.0",
|
||||
"http-utils",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"jsonwebtoken",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"parking_lot 0.12.1",
|
||||
"pem",
|
||||
"postgres-protocol",
|
||||
"postgres_backend",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"pq_proto",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"rustls 0.23.18",
|
||||
"safekeeper_api",
|
||||
"safekeeper_client",
|
||||
"scopeguard",
|
||||
"sd-notify",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smallvec",
|
||||
"storage_broker",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror 1.0.69",
|
||||
"tikv-jemallocator",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"utils",
|
||||
"wal_decoder",
|
||||
"walproposer",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.8"
|
||||
@@ -6696,6 +6751,7 @@ dependencies = [
|
||||
"rustls 0.23.18",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
"tracing",
|
||||
@@ -6708,6 +6764,7 @@ name = "storage_controller"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"bytes",
|
||||
"camino",
|
||||
"chrono",
|
||||
@@ -7706,7 +7763,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -7720,27 +7776,6 @@ dependencies = [
|
||||
"tracing-serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-test"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68"
|
||||
dependencies = [
|
||||
"tracing-core",
|
||||
"tracing-subscriber",
|
||||
"tracing-test-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-test-macro"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-utils"
|
||||
version = "0.1.0"
|
||||
@@ -8593,7 +8628,6 @@ dependencies = [
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"uuid",
|
||||
"zeroize",
|
||||
|
||||
@@ -43,7 +43,7 @@ members = [
|
||||
"libs/proxy/postgres-protocol2",
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"endpoint_storage", "libs/sk_ps_discovery",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -262,6 +262,7 @@ pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||
safekeeper_client = { path = "./safekeeper/client" }
|
||||
sk_ps_discovery = { path = "./libs/sk_ps_discovery" }
|
||||
desim = { version = "0.1", path = "./libs/desim" }
|
||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
storage_controller_client = { path = "./storage_controller/client" }
|
||||
|
||||
@@ -582,38 +582,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "online_advisor-build"
|
||||
# compile online_advisor extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS online_advisor-src
|
||||
ARG PG_VERSION
|
||||
|
||||
# online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries
|
||||
# last release 1.0 - May 15, 2025
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION:?}" in \
|
||||
"v17") \
|
||||
;; \
|
||||
*) \
|
||||
echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
|
||||
echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \
|
||||
mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
|
||||
|
||||
FROM pg-build AS online_advisor-build
|
||||
COPY --from=online_advisor-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src/
|
||||
RUN if [ -d online_advisor-src ]; then \
|
||||
cd online_advisor-src && \
|
||||
make -j install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \
|
||||
fi
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_hashids-build"
|
||||
@@ -1680,7 +1648,6 @@ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -1856,7 +1823,6 @@ COPY --from=pgjwt-src /ext-src/ /ext-src/
|
||||
COPY --from=pg_graphql-src /ext-src/ /ext-src/
|
||||
#COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
|
||||
COPY --from=hypopg-src /ext-src/ /ext-src/
|
||||
COPY --from=online_advisor-src /ext-src/ /ext-src/
|
||||
COPY --from=pg_hashids-src /ext-src/ /ext-src/
|
||||
COPY --from=rum-src /ext-src/ /ext-src/
|
||||
COPY --from=pgtap-src /ext-src/ /ext-src/
|
||||
|
||||
@@ -213,10 +213,8 @@ impl Escaping for PgIdent {
|
||||
|
||||
// Find the first suitable tag that is not present in the string.
|
||||
// Postgres' max role/DB name length is 63 bytes, so even in the
|
||||
// worst case it won't take long. Outer tag is always `tag + "x"`,
|
||||
// so if `tag` is not present in the string, `outer_tag` is not
|
||||
// present in the string either.
|
||||
while self.contains(&tag.to_string()) {
|
||||
// worst case it won't take long.
|
||||
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
|
||||
tag += "x";
|
||||
outer_tag = tag.clone() + "x";
|
||||
}
|
||||
|
||||
@@ -27,40 +27,6 @@ fn get_rsyslog_pid() -> Option<String> {
|
||||
}
|
||||
}
|
||||
|
||||
fn wait_for_rsyslog_pid() -> Result<String, anyhow::Error> {
|
||||
const MAX_WAIT: Duration = Duration::from_secs(5);
|
||||
const INITIAL_SLEEP: Duration = Duration::from_millis(2);
|
||||
|
||||
let mut sleep_duration = INITIAL_SLEEP;
|
||||
let start = std::time::Instant::now();
|
||||
let mut attempts = 1;
|
||||
|
||||
for attempt in 1.. {
|
||||
attempts = attempt;
|
||||
match get_rsyslog_pid() {
|
||||
Some(pid) => return Ok(pid),
|
||||
None => {
|
||||
if start.elapsed() >= MAX_WAIT {
|
||||
break;
|
||||
}
|
||||
info!(
|
||||
"rsyslogd is not running, attempt {}. Sleeping for {} ms",
|
||||
attempt,
|
||||
sleep_duration.as_millis()
|
||||
);
|
||||
std::thread::sleep(sleep_duration);
|
||||
sleep_duration *= 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(anyhow::anyhow!(
|
||||
"rsyslogd is not running after waiting for {} seconds and {} attempts",
|
||||
attempts,
|
||||
start.elapsed().as_secs()
|
||||
))
|
||||
}
|
||||
|
||||
// Restart rsyslogd to apply the new configuration.
|
||||
// This is necessary, because there is no other way to reload the rsyslog configuration.
|
||||
//
|
||||
@@ -70,14 +36,14 @@ fn wait_for_rsyslog_pid() -> Result<String, anyhow::Error> {
|
||||
// TODO: test it properly
|
||||
//
|
||||
fn restart_rsyslog() -> Result<()> {
|
||||
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
|
||||
info!("rsyslogd is running with pid: {}, restart it", old_pid);
|
||||
|
||||
// kill it to restart
|
||||
let _ = Command::new("pkill")
|
||||
.arg("rsyslogd")
|
||||
.output()
|
||||
.context("Failed to restart rsyslogd")?;
|
||||
|
||||
// ensure rsyslogd is running
|
||||
wait_for_rsyslog_pid()?;
|
||||
.context("Failed to stop rsyslogd")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -165,11 +131,15 @@ pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Nothing to configure
|
||||
// When new config is empty we can simply remove the configuration file.
|
||||
if new_config.is_empty() {
|
||||
// When the configuration is removed, PostgreSQL will stop sending data
|
||||
// to the files watched by rsyslog, so restarting rsyslog is more effort
|
||||
// than just ignoring this change.
|
||||
info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH);
|
||||
match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) {
|
||||
Ok(_) => {}
|
||||
Err(err) if err.kind() == ErrorKind::NotFound => {}
|
||||
Err(err) => return Err(err.into()),
|
||||
}
|
||||
restart_rsyslog()?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
|
||||
@@ -71,14 +71,6 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
|
||||
("name$$$", ("$x$name$$$$x$", "xx")),
|
||||
("name$$$$", ("$x$name$$$$$x$", "xx")),
|
||||
("name$x$", ("$xx$name$x$$xx$", "xxx")),
|
||||
("x", ("$xx$x$xx$", "xxx")),
|
||||
("xx", ("$xxx$xx$xxx$", "xxxx")),
|
||||
("$x", ("$xx$$x$xx$", "xxx")),
|
||||
("x$", ("$xx$x$$xx$", "xxx")),
|
||||
("$x$", ("$xx$$x$$xx$", "xxx")),
|
||||
("xx$", ("$xxx$xx$$xxx$", "xxxx")),
|
||||
("$xx", ("$xxx$$xx$xxx$", "xxxx")),
|
||||
("$xx$", ("$xxx$$xx$$xxx$", "xxxx")),
|
||||
];
|
||||
|
||||
for (input, expected) in test_cases {
|
||||
|
||||
@@ -546,16 +546,6 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Falied to parse 'sampling_ratio'")?,
|
||||
relsize_snapshot_cache_capacity: settings
|
||||
.remove("relsize snapshot cache capacity")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?,
|
||||
basebackup_cache_enabled: settings
|
||||
.remove("basebackup_cache_enabled")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'basebackup_cache_enabled' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
|
||||
@@ -462,8 +462,6 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
|
||||
if var(REAL_S3_ENV).is_ok() {
|
||||
assert!(body.contains("remote_storage_s3_deleted_objects_total"));
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
assert!(body.contains("process_threads"));
|
||||
}
|
||||
|
||||
|
||||
@@ -183,8 +183,6 @@ pub struct ConfigToml {
|
||||
pub enable_tls_page_service_api: bool,
|
||||
pub dev_mode: bool,
|
||||
pub timeline_import_config: TimelineImportConfig,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub basebackup_cache_config: Option<BasebackupCacheConfig>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -237,7 +235,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy {
|
||||
ScatteredLsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(tag = "mode", rename_all = "kebab-case")]
|
||||
pub enum GetVectoredConcurrentIo {
|
||||
/// The read path is fully sequential: layers are visited
|
||||
@@ -310,26 +308,6 @@ pub struct TimelineImportConfig {
|
||||
pub import_job_checkpoint_threshold: NonZeroUsize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct BasebackupCacheConfig {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub cleanup_period: Duration,
|
||||
// FIXME: Support max_size_bytes.
|
||||
// pub max_size_bytes: usize,
|
||||
pub max_size_entries: i64,
|
||||
}
|
||||
|
||||
impl Default for BasebackupCacheConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cleanup_period: Duration::from_secs(60),
|
||||
// max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
|
||||
max_size_entries: 1000,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod statvfs {
|
||||
pub mod mock {
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -513,14 +491,6 @@ pub struct TenantConfigToml {
|
||||
/// Tenant level performance sampling ratio override. Controls the ratio of get page requests
|
||||
/// that will get perf sampling for the tenant.
|
||||
pub sampling_ratio: Option<Ratio>,
|
||||
|
||||
/// Capacity of relsize snapshot cache (used by replicas).
|
||||
pub relsize_snapshot_cache_capacity: usize,
|
||||
|
||||
/// Enable preparing basebackup on XLOG_CHECKPOINT_SHUTDOWN and using it in basebackup requests.
|
||||
// FIXME: Remove skip_serializing_if when the feature is stable.
|
||||
#[serde(skip_serializing_if = "std::ops::Not::not")]
|
||||
pub basebackup_cache_enabled: bool,
|
||||
}
|
||||
|
||||
pub mod defaults {
|
||||
@@ -694,7 +664,6 @@ impl Default for ConfigToml {
|
||||
import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
|
||||
import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
|
||||
},
|
||||
basebackup_cache_config: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -761,7 +730,6 @@ pub mod tenant_conf_defaults {
|
||||
pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
|
||||
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
|
||||
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
|
||||
pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000;
|
||||
}
|
||||
|
||||
impl Default for TenantConfigToml {
|
||||
@@ -819,8 +787,6 @@ impl Default for TenantConfigToml {
|
||||
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
|
||||
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
|
||||
sampling_ratio: None,
|
||||
relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY,
|
||||
basebackup_cache_enabled: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -630,10 +630,6 @@ pub struct TenantConfigPatch {
|
||||
pub gc_compaction_ratio_percent: FieldPatch<u64>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub sampling_ratio: FieldPatch<Option<Ratio>>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub relsize_snapshot_cache_capacity: FieldPatch<usize>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub basebackup_cache_enabled: FieldPatch<bool>,
|
||||
}
|
||||
|
||||
/// Like [`crate::config::TenantConfigToml`], but preserves the information
|
||||
@@ -763,12 +759,6 @@ pub struct TenantConfig {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub sampling_ratio: Option<Option<Ratio>>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub relsize_snapshot_cache_capacity: Option<usize>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub basebackup_cache_enabled: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfig {
|
||||
@@ -814,8 +804,6 @@ impl TenantConfig {
|
||||
mut gc_compaction_initial_threshold_kb,
|
||||
mut gc_compaction_ratio_percent,
|
||||
mut sampling_ratio,
|
||||
mut relsize_snapshot_cache_capacity,
|
||||
mut basebackup_cache_enabled,
|
||||
} = self;
|
||||
|
||||
patch.checkpoint_distance.apply(&mut checkpoint_distance);
|
||||
@@ -917,12 +905,6 @@ impl TenantConfig {
|
||||
.gc_compaction_ratio_percent
|
||||
.apply(&mut gc_compaction_ratio_percent);
|
||||
patch.sampling_ratio.apply(&mut sampling_ratio);
|
||||
patch
|
||||
.relsize_snapshot_cache_capacity
|
||||
.apply(&mut relsize_snapshot_cache_capacity);
|
||||
patch
|
||||
.basebackup_cache_enabled
|
||||
.apply(&mut basebackup_cache_enabled);
|
||||
|
||||
Ok(Self {
|
||||
checkpoint_distance,
|
||||
@@ -962,8 +944,6 @@ impl TenantConfig {
|
||||
gc_compaction_initial_threshold_kb,
|
||||
gc_compaction_ratio_percent,
|
||||
sampling_ratio,
|
||||
relsize_snapshot_cache_capacity,
|
||||
basebackup_cache_enabled,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1072,12 +1052,6 @@ impl TenantConfig {
|
||||
.gc_compaction_ratio_percent
|
||||
.unwrap_or(global_conf.gc_compaction_ratio_percent),
|
||||
sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio),
|
||||
relsize_snapshot_cache_capacity: self
|
||||
.relsize_snapshot_cache_capacity
|
||||
.unwrap_or(global_conf.relsize_snapshot_cache_capacity),
|
||||
basebackup_cache_enabled: self
|
||||
.basebackup_cache_enabled
|
||||
.unwrap_or(global_conf.basebackup_cache_enabled),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -86,27 +86,6 @@ pub struct DbError {
|
||||
}
|
||||
|
||||
impl DbError {
|
||||
pub fn new_test_error(code: SqlState, message: String) -> Self {
|
||||
DbError {
|
||||
severity: "ERROR".to_string(),
|
||||
parsed_severity: Some(Severity::Error),
|
||||
code,
|
||||
message,
|
||||
detail: None,
|
||||
hint: None,
|
||||
position: None,
|
||||
where_: None,
|
||||
schema: None,
|
||||
table: None,
|
||||
column: None,
|
||||
datatype: None,
|
||||
constraint: None,
|
||||
file: None,
|
||||
line: None,
|
||||
routine: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
|
||||
let mut severity = None;
|
||||
let mut parsed_severity = None;
|
||||
|
||||
@@ -6,9 +6,11 @@ use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::TimestampTz;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::time::Instant;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::shard::ShardIndex;
|
||||
|
||||
use crate::membership::Configuration;
|
||||
use crate::{ServerInfo, Term};
|
||||
@@ -309,3 +311,29 @@ pub struct PullTimelineResponse {
|
||||
pub safekeeper_host: Option<String>,
|
||||
// TODO: add more fields?
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
#[serde(tag = "action")]
|
||||
pub enum TenantShardPageserverAttachmentChange {
|
||||
Attach(TenantShardPageserverAttachment),
|
||||
Detach(TenantShardPageserverAttachment),
|
||||
}
|
||||
|
||||
impl TenantShardPageserverAttachmentChange {
|
||||
pub fn attachment(&self) -> &TenantShardPageserverAttachment {
|
||||
match self {
|
||||
TenantShardPageserverAttachmentChange::Attach(a) => a,
|
||||
TenantShardPageserverAttachmentChange::Detach(a) => a,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct TenantShardPageserverAttachment {
|
||||
pub shard_id: ShardIndex,
|
||||
pub generation: Generation,
|
||||
pub ps_id: NodeId,
|
||||
// TODO: avoid transmitting this with every request.
|
||||
// How nice things could be if there were simple DNS records for ps-$node_id.$cell.$region.$cloud.neon.tech
|
||||
pub ps_hostname: String, // TODO: some type safety
|
||||
}
|
||||
|
||||
81
libs/sk_ps_discovery/Cargo.toml
Normal file
81
libs/sk_ps_discovery/Cargo.toml
Normal file
@@ -0,0 +1,81 @@
|
||||
[package]
|
||||
name = "sk_ps_discovery"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
workspace_hack.workspace = true
|
||||
|
||||
async-stream.workspace = true
|
||||
anyhow.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
chrono.workspace = true
|
||||
clap = { workspace = true, features = ["derive"] }
|
||||
crc32c.workspace = true
|
||||
fail.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
http.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
futures.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
pprof.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["json"] }
|
||||
rustls.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
smallvec.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
tokio = { workspace = true, features = ["fs"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util = { workspace = true }
|
||||
tonic = { workspace = true }
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
metrics.workspace = true
|
||||
pem.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
remote_storage.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
safekeeper_client.workspace = true
|
||||
sha2.workspace = true
|
||||
sd-notify.workspace = true
|
||||
storage_broker.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
http-utils.workspace = true
|
||||
utils.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
env_logger.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
itertools.workspace = true
|
||||
walproposer.workspace = true
|
||||
rand.workspace = true
|
||||
desim.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
97
libs/sk_ps_discovery/benches/bench.rs
Normal file
97
libs/sk_ps_discovery/benches/bench.rs
Normal file
@@ -0,0 +1,97 @@
|
||||
//! WAL ingestion benchmarks.
|
||||
|
||||
use std::time::Instant;
|
||||
|
||||
use criterion::{Criterion, criterion_group, criterion_main};
|
||||
use pprof::criterion::{Output, PProfProfiler};
|
||||
use sk_ps_discovery::{
|
||||
AttachmentUpdate, RemoteConsistentLsnAdv, TenantShardAttachmentId, TimelineAttachmentId,
|
||||
};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
shard::ShardIndex,
|
||||
};
|
||||
|
||||
/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs.
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
#[unsafe(export_name = "malloc_conf")]
|
||||
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
|
||||
|
||||
// Register benchmarks with Criterion.
|
||||
criterion_group!(
|
||||
name = benches;
|
||||
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
|
||||
targets = bench_simple,
|
||||
);
|
||||
criterion_main!(benches);
|
||||
|
||||
fn bench_simple(c: &mut Criterion) {
|
||||
let mut g = c.benchmark_group("simple");
|
||||
|
||||
// setup
|
||||
let mut world = sk_ps_discovery::World::default();
|
||||
|
||||
// Simplified view: lots of unsharded tenants with one timeline each
|
||||
let n_pageservers = 20;
|
||||
let n_tenant_shards_per_pageserver = 2000;
|
||||
for ps_id in 1..=n_pageservers {
|
||||
for _ in ..n_tenant_shards_per_pageserver {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
for generation in 10..=11 {
|
||||
let tenant_shard_attachment_id = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(generation),
|
||||
};
|
||||
let timeline_attachment = TimelineAttachmentId {
|
||||
tenant_timeline_id: TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(generation),
|
||||
};
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action: sk_ps_discovery::AttachmentUpdateAction::Attach {
|
||||
ps_id: NodeId(ps_id),
|
||||
},
|
||||
});
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
remote_consistent_lsn: Lsn(23),
|
||||
attachment: timeline_attachment,
|
||||
});
|
||||
}
|
||||
world.handle_commit_lsn_advancement(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
Lsn(42),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// setup done
|
||||
let world = world;
|
||||
g.bench_function("get_commit_lsn_advertisements", |bencher| {
|
||||
bencher.iter_custom(|iters| {
|
||||
let started = Instant::now();
|
||||
|
||||
for _ in 0..iters {
|
||||
criterion::black_box(world.get_commit_lsn_advertisements());
|
||||
}
|
||||
|
||||
let elapsed = started.elapsed();
|
||||
elapsed
|
||||
});
|
||||
});
|
||||
|
||||
g.finish();
|
||||
}
|
||||
515
libs/sk_ps_discovery/src/lib.rs
Normal file
515
libs/sk_ps_discovery/src/lib.rs
Normal file
@@ -0,0 +1,515 @@
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet, HashMap, HashSet, btree_map, hash_map},
|
||||
ops::RangeInclusive,
|
||||
};
|
||||
|
||||
use tracing::{info, warn};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
merge_join,
|
||||
shard::ShardIndex,
|
||||
};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct World {
|
||||
attachments: BTreeMap<TenantShardAttachmentId, NodeId>,
|
||||
attachment_count: HashMap<TenantId, u16>,
|
||||
nodes_timelines: HashMap<NodeId, HashMap<TenantTimelineId, u16>>, // u16 is a refcount from each timeline attachment id
|
||||
// continously maintained aggregate for efficient decisionmaking on quiescing;
|
||||
// quiesced timelines are always caught up
|
||||
// can quiesce one == attachment_count (TODO: this requires enforcing foreign key relationship between attachments and remote_consistent_lsn)
|
||||
caught_up_count: HashMap<TenantTimelineId, u16>,
|
||||
|
||||
// BEGIN quiescing/active split
|
||||
quiesced_timelines: BTreeMap<TenantTimelineId, Lsn>,
|
||||
// ^
|
||||
// either a timeline is in quiesced_timelines
|
||||
// or it is below
|
||||
// v
|
||||
commit_lsns: BTreeMap<TenantTimelineId, Lsn>,
|
||||
remote_consistent_lsns: BTreeMap<TimelineAttachmentId, Lsn>,
|
||||
// END quiescing/active split
|
||||
|
||||
// other fields
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, PartialOrd, Ord)]
|
||||
pub struct TenantShardAttachmentId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_id: ShardIndex,
|
||||
pub generation: Generation,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, PartialOrd, Ord)]
|
||||
pub struct TimelineAttachmentId {
|
||||
pub tenant_timeline_id: TenantTimelineId,
|
||||
pub shard_id: ShardIndex,
|
||||
pub generation: Generation,
|
||||
}
|
||||
|
||||
pub struct AttachmentUpdate {
|
||||
pub tenant_shard_attachment_id: TenantShardAttachmentId,
|
||||
pub action: AttachmentUpdateAction,
|
||||
}
|
||||
|
||||
pub enum AttachmentUpdateAction {
|
||||
Attach { ps_id: NodeId },
|
||||
Detach,
|
||||
}
|
||||
|
||||
pub struct RemoteConsistentLsnAdv {
|
||||
pub attachment: TimelineAttachmentId,
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl World {
|
||||
fn check_invariants(&self) {
|
||||
if !cfg!(debug_assertions) {
|
||||
return;
|
||||
}
|
||||
|
||||
// caught_up_count maintenance
|
||||
{
|
||||
for (tenant_timeline_id, caught_up_count) in
|
||||
self.caught_up_count.iter().map(|(k, v)| (*k, *v))
|
||||
{
|
||||
let attachment_count = *self
|
||||
.attachment_count
|
||||
.get(&tenant_timeline_id.tenant_id)
|
||||
.unwrap();
|
||||
assert!(caught_up_count <= attachment_count);
|
||||
if caught_up_count == attachment_count {
|
||||
self.quiesced_timelines.contains_key(&tenant_timeline_id);
|
||||
// remote_consistent_lsn and commit_lsns is empty, checked by "quiescing XOR ..." below
|
||||
} else {
|
||||
let commit_lsn = self.commit_lsns[&&tenant_timeline_id];
|
||||
let mut validate_caught_up = 0;
|
||||
let mut validate_not_caught_up = 0;
|
||||
for (_, r_c_lsn) in self
|
||||
.remote_consistent_lsns
|
||||
.range(TimelineAttachmentId::timeline_range(tenant_timeline_id))
|
||||
.map(|(k, v)| (*k, *v))
|
||||
{
|
||||
if r_c_lsn == commit_lsn {
|
||||
validate_caught_up += 1;
|
||||
} else {
|
||||
assert!(r_c_lsn < commit_lsn);
|
||||
validate_not_caught_up += 1;
|
||||
}
|
||||
}
|
||||
assert_eq!(validate_caught_up, caught_up_count);
|
||||
assert_eq!(
|
||||
validate_caught_up + validate_not_caught_up,
|
||||
attachment_count
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// quiescing XOR ...
|
||||
{
|
||||
let quiesced_timelines: HashSet<TenantTimelineId> =
|
||||
self.quiesced_timelines.keys().cloned().collect();
|
||||
let commit_lsn_timelines: HashSet<TenantTimelineId> =
|
||||
self.commit_lsns.keys().cloned().collect();
|
||||
let remote_consistent_lsn_timelines: HashSet<TenantTimelineId> = self
|
||||
.remote_consistent_lsns
|
||||
.keys()
|
||||
.map(|tlaid: &TimelineAttachmentId| tlaid.tenant_timeline_id)
|
||||
.collect();
|
||||
#[rustfmt::skip]
|
||||
assert_eq!(0, quiesced_timelines.intersection(&commit_lsn_timelines).count());
|
||||
#[rustfmt::skip]
|
||||
assert_eq!(0, quiesced_timelines.intersection(&remote_consistent_lsn_timelines).count());
|
||||
}
|
||||
|
||||
// nodes_timelines maintenance
|
||||
{
|
||||
let mut expect: HashMap<NodeId, HashMap<TenantTimelineId, u16>> = HashMap::new();
|
||||
let all_ttids: BTreeSet<TenantTimelineId> = self
|
||||
.quiesced_timelines
|
||||
.keys()
|
||||
.cloned()
|
||||
.chain(
|
||||
self.remote_consistent_lsns
|
||||
.keys()
|
||||
.cloned()
|
||||
.map(|tlaid| tlaid.tenant_timeline_id),
|
||||
)
|
||||
.collect();
|
||||
for ttid in all_ttids {
|
||||
for (_, node_id) in self
|
||||
.attachments
|
||||
.range(TenantShardAttachmentId::tenant_range(ttid.tenant_id))
|
||||
.map(|(k, v)| (*k, *v))
|
||||
{
|
||||
let expect = expect.entry(node_id).or_default();
|
||||
let refcount = expect.entry(ttid).or_default();
|
||||
*refcount += 1;
|
||||
}
|
||||
}
|
||||
assert_eq!(expect, self.nodes_timelines);
|
||||
}
|
||||
}
|
||||
pub fn update_attachment(&mut self, upd: AttachmentUpdate) {
|
||||
self.check_invariants();
|
||||
use AttachmentUpdateAction::*;
|
||||
use btree_map::Entry::*;
|
||||
let AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action,
|
||||
} = upd;
|
||||
match (action, self.attachments.entry(tenant_shard_attachment_id)) {
|
||||
(Attach { ps_id }, Occupied(e)) if *e.get() == ps_id => {
|
||||
info!("attachment is already known")
|
||||
}
|
||||
(Attach { ps_id }, Occupied(e)) => {
|
||||
warn!(current_node=%e.get(), proposed_node=%ps_id, "ignoring update that moves attachment to a different pageserver");
|
||||
}
|
||||
(Attach { ps_id }, Vacant(e)) => {
|
||||
e.insert(ps_id);
|
||||
// Keep attachmount_count up to date
|
||||
let attachment_count = self
|
||||
.attachment_count
|
||||
.entry(tenant_shard_attachment_id.tenant_id)
|
||||
.or_default();
|
||||
*attachment_count += attachment_count.checked_add(1).unwrap();
|
||||
// Keep nodes_timelines up to date
|
||||
let nodes_timelines = self.nodes_timelines.entry(ps_id).or_default();
|
||||
for (ttid, _) in self.commit_lsns.range(TenantTimelineId::tenant_range(
|
||||
tenant_shard_attachment_id.tenant_id,
|
||||
)) {
|
||||
let refcount = nodes_timelines.entry(*ttid).or_default();
|
||||
*refcount = refcount.checked_add(1).unwrap();
|
||||
}
|
||||
if nodes_timelines.is_empty() {
|
||||
self.nodes_timelines.remove(&ps_id);
|
||||
}
|
||||
// New shards may start at an older LSN than where we quiesced => activate all quiesced timelines.
|
||||
let activate_range =
|
||||
TenantTimelineId::tenant_range(tenant_shard_attachment_id.tenant_id);
|
||||
let activate: HashSet<TenantTimelineId> = self
|
||||
.quiesced_timelines
|
||||
.range(activate_range)
|
||||
.map(|(ttid, _quiesced_lsn)| *ttid)
|
||||
.collect();
|
||||
for tenant_timeline_id in activate {
|
||||
self.activate_timeline(tenant_timeline_id);
|
||||
}
|
||||
}
|
||||
(Detach, Occupied(e)) => {
|
||||
let ps_id = e.remove();
|
||||
// Keep attachment count up to date
|
||||
let attachment_count = self
|
||||
.attachment_count
|
||||
.get_mut(&tenant_shard_attachment_id.tenant_id)
|
||||
.expect("attachment action initializes the hasmap entry");
|
||||
*attachment_count = attachment_count.checked_sub(1).unwrap();
|
||||
// Keep nodes_timelines up to date
|
||||
let nodes_timelines = self
|
||||
.nodes_timelines
|
||||
.get_mut(&ps_id)
|
||||
.expect("attachment action initializes hashmap entry");
|
||||
for (ttid, _) in self.commit_lsns.range(TenantTimelineId::tenant_range(
|
||||
tenant_shard_attachment_id.tenant_id,
|
||||
)) {
|
||||
let refcount = nodes_timelines.entry(*ttid).or_default();
|
||||
*refcount = refcount.checked_sub(1).unwrap();
|
||||
}
|
||||
}
|
||||
(Detach, Vacant(_)) => {
|
||||
info!("detachment is already known");
|
||||
}
|
||||
}
|
||||
self.check_invariants();
|
||||
}
|
||||
pub fn handle_remote_consistent_lsn_advertisement(&mut self, adv: RemoteConsistentLsnAdv) {
|
||||
self.check_invariants();
|
||||
let RemoteConsistentLsnAdv {
|
||||
attachment,
|
||||
remote_consistent_lsn,
|
||||
} = adv;
|
||||
|
||||
match self.remote_consistent_lsns.entry(attachment) {
|
||||
btree_map::Entry::Occupied(mut occupied_entry) => {
|
||||
let current = occupied_entry.get_mut();
|
||||
use std::cmp::Ordering::*;
|
||||
match (*current).cmp(&remote_consistent_lsn) {
|
||||
Less => {
|
||||
*current = remote_consistent_lsn;
|
||||
let caught_up_count = self
|
||||
.caught_up_count
|
||||
.get_mut(&attachment.tenant_timeline_id)
|
||||
.unwrap();
|
||||
*caught_up_count = caught_up_count.checked_add(1).unwrap();
|
||||
if *caught_up_count
|
||||
== self.attachment_count[&attachment.tenant_timeline_id.tenant_id]
|
||||
{
|
||||
self.quiesce_timeline(attachment.tenant_timeline_id);
|
||||
}
|
||||
}
|
||||
Equal => {
|
||||
info!("ignoring no-op update, likely duplicate delivery");
|
||||
}
|
||||
Greater => {
|
||||
warn!(
|
||||
"ignoring advertisement because remote_consistent_lsn is moving backwards"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
btree_map::Entry::Vacant(_) => {
|
||||
let ttid = attachment.tenant_timeline_id;
|
||||
match self.quiesced_timelines.get(&ttid).cloned() {
|
||||
Some(quiesced_lsn) if quiesced_lsn == remote_consistent_lsn => {
|
||||
info!("ignoring no-op update for quiesced timeline");
|
||||
}
|
||||
Some(_) => {
|
||||
self.activate_timeline(ttid);
|
||||
// recurse one level, guarnateed to hit `Occupied` case above
|
||||
self.handle_remote_consistent_lsn_advertisement(adv);
|
||||
}
|
||||
None => {
|
||||
info!("ignoring advertisement because timeline is not known");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.check_invariants();
|
||||
}
|
||||
pub fn handle_commit_lsn_advancement(&mut self, ttid: TenantTimelineId, update: Lsn) {
|
||||
self.check_invariants();
|
||||
match self.commit_lsns.entry(ttid) {
|
||||
btree_map::Entry::Occupied(mut entry) => {
|
||||
let current = entry.get_mut();
|
||||
use std::cmp::Ordering::*;
|
||||
match (*current).cmp(&update) {
|
||||
Less => {
|
||||
*current = update;
|
||||
// We never allow remote_consistent_lsn to be ahead of commit_lsn.
|
||||
// Therefore, it is safe to say nothing is caught up anymore.
|
||||
let caught_up_count = self.caught_up_count.get_mut(&ttid).unwrap();
|
||||
*caught_up_count = 0;
|
||||
}
|
||||
Equal => {
|
||||
// This code runs in safekeeper impl, no reason why there would be duplicate delivery.
|
||||
warn!("ignoring no-op update; why is this happening?");
|
||||
}
|
||||
Greater => {
|
||||
panic!(
|
||||
"proposed commit_lsn would move it backwards: current={} update={}",
|
||||
current, update
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
btree_map::Entry::Vacant(entry) => {
|
||||
match self.quiesced_timelines.get(&ttid).cloned() {
|
||||
Some(quiesced_lsn) if quiesced_lsn == update => {
|
||||
info!("ignoring no-op update for quiesced timeline");
|
||||
}
|
||||
Some(_) => {
|
||||
self.activate_timeline(ttid);
|
||||
// recurse one level, guarnateed to hit `Occupied` case above
|
||||
self.handle_commit_lsn_advancement(ttid, update);
|
||||
}
|
||||
None => {
|
||||
info!("first time hearing about this timeline, initializing");
|
||||
entry.insert(update);
|
||||
let replaced = self.caught_up_count.insert(ttid, 0);
|
||||
// only commit_lsn advancement makes timelines known to world
|
||||
assert_eq!(None, replaced);
|
||||
for (attachment, node_id) in self
|
||||
.attachments
|
||||
.range(TenantShardAttachmentId::tenant_range(ttid.tenant_id))
|
||||
{
|
||||
let replaced = self.remote_consistent_lsns.insert(
|
||||
attachment.timeline_attachment_id(ttid.timeline_id),
|
||||
Lsn(0),
|
||||
);
|
||||
// only commit_lsn advancement makes timelines known to World
|
||||
assert_eq!(None, replaced);
|
||||
|
||||
let nodes_timelines = self.nodes_timelines.entry(*node_id).or_default();
|
||||
let refcount = nodes_timelines.entry(ttid).or_default();
|
||||
*refcount = refcount.checked_add(1).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.check_invariants();
|
||||
}
|
||||
|
||||
pub fn get_commit_lsn_advertisements(&self) -> HashMap<NodeId, HashMap<TenantTimelineId, Lsn>> {
|
||||
let mut commit_lsn_advertisements_by_node: HashMap<NodeId, HashMap<TenantTimelineId, Lsn>> =
|
||||
HashMap::with_capacity(self.nodes_timelines.len());
|
||||
let commit_lsns_iter = self.commit_lsns.iter().map(|(k, v)| (*k, *v));
|
||||
let attachments_iter = self.attachments.iter().map(|(k, v)| (*k, *v));
|
||||
let remote_consistent_lsns_iter = self.remote_consistent_lsns.iter().map(|(k, v)| (*k, *v));
|
||||
|
||||
let join = merge_join::inner_equi_join_with_merge_strategy(
|
||||
commit_lsns_iter,
|
||||
attachments_iter,
|
||||
|(tenant_timeline_id, _)| tenant_timeline_id.tenant_id,
|
||||
|(shard_attachment_id, _)| shard_attachment_id.tenant_id,
|
||||
);
|
||||
let join = merge_join::left_equi_join_with_merge_strategy(
|
||||
join,
|
||||
remote_consistent_lsns_iter,
|
||||
|((ttid, _), _)| ttid.tenant_id,
|
||||
|(tlaid, _)| tlaid.tenant_timeline_id.tenant_id,
|
||||
);
|
||||
for ((c, a), r) in join {
|
||||
let (tenant_timeline_id, commit_lsn): (TenantTimelineId, Lsn) = c;
|
||||
let (_, node_id): (TenantShardAttachmentId, NodeId) = a;
|
||||
match r {
|
||||
// TODO: can > ever happen?
|
||||
Some((_, remote_consistent_lsn)) if remote_consistent_lsn >= commit_lsn => {
|
||||
// this timeline shard attachment is already caught up
|
||||
continue;
|
||||
}
|
||||
Some(_) | None => {
|
||||
// need to advertise
|
||||
// -> fallthrough
|
||||
}
|
||||
};
|
||||
// DISTINCT node_id, array_agg(DISTINCT tenant_shard_id )
|
||||
let for_node = commit_lsn_advertisements_by_node
|
||||
.entry(node_id)
|
||||
.or_insert_with(|| HashMap::with_capacity(self.nodes_timelines[&node_id].len()));
|
||||
match for_node.entry(tenant_timeline_id) {
|
||||
hash_map::Entry::Vacant(vacant_entry) => {
|
||||
vacant_entry.insert(commit_lsn);
|
||||
}
|
||||
hash_map::Entry::Occupied(occupied_entry) => {
|
||||
assert_eq!(*occupied_entry.get(), commit_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
commit_lsn_advertisements_by_node
|
||||
}
|
||||
|
||||
fn activate_timeline(&mut self, tenant_timeline_id: TenantTimelineId) {
|
||||
let quiesced_lsn = self
|
||||
.quiesced_timelines
|
||||
.remove(&tenant_timeline_id)
|
||||
.expect("must call this function only on quiesced tenant_timeline_id");
|
||||
let replaced = self.commit_lsns.insert(tenant_timeline_id, quiesced_lsn);
|
||||
assert_eq!(None, replaced);
|
||||
let reconstruct_remote_consistent_lsn_entries = self
|
||||
.attachments
|
||||
.range(TenantShardAttachmentId::tenant_range(
|
||||
tenant_timeline_id.tenant_id,
|
||||
))
|
||||
.map(|(k, _)| *k)
|
||||
.map(|tenant_shard_attachment_id| {
|
||||
(
|
||||
tenant_shard_attachment_id
|
||||
.timeline_attachment_id(tenant_timeline_id.timeline_id),
|
||||
quiesced_lsn,
|
||||
)
|
||||
});
|
||||
for (key, value) in reconstruct_remote_consistent_lsn_entries {
|
||||
let replaced = self.remote_consistent_lsns.insert(key, value);
|
||||
assert_eq!(None, replaced);
|
||||
}
|
||||
}
|
||||
|
||||
fn quiesce_timeline(&mut self, tenant_timeline_id: TenantTimelineId) {
|
||||
self.check_invariants();
|
||||
if self.quiesced_timelines.contains_key(&tenant_timeline_id) {
|
||||
panic!("only call this function on active timelines");
|
||||
}
|
||||
let quiesced_lsn = self
|
||||
.commit_lsns
|
||||
.remove(&tenant_timeline_id)
|
||||
.expect("inconsistent: we checked it's not in quiesced_timelines, so, must be active");
|
||||
let caught_up_count = self
|
||||
.caught_up_count
|
||||
.remove(&tenant_timeline_id)
|
||||
.expect("inconsistent: we checked it's not in quiesced_timleines, so, must be active");
|
||||
let mut remove_remote_consistent_lsns = Vec::new();
|
||||
for (k, remote_consistent_lsn) in self
|
||||
.remote_consistent_lsns
|
||||
.range(TimelineAttachmentId::timeline_range(tenant_timeline_id))
|
||||
{
|
||||
assert_eq!(*remote_consistent_lsn, quiesced_lsn);
|
||||
remove_remote_consistent_lsns.push(*k);
|
||||
}
|
||||
assert_eq!(
|
||||
caught_up_count,
|
||||
u16::try_from(remove_remote_consistent_lsns.len()).unwrap()
|
||||
);
|
||||
for k in remove_remote_consistent_lsns {
|
||||
let removed = self.remote_consistent_lsns.remove(&k);
|
||||
assert!(removed.is_some(), "we just added");
|
||||
}
|
||||
let replaced = self
|
||||
.quiesced_timelines
|
||||
.insert(tenant_timeline_id, quiesced_lsn);
|
||||
assert_eq!(None, replaced); // we checked at function entry
|
||||
self.check_invariants();
|
||||
}
|
||||
}
|
||||
|
||||
impl TimelineAttachmentId {
|
||||
pub fn timeline_range(ttid: TenantTimelineId) -> RangeInclusive<Self> {
|
||||
let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
|
||||
let generation_range: RangeInclusive<_> = Generation::RANGE;
|
||||
RangeInclusive::new(
|
||||
TimelineAttachmentId {
|
||||
tenant_timeline_id: ttid,
|
||||
shard_id: *shard_index_range.start(),
|
||||
generation: *generation_range.start(),
|
||||
},
|
||||
TimelineAttachmentId {
|
||||
tenant_timeline_id: ttid,
|
||||
shard_id: *shard_index_range.end(),
|
||||
generation: *generation_range.end(),
|
||||
},
|
||||
)
|
||||
}
|
||||
pub fn tenant_shard_attachment_id(self) -> TenantShardAttachmentId {
|
||||
TenantShardAttachmentId {
|
||||
tenant_id: self.tenant_timeline_id.tenant_id,
|
||||
shard_id: self.shard_id,
|
||||
generation: self.generation,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantShardAttachmentId {
|
||||
pub fn timeline_attachment_id(self, timeline_id: TimelineId) -> TimelineAttachmentId {
|
||||
TimelineAttachmentId {
|
||||
tenant_timeline_id: TenantTimelineId {
|
||||
tenant_id: self.tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
shard_id: self.shard_id,
|
||||
generation: self.generation,
|
||||
}
|
||||
}
|
||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
||||
let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
|
||||
let generation_range: RangeInclusive<_> = Generation::RANGE;
|
||||
RangeInclusive::new(
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_id: *shard_index_range.start(),
|
||||
generation: *generation_range.start(),
|
||||
},
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_id: *shard_index_range.end(),
|
||||
generation: *generation_range.end(),
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
224
libs/sk_ps_discovery/src/tests.rs
Normal file
224
libs/sk_ps_discovery/src/tests.rs
Normal file
@@ -0,0 +1,224 @@
|
||||
use utils::{id::TenantId, logging};
|
||||
|
||||
use super::*;
|
||||
use crate::World;
|
||||
|
||||
#[track_caller]
|
||||
fn validate_advertisements(
|
||||
actual: HashMap<NodeId, HashMap<TenantTimelineId, Lsn>>,
|
||||
expect: Vec<(NodeId, Vec<(TenantTimelineId, Lsn)>)>,
|
||||
) {
|
||||
let expect: HashMap<_, _> = expect
|
||||
.into_iter()
|
||||
.map(|(node_id, innermap)| (node_id, innermap.into_iter().collect()))
|
||||
.collect();
|
||||
assert_eq!(actual, expect);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let mut world = World::default();
|
||||
|
||||
let tenant_id = TenantId::from_array([0xff; 16]);
|
||||
let timeline_id = TimelineId::from_array([1; 16]);
|
||||
let timeline2 = TimelineId::from_array([2; 16]);
|
||||
|
||||
let attachment1 = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
};
|
||||
let attachment2 = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(3),
|
||||
};
|
||||
|
||||
let ps1 = NodeId(0x100);
|
||||
|
||||
// Out of order; in happy path, commit_lsn advances first, but let's test the
|
||||
// case where safekeeper doesn't know about the attachments yet first, before
|
||||
// we extend the case to the happy path.
|
||||
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
attachment: attachment1.timeline_attachment_id(timeline_id),
|
||||
remote_consistent_lsn: Lsn(0x23),
|
||||
});
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
attachment: attachment2.timeline_attachment_id(timeline_id),
|
||||
remote_consistent_lsn: Lsn(0x42),
|
||||
});
|
||||
// SK authoritative info on which advertisements ought exist is still empty
|
||||
assert_eq!(world.get_commit_lsn_advertisements(), HashMap::default());
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id: attachment1,
|
||||
action: AttachmentUpdateAction::Attach { ps_id: ps1 },
|
||||
});
|
||||
// We have not inserted any commit_lsn info yet, so, still no advs expected
|
||||
assert_eq!(world.get_commit_lsn_advertisements(), HashMap::default());
|
||||
// insert commit_lsn info for different timeline
|
||||
world.handle_commit_lsn_advancement(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id: timeline2,
|
||||
},
|
||||
Lsn(0x66),
|
||||
);
|
||||
// Advs should still be empty
|
||||
validate_advertisements(
|
||||
world.get_commit_lsn_advertisements(),
|
||||
vec![(
|
||||
ps1,
|
||||
vec![(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id: timeline2,
|
||||
},
|
||||
Lsn(0x66),
|
||||
)],
|
||||
)],
|
||||
);
|
||||
|
||||
// Ok, out of order part tested. Now Safekeeper learns about the attachments.
|
||||
|
||||
// insert commit_lsn info for the timeline we have remote_consistent_lsn info for
|
||||
world.handle_commit_lsn_advancement(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
Lsn(0x55),
|
||||
);
|
||||
dbg!(&world);
|
||||
// Now advertisements to attachment1 will be sent out, but attachment2 is still not known, so, no advertisements to it.
|
||||
validate_advertisements(
|
||||
world.get_commit_lsn_advertisements(),
|
||||
vec![(
|
||||
ps1,
|
||||
vec![(
|
||||
TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
Lsn(0x55),
|
||||
)],
|
||||
)],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn advertisement_for_new_timeline() {
|
||||
let mut world = World::default();
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let ttid = TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let tenant_shard_attachment_id = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
};
|
||||
|
||||
let ps_id = NodeId(0x100);
|
||||
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action: AttachmentUpdateAction::Attach { ps_id },
|
||||
});
|
||||
world.handle_commit_lsn_advancement(ttid, Lsn(23));
|
||||
|
||||
let advs = world.get_commit_lsn_advertisements();
|
||||
validate_advertisements(advs, vec![(ps_id, vec![(ttid, Lsn(23))])]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn quiescing_timeline_catchup() {
|
||||
let _guard = logging::init(
|
||||
logging::LogFormat::Test,
|
||||
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
logging::Output::Stdout,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let mut world = World::default();
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let ttid = TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let tenant_shard_attachment_id = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
};
|
||||
|
||||
let ps_id = NodeId(0x100);
|
||||
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action: AttachmentUpdateAction::Attach { ps_id },
|
||||
});
|
||||
world.handle_commit_lsn_advancement(ttid, Lsn(23));
|
||||
|
||||
assert!(world.quiesced_timelines.is_empty());
|
||||
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
attachment: tenant_shard_attachment_id.timeline_attachment_id(timeline_id),
|
||||
remote_consistent_lsn: Lsn(23),
|
||||
});
|
||||
|
||||
assert!(world.quiesced_timelines.contains_key(&ttid));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nodes_timelines() {
|
||||
let mut world = World::default();
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::from_array([0x1; 16]);
|
||||
let ttid = TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let tenant_shard_attachment_id = TenantShardAttachmentId {
|
||||
tenant_id,
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
};
|
||||
|
||||
let ps_id = NodeId(0x100);
|
||||
|
||||
world.update_attachment(AttachmentUpdate {
|
||||
tenant_shard_attachment_id,
|
||||
action: AttachmentUpdateAction::Attach { ps_id },
|
||||
});
|
||||
|
||||
assert!(world.nodes_timelines.get(&ps_id).is_none());
|
||||
|
||||
world.handle_commit_lsn_advancement(ttid, Lsn(0x23));
|
||||
|
||||
assert_eq!(world.nodes_timelines[&ps_id].len(), 1);
|
||||
|
||||
let timeline2 = TimelineId::from_array([0x2; 16]);
|
||||
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
|
||||
attachment: TimelineAttachmentId {
|
||||
tenant_timeline_id: TenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id: timeline2,
|
||||
},
|
||||
shard_id: ShardIndex::unsharded(),
|
||||
generation: Generation::Valid(2),
|
||||
},
|
||||
remote_consistent_lsn: Lsn(0x42),
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: need more tests, esp for the removal path
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::fmt::Debug;
|
||||
use std::{fmt::Debug, ops::RangeInclusive};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -25,7 +25,9 @@ pub enum Generation {
|
||||
/// scenarios where pageservers might otherwise issue conflicting writes to
|
||||
/// remote storage
|
||||
impl Generation {
|
||||
pub const MIN: Self = Self::None;
|
||||
pub const MAX: Self = Self::Valid(u32::MAX);
|
||||
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
|
||||
|
||||
/// Create a new Generation that represents a legacy key format with
|
||||
/// no generation suffix
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::fmt;
|
||||
use std::num::ParseIntError;
|
||||
use std::ops::RangeInclusive;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -320,6 +321,19 @@ impl TenantTimelineId {
|
||||
pub fn empty() -> Self {
|
||||
Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16]))
|
||||
}
|
||||
|
||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
||||
RangeInclusive::new(
|
||||
Self {
|
||||
tenant_id,
|
||||
timeline_id: TimelineId::from_array([u8::MIN; 16]),
|
||||
},
|
||||
Self {
|
||||
tenant_id,
|
||||
timeline_id: TimelineId::from_array([u8::MAX; 16]),
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for TenantTimelineId {
|
||||
|
||||
@@ -95,6 +95,9 @@ pub mod guard_arc_swap;
|
||||
|
||||
pub mod elapsed_accum;
|
||||
|
||||
pub mod merge_join;
|
||||
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod linux_socket_ioctl;
|
||||
|
||||
|
||||
164
libs/utils/src/merge_join.rs
Normal file
164
libs/utils/src/merge_join.rs
Normal file
@@ -0,0 +1,164 @@
|
||||
pub fn inner_equi_join_with_merge_strategy<L, LI, R, RI, K, FL, FR>(
|
||||
l: L,
|
||||
r: R,
|
||||
key_l: FL,
|
||||
key_r: FR,
|
||||
) -> impl Iterator<Item = (LI, RI)>
|
||||
where
|
||||
L: Iterator<Item = LI>, // + Sorted
|
||||
R: Iterator<Item = RI>, // + Sorted
|
||||
FL: 'static + Fn(&LI) -> K,
|
||||
FR: 'static + Fn(&RI) -> K,
|
||||
LI: Copy,
|
||||
RI: Copy,
|
||||
K: PartialEq + Eq + Ord,
|
||||
{
|
||||
let mut l = l.map(move |i| (i, key_l(&i))).peekable();
|
||||
let mut r = r.map(move |i| (i, key_r(&i))).peekable();
|
||||
std::iter::from_fn(move || {
|
||||
loop {
|
||||
match (l.peek(), r.peek()) {
|
||||
(Some((_, lk)), Some((_, rk))) if lk < rk => {
|
||||
drop(l.next());
|
||||
continue;
|
||||
}
|
||||
(Some((_, lk)), Some((_, rk))) if lk > rk => {
|
||||
drop(r.next());
|
||||
continue;
|
||||
}
|
||||
(Some((lv, lk)), Some((_, rk))) => {
|
||||
assert!(lk == rk);
|
||||
let (rv, _) = r.next().unwrap();
|
||||
return Some((lv.clone(), rv));
|
||||
}
|
||||
(None, None) | (None, Some(_)) | (Some(_), None) => return None,
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn left_equi_join_with_merge_strategy<L, LI, R, RI, K, FL, FR>(
|
||||
l: L,
|
||||
r: R,
|
||||
key_l: FL,
|
||||
key_r: FR,
|
||||
) -> impl Iterator<Item = (LI, Option<RI>)>
|
||||
where
|
||||
L: Iterator<Item = LI>, // + Sorted
|
||||
R: Iterator<Item = RI>, // + Sorted
|
||||
FL: 'static + Fn(&LI) -> K,
|
||||
FR: 'static + Fn(&RI) -> K,
|
||||
LI: Copy,
|
||||
RI: Copy,
|
||||
K: PartialEq + Eq + Ord,
|
||||
{
|
||||
let mut l = l.map(move |i| (i, key_l(&i))).peekable();
|
||||
let mut r = r.map(move |i| (i, key_r(&i))).peekable();
|
||||
let mut l_had_match = false;
|
||||
std::iter::from_fn(move || {
|
||||
loop {
|
||||
match (l.peek(), r.peek()) {
|
||||
(Some((_, lk)), Some((_, rk))) if lk < rk => {
|
||||
let (lv, _) = l.next().unwrap();
|
||||
if l_had_match {
|
||||
l_had_match = false;
|
||||
continue;
|
||||
} else {
|
||||
return Some((lv, None));
|
||||
}
|
||||
}
|
||||
(Some((_, _)), None) => {
|
||||
let (lv, _) = l.next().unwrap();
|
||||
if l_had_match {
|
||||
l_had_match = false;
|
||||
continue;
|
||||
} else {
|
||||
return Some((lv, None));
|
||||
}
|
||||
}
|
||||
(Some((_, lk)), Some((_, rk))) if lk > rk => {
|
||||
drop(r.next());
|
||||
continue;
|
||||
}
|
||||
(Some((lv, lk)), Some((_, rk))) => {
|
||||
l_had_match = true;
|
||||
assert!(lk == rk);
|
||||
let (rv, _) = r.next().unwrap();
|
||||
return Some((lv.clone(), Some(rv)));
|
||||
}
|
||||
(None, None) | (None, Some(_)) => return None,
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
#[test]
|
||||
fn inner_equi_basic() {
|
||||
let l = vec![b"a", b"c"];
|
||||
let r = vec![b"aa", b"ad", b"ba", b"bb", b"ca", b"cb", b"cd", b"dd"];
|
||||
|
||||
let res: Vec<_> = super::inner_equi_join_with_merge_strategy(
|
||||
l.into_iter(),
|
||||
r.into_iter(),
|
||||
|l| &l[0..1],
|
||||
|r| &r[0..1],
|
||||
)
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
vec![
|
||||
(b"a", b"aa"),
|
||||
(b"a", b"ad"),
|
||||
(b"c", b"ca"),
|
||||
(b"c", b"cb"),
|
||||
(b"c", b"cd"),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn left_equi_basic() {
|
||||
/*
|
||||
create table aleft (id text, aleft text);
|
||||
create table aright (id text, aright text);
|
||||
insert into aleft values ('a', 'a'), ('b', 'b');
|
||||
insert into aright values ('a', 'aa'), ('a', 'ab'), ('c', 'cd');
|
||||
select * from aleft left join aright using ("id");
|
||||
*/
|
||||
|
||||
let l = vec![b"a", b"b"];
|
||||
let r = vec![b"aa", b"ab", b"cd"];
|
||||
|
||||
let res: Vec<_> = super::left_equi_join_with_merge_strategy(
|
||||
l.into_iter(),
|
||||
r.into_iter(),
|
||||
|l| &l[0..1],
|
||||
|r| &r[0..1],
|
||||
)
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
vec![(b"a", Some(b"aa")), (b"a", Some(b"ab")), (b"b", None)]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn left_equi_basic_2() {
|
||||
let l = vec![b"b"];
|
||||
let r = vec![b"aa", b"ab", b"bb"];
|
||||
|
||||
let res: Vec<_> = super::left_equi_join_with_merge_strategy(
|
||||
l.into_iter(),
|
||||
r.into_iter(),
|
||||
|l| &l[0..1],
|
||||
|r| &r[0..1],
|
||||
)
|
||||
.collect();
|
||||
|
||||
assert_eq!(res, vec![(b"b", Some(b"bb"))])
|
||||
}
|
||||
}
|
||||
@@ -52,6 +52,7 @@ pub struct TenantShardId {
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
pub const MIN: Self = Self(0);
|
||||
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
|
||||
|
||||
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
|
||||
/// legacy format for TenantShardId that excludes the shard suffix", also known
|
||||
@@ -85,7 +86,9 @@ impl ShardCount {
|
||||
}
|
||||
|
||||
impl ShardNumber {
|
||||
pub const MIN: Self = Self(0);
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
|
||||
}
|
||||
|
||||
impl TenantShardId {
|
||||
@@ -100,16 +103,17 @@ impl TenantShardId {
|
||||
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
|
||||
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
|
||||
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
|
||||
let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
|
||||
RangeInclusive::new(
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber(0),
|
||||
shard_count: ShardCount(0),
|
||||
shard_number: shard_index_range.start().shard_number,
|
||||
shard_count: shard_index_range.start().shard_count,
|
||||
},
|
||||
Self {
|
||||
tenant_id,
|
||||
shard_number: ShardNumber::MAX,
|
||||
shard_count: ShardCount::MAX,
|
||||
shard_number: shard_index_range.end().shard_number,
|
||||
shard_count: shard_index_range.end().shard_count,
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -241,6 +245,16 @@ impl From<[u8; 18]> for TenantShardId {
|
||||
}
|
||||
|
||||
impl ShardIndex {
|
||||
pub const MIN: Self = ShardIndex {
|
||||
shard_number: ShardNumber::MIN,
|
||||
shard_count: ShardCount::MIN,
|
||||
};
|
||||
pub const MAX: Self = ShardIndex {
|
||||
shard_number: ShardNumber::MAX,
|
||||
shard_count: ShardCount::MAX,
|
||||
};
|
||||
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
|
||||
|
||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
shard_number: number,
|
||||
|
||||
@@ -4,3 +4,5 @@ pub mod duplex;
|
||||
pub mod gate;
|
||||
|
||||
pub mod spsc_fold;
|
||||
|
||||
pub mod spsc_watch;
|
||||
|
||||
@@ -56,7 +56,7 @@ impl<T: Send> Sender<T> {
|
||||
/// # Panics
|
||||
///
|
||||
/// If `try_fold` panics, any subsequent call to `send` panic.
|
||||
pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), SendError>
|
||||
pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), (T, SendError)>
|
||||
where
|
||||
F: Fn(&mut T, T) -> Result<(), T>,
|
||||
{
|
||||
@@ -104,7 +104,9 @@ impl<T: Send> Sender<T> {
|
||||
}
|
||||
Poll::Pending
|
||||
}
|
||||
State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
|
||||
State::ReceiverGone => {
|
||||
Poll::Ready(Err((value.take().unwrap(), SendError::ReceiverGone)))
|
||||
}
|
||||
State::SenderGone(_)
|
||||
| State::AllGone
|
||||
| State::SenderDropping
|
||||
|
||||
55
libs/utils/src/sync/spsc_watch.rs
Normal file
55
libs/utils/src/sync/spsc_watch.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
//! watch is probably not the right word, because we do take out
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::sync::spsc_fold;
|
||||
|
||||
pub fn channel<T: Send>() -> (Sender<T>, Receiver<T>) {
|
||||
let (tx, rx) = spsc_fold::channel();
|
||||
let cancel = CancellationToken::new();
|
||||
(
|
||||
Sender {
|
||||
tx,
|
||||
_cancel: cancel.clone().drop_guard(),
|
||||
},
|
||||
Receiver { rx, cancel },
|
||||
)
|
||||
}
|
||||
|
||||
pub struct Sender<T> {
|
||||
tx: spsc_fold::Sender<T>,
|
||||
_cancel: tokio_util::sync::DropGuard,
|
||||
}
|
||||
|
||||
pub struct Receiver<T> {
|
||||
rx: spsc_fold::Receiver<T>,
|
||||
cancel: CancellationToken,
|
||||
}
|
||||
|
||||
impl<T: Send> Sender<T> {
|
||||
pub fn send_replace(&mut self, value: T) -> Result<(), (T, spsc_fold::SendError)> {
|
||||
poll_ready(self.tx.send(value, |old, new| {
|
||||
*old = new;
|
||||
Ok(())
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Send> Receiver<T> {
|
||||
pub async fn recv(&mut self) -> Result<T, spsc_fold::RecvError> {
|
||||
self.rx.recv().await
|
||||
}
|
||||
pub async fn cancelled(&mut self) {
|
||||
self.cancel.cancelled().await
|
||||
}
|
||||
}
|
||||
|
||||
fn poll_ready<F: Future<Output = O>, O>(f: F) -> O {
|
||||
futures::executor::block_on(async move {
|
||||
let f = std::pin::pin!(f);
|
||||
match futures::poll!(f) {
|
||||
std::task::Poll::Ready(r) => r,
|
||||
std::task::Poll::Pending => unreachable!("expecting future to always return Ready"),
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -30,7 +30,6 @@ crc32c.workspace = true
|
||||
either.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
hashlink.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
|
||||
@@ -1,13 +1,7 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Generates Rust code from .proto Protobuf schemas, along with a binary file
|
||||
/// descriptor set for Protobuf schema reflection.
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let out_dir = PathBuf::from(env::var("OUT_DIR")?);
|
||||
// Generates Rust code from .proto Protobuf schemas.
|
||||
tonic_build::configure()
|
||||
.bytes(["."])
|
||||
.file_descriptor_set_path(out_dir.join("page_api_descriptor.bin"))
|
||||
.compile_protos(&["proto/page_service.proto"], &["proto"])
|
||||
.map_err(|err| err.into())
|
||||
}
|
||||
|
||||
@@ -11,19 +11,6 @@
|
||||
// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
|
||||
// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
|
||||
//
|
||||
// The service can be accessed via e.g. grpcurl:
|
||||
//
|
||||
// ```
|
||||
// grpcurl \
|
||||
// -plaintext \
|
||||
// -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
|
||||
// -H "neon-shard-id: 0b10" \
|
||||
// -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
|
||||
// -H "authorization: Bearer $JWT" \
|
||||
// -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
|
||||
// localhost:51051 page_api.PageService/CheckRelExists
|
||||
// ```
|
||||
//
|
||||
// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
|
||||
// However, this will require reconnecting when changing modes.
|
||||
//
|
||||
@@ -33,7 +20,7 @@
|
||||
// - Compression
|
||||
|
||||
syntax = "proto3";
|
||||
package page_api;
|
||||
package page_service;
|
||||
|
||||
service PageService {
|
||||
// Returns whether a relation exists.
|
||||
|
||||
@@ -7,12 +7,7 @@
|
||||
|
||||
// Code generated by protobuf.
|
||||
pub mod proto {
|
||||
tonic::include_proto!("page_api");
|
||||
|
||||
/// File descriptor set for Protobuf schema reflection. This allows using
|
||||
/// e.g. grpcurl with the API.
|
||||
pub const FILE_DESCRIPTOR_SET: &[u8] =
|
||||
tonic::include_file_descriptor_set!("page_api_descriptor");
|
||||
tonic::include_proto!("page_service");
|
||||
|
||||
pub use page_service_client::PageServiceClient;
|
||||
pub use page_service_server::{PageService, PageServiceServer};
|
||||
|
||||
@@ -144,7 +144,7 @@ where
|
||||
replica,
|
||||
ctx,
|
||||
io_concurrency: IoConcurrency::spawn_from_conf(
|
||||
timeline.conf.get_vectored_concurrent_io,
|
||||
timeline.conf,
|
||||
timeline
|
||||
.gate
|
||||
.enter()
|
||||
@@ -343,7 +343,7 @@ where
|
||||
// Gather non-relational files from object storage pages.
|
||||
let slru_partitions = self
|
||||
.timeline
|
||||
.get_slru_keyspace(Version::at(self.lsn), self.ctx)
|
||||
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
.partition(
|
||||
self.timeline.get_shard_identity(),
|
||||
@@ -378,7 +378,7 @@ where
|
||||
// Otherwise only include init forks of unlogged relations.
|
||||
let rels = self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
for &rel in rels.iter() {
|
||||
// Send init fork as main fork to provide well formed empty
|
||||
@@ -517,7 +517,7 @@ where
|
||||
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_rel_size(src, Version::at(self.lsn), self.ctx)
|
||||
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
@@ -577,7 +577,7 @@ where
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, Version::at(self.lsn), self.ctx)
|
||||
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?;
|
||||
|
||||
if img.len()
|
||||
@@ -631,7 +631,7 @@ where
|
||||
if !has_relmap_file
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
|
||||
.await?
|
||||
.is_empty()
|
||||
{
|
||||
|
||||
@@ -1,518 +0,0 @@
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
|
||||
use tokio::{
|
||||
io::{AsyncWriteExt, BufWriter},
|
||||
sync::mpsc::{UnboundedReceiver, UnboundedSender},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
shard::TenantShardId,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
basebackup::send_basebackup_tarball,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ},
|
||||
task_mgr::TaskKind,
|
||||
tenant::{
|
||||
Timeline,
|
||||
mgr::{TenantManager, TenantSlot},
|
||||
},
|
||||
};
|
||||
|
||||
pub struct BasebackupPrepareRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
|
||||
pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
|
||||
|
||||
type BasebackupRemoveEntrySender = UnboundedSender<Utf8PathBuf>;
|
||||
type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
|
||||
|
||||
/// BasebackupCache stores cached basebackup archives for timelines on local disk.
|
||||
///
|
||||
/// The main purpose of this cache is to speed up the startup process of compute nodes
|
||||
/// after scaling to zero.
|
||||
/// Thus, the basebackup is stored only for the latest LSN of the timeline and with
|
||||
/// fixed set of parameters (gzip=true, full_backup=false, replica=false, prev_lsn=none).
|
||||
///
|
||||
/// The cache receives prepare requests through the `BasebackupPrepareSender` channel,
|
||||
/// generates a basebackup from the timeline in the background, and stores it on disk.
|
||||
///
|
||||
/// Basebackup requests are pretty rare. We expect ~thousands of entries in the cache
|
||||
/// and ~1 RPS for get requests.
|
||||
pub struct BasebackupCache {
|
||||
data_dir: Utf8PathBuf,
|
||||
config: BasebackupCacheConfig,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
remove_entry_sender: BasebackupRemoveEntrySender,
|
||||
|
||||
entries: std::sync::Mutex<HashMap<TenantTimelineId, Lsn>>,
|
||||
|
||||
cancel: CancellationToken,
|
||||
|
||||
read_hit_count: GenericCounter<AtomicU64>,
|
||||
read_miss_count: GenericCounter<AtomicU64>,
|
||||
read_err_count: GenericCounter<AtomicU64>,
|
||||
|
||||
prepare_ok_count: GenericCounter<AtomicU64>,
|
||||
prepare_skip_count: GenericCounter<AtomicU64>,
|
||||
prepare_err_count: GenericCounter<AtomicU64>,
|
||||
}
|
||||
|
||||
impl BasebackupCache {
|
||||
/// Creates a BasebackupCache and spawns the background task.
|
||||
/// The initialization of the cache is performed in the background and does not
|
||||
/// block the caller. The cache will return `None` for any get requests until
|
||||
/// initialization is complete.
|
||||
pub fn spawn(
|
||||
runtime_handle: &tokio::runtime::Handle,
|
||||
data_dir: Utf8PathBuf,
|
||||
config: Option<BasebackupCacheConfig>,
|
||||
prepare_receiver: BasebackupPrepareReceiver,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let enabled = config.is_some();
|
||||
|
||||
let cache = Arc::new(BasebackupCache {
|
||||
data_dir,
|
||||
config: config.unwrap_or_default(),
|
||||
tenant_manager,
|
||||
remove_entry_sender,
|
||||
|
||||
entries: std::sync::Mutex::new(HashMap::new()),
|
||||
|
||||
cancel,
|
||||
|
||||
read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
|
||||
read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
|
||||
read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
|
||||
|
||||
prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
|
||||
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
|
||||
prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
|
||||
});
|
||||
|
||||
if enabled {
|
||||
runtime_handle.spawn(
|
||||
cache
|
||||
.clone()
|
||||
.background(prepare_receiver, remove_entry_receiver),
|
||||
);
|
||||
}
|
||||
|
||||
cache
|
||||
}
|
||||
|
||||
/// Gets a basebackup entry from the cache.
|
||||
/// If the entry is found, opens a file with the basebackup archive and returns it.
|
||||
/// The open file descriptor will prevent the file system from deleting the file
|
||||
/// even if the entry is removed from the cache in the background.
|
||||
pub async fn get(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Option<tokio::fs::File> {
|
||||
// Fast path. Check if the entry exists using the in-memory state.
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
if self.entries.lock().unwrap().get(&tti) != Some(&lsn) {
|
||||
self.read_miss_count.inc();
|
||||
return None;
|
||||
}
|
||||
|
||||
let path = self.entry_path(tenant_id, timeline_id, lsn);
|
||||
|
||||
match tokio::fs::File::open(path).await {
|
||||
Ok(file) => {
|
||||
self.read_hit_count.inc();
|
||||
Some(file)
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// We may end up here if the basebackup was concurrently removed by the cleanup task.
|
||||
self.read_miss_count.inc();
|
||||
} else {
|
||||
self.read_err_count.inc();
|
||||
tracing::warn!("Unexpected error opening basebackup cache file: {:?}", e);
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Private methods.
|
||||
|
||||
fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
|
||||
// The default format for LSN is 0/ABCDEF.
|
||||
// The backslash is not filename friendly, so serialize it as plain hex.
|
||||
let lsn = lsn.0;
|
||||
format!("basebackup_{tenant_id}_{timeline_id}_{lsn:016X}.tar.gz")
|
||||
}
|
||||
|
||||
fn entry_path(&self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> Utf8PathBuf {
|
||||
self.data_dir
|
||||
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
fn entry_tmp_path(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Utf8PathBuf {
|
||||
self.data_dir
|
||||
.join("tmp")
|
||||
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> {
|
||||
let parts: Vec<&str> = filename
|
||||
.strip_prefix("basebackup_")?
|
||||
.strip_suffix(".tar.gz")?
|
||||
.split('_')
|
||||
.collect();
|
||||
if parts.len() != 3 {
|
||||
return None;
|
||||
}
|
||||
let tenant_id = parts[0].parse::<TenantId>().ok()?;
|
||||
let timeline_id = parts[1].parse::<TimelineId>().ok()?;
|
||||
let lsn = Lsn(u64::from_str_radix(parts[2], 16).ok()?);
|
||||
|
||||
Some((tenant_id, timeline_id, lsn))
|
||||
}
|
||||
|
||||
async fn cleanup(&self) -> anyhow::Result<()> {
|
||||
// Cleanup tmp directory.
|
||||
let tmp_dir = self.data_dir.join("tmp");
|
||||
let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?;
|
||||
while let Some(dir_entry) = tmp_dir.next_entry().await? {
|
||||
if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await {
|
||||
tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove outdated entries.
|
||||
let entries_old = self.entries.lock().unwrap().clone();
|
||||
let mut entries_new = HashMap::new();
|
||||
for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
continue;
|
||||
}
|
||||
let TenantSlot::Attached(tenant) = tenant_slot else {
|
||||
continue;
|
||||
};
|
||||
let tenant_id = tenant_shard_id.tenant_id;
|
||||
|
||||
for timeline in tenant.list_timelines() {
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id);
|
||||
if let Some(&entry_lsn) = entries_old.get(&tti) {
|
||||
if timeline.get_last_record_lsn() <= entry_lsn {
|
||||
entries_new.insert(tti, entry_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (&tti, &lsn) in entries_old.iter() {
|
||||
if !entries_new.contains_key(&tti) {
|
||||
self.remove_entry_sender
|
||||
.send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn))
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64);
|
||||
*self.entries.lock().unwrap() = entries_new;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn on_startup(&self) -> anyhow::Result<()> {
|
||||
// Create data_dir and tmp directory if they do not exist.
|
||||
tokio::fs::create_dir_all(&self.data_dir.join("tmp"))
|
||||
.await
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to create basebackup cache data_dir {:?}: {:?}",
|
||||
self.data_dir,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
// Read existing entries from the data_dir and add them to in-memory state.
|
||||
let mut entries = HashMap::new();
|
||||
let mut dir = tokio::fs::read_dir(&self.data_dir).await?;
|
||||
while let Some(dir_entry) = dir.next_entry().await? {
|
||||
let filename = dir_entry.file_name();
|
||||
|
||||
if filename == "tmp" {
|
||||
// Skip the tmp directory.
|
||||
continue;
|
||||
}
|
||||
|
||||
let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref());
|
||||
let Some((tenant_id, timeline_id, lsn)) = parsed else {
|
||||
tracing::warn!("Invalid basebackup cache file name: {:?}", filename);
|
||||
continue;
|
||||
};
|
||||
|
||||
let tti = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
|
||||
use std::collections::hash_map::Entry::*;
|
||||
|
||||
match entries.entry(tti) {
|
||||
Occupied(mut entry) => {
|
||||
let entry_lsn = *entry.get();
|
||||
// Leave only the latest entry, remove the old one.
|
||||
if lsn < entry_lsn {
|
||||
self.remove_entry_sender.send(self.entry_path(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
))?;
|
||||
} else if lsn > entry_lsn {
|
||||
self.remove_entry_sender.send(self.entry_path(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
entry_lsn,
|
||||
))?;
|
||||
entry.insert(lsn);
|
||||
} else {
|
||||
// Two different filenames parsed to the same timline_id and LSN.
|
||||
// Should never happen.
|
||||
return Err(anyhow::anyhow!(
|
||||
"Duplicate basebackup cache entry with the same LSN: {:?}",
|
||||
filename
|
||||
));
|
||||
}
|
||||
}
|
||||
Vacant(entry) => {
|
||||
entry.insert(lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
|
||||
*self.entries.lock().unwrap() = entries;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn background(
|
||||
self: Arc<Self>,
|
||||
mut prepare_receiver: BasebackupPrepareReceiver,
|
||||
mut remove_entry_receiver: BasebackupRemoveEntryReceiver,
|
||||
) {
|
||||
// Panic in the background is a safe fallback.
|
||||
// It will drop receivers and the cache will be effectively disabled.
|
||||
self.on_startup()
|
||||
.await
|
||||
.expect("Failed to initialize basebackup cache");
|
||||
|
||||
let mut cleanup_ticker = tokio::time::interval(self.config.cleanup_period);
|
||||
cleanup_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
Some(req) = prepare_receiver.recv() => {
|
||||
if let Err(err) = self.prepare_basebackup(
|
||||
req.tenant_shard_id,
|
||||
req.timeline_id,
|
||||
req.lsn,
|
||||
).await {
|
||||
tracing::info!("Failed to prepare basebackup: {:#}", err);
|
||||
self.prepare_err_count.inc();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Some(req) = remove_entry_receiver.recv() => {
|
||||
if let Err(e) = tokio::fs::remove_file(req).await {
|
||||
tracing::warn!("Failed to remove basebackup cache file: {:#}", e);
|
||||
}
|
||||
}
|
||||
_ = cleanup_ticker.tick() => {
|
||||
self.cleanup().await.unwrap_or_else(|e| {
|
||||
tracing::warn!("Failed to clean up basebackup cache: {:#}", e);
|
||||
});
|
||||
}
|
||||
_ = self.cancel.cancelled() => {
|
||||
tracing::info!("BasebackupCache background task cancelled");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Prepare a basebackup for the given timeline.
|
||||
///
|
||||
/// If the basebackup already exists with a higher LSN or the timeline already
|
||||
/// has a higher last_record_lsn, skip the preparation.
|
||||
///
|
||||
/// The basebackup is prepared in a temporary directory and then moved to the final
|
||||
/// location to make the operation atomic.
|
||||
async fn prepare_basebackup(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
req_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
"Preparing basebackup for timeline",
|
||||
);
|
||||
|
||||
let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
{
|
||||
let entries = self.entries.lock().unwrap();
|
||||
if let Some(&entry_lsn) = entries.get(&tti) {
|
||||
if entry_lsn >= req_lsn {
|
||||
tracing::info!(
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
%entry_lsn,
|
||||
"Basebackup entry already exists for timeline with higher LSN, skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
if entries.len() as i64 >= self.config.max_size_entries {
|
||||
tracing::info!(
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
"Basebackup cache is full, skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
let tenant = self
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
let tenant_state = tenant.current_state();
|
||||
if tenant_state != TenantState::Active {
|
||||
anyhow::bail!(
|
||||
"Tenant {} is not active, current state: {:?}",
|
||||
tenant_shard_id.tenant_id,
|
||||
tenant_state
|
||||
)
|
||||
}
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if last_record_lsn > req_lsn {
|
||||
tracing::info!(
|
||||
%timeline_id,
|
||||
%req_lsn,
|
||||
%last_record_lsn,
|
||||
"Timeline has a higher LSN than the requested one, skipping basebackup",
|
||||
);
|
||||
self.prepare_skip_count.inc();
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let entry_tmp_path = self.entry_tmp_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
|
||||
|
||||
let res = self
|
||||
.prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn)
|
||||
.await;
|
||||
|
||||
if let Err(err) = res {
|
||||
tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
|
||||
// Try to clean up tmp file. If we fail, the background clean up task will take care of it.
|
||||
match tokio::fs::remove_file(&entry_tmp_path).await {
|
||||
Ok(_) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => {
|
||||
tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
|
||||
}
|
||||
}
|
||||
return Err(err);
|
||||
}
|
||||
|
||||
// Move the tmp file to the final location atomically.
|
||||
let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
|
||||
tokio::fs::rename(&entry_tmp_path, &entry_path).await?;
|
||||
|
||||
let mut entries = self.entries.lock().unwrap();
|
||||
if let Some(old_lsn) = entries.insert(tti, req_lsn) {
|
||||
// Remove the old entry if it exists.
|
||||
self.remove_entry_sender
|
||||
.send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn))
|
||||
.unwrap();
|
||||
}
|
||||
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
|
||||
|
||||
self.prepare_ok_count.inc();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Prepares a basebackup in a temporary file.
|
||||
async fn prepare_basebackup_tmp(
|
||||
&self,
|
||||
emptry_tmp_path: &Utf8Path,
|
||||
timeline: &Arc<Timeline>,
|
||||
req_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download);
|
||||
let ctx = ctx.with_scope_timeline(timeline);
|
||||
|
||||
let file = tokio::fs::File::create(emptry_tmp_path).await?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
&mut writer,
|
||||
// Level::Best because compression is not on the hot path of basebackup requests.
|
||||
// The decompression is almost not affected by the compression level.
|
||||
async_compression::Level::Best,
|
||||
);
|
||||
|
||||
// We may receive a request before the WAL record is applied to the timeline.
|
||||
// Wait for the requested LSN to be applied.
|
||||
timeline
|
||||
.wait_lsn(
|
||||
req_lsn,
|
||||
crate::tenant::timeline::WaitLsnWaiter::BaseBackupCache,
|
||||
crate::tenant::timeline::WaitLsnTimeout::Default,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
send_basebackup_tarball(
|
||||
&mut encoder,
|
||||
timeline,
|
||||
Some(req_lsn),
|
||||
None,
|
||||
false,
|
||||
false,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
encoder.shutdown().await?;
|
||||
writer.flush().await?;
|
||||
writer.into_inner().sync_all().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,6 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
|
||||
use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
|
||||
use metrics::set_build_info_metric;
|
||||
use nix::sys::socket::{setsockopt, sockopt};
|
||||
use pageserver::basebackup_cache::BasebackupCache;
|
||||
use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
|
||||
use pageserver::controller_upcall_client::StorageControllerUpcallClient;
|
||||
use pageserver::deletion_queue::DeletionQueue;
|
||||
@@ -424,11 +423,14 @@ fn start_pageserver(
|
||||
.map(storage_broker::Certificate::from_pem),
|
||||
);
|
||||
// Note: we do not attempt connecting here (but validate endpoints sanity).
|
||||
storage_broker::connect(
|
||||
let service_client = storage_broker::connect(
|
||||
conf.broker_endpoint.clone(),
|
||||
conf.broker_keepalive_interval,
|
||||
tls_config,
|
||||
)
|
||||
)?;
|
||||
anyhow::Ok(storage_broker::TimelineUpdatesSubscriber::new(
|
||||
service_client,
|
||||
))
|
||||
})
|
||||
.with_context(|| {
|
||||
format!(
|
||||
@@ -542,8 +544,6 @@ fn start_pageserver(
|
||||
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
|
||||
|
||||
// Scan the local 'tenants/' directory and start loading the tenants
|
||||
let (basebackup_prepare_sender, basebackup_prepare_receiver) =
|
||||
tokio::sync::mpsc::unbounded_channel();
|
||||
let deletion_queue_client = deletion_queue.new_client();
|
||||
let background_purges = mgr::BackgroundPurges::default();
|
||||
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
|
||||
@@ -554,22 +554,12 @@ fn start_pageserver(
|
||||
remote_storage: remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
},
|
||||
order,
|
||||
shutdown_pageserver.clone(),
|
||||
))?;
|
||||
let tenant_manager = Arc::new(tenant_manager);
|
||||
|
||||
let basebackup_cache = BasebackupCache::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
conf.basebackup_cache_dir(),
|
||||
conf.basebackup_cache_config.clone(),
|
||||
basebackup_prepare_receiver,
|
||||
Arc::clone(&tenant_manager),
|
||||
shutdown_pageserver.child_token(),
|
||||
);
|
||||
|
||||
BACKGROUND_RUNTIME.spawn({
|
||||
let shutdown_pageserver = shutdown_pageserver.clone();
|
||||
let drive_init = async move {
|
||||
@@ -776,7 +766,6 @@ fn start_pageserver(
|
||||
} else {
|
||||
None
|
||||
},
|
||||
basebackup_cache,
|
||||
);
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
|
||||
@@ -232,8 +232,6 @@ pub struct PageServerConf {
|
||||
pub dev_mode: bool,
|
||||
|
||||
pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
|
||||
|
||||
pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -263,10 +261,6 @@ impl PageServerConf {
|
||||
self.workdir.join("metadata.json")
|
||||
}
|
||||
|
||||
pub fn basebackup_cache_dir(&self) -> Utf8PathBuf {
|
||||
self.workdir.join("basebackup_cache")
|
||||
}
|
||||
|
||||
pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
|
||||
// Encode a version in the filename, so that if we ever switch away from JSON we can
|
||||
// increment this.
|
||||
@@ -413,7 +407,6 @@ impl PageServerConf {
|
||||
enable_tls_page_service_api,
|
||||
dev_mode,
|
||||
timeline_import_config,
|
||||
basebackup_cache_config,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -468,7 +461,6 @@ impl PageServerConf {
|
||||
enable_tls_page_service_api,
|
||||
dev_mode,
|
||||
timeline_import_config,
|
||||
basebackup_cache_config,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
@@ -552,23 +544,6 @@ impl PageServerConf {
|
||||
ratio.numerator, ratio.denominator
|
||||
)
|
||||
);
|
||||
|
||||
let url = Url::parse(&tracing_config.export_config.endpoint)
|
||||
.map_err(anyhow::Error::msg)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"tracing endpoint URL is invalid : {}",
|
||||
tracing_config.export_config.endpoint
|
||||
)
|
||||
})?;
|
||||
|
||||
ensure!(
|
||||
url.scheme() == "http" || url.scheme() == "https",
|
||||
format!(
|
||||
"tracing endpoint URL must start with http:// or https://: {}",
|
||||
tracing_config.export_config.endpoint
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
|
||||
@@ -685,25 +660,4 @@ mod tests {
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
|
||||
.expect("parse_and_validate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_config_tracing_endpoint_is_invalid() {
|
||||
let input = r#"
|
||||
control_plane_api = "http://localhost:6666"
|
||||
|
||||
[tracing]
|
||||
|
||||
sampling_ratio = { numerator = 1, denominator = 0 }
|
||||
|
||||
[tracing.export_config]
|
||||
endpoint = "localhost:4317"
|
||||
protocol = "http-binary"
|
||||
timeout = "1ms"
|
||||
"#;
|
||||
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
|
||||
.expect("config has valid fields");
|
||||
let workdir = Utf8PathBuf::from("/nonexistent");
|
||||
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
|
||||
.expect_err("parse_and_validate should fail for endpoint without scheme");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,25 +18,12 @@ use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
// management.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
|
||||
pub(super) enum Name {
|
||||
/// Timeline last_record_lsn, absolute.
|
||||
/// Timeline last_record_lsn, absolute
|
||||
#[serde(rename = "written_size")]
|
||||
WrittenSize,
|
||||
/// Timeline last_record_lsn, incremental
|
||||
#[serde(rename = "written_data_bytes_delta")]
|
||||
WrittenSizeDelta,
|
||||
/// Written bytes only on this timeline (not including ancestors):
|
||||
/// written_size - ancestor_lsn
|
||||
///
|
||||
/// On the root branch, this is equivalent to `written_size`.
|
||||
#[serde(rename = "written_size_since_parent")]
|
||||
WrittenSizeSinceParent,
|
||||
/// PITR history size only on this timeline (not including ancestors):
|
||||
/// last_record_lsn - max(pitr_cutoff, ancestor_lsn).
|
||||
///
|
||||
/// On the root branch, this is its entire PITR history size. Not emitted if GC hasn't computed
|
||||
/// the PITR cutoff yet. 0 if PITR is disabled.
|
||||
#[serde(rename = "pitr_history_size_since_parent")]
|
||||
PitrHistorySizeSinceParent,
|
||||
/// Timeline logical size
|
||||
#[serde(rename = "timeline_logical_size")]
|
||||
LogicalSize,
|
||||
@@ -170,32 +157,6 @@ impl MetricsKey {
|
||||
.incremental_values()
|
||||
}
|
||||
|
||||
/// `written_size` - `ancestor_lsn`.
|
||||
const fn written_size_since_parent(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
metric: Name::WrittenSizeSinceParent,
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// `written_size` - max(`pitr_cutoff`, `ancestor_lsn`).
|
||||
const fn pitr_history_size_since_parent(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> AbsoluteValueFactory {
|
||||
MetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline_id),
|
||||
metric: Name::PitrHistorySizeSinceParent,
|
||||
}
|
||||
.absolute_values()
|
||||
}
|
||||
|
||||
/// Exact [`Timeline::get_current_logical_size`].
|
||||
///
|
||||
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
|
||||
@@ -373,13 +334,7 @@ impl TenantSnapshot {
|
||||
struct TimelineSnapshot {
|
||||
loaded_at: (Lsn, SystemTime),
|
||||
last_record_lsn: Lsn,
|
||||
ancestor_lsn: Lsn,
|
||||
current_exact_logical_size: Option<u64>,
|
||||
/// Whether PITR is enabled (pitr_interval > 0).
|
||||
pitr_enabled: bool,
|
||||
/// The PITR cutoff LSN. None if not yet initialized. If PITR is disabled, this is approximately
|
||||
/// Some(last_record_lsn), but may lag behind it since it's computed periodically.
|
||||
pitr_cutoff: Option<Lsn>,
|
||||
}
|
||||
|
||||
impl TimelineSnapshot {
|
||||
@@ -399,9 +354,6 @@ impl TimelineSnapshot {
|
||||
} else {
|
||||
let loaded_at = t.loaded_at;
|
||||
let last_record_lsn = t.get_last_record_lsn();
|
||||
let ancestor_lsn = t.get_ancestor_lsn();
|
||||
let pitr_enabled = !t.get_pitr_interval().is_zero();
|
||||
let pitr_cutoff = t.gc_info.read().unwrap().cutoffs.time;
|
||||
|
||||
let current_exact_logical_size = {
|
||||
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
|
||||
@@ -421,10 +373,7 @@ impl TimelineSnapshot {
|
||||
Ok(Some(TimelineSnapshot {
|
||||
loaded_at,
|
||||
last_record_lsn,
|
||||
ancestor_lsn,
|
||||
current_exact_logical_size,
|
||||
pitr_enabled,
|
||||
pitr_cutoff,
|
||||
}))
|
||||
}
|
||||
}
|
||||
@@ -475,8 +424,6 @@ impl TimelineSnapshot {
|
||||
|
||||
let up_to = now;
|
||||
|
||||
let written_size_last = written_size_now.value.max(prev.1); // don't regress
|
||||
|
||||
if let Some(delta) = written_size_now.value.checked_sub(prev.1) {
|
||||
let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
|
||||
// written_size_delta
|
||||
@@ -494,27 +441,6 @@ impl TimelineSnapshot {
|
||||
});
|
||||
}
|
||||
|
||||
// Compute the branch-local written size.
|
||||
let written_size_since_parent_key =
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id);
|
||||
metrics.push(
|
||||
written_size_since_parent_key
|
||||
.at(now, written_size_last.saturating_sub(self.ancestor_lsn.0)),
|
||||
);
|
||||
|
||||
// Compute the branch-local PITR history size. Not emitted if GC hasn't yet computed the
|
||||
// PITR cutoff. 0 if PITR is disabled.
|
||||
let pitr_history_size_since_parent_key =
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id);
|
||||
if !self.pitr_enabled {
|
||||
metrics.push(pitr_history_size_since_parent_key.at(now, 0));
|
||||
} else if let Some(pitr_cutoff) = self.pitr_cutoff {
|
||||
metrics.push(pitr_history_size_since_parent_key.at(
|
||||
now,
|
||||
written_size_last.saturating_sub(pitr_cutoff.max(self.ancestor_lsn).0),
|
||||
));
|
||||
}
|
||||
|
||||
{
|
||||
let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
|
||||
let current_or_previous = self
|
||||
|
||||
@@ -12,17 +12,12 @@ fn startup_collected_timeline_metrics_before_advancing() {
|
||||
let cache = HashMap::new();
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let pitr_cutoff = Lsn(0x11000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
let logical_size = 0x42000;
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, SystemTime::now()),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
ancestor_lsn: Lsn(0),
|
||||
current_exact_logical_size: Some(logical_size),
|
||||
pitr_enabled: true,
|
||||
pitr_cutoff: Some(pitr_cutoff),
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
let now = DateTime::<Utc>::from(SystemTime::now());
|
||||
@@ -38,11 +33,7 @@ fn startup_collected_timeline_metrics_before_advancing() {
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
|
||||
.at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
|
||||
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
@@ -58,9 +49,7 @@ fn startup_collected_timeline_metrics_second_round() {
|
||||
let before = DateTime::<Utc>::from(before);
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let pitr_cutoff = Lsn(0x11000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
let logical_size = 0x42000;
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id)
|
||||
@@ -70,10 +59,7 @@ fn startup_collected_timeline_metrics_second_round() {
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
ancestor_lsn: Lsn(0),
|
||||
current_exact_logical_size: Some(logical_size),
|
||||
pitr_enabled: true,
|
||||
pitr_cutoff: Some(pitr_cutoff),
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
@@ -83,11 +69,7 @@ fn startup_collected_timeline_metrics_second_round() {
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
|
||||
.at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
|
||||
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
@@ -104,9 +86,7 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
|
||||
let before = DateTime::<Utc>::from(before);
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let pitr_cutoff = Lsn(0x11000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
let logical_size = 0x42000;
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
@@ -123,10 +103,7 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
ancestor_lsn: Lsn(0),
|
||||
current_exact_logical_size: Some(logical_size),
|
||||
pitr_enabled: true,
|
||||
pitr_cutoff: Some(pitr_cutoff),
|
||||
current_exact_logical_size: Some(0x42000),
|
||||
};
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
@@ -136,18 +113,16 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
|
||||
.at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
|
||||
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/// Tests that written sizes do not regress across restarts.
|
||||
#[test]
|
||||
fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
|
||||
// it can happen that we lose the inmemorylayer but have previously sent metrics and we
|
||||
// should never go backwards
|
||||
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
@@ -165,10 +140,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (Lsn(50), at_restart),
|
||||
last_record_lsn: Lsn(50),
|
||||
ancestor_lsn: Lsn(0),
|
||||
current_exact_logical_size: None,
|
||||
pitr_enabled: true,
|
||||
pitr_cutoff: Some(Lsn(20)),
|
||||
};
|
||||
|
||||
let mut cache = HashMap::from([
|
||||
@@ -197,8 +169,6 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 100),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 80),
|
||||
]
|
||||
);
|
||||
|
||||
@@ -213,157 +183,6 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 100),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 80),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/// Tests that written sizes do not regress across restarts, even on child branches.
|
||||
#[test]
|
||||
fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let [later, now, at_restart] = time_backwards();
|
||||
|
||||
// FIXME: tests would be so much easier if we did not need to juggle back and forth
|
||||
// SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
let later = DateTime::<Utc>::from(later);
|
||||
let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
|
||||
let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
|
||||
let before_restart = DateTime::<Utc>::from(before_restart);
|
||||
let way_before = DateTime::<Utc>::from(way_before);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (Lsn(50), at_restart),
|
||||
last_record_lsn: Lsn(50),
|
||||
ancestor_lsn: Lsn(40),
|
||||
current_exact_logical_size: None,
|
||||
pitr_enabled: true,
|
||||
pitr_cutoff: Some(Lsn(20)),
|
||||
};
|
||||
|
||||
let mut cache = HashMap::from([
|
||||
MetricsKey::written_size(tenant_id, timeline_id)
|
||||
.at(before_restart, 100)
|
||||
.to_kv_pair(),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until(
|
||||
way_before,
|
||||
before_restart,
|
||||
// not taken into account, but the timestamps are important
|
||||
999_999_999,
|
||||
)
|
||||
.to_kv_pair(),
|
||||
]);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
|
||||
before_restart,
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 60),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60),
|
||||
]
|
||||
);
|
||||
|
||||
// now if we cache these metrics, and re-run while "still in recovery"
|
||||
cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
|
||||
|
||||
// "still in recovery", because our snapshot did not change
|
||||
snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 60),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/// Tests that written sizes do not regress across restarts, even on child branches and
|
||||
/// with a PITR cutoff after the branch point.
|
||||
#[test]
|
||||
fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn_and_pitr_cutoff() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let [later, now, at_restart] = time_backwards();
|
||||
|
||||
// FIXME: tests would be so much easier if we did not need to juggle back and forth
|
||||
// SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
let later = DateTime::<Utc>::from(later);
|
||||
let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
|
||||
let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
|
||||
let before_restart = DateTime::<Utc>::from(before_restart);
|
||||
let way_before = DateTime::<Utc>::from(way_before);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (Lsn(50), at_restart),
|
||||
last_record_lsn: Lsn(50),
|
||||
ancestor_lsn: Lsn(30),
|
||||
current_exact_logical_size: None,
|
||||
pitr_enabled: true,
|
||||
pitr_cutoff: Some(Lsn(40)),
|
||||
};
|
||||
|
||||
let mut cache = HashMap::from([
|
||||
MetricsKey::written_size(tenant_id, timeline_id)
|
||||
.at(before_restart, 100)
|
||||
.to_kv_pair(),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until(
|
||||
way_before,
|
||||
before_restart,
|
||||
// not taken into account, but the timestamps are important
|
||||
999_999_999,
|
||||
)
|
||||
.to_kv_pair(),
|
||||
]);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
|
||||
before_restart,
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 70),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60),
|
||||
]
|
||||
);
|
||||
|
||||
// now if we cache these metrics, and re-run while "still in recovery"
|
||||
cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
|
||||
|
||||
// "still in recovery", because our snapshot did not change
|
||||
snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 70),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60),
|
||||
]
|
||||
);
|
||||
}
|
||||
@@ -382,10 +201,7 @@ fn post_restart_current_exact_logical_size_uses_cached() {
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (Lsn(50), at_restart),
|
||||
last_record_lsn: Lsn(50),
|
||||
ancestor_lsn: Lsn(0),
|
||||
current_exact_logical_size: None,
|
||||
pitr_enabled: true,
|
||||
pitr_cutoff: None,
|
||||
};
|
||||
|
||||
let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id)
|
||||
@@ -470,101 +286,16 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
|
||||
times
|
||||
}
|
||||
|
||||
/// Tests that disabled PITR history does not yield any history size, even when the PITR cutoff
|
||||
/// indicates otherwise.
|
||||
#[test]
|
||||
fn pitr_disabled_yields_no_history_size() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::new();
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let pitr_cutoff = Lsn(0x11000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, SystemTime::now()),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
ancestor_lsn: Lsn(0),
|
||||
current_exact_logical_size: None,
|
||||
pitr_enabled: false,
|
||||
pitr_cutoff: Some(pitr_cutoff),
|
||||
};
|
||||
|
||||
let now = DateTime::<Utc>::from(SystemTime::now());
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
|
||||
snap.loaded_at.1.into(),
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
|
||||
.at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
/// Tests that uninitialized PITR cutoff does not emit any history size metric at all.
|
||||
#[test]
|
||||
fn pitr_uninitialized_does_not_emit_history_size() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::new();
|
||||
|
||||
let initdb_lsn = Lsn(0x10000);
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, SystemTime::now()),
|
||||
last_record_lsn: disk_consistent_lsn,
|
||||
ancestor_lsn: Lsn(0),
|
||||
current_exact_logical_size: None,
|
||||
pitr_enabled: true,
|
||||
pitr_cutoff: None,
|
||||
};
|
||||
|
||||
let now = DateTime::<Utc>::from(SystemTime::now());
|
||||
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
&[
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
|
||||
snap.loaded_at.1.into(),
|
||||
now,
|
||||
0
|
||||
),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
|
||||
.at(now, disk_consistent_lsn.0),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
pub(crate) const fn metric_examples_old(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [RawMetric; 7] {
|
||||
) -> [RawMetric; 5] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until_old_format(before, now, 0),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
|
||||
MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
|
||||
@@ -576,12 +307,10 @@ pub(crate) const fn metric_examples(
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [NewRawMetric; 7] {
|
||||
) -> [NewRawMetric; 5] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
|
||||
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
|
||||
MetricsKey::synthetic_size(tenant_id).at(now, 1),
|
||||
|
||||
@@ -513,14 +513,6 @@ mod tests {
|
||||
line!(),
|
||||
r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"pitr_history_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
|
||||
@@ -568,7 +560,7 @@ mod tests {
|
||||
assert_eq!(upgraded_samples, new_samples);
|
||||
}
|
||||
|
||||
fn metric_samples_old() -> [RawMetric; 7] {
|
||||
fn metric_samples_old() -> [RawMetric; 5] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
@@ -580,7 +572,7 @@ mod tests {
|
||||
super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
|
||||
}
|
||||
|
||||
fn metric_samples() -> [NewRawMetric; 7] {
|
||||
fn metric_samples() -> [NewRawMetric; 5] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ pub struct State {
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
allowlist_routes: &'static [&'static str],
|
||||
remote_storage: GenericRemoteStorage,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
@@ -114,7 +114,7 @@ impl State {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
secondary_controller: SecondaryController,
|
||||
@@ -449,7 +449,7 @@ async fn build_timeline_info_common(
|
||||
// Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
|
||||
// actually trimmed data to), which can pass each other when PITR is changed.
|
||||
let min_readable_lsn = std::cmp::max(
|
||||
timeline.get_gc_cutoff_lsn().unwrap_or_default(),
|
||||
timeline.get_gc_cutoff_lsn(),
|
||||
*timeline.get_applied_gc_cutoff_lsn(),
|
||||
);
|
||||
|
||||
@@ -3199,7 +3199,7 @@ async fn list_aux_files(
|
||||
.await?;
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
state.conf.get_vectored_concurrent_io,
|
||||
state.conf,
|
||||
timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
|
||||
);
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
mod auth;
|
||||
pub mod basebackup;
|
||||
pub mod basebackup_cache;
|
||||
pub mod config;
|
||||
pub mod consumption_metrics;
|
||||
pub mod context;
|
||||
|
||||
@@ -843,50 +843,23 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_relsize_latest_cache_entries",
|
||||
"Number of entries in the latest relation size cache",
|
||||
"pageserver_relsize_cache_entries",
|
||||
"Number of entries in the relation size cache",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_relsize_latest_cache_hits",
|
||||
"Latest relation size cache hits",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_relsize_latest_cache_misses",
|
||||
"Relation size latest cache misses",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_relsize_snapshot_cache_entries",
|
||||
"Number of entries in the pitr relation size cache",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_relsize_snapshot_cache_hits",
|
||||
"Pitr relation size cache hits",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_relsize_snapshot_cache_misses",
|
||||
"Relation size snapshot cache misses",
|
||||
"pageserver_relsize_cache_misses",
|
||||
"Relation size cache misses",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1066,15 +1039,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
|
||||
.expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
|
||||
});
|
||||
|
||||
pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_offloaded_timelines",
|
||||
"Number of offloaded timelines of a tenant",
|
||||
&["tenant_id", "shard_id"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_offloaded_timelines metric")
|
||||
});
|
||||
|
||||
pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_eviction_iteration_duration_seconds_global",
|
||||
@@ -3560,14 +3524,11 @@ impl TimelineMetrics {
|
||||
}
|
||||
|
||||
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
let tid = tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = tenant_shard_id.shard_slug().to_string();
|
||||
|
||||
// Only shard zero deals in synthetic sizes
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
let tid = tenant_shard_id.tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
}
|
||||
let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]);
|
||||
|
||||
tenant_throttling::remove_tenant_metrics(tenant_shard_id);
|
||||
|
||||
@@ -4359,42 +4320,6 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
|
||||
.set(u64::try_from(num_threads.get()).unwrap());
|
||||
}
|
||||
|
||||
pub(crate) static BASEBACKUP_CACHE_READ: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_basebackup_cache_read_total",
|
||||
"Number of read accesses to the basebackup cache grouped by hit/miss/error",
|
||||
&["result"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_basebackup_cache_prepare_total",
|
||||
"Number of prepare requests processed by the basebackup cache grouped by ok/skip/error",
|
||||
&["result"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"pageserver_basebackup_cache_entries_total",
|
||||
"Number of entries in the basebackup cache"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// FIXME: Support basebackup cache size metrics.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"pageserver_basebackup_cache_size_bytes",
|
||||
"Total size of all basebackup cache entries on disk in bytes"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_config_ignored_items",
|
||||
|
||||
@@ -9,6 +9,7 @@ use std::sync::Arc;
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use std::{io, str};
|
||||
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use anyhow::{Context, bail};
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
@@ -17,7 +18,7 @@ use itertools::Itertools;
|
||||
use jsonwebtoken::TokenData;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::config::{
|
||||
GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
|
||||
};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
@@ -51,10 +52,8 @@ use utils::simple_rcu::RcuReadGuard;
|
||||
use utils::sync::gate::{Gate, GateGuard};
|
||||
use utils::sync::spsc_fold;
|
||||
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::basebackup_cache::BasebackupCache;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
@@ -63,7 +62,7 @@ use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
|
||||
SmgrOpTimer, TimelineMetrics,
|
||||
};
|
||||
use crate::pgdatadir_mapping::{LsnRange, Version};
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
|
||||
@@ -108,7 +107,6 @@ pub fn spawn(
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
let libpq_ctx = RequestContext::todo_child(
|
||||
@@ -130,7 +128,6 @@ pub fn spawn(
|
||||
conf.pg_auth_type,
|
||||
tls_config,
|
||||
conf.page_service_pipelining.clone(),
|
||||
basebackup_cache,
|
||||
libpq_ctx,
|
||||
cancel.clone(),
|
||||
)
|
||||
@@ -189,7 +186,6 @@ pub async fn libpq_listener_main(
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
listener_ctx: RequestContext,
|
||||
listener_cancel: CancellationToken,
|
||||
) -> Connections {
|
||||
@@ -233,7 +229,6 @@ pub async fn libpq_listener_main(
|
||||
auth_type,
|
||||
tls_config.clone(),
|
||||
pipelining_config.clone(),
|
||||
Arc::clone(&basebackup_cache),
|
||||
connection_ctx,
|
||||
connections_cancel.child_token(),
|
||||
gate_guard,
|
||||
@@ -276,7 +271,6 @@ async fn page_service_conn_main(
|
||||
auth_type: AuthType,
|
||||
tls_config: Option<Arc<rustls::ServerConfig>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
@@ -337,12 +331,11 @@ async fn page_service_conn_main(
|
||||
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
|
||||
// and create the per-query context in process_query ourselves.
|
||||
let mut conn_handler = PageServerHandler::new(
|
||||
conf,
|
||||
tenant_manager,
|
||||
auth,
|
||||
pipelining_config,
|
||||
conf.get_vectored_concurrent_io,
|
||||
perf_span_fields,
|
||||
basebackup_cache,
|
||||
connection_ctx,
|
||||
cancel.clone(),
|
||||
gate_guard,
|
||||
@@ -378,6 +371,7 @@ async fn page_service_conn_main(
|
||||
}
|
||||
|
||||
struct PageServerHandler {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
|
||||
@@ -395,9 +389,6 @@ struct PageServerHandler {
|
||||
timeline_handles: Option<TimelineHandles>,
|
||||
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
|
||||
gate_guard: GateGuard,
|
||||
}
|
||||
@@ -651,7 +642,7 @@ impl std::fmt::Display for BatchedPageStreamError {
|
||||
struct BatchedGetPageRequest {
|
||||
req: PagestreamGetPageRequest,
|
||||
timer: SmgrOpTimer,
|
||||
lsn_range: LsnRange,
|
||||
effective_request_lsn: Lsn,
|
||||
ctx: RequestContext,
|
||||
}
|
||||
|
||||
@@ -773,12 +764,12 @@ impl BatchedFeMessage {
|
||||
match batching_strategy {
|
||||
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
|
||||
if let Some(last_in_batch) = accum_pages.last() {
|
||||
if last_in_batch.lsn_range.effective_lsn
|
||||
!= this_pages[0].lsn_range.effective_lsn
|
||||
if last_in_batch.effective_request_lsn
|
||||
!= this_pages[0].effective_request_lsn
|
||||
{
|
||||
trace!(
|
||||
accum_lsn = %last_in_batch.lsn_range.effective_lsn,
|
||||
this_lsn = %this_pages[0].lsn_range.effective_lsn,
|
||||
accum_lsn = %last_in_batch.effective_request_lsn,
|
||||
this_lsn = %this_pages[0].effective_request_lsn,
|
||||
"stopping batching because LSN changed"
|
||||
);
|
||||
|
||||
@@ -793,15 +784,15 @@ impl BatchedFeMessage {
|
||||
let same_page_different_lsn = accum_pages.iter().any(|batched| {
|
||||
batched.req.rel == this_pages[0].req.rel
|
||||
&& batched.req.blkno == this_pages[0].req.blkno
|
||||
&& batched.lsn_range.effective_lsn
|
||||
!= this_pages[0].lsn_range.effective_lsn
|
||||
&& batched.effective_request_lsn
|
||||
!= this_pages[0].effective_request_lsn
|
||||
});
|
||||
|
||||
if same_page_different_lsn {
|
||||
trace!(
|
||||
rel=%this_pages[0].req.rel,
|
||||
blkno=%this_pages[0].req.blkno,
|
||||
lsn=%this_pages[0].lsn_range.effective_lsn,
|
||||
lsn=%this_pages[0].effective_request_lsn,
|
||||
"stopping batching because same page was requested at different LSNs"
|
||||
);
|
||||
|
||||
@@ -853,17 +844,17 @@ impl BatchedFeMessage {
|
||||
impl PageServerHandler {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
perf_span_fields: ConnectionPerfSpanFields,
|
||||
basebackup_cache: Arc<BasebackupCache>,
|
||||
connection_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
gate_guard: GateGuard,
|
||||
) -> Self {
|
||||
PageServerHandler {
|
||||
conf,
|
||||
auth,
|
||||
claims: None,
|
||||
connection_ctx,
|
||||
@@ -871,8 +862,6 @@ impl PageServerHandler {
|
||||
timeline_handles: Some(TimelineHandles::new(tenant_manager)),
|
||||
cancel,
|
||||
pipelining_config,
|
||||
get_vectored_concurrent_io,
|
||||
basebackup_cache,
|
||||
gate_guard,
|
||||
}
|
||||
}
|
||||
@@ -1169,7 +1158,7 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
// We're holding the Handle
|
||||
let effective_lsn = match Self::effective_request_lsn(
|
||||
let effective_request_lsn = match Self::effective_request_lsn(
|
||||
&shard,
|
||||
shard.get_last_record_lsn(),
|
||||
req.hdr.request_lsn,
|
||||
@@ -1188,10 +1177,7 @@ impl PageServerHandler {
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest {
|
||||
req,
|
||||
timer,
|
||||
lsn_range: LsnRange {
|
||||
effective_lsn,
|
||||
request_lsn: req.hdr.request_lsn
|
||||
},
|
||||
effective_request_lsn,
|
||||
ctx,
|
||||
}],
|
||||
// The executor grabs the batch when it becomes idle.
|
||||
@@ -1637,7 +1623,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.get_vectored_concurrent_io,
|
||||
self.conf,
|
||||
match self.gate_guard.try_clone() {
|
||||
Ok(guard) => guard,
|
||||
Err(_) => {
|
||||
@@ -2141,14 +2127,7 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
let exists = timeline
|
||||
.get_rel_exists(
|
||||
req.rel,
|
||||
Version::LsnRange(LsnRange {
|
||||
effective_lsn: lsn,
|
||||
request_lsn: req.hdr.request_lsn,
|
||||
}),
|
||||
ctx,
|
||||
)
|
||||
.get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
@@ -2175,14 +2154,7 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
let n_blocks = timeline
|
||||
.get_rel_size(
|
||||
req.rel,
|
||||
Version::LsnRange(LsnRange {
|
||||
effective_lsn: lsn,
|
||||
request_lsn: req.hdr.request_lsn,
|
||||
}),
|
||||
ctx,
|
||||
)
|
||||
.get_rel_size(req.rel, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
@@ -2209,15 +2181,7 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
|
||||
let total_blocks = timeline
|
||||
.get_db_size(
|
||||
DEFAULTTABLESPACE_OID,
|
||||
req.dbnode,
|
||||
Version::LsnRange(LsnRange {
|
||||
effective_lsn: lsn,
|
||||
request_lsn: req.hdr.request_lsn,
|
||||
}),
|
||||
ctx,
|
||||
)
|
||||
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
||||
|
||||
@@ -2250,7 +2214,7 @@ impl PageServerHandler {
|
||||
// Ignore error (trace buffer may be full or tracer may have disconnected).
|
||||
_ = page_trace.try_send(PageTraceEvent {
|
||||
key,
|
||||
effective_lsn: batch.lsn_range.effective_lsn,
|
||||
effective_lsn: batch.effective_request_lsn,
|
||||
time,
|
||||
});
|
||||
}
|
||||
@@ -2265,7 +2229,7 @@ impl PageServerHandler {
|
||||
perf_instrument = true;
|
||||
}
|
||||
|
||||
req.lsn_range.effective_lsn
|
||||
req.effective_request_lsn
|
||||
})
|
||||
.max()
|
||||
.expect("batch is never empty");
|
||||
@@ -2319,7 +2283,7 @@ impl PageServerHandler {
|
||||
(
|
||||
&p.req.rel,
|
||||
&p.req.blkno,
|
||||
p.lsn_range,
|
||||
p.effective_request_lsn,
|
||||
p.ctx.attached_child(),
|
||||
)
|
||||
}),
|
||||
@@ -2504,8 +2468,6 @@ impl PageServerHandler {
|
||||
.map_err(QueryError::Disconnected)?;
|
||||
self.flush_cancellable(pgb, &self.cancel).await?;
|
||||
|
||||
let mut from_cache = false;
|
||||
|
||||
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
||||
if full_backup {
|
||||
@@ -2523,33 +2485,7 @@ impl PageServerHandler {
|
||||
.map_err(map_basebackup_error)?;
|
||||
} else {
|
||||
let mut writer = BufWriter::new(pgb.copyout_writer());
|
||||
|
||||
let cached = {
|
||||
// Basebackup is cached only for this combination of parameters.
|
||||
if timeline.is_basebackup_cache_enabled()
|
||||
&& gzip
|
||||
&& lsn.is_some()
|
||||
&& prev_lsn.is_none()
|
||||
{
|
||||
self.basebackup_cache
|
||||
.get(tenant_id, timeline_id, lsn.unwrap())
|
||||
.await
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(mut cached) = cached {
|
||||
from_cache = true;
|
||||
tokio::io::copy(&mut cached, &mut writer)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
map_basebackup_error(BasebackupError::Client(
|
||||
e,
|
||||
"handle_basebackup_request,cached,copy",
|
||||
))
|
||||
})?;
|
||||
} else if gzip {
|
||||
if gzip {
|
||||
let mut encoder = GzipEncoder::with_quality(
|
||||
&mut writer,
|
||||
// NOTE using fast compression because it's on the critical path
|
||||
@@ -2608,7 +2544,6 @@ impl PageServerHandler {
|
||||
info!(
|
||||
lsn_await_millis = lsn_awaited_after.as_millis(),
|
||||
basebackup_millis = basebackup_after.as_millis(),
|
||||
%from_cache,
|
||||
"basebackup complete"
|
||||
);
|
||||
|
||||
|
||||
@@ -43,9 +43,7 @@ use crate::aux_file;
|
||||
use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::{
|
||||
RELSIZE_CACHE_MISSES_OLD, RELSIZE_LATEST_CACHE_ENTRIES, RELSIZE_LATEST_CACHE_HITS,
|
||||
RELSIZE_LATEST_CACHE_MISSES, RELSIZE_SNAPSHOT_CACHE_ENTRIES, RELSIZE_SNAPSHOT_CACHE_HITS,
|
||||
RELSIZE_SNAPSHOT_CACHE_MISSES,
|
||||
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
|
||||
};
|
||||
use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
@@ -92,28 +90,6 @@ pub enum LsnForTimestamp {
|
||||
NoData(Lsn),
|
||||
}
|
||||
|
||||
/// Each request to page server contains LSN range: `not_modified_since..request_lsn`.
|
||||
/// See comments libs/pageserver_api/src/models.rs.
|
||||
/// Based on this range and `last_record_lsn` PS calculates `effective_lsn`.
|
||||
/// But to distinguish requests from primary and replicas we need also to pass `request_lsn`.
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct LsnRange {
|
||||
pub effective_lsn: Lsn,
|
||||
pub request_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl LsnRange {
|
||||
pub fn at(lsn: Lsn) -> LsnRange {
|
||||
LsnRange {
|
||||
effective_lsn: lsn,
|
||||
request_lsn: lsn,
|
||||
}
|
||||
}
|
||||
pub fn is_latest(&self) -> bool {
|
||||
self.request_lsn == Lsn::MAX
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum CalculateLogicalSizeError {
|
||||
#[error("cancelled")]
|
||||
@@ -226,13 +202,13 @@ impl Timeline {
|
||||
io_concurrency: IoConcurrency,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
match version {
|
||||
Version::LsnRange(lsns) => {
|
||||
Version::Lsn(effective_lsn) => {
|
||||
let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
|
||||
let res = self
|
||||
.get_rel_page_at_lsn_batched(
|
||||
pages
|
||||
.iter()
|
||||
.map(|(tag, blknum)| (tag, blknum, lsns, ctx.attached_child())),
|
||||
pages.iter().map(|(tag, blknum)| {
|
||||
(tag, blknum, effective_lsn, ctx.attached_child())
|
||||
}),
|
||||
io_concurrency.clone(),
|
||||
ctx,
|
||||
)
|
||||
@@ -270,7 +246,7 @@ impl Timeline {
|
||||
/// The ordering of the returned vec corresponds to the ordering of `pages`.
|
||||
pub(crate) async fn get_rel_page_at_lsn_batched(
|
||||
&self,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, LsnRange, RequestContext)>,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<Result<Bytes, PageReconstructError>> {
|
||||
@@ -289,7 +265,7 @@ impl Timeline {
|
||||
let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
|
||||
HashMap::with_capacity(pages.len());
|
||||
|
||||
for (response_slot_idx, (tag, blknum, lsns, ctx)) in pages.enumerate() {
|
||||
for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
|
||||
if tag.relnode == 0 {
|
||||
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
@@ -298,7 +274,7 @@ impl Timeline {
|
||||
slots_filled += 1;
|
||||
continue;
|
||||
}
|
||||
let lsn = lsns.effective_lsn;
|
||||
|
||||
let nblocks = {
|
||||
let ctx = RequestContextBuilder::from(&ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
@@ -313,7 +289,7 @@ impl Timeline {
|
||||
.attached_child();
|
||||
|
||||
match self
|
||||
.get_rel_size(*tag, Version::LsnRange(lsns), &ctx)
|
||||
.get_rel_size(*tag, Version::Lsn(lsn), &ctx)
|
||||
.maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
|
||||
.await
|
||||
{
|
||||
@@ -494,7 +470,7 @@ impl Timeline {
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, version) {
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||
return Ok(nblocks);
|
||||
}
|
||||
|
||||
@@ -512,7 +488,7 @@ impl Timeline {
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
let nblocks = buf.get_u32_le();
|
||||
|
||||
self.update_cached_rel_size(tag, version, nblocks);
|
||||
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
|
||||
|
||||
Ok(nblocks)
|
||||
}
|
||||
@@ -534,7 +510,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// first try to lookup relation in cache
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version) {
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
|
||||
return Ok(true);
|
||||
}
|
||||
// then check if the database was already initialized.
|
||||
@@ -610,7 +586,7 @@ impl Timeline {
|
||||
// scan directory listing (new), merge with the old results
|
||||
let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf.get_vectored_concurrent_io,
|
||||
self.conf,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| PageReconstructError::Cancelled)?,
|
||||
@@ -656,7 +632,7 @@ impl Timeline {
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
assert!(self.tenant_shard_id.is_shard_zero());
|
||||
let n_blocks = self
|
||||
.get_slru_segment_size(kind, segno, Version::at(lsn), ctx)
|
||||
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
|
||||
.await?;
|
||||
|
||||
let keyspace = KeySpace::single(
|
||||
@@ -669,7 +645,7 @@ impl Timeline {
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf.get_vectored_concurrent_io,
|
||||
self.conf,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| PageReconstructError::Cancelled)?,
|
||||
@@ -891,11 +867,11 @@ impl Timeline {
|
||||
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
|
||||
) -> Result<T, PageReconstructError> {
|
||||
for segno in self
|
||||
.list_slru_segments(SlruKind::Clog, Version::at(probe_lsn), ctx)
|
||||
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
|
||||
.await?
|
||||
{
|
||||
let nblocks = self
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, Version::at(probe_lsn), ctx)
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
|
||||
.await?;
|
||||
|
||||
let keyspace = KeySpace::single(
|
||||
@@ -909,7 +885,7 @@ impl Timeline {
|
||||
);
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf.get_vectored_concurrent_io,
|
||||
self.conf,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| PageReconstructError::Cancelled)?,
|
||||
@@ -1161,7 +1137,7 @@ impl Timeline {
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for rel in self
|
||||
.list_rels(*spcnode, *dbnode, Version::at(lsn), ctx)
|
||||
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
|
||||
.await?
|
||||
{
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -1236,7 +1212,7 @@ impl Timeline {
|
||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||
|
||||
let mut rels: Vec<RelTag> = self
|
||||
.list_rels(spcnode, dbnode, Version::at(lsn), ctx)
|
||||
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
@@ -1353,75 +1329,59 @@ impl Timeline {
|
||||
Ok((dense_keyspace, sparse_keyspace))
|
||||
}
|
||||
|
||||
/// Get cached size of relation. There are two caches: one for primary updates, it captures the latest state of
|
||||
/// of the timeline and snapshot cache, which key includes LSN and so can be used by replicas to get relation size
|
||||
/// at the particular LSN (snapshot).
|
||||
pub fn get_cached_rel_size(&self, tag: &RelTag, version: Version<'_>) -> Option<BlockNumber> {
|
||||
let lsn = version.get_lsn();
|
||||
{
|
||||
let rel_size_cache = self.rel_size_latest_cache.read().unwrap();
|
||||
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
|
||||
if lsn >= *cached_lsn {
|
||||
RELSIZE_LATEST_CACHE_HITS.inc();
|
||||
return Some(*nblocks);
|
||||
}
|
||||
RELSIZE_CACHE_MISSES_OLD.inc();
|
||||
/// Get cached size of relation if it not updated after specified LSN
|
||||
pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
|
||||
let rel_size_cache = self.rel_size_cache.read().unwrap();
|
||||
if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
|
||||
if lsn >= *cached_lsn {
|
||||
RELSIZE_CACHE_HITS.inc();
|
||||
return Some(*nblocks);
|
||||
}
|
||||
RELSIZE_CACHE_MISSES_OLD.inc();
|
||||
}
|
||||
{
|
||||
let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
|
||||
if let Some(nblock) = rel_size_cache.get(&(lsn, *tag)) {
|
||||
RELSIZE_SNAPSHOT_CACHE_HITS.inc();
|
||||
return Some(*nblock);
|
||||
}
|
||||
}
|
||||
if version.is_latest() {
|
||||
RELSIZE_LATEST_CACHE_MISSES.inc();
|
||||
} else {
|
||||
RELSIZE_SNAPSHOT_CACHE_MISSES.inc();
|
||||
}
|
||||
RELSIZE_CACHE_MISSES.inc();
|
||||
None
|
||||
}
|
||||
|
||||
/// Update cached relation size if there is no more recent update
|
||||
pub fn update_cached_rel_size(&self, tag: RelTag, version: Version<'_>, nblocks: BlockNumber) {
|
||||
let lsn = version.get_lsn();
|
||||
if version.is_latest() {
|
||||
let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
|
||||
match rel_size_cache.entry(tag) {
|
||||
hash_map::Entry::Occupied(mut entry) => {
|
||||
let cached_lsn = entry.get_mut();
|
||||
if lsn >= cached_lsn.0 {
|
||||
*cached_lsn = (lsn, nblocks);
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
entry.insert((lsn, nblocks));
|
||||
RELSIZE_LATEST_CACHE_ENTRIES.inc();
|
||||
pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
|
||||
if lsn < rel_size_cache.complete_as_of {
|
||||
// Do not cache old values. It's safe to cache the size on read, as long as
|
||||
// the read was at an LSN since we started the WAL ingestion. Reasoning: we
|
||||
// never evict values from the cache, so if the relation size changed after
|
||||
// 'lsn', the new value is already in the cache.
|
||||
return;
|
||||
}
|
||||
|
||||
match rel_size_cache.map.entry(tag) {
|
||||
hash_map::Entry::Occupied(mut entry) => {
|
||||
let cached_lsn = entry.get_mut();
|
||||
if lsn >= cached_lsn.0 {
|
||||
*cached_lsn = (lsn, nblocks);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
|
||||
if rel_size_cache.capacity() != 0 {
|
||||
rel_size_cache.insert((lsn, tag), nblocks);
|
||||
RELSIZE_SNAPSHOT_CACHE_ENTRIES.set(rel_size_cache.len() as u64);
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
entry.insert((lsn, nblocks));
|
||||
RELSIZE_CACHE_ENTRIES.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Store cached relation size
|
||||
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
|
||||
if rel_size_cache.insert(tag, (lsn, nblocks)).is_none() {
|
||||
RELSIZE_LATEST_CACHE_ENTRIES.inc();
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
|
||||
RELSIZE_CACHE_ENTRIES.inc();
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove cached relation size
|
||||
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
|
||||
let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
|
||||
if rel_size_cache.remove(tag).is_some() {
|
||||
RELSIZE_LATEST_CACHE_ENTRIES.dec();
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
if rel_size_cache.map.remove(tag).is_some() {
|
||||
RELSIZE_CACHE_ENTRIES.dec();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1625,10 +1585,7 @@ impl DatadirModification<'_> {
|
||||
// check the cache too. This is because eagerly checking the cache results in
|
||||
// less work overall and 10% better performance. It's more work on cache miss
|
||||
// but cache miss is rare.
|
||||
if let Some(nblocks) = self
|
||||
.tline
|
||||
.get_cached_rel_size(&rel, Version::Modified(self))
|
||||
{
|
||||
if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
|
||||
Ok(nblocks)
|
||||
} else if !self
|
||||
.tline
|
||||
@@ -2710,7 +2667,7 @@ pub struct DatadirModificationStats {
|
||||
/// timeline to not miss the latest updates.
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum Version<'a> {
|
||||
LsnRange(LsnRange),
|
||||
Lsn(Lsn),
|
||||
Modified(&'a DatadirModification<'a>),
|
||||
}
|
||||
|
||||
@@ -2722,7 +2679,7 @@ impl Version<'_> {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
match self {
|
||||
Version::LsnRange(lsns) => timeline.get(key, lsns.effective_lsn, ctx).await,
|
||||
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
|
||||
Version::Modified(modification) => modification.get(key, ctx).await,
|
||||
}
|
||||
}
|
||||
@@ -2744,26 +2701,12 @@ impl Version<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_latest(&self) -> bool {
|
||||
fn get_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
Version::LsnRange(lsns) => lsns.is_latest(),
|
||||
Version::Modified(_) => true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_lsn(&self) -> Lsn {
|
||||
match self {
|
||||
Version::LsnRange(lsns) => lsns.effective_lsn,
|
||||
Version::Lsn(lsn) => *lsn,
|
||||
Version::Modified(modification) => modification.lsn,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn at(lsn: Lsn) -> Self {
|
||||
Version::LsnRange(LsnRange {
|
||||
effective_lsn: lsn,
|
||||
request_lsn: lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
//--- Metadata structs stored in key-value pairs in the repository.
|
||||
|
||||
@@ -380,10 +380,6 @@ pub enum TaskKind {
|
||||
DetachAncestor,
|
||||
|
||||
ImportPgdata,
|
||||
|
||||
/// Background task of [`crate::basebackup_cache::BasebackupCache`].
|
||||
/// Prepares basebackups and clears outdated entries.
|
||||
BasebackupCache,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -48,7 +48,6 @@ use remote_timeline_client::{
|
||||
download_tenant_manifest,
|
||||
};
|
||||
use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
|
||||
use timeline::import_pgdata::ImportingTimeline;
|
||||
use timeline::offload::{OffloadError, offload_timeline};
|
||||
@@ -78,7 +77,6 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
|
||||
use self::timeline::{
|
||||
EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
|
||||
};
|
||||
use crate::basebackup_cache::BasebackupPrepareSender;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context;
|
||||
use crate::context::RequestContextBuilder;
|
||||
@@ -87,8 +85,8 @@ use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
|
||||
use crate::l0_flush::L0FlushGlobalState;
|
||||
use crate::metrics::{
|
||||
BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
|
||||
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
|
||||
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
|
||||
TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
|
||||
};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationMode;
|
||||
@@ -154,11 +152,10 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
/// as the shared remote storage client and process initialization state.
|
||||
#[derive(Clone)]
|
||||
pub struct TenantSharedResources {
|
||||
pub broker_client: storage_broker::BrokerClientChannel,
|
||||
pub broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
pub remote_storage: GenericRemoteStorage,
|
||||
pub deletion_queue_client: DeletionQueueClient,
|
||||
pub l0_flush_global_state: L0FlushGlobalState,
|
||||
pub basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
}
|
||||
|
||||
/// A [`TenantShard`] is really an _attached_ tenant. The configuration
|
||||
@@ -319,15 +316,12 @@ pub struct TenantShard {
|
||||
gc_cs: tokio::sync::Mutex<()>,
|
||||
walredo_mgr: Option<Arc<WalRedoManager>>,
|
||||
|
||||
/// Provides access to timeline data sitting in the remote storage.
|
||||
// provides access to timeline data sitting in the remote storage
|
||||
pub(crate) remote_storage: GenericRemoteStorage,
|
||||
|
||||
/// Access to global deletion queue for when this tenant wants to schedule a deletion.
|
||||
// Access to global deletion queue for when this tenant wants to schedule a deletion
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
|
||||
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
|
||||
basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
|
||||
/// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
@@ -1291,7 +1285,6 @@ impl TenantShard {
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
} = resources;
|
||||
|
||||
let attach_mode = attached_conf.location.attach_mode;
|
||||
@@ -1307,7 +1300,6 @@ impl TenantShard {
|
||||
remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
));
|
||||
|
||||
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
|
||||
@@ -2114,7 +2106,7 @@ impl TenantShard {
|
||||
async fn unoffload_timeline(
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: RequestContext,
|
||||
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
||||
info!("unoffloading timeline");
|
||||
@@ -2249,7 +2241,7 @@ impl TenantShard {
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
new_state: TimelineArchivalState,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), TimelineArchivalError> {
|
||||
info!("setting timeline archival config");
|
||||
@@ -2578,7 +2570,7 @@ impl TenantShard {
|
||||
pub(crate) async fn create_timeline(
|
||||
self: &Arc<TenantShard>,
|
||||
params: CreateTimelineParams,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
if !self.is_active() {
|
||||
@@ -3306,7 +3298,7 @@ impl TenantShard {
|
||||
/// to delay background jobs. Background jobs can be started right away when None is given.
|
||||
fn activate(
|
||||
self: &Arc<Self>,
|
||||
broker_client: BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
@@ -3355,13 +3347,6 @@ impl TenantShard {
|
||||
activated_timelines += 1;
|
||||
}
|
||||
|
||||
let tid = self.tenant_shard_id.tenant_id.to_string();
|
||||
let shard_id = self.tenant_shard_id.shard_slug().to_string();
|
||||
let offloaded_timeline_count = timelines_offloaded_accessor.len();
|
||||
TENANT_OFFLOADED_TIMELINES
|
||||
.with_label_values(&[&tid, &shard_id])
|
||||
.set(offloaded_timeline_count as u64);
|
||||
|
||||
self.state.send_modify(move |current_state| {
|
||||
assert!(
|
||||
matches!(current_state, TenantState::Activating(_)),
|
||||
@@ -4246,7 +4231,6 @@ impl TenantShard {
|
||||
remote_storage: GenericRemoteStorage,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
l0_flush_global_state: L0FlushGlobalState,
|
||||
basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
) -> TenantShard {
|
||||
assert!(!attached_conf.location.generation.is_none());
|
||||
|
||||
@@ -4350,7 +4334,6 @@ impl TenantShard {
|
||||
ongoing_timeline_detach: std::sync::Mutex::default(),
|
||||
gc_block: Default::default(),
|
||||
l0_flush_global_state,
|
||||
basebackup_prepare_sender,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4603,7 +4586,7 @@ impl TenantShard {
|
||||
|
||||
target.cutoffs = GcCutoffs {
|
||||
space: space_cutoff,
|
||||
time: None,
|
||||
time: Lsn::INVALID,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -4687,7 +4670,7 @@ impl TenantShard {
|
||||
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
|
||||
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
|
||||
target.within_ancestor_pitr =
|
||||
Some(timeline.get_ancestor_lsn()) >= ancestor_gc_cutoffs.time;
|
||||
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4700,15 +4683,13 @@ impl TenantShard {
|
||||
} else {
|
||||
0
|
||||
});
|
||||
if let Some(time_cutoff) = target.cutoffs.time {
|
||||
timeline.metrics.pitr_history_size.set(
|
||||
timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(time_cutoff)
|
||||
.unwrap_or_default()
|
||||
.0,
|
||||
);
|
||||
}
|
||||
timeline.metrics.pitr_history_size.set(
|
||||
timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(target.cutoffs.time)
|
||||
.unwrap_or(Lsn(0))
|
||||
.0,
|
||||
);
|
||||
|
||||
// Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline?
|
||||
// - this timeline was created while we were finding cutoffs
|
||||
@@ -4717,8 +4698,8 @@ impl TenantShard {
|
||||
let original_cutoffs = target.cutoffs.clone();
|
||||
// GC cutoffs should never go back
|
||||
target.cutoffs = GcCutoffs {
|
||||
space: cutoffs.space.max(original_cutoffs.space),
|
||||
time: cutoffs.time.max(original_cutoffs.time),
|
||||
space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
|
||||
time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5270,7 +5251,6 @@ impl TenantShard {
|
||||
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
|
||||
l0_compaction_trigger: self.l0_compaction_trigger.clone(),
|
||||
l0_flush_global_state: self.l0_flush_global_state.clone(),
|
||||
basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5579,14 +5559,6 @@ impl TenantShard {
|
||||
}
|
||||
}
|
||||
|
||||
// Update metrics
|
||||
let tid = self.tenant_shard_id.to_string();
|
||||
let shard_id = self.tenant_shard_id.shard_slug().to_string();
|
||||
let set_key = &[tid.as_str(), shard_id.as_str()][..];
|
||||
TENANT_OFFLOADED_TIMELINES
|
||||
.with_label_values(set_key)
|
||||
.set(manifest.offloaded_timelines.len() as u64);
|
||||
|
||||
// Upload the manifest. Remote storage does no retries internally, so retry here.
|
||||
match backoff::retry(
|
||||
|| async {
|
||||
@@ -5853,8 +5825,6 @@ pub(crate) mod harness {
|
||||
) -> anyhow::Result<Arc<TenantShard>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
|
||||
let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
let tenant = Arc::new(TenantShard::new(
|
||||
TenantState::Attaching,
|
||||
self.conf,
|
||||
@@ -5872,7 +5842,6 @@ pub(crate) mod harness {
|
||||
self.deletion_queue.new_client(),
|
||||
// TODO: ideally we should run all unit tests with both configs
|
||||
L0FlushGlobalState::new(L0FlushConfig::default()),
|
||||
basebackup_requst_sender,
|
||||
));
|
||||
|
||||
let preload = tenant
|
||||
@@ -8626,10 +8595,8 @@ mod tests {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<Bytes>, GetVectoredError> {
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
tline.conf.get_vectored_concurrent_io,
|
||||
tline.gate.enter().unwrap(),
|
||||
);
|
||||
let io_concurrency =
|
||||
IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
|
||||
let mut res = tline
|
||||
@@ -8967,7 +8934,7 @@ mod tests {
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Some(Lsn(0x30));
|
||||
guard.cutoffs.time = Lsn(0x30);
|
||||
guard.cutoffs.space = Lsn(0x30);
|
||||
}
|
||||
|
||||
@@ -9075,7 +9042,7 @@ mod tests {
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Some(Lsn(0x40));
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
guard.cutoffs.space = Lsn(0x40);
|
||||
}
|
||||
tline
|
||||
@@ -9493,7 +9460,7 @@ mod tests {
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Some(Lsn(0x30)),
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
@@ -9577,7 +9544,7 @@ mod tests {
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Some(Lsn(0x40));
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
guard.cutoffs.space = Lsn(0x40);
|
||||
}
|
||||
tline
|
||||
@@ -10048,7 +10015,7 @@ mod tests {
|
||||
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Some(Lsn(0x30)),
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
@@ -10111,7 +10078,7 @@ mod tests {
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time.unwrap_or_default()
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
@@ -10189,7 +10156,7 @@ mod tests {
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Some(Lsn(0x38));
|
||||
guard.cutoffs.time = Lsn(0x38);
|
||||
guard.cutoffs.space = Lsn(0x38);
|
||||
}
|
||||
tline
|
||||
@@ -10297,7 +10264,7 @@ mod tests {
|
||||
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Some(Lsn(0x30)),
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
@@ -10360,7 +10327,7 @@ mod tests {
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time.unwrap_or_default()
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
@@ -10546,7 +10513,7 @@ mod tests {
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Some(Lsn(0x10)),
|
||||
time: Lsn(0x10),
|
||||
space: Lsn(0x10),
|
||||
},
|
||||
leases: Default::default(),
|
||||
@@ -10566,7 +10533,7 @@ mod tests {
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Some(Lsn(0x50)),
|
||||
time: Lsn(0x50),
|
||||
space: Lsn(0x50),
|
||||
},
|
||||
leases: Default::default(),
|
||||
@@ -11287,7 +11254,7 @@ mod tests {
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Some(Lsn(0x30)),
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
@@ -11676,7 +11643,7 @@ mod tests {
|
||||
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Some(Lsn(0x30)),
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
@@ -11739,7 +11706,7 @@ mod tests {
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time.unwrap_or_default()
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
@@ -11928,7 +11895,7 @@ mod tests {
|
||||
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Some(Lsn(0x30)),
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
@@ -11991,7 +11958,7 @@ mod tests {
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time.unwrap_or_default()
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
@@ -12254,7 +12221,7 @@ mod tests {
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Some(Lsn(0x30)),
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
|
||||
@@ -235,7 +235,7 @@ pub(super) async fn gather_inputs(
|
||||
// than our internal space cutoff. This means that if someone drops a database and waits for their
|
||||
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
|
||||
// the space cutoff.
|
||||
let mut next_pitr_cutoff = gc_info.cutoffs.time.unwrap_or_default(); // TODO: handle None
|
||||
let mut next_pitr_cutoff = gc_info.cutoffs.time;
|
||||
|
||||
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
|
||||
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
|
||||
|
||||
@@ -31,7 +31,6 @@ pub use inmemory_layer::InMemoryLayer;
|
||||
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
|
||||
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
|
||||
pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
|
||||
use pageserver_api::config::GetVectoredConcurrentIo;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
@@ -44,6 +43,7 @@ use self::inmemory_layer::InMemoryLayerFileId;
|
||||
use super::PageReconstructError;
|
||||
use super::layer_map::InMemoryLayerDesc;
|
||||
use super::timeline::{GetVectoredError, ReadPath};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{
|
||||
AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
@@ -318,10 +318,11 @@ impl IoConcurrency {
|
||||
}
|
||||
|
||||
pub(crate) fn spawn_from_conf(
|
||||
conf: GetVectoredConcurrentIo,
|
||||
conf: &'static PageServerConf,
|
||||
gate_guard: GateGuard,
|
||||
) -> IoConcurrency {
|
||||
let selected = match conf {
|
||||
use pageserver_api::config::GetVectoredConcurrentIo;
|
||||
let selected = match conf.get_vectored_concurrent_io {
|
||||
GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
|
||||
GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
|
||||
};
|
||||
|
||||
@@ -63,28 +63,7 @@ pub struct InMemoryLayer {
|
||||
|
||||
opened_at: Instant,
|
||||
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The [`IndexEntry`] is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
///
|
||||
/// We use a separate lock for the index to reduce the critical section
|
||||
/// during which reads cannot be planned.
|
||||
///
|
||||
/// If you need access to both the index and the underlying file at the same time,
|
||||
/// respect the following locking order to avoid deadlocks:
|
||||
/// 1. [`InMemoryLayer::inner`]
|
||||
/// 2. [`InMemoryLayer::index`]
|
||||
///
|
||||
/// Note that the file backing [`InMemoryLayer::inner`] is append-only,
|
||||
/// so it is not necessary to hold simultaneous locks on index.
|
||||
/// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
|
||||
/// In particular:
|
||||
/// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
|
||||
/// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
|
||||
index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
|
||||
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once,
|
||||
/// and `index` (see rationale there).
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once.
|
||||
/// All other changing parts are in `inner`, and protected by a mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
@@ -102,6 +81,11 @@ impl std::fmt::Debug for InMemoryLayer {
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The [`IndexEntry`] is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
|
||||
|
||||
/// The values are stored in a serialized format in this file.
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
/// PerSeg::page_versions map stores offsets into this file.
|
||||
@@ -121,7 +105,7 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
|
||||
trailing_ones
|
||||
};
|
||||
|
||||
/// See [`InMemoryLayer::index`].
|
||||
/// See [`InMemoryLayerInner::index`].
|
||||
///
|
||||
/// For memory efficiency, the data is packed into a u64.
|
||||
///
|
||||
@@ -441,7 +425,7 @@ impl InMemoryLayer {
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.attached_child();
|
||||
|
||||
let index = self.index.read().await;
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
struct ValueRead {
|
||||
entry_lsn: Lsn,
|
||||
@@ -451,7 +435,10 @@ impl InMemoryLayer {
|
||||
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) {
|
||||
for (key, vec_map) in inner
|
||||
.index
|
||||
.range(range.start.to_compact()..range.end.to_compact())
|
||||
{
|
||||
let key = Key::from_compact(*key);
|
||||
let slice = vec_map.slice_range(lsn_range.clone());
|
||||
|
||||
@@ -479,7 +466,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
}
|
||||
drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
|
||||
drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
|
||||
let read_from = Arc::clone(self);
|
||||
let read_ctx = ctx.attached_child();
|
||||
reconstruct_state
|
||||
@@ -586,8 +573,8 @@ impl InMemoryLayer {
|
||||
start_lsn,
|
||||
end_lsn: OnceLock::new(),
|
||||
opened_at: Instant::now(),
|
||||
index: RwLock::new(BTreeMap::new()),
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
index: BTreeMap::new(),
|
||||
file,
|
||||
resource_units: GlobalResourceUnits::new(),
|
||||
}),
|
||||
@@ -605,39 +592,31 @@ impl InMemoryLayer {
|
||||
serialized_batch: SerializedValueBatch,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let (base_offset, metadata) = {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
|
||||
let base_offset = inner.file.len();
|
||||
let base_offset = inner.file.len();
|
||||
|
||||
let SerializedValueBatch {
|
||||
raw,
|
||||
metadata,
|
||||
max_lsn: _,
|
||||
len: _,
|
||||
} = serialized_batch;
|
||||
let SerializedValueBatch {
|
||||
raw,
|
||||
metadata,
|
||||
max_lsn: _,
|
||||
len: _,
|
||||
} = serialized_batch;
|
||||
|
||||
// Write the batch to the file
|
||||
inner.file.write_raw(&raw, ctx).await?;
|
||||
let new_size = inner.file.len();
|
||||
// Write the batch to the file
|
||||
inner.file.write_raw(&raw, ctx).await?;
|
||||
let new_size = inner.file.len();
|
||||
|
||||
let expected_new_len = base_offset
|
||||
.checked_add(raw.len().into_u64())
|
||||
// write_raw would error if we were to overflow u64.
|
||||
// also IndexEntry and higher levels in
|
||||
//the code don't allow the file to grow that large
|
||||
.unwrap();
|
||||
assert_eq!(new_size, expected_new_len);
|
||||
|
||||
inner.resource_units.maybe_publish_size(new_size);
|
||||
|
||||
(base_offset, metadata)
|
||||
};
|
||||
let expected_new_len = base_offset
|
||||
.checked_add(raw.len().into_u64())
|
||||
// write_raw would error if we were to overflow u64.
|
||||
// also IndexEntry and higher levels in
|
||||
//the code don't allow the file to grow that large
|
||||
.unwrap();
|
||||
assert_eq!(new_size, expected_new_len);
|
||||
|
||||
// Update the index with the new entries
|
||||
let mut index = self.index.write().await;
|
||||
|
||||
for meta in metadata {
|
||||
let SerializedValueMeta {
|
||||
key,
|
||||
@@ -660,7 +639,7 @@ impl InMemoryLayer {
|
||||
will_init,
|
||||
})?;
|
||||
|
||||
let vec_map = index.entry(key).or_default();
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
|
||||
if old.is_some() {
|
||||
// This should not break anything, but is unexpected: ingestion code aims to filter out
|
||||
@@ -679,6 +658,8 @@ impl InMemoryLayer {
|
||||
);
|
||||
}
|
||||
|
||||
inner.resource_units.maybe_publish_size(new_size);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -699,18 +680,6 @@ impl InMemoryLayer {
|
||||
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is exclusive
|
||||
///
|
||||
/// A note on locking:
|
||||
/// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing
|
||||
/// writes while freezing the layer. This is enforced at a higher level via
|
||||
/// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths:
|
||||
/// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the
|
||||
/// Timeline::write_lock for its lifetime. The rolling is handled in
|
||||
/// [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function
|
||||
/// so can't be called from different threads.
|
||||
/// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`].
|
||||
/// This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer),
|
||||
/// hence there can be no concurrent writes
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
assert!(
|
||||
self.start_lsn < end_lsn,
|
||||
@@ -731,8 +700,8 @@ impl InMemoryLayer {
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
let index = self.index.read().await;
|
||||
for vec_map in index.values() {
|
||||
let inner = self.inner.write().await;
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
}
|
||||
@@ -755,11 +724,14 @@ impl InMemoryLayer {
|
||||
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. See the comment on
|
||||
// [`InMemoryLayer::freeze`] to understand how locking between the append path
|
||||
// and layer flushing works.
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
// though: another thread might have grabbed a reference to this layer
|
||||
// in `get_layer_for_write' just before the checkpointer called
|
||||
// `freeze`, and then `write_to_disk` on it. When the thread gets the
|
||||
// lock, it will see that it's not writeable anymore and retry, but it
|
||||
// would have to wait until we release it. That race condition is very
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().await;
|
||||
let index = self.index.read().await;
|
||||
|
||||
use l0_flush::Inner;
|
||||
let _concurrency_permit = match l0_flush_global_state {
|
||||
@@ -771,9 +743,13 @@ impl InMemoryLayer {
|
||||
let key_count = if let Some(key_range) = key_range {
|
||||
let key_range = key_range.start.to_compact()..key_range.end.to_compact();
|
||||
|
||||
index.iter().filter(|(k, _)| key_range.contains(k)).count()
|
||||
inner
|
||||
.index
|
||||
.iter()
|
||||
.filter(|(k, _)| key_range.contains(k))
|
||||
.count()
|
||||
} else {
|
||||
index.len()
|
||||
inner.index.len()
|
||||
};
|
||||
if key_count == 0 {
|
||||
return Ok(None);
|
||||
@@ -796,7 +772,7 @@ impl InMemoryLayer {
|
||||
let file_contents = inner.file.load_to_io_buf(ctx).await?;
|
||||
let file_contents = file_contents.freeze();
|
||||
|
||||
for (key, vec_map) in index.iter() {
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, entry) in vec_map
|
||||
.as_slice()
|
||||
|
||||
@@ -14,7 +14,6 @@ pub mod span;
|
||||
pub mod uninit;
|
||||
mod walreceiver;
|
||||
|
||||
use hashlink::LruCache;
|
||||
use std::array;
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::btree_map::Entry;
|
||||
@@ -24,6 +23,8 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use anyhow::{Context, Result, anyhow, bail, ensure};
|
||||
use arc_swap::{ArcSwap, ArcSwapOption};
|
||||
use bytes::Bytes;
|
||||
@@ -60,7 +61,6 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp};
|
||||
use rand::Rng;
|
||||
use remote_storage::DownloadError;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::sync::{Notify, oneshot, watch};
|
||||
@@ -92,12 +92,10 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
|
||||
use super::tasks::log_compaction_error;
|
||||
use super::upload_queue::NotInitialized;
|
||||
use super::{
|
||||
AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
|
||||
AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
};
|
||||
use crate::PERF_TRACE_TARGET;
|
||||
use crate::aux_file::AuxFileSizeEstimator;
|
||||
use crate::basebackup_cache::BasebackupPrepareRequest;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{
|
||||
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
|
||||
@@ -131,7 +129,6 @@ use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
use crate::walingest::WalLagCooldown;
|
||||
use crate::walredo::RedoAttemptType;
|
||||
use crate::{ZERO_PAGE, task_mgr, walredo};
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
@@ -197,7 +194,16 @@ pub struct TimelineResources {
|
||||
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
|
||||
pub l0_compaction_trigger: Arc<Notify>,
|
||||
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
|
||||
pub basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
}
|
||||
|
||||
/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
|
||||
/// ingestion considerably, because WAL ingestion needs to check on most records if the record
|
||||
/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end
|
||||
/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the
|
||||
/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
|
||||
pub(crate) struct RelSizeCache {
|
||||
pub(crate) complete_as_of: Lsn,
|
||||
pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
@@ -358,8 +364,7 @@ pub struct Timeline {
|
||||
pub walreceiver: Mutex<Option<WalReceiver>>,
|
||||
|
||||
/// Relation size cache
|
||||
pub(crate) rel_size_latest_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
|
||||
pub(crate) rel_size_snapshot_cache: Mutex<LruCache<(Lsn, RelTag), BlockNumber>>,
|
||||
pub(crate) rel_size_cache: RwLock<RelSizeCache>,
|
||||
|
||||
download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
|
||||
|
||||
@@ -441,9 +446,6 @@ pub struct Timeline {
|
||||
pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
|
||||
|
||||
wait_lsn_log_slow: tokio::sync::Semaphore,
|
||||
|
||||
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
|
||||
basebackup_prepare_sender: BasebackupPrepareSender,
|
||||
}
|
||||
|
||||
pub(crate) enum PreviousHeatmap {
|
||||
@@ -534,24 +536,29 @@ impl GcInfo {
|
||||
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
|
||||
/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
|
||||
/// between time-based and space-based retention for observability and consumption metrics purposes.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct GcCutoffs {
|
||||
/// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much
|
||||
/// history we must keep to retain a specified number of bytes of WAL.
|
||||
pub(crate) space: Lsn,
|
||||
|
||||
/// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates
|
||||
/// how much history we must keep to enable reading back at least the PITR interval duration.
|
||||
///
|
||||
/// None indicates that the PITR cutoff has not been computed. A PITR interval of 0 will yield
|
||||
/// Some(last_record_lsn).
|
||||
pub(crate) time: Option<Lsn>,
|
||||
/// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates how much
|
||||
/// history we must keep to enable reading back at least the PITR interval duration.
|
||||
pub(crate) time: Lsn,
|
||||
}
|
||||
|
||||
impl Default for GcCutoffs {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
space: Lsn::INVALID,
|
||||
time: Lsn::INVALID,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GcCutoffs {
|
||||
fn select_min(&self) -> Lsn {
|
||||
// NB: if we haven't computed the PITR cutoff yet, we can't GC anything.
|
||||
self.space.min(self.time.unwrap_or_default())
|
||||
std::cmp::min(self.space, self.time)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1033,7 +1040,6 @@ pub(crate) enum WaitLsnWaiter<'a> {
|
||||
Tenant,
|
||||
PageService,
|
||||
HttpEndpoint,
|
||||
BaseBackupCache,
|
||||
}
|
||||
|
||||
/// Argument to [`Timeline::shutdown`].
|
||||
@@ -1089,14 +1095,11 @@ impl Timeline {
|
||||
/// Get the bytes written since the PITR cutoff on this branch, and
|
||||
/// whether this branch's ancestor_lsn is within its parent's PITR.
|
||||
pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
|
||||
// TODO: for backwards compatibility, we return the full history back to 0 when the PITR
|
||||
// cutoff has not yet been initialized. This should return None instead, but this is exposed
|
||||
// in external HTTP APIs and callers may not handle a null value.
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let history = self
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(gc_info.cutoffs.time.unwrap_or_default())
|
||||
.unwrap_or_default()
|
||||
.checked_sub(gc_info.cutoffs.time)
|
||||
.unwrap_or(Lsn(0))
|
||||
.0;
|
||||
(history, gc_info.within_ancestor_pitr)
|
||||
}
|
||||
@@ -1106,10 +1109,9 @@ impl Timeline {
|
||||
self.applied_gc_cutoff_lsn.read()
|
||||
}
|
||||
|
||||
/// Read timeline's planned GC cutoff: this is the logical end of history that users are allowed
|
||||
/// to read (based on configured PITR), even if physically we have more history. Returns None
|
||||
/// if the PITR cutoff has not yet been initialized.
|
||||
pub(crate) fn get_gc_cutoff_lsn(&self) -> Option<Lsn> {
|
||||
/// Read timeline's planned GC cutoff: this is the logical end of history that users
|
||||
/// are allowed to read (based on configured PITR), even if physically we have more history.
|
||||
pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
|
||||
self.gc_info.read().unwrap().cutoffs.time
|
||||
}
|
||||
|
||||
@@ -1560,8 +1562,7 @@ impl Timeline {
|
||||
}
|
||||
WaitLsnWaiter::Tenant
|
||||
| WaitLsnWaiter::PageService
|
||||
| WaitLsnWaiter::HttpEndpoint
|
||||
| WaitLsnWaiter::BaseBackupCache => unreachable!(
|
||||
| WaitLsnWaiter::HttpEndpoint => unreachable!(
|
||||
"tenant or page_service context are not expected to have task kind {:?}",
|
||||
ctx.task_kind()
|
||||
),
|
||||
@@ -2078,7 +2079,7 @@ impl Timeline {
|
||||
pub(crate) fn activate(
|
||||
self: &Arc<Self>,
|
||||
parent: Arc<crate::tenant::TenantShard>,
|
||||
broker_client: BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
@@ -2466,41 +2467,6 @@ impl Timeline {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn is_basebackup_cache_enabled(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.basebackup_cache_enabled
|
||||
.unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
|
||||
}
|
||||
|
||||
/// Prepare basebackup for the given LSN and store it in the basebackup cache.
|
||||
/// The method is asynchronous and returns immediately.
|
||||
/// The actual basebackup preparation is performed in the background
|
||||
/// by the basebackup cache on a best-effort basis.
|
||||
pub(crate) fn prepare_basebackup(&self, lsn: Lsn) {
|
||||
if !self.is_basebackup_cache_enabled() {
|
||||
return;
|
||||
}
|
||||
if !self.tenant_shard_id.is_shard_zero() {
|
||||
// In theory we should never get here, but just in case check it.
|
||||
// Preparing basebackup doesn't make sense for shards other than shard zero.
|
||||
return;
|
||||
}
|
||||
|
||||
let res = self
|
||||
.basebackup_prepare_sender
|
||||
.send(BasebackupPrepareRequest {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
timeline_id: self.timeline_id,
|
||||
lsn,
|
||||
});
|
||||
if let Err(e) = res {
|
||||
// May happen during shutdown, it's not critical.
|
||||
info!("Failed to send shutdown checkpoint: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Number of times we will compute partition within a checkpoint distance.
|
||||
@@ -2578,13 +2544,6 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
|
||||
}
|
||||
|
||||
pub(crate) fn get_pitr_interval(&self) -> Duration {
|
||||
let tenant_conf = &self.tenant_conf.load().tenant_conf;
|
||||
tenant_conf
|
||||
.pitr_interval
|
||||
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
|
||||
}
|
||||
|
||||
fn get_compaction_period(&self) -> Duration {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
@@ -2860,13 +2819,6 @@ impl Timeline {
|
||||
|
||||
self.remote_client.update_config(&new_conf.location);
|
||||
|
||||
let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
|
||||
if let Some(new_capacity) = new_conf.tenant_conf.relsize_snapshot_cache_capacity {
|
||||
if new_capacity != rel_size_cache.capacity() {
|
||||
rel_size_cache.set_capacity(new_capacity);
|
||||
}
|
||||
}
|
||||
|
||||
self.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.write()
|
||||
@@ -2925,14 +2877,6 @@ impl Timeline {
|
||||
ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded);
|
||||
}
|
||||
|
||||
let relsize_snapshot_cache_capacity = {
|
||||
let loaded_tenant_conf = tenant_conf.load();
|
||||
loaded_tenant_conf
|
||||
.tenant_conf
|
||||
.relsize_snapshot_cache_capacity
|
||||
.unwrap_or(conf.default_tenant_conf.relsize_snapshot_cache_capacity)
|
||||
};
|
||||
|
||||
Arc::new_cyclic(|myself| {
|
||||
let metrics = Arc::new(TimelineMetrics::new(
|
||||
&tenant_shard_id,
|
||||
@@ -3024,8 +2968,10 @@ impl Timeline {
|
||||
last_image_layer_creation_check_instant: Mutex::new(None),
|
||||
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_latest_cache: RwLock::new(HashMap::new()),
|
||||
rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
|
||||
rel_size_cache: RwLock::new(RelSizeCache {
|
||||
complete_as_of: disk_consistent_lsn,
|
||||
map: HashMap::new(),
|
||||
}),
|
||||
|
||||
download_all_remote_layers_task_info: RwLock::new(None),
|
||||
|
||||
@@ -3070,8 +3016,6 @@ impl Timeline {
|
||||
rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
|
||||
|
||||
wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
|
||||
|
||||
basebackup_prepare_sender: resources.basebackup_prepare_sender,
|
||||
};
|
||||
|
||||
result.repartition_threshold =
|
||||
@@ -3169,7 +3113,7 @@ impl Timeline {
|
||||
fn launch_wal_receiver(
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestContext,
|
||||
broker_client: BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
) {
|
||||
info!(
|
||||
"launching WAL receiver for timeline {} of tenant {}",
|
||||
@@ -3585,7 +3529,7 @@ impl Timeline {
|
||||
};
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self_ref.conf.get_vectored_concurrent_io,
|
||||
self_ref.conf,
|
||||
self_ref
|
||||
.gate
|
||||
.enter()
|
||||
@@ -5614,7 +5558,7 @@ impl Timeline {
|
||||
});
|
||||
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
self.conf.get_vectored_concurrent_io,
|
||||
self.conf,
|
||||
self.gate
|
||||
.enter()
|
||||
.map_err(|_| CreateImageLayersError::Cancelled)?,
|
||||
@@ -6285,12 +6229,14 @@ impl Timeline {
|
||||
|
||||
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
|
||||
|
||||
if cfg!(test) && pitr == Duration::ZERO {
|
||||
if cfg!(test) {
|
||||
// Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
|
||||
return Ok(GcCutoffs {
|
||||
time: Some(self.get_last_record_lsn()),
|
||||
space: space_cutoff,
|
||||
});
|
||||
if pitr == Duration::ZERO {
|
||||
return Ok(GcCutoffs {
|
||||
time: self.get_last_record_lsn(),
|
||||
space: space_cutoff,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate a time-based limit on how much to retain:
|
||||
@@ -6304,14 +6250,14 @@ impl Timeline {
|
||||
// PITR is not set. Retain the size-based limit, or the default time retention,
|
||||
// whichever requires less data.
|
||||
GcCutoffs {
|
||||
time: Some(self.get_last_record_lsn()),
|
||||
time: self.get_last_record_lsn(),
|
||||
space: std::cmp::max(time_cutoff, space_cutoff),
|
||||
}
|
||||
}
|
||||
(Duration::ZERO, None) => {
|
||||
// PITR is not set, and time lookup failed
|
||||
GcCutoffs {
|
||||
time: Some(self.get_last_record_lsn()),
|
||||
time: self.get_last_record_lsn(),
|
||||
space: space_cutoff,
|
||||
}
|
||||
}
|
||||
@@ -6319,7 +6265,7 @@ impl Timeline {
|
||||
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
|
||||
// cannot advance beyond what was already GC'd, and respect space-based retention
|
||||
GcCutoffs {
|
||||
time: Some(*self.get_applied_gc_cutoff_lsn()),
|
||||
time: *self.get_applied_gc_cutoff_lsn(),
|
||||
space: space_cutoff,
|
||||
}
|
||||
}
|
||||
@@ -6327,7 +6273,7 @@ impl Timeline {
|
||||
// PITR interval is set and we looked up timestamp successfully. Ignore
|
||||
// size based retention and make time cutoff authoritative
|
||||
GcCutoffs {
|
||||
time: Some(time_cutoff),
|
||||
time: time_cutoff,
|
||||
space: time_cutoff,
|
||||
}
|
||||
}
|
||||
@@ -6380,7 +6326,7 @@ impl Timeline {
|
||||
)
|
||||
};
|
||||
|
||||
let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default());
|
||||
let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
|
||||
let standby_horizon = self.standby_horizon.load();
|
||||
// Hold GC for the standby, but as a safety guard do it only within some
|
||||
// reasonable lag.
|
||||
@@ -6429,7 +6375,7 @@ impl Timeline {
|
||||
async fn gc_timeline(
|
||||
&self,
|
||||
space_cutoff: Lsn,
|
||||
time_cutoff: Option<Lsn>, // None if uninitialized
|
||||
time_cutoff: Lsn,
|
||||
retain_lsns: Vec<Lsn>,
|
||||
max_lsn_with_valid_lease: Option<Lsn>,
|
||||
new_gc_cutoff: Lsn,
|
||||
@@ -6448,12 +6394,6 @@ impl Timeline {
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let Some(time_cutoff) = time_cutoff else {
|
||||
// The GC cutoff should have been computed by now, but let's be defensive.
|
||||
info!("Nothing to GC: time_cutoff not yet computed");
|
||||
return Ok(result);
|
||||
};
|
||||
|
||||
// We need to ensure that no one tries to read page versions or create
|
||||
// branches at a point before latest_gc_cutoff_lsn. See branch_timeline()
|
||||
// for details. This will block until the old value is no longer in use.
|
||||
|
||||
@@ -1526,7 +1526,7 @@ impl Timeline {
|
||||
info!(
|
||||
"starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
|
||||
checked {layers_checked}/{layers_total} layers \
|
||||
(latest_gc_cutoff={} pitr_cutoff={:?})",
|
||||
(latest_gc_cutoff={} pitr_cutoff={})",
|
||||
layers_to_rewrite.len(),
|
||||
drop_layers.len(),
|
||||
*latest_gc_cutoff,
|
||||
|
||||
@@ -188,7 +188,7 @@ pub(crate) async fn generate_tombstone_image_layer(
|
||||
"removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
|
||||
);
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
detached.conf.get_vectored_concurrent_io,
|
||||
detached.conf,
|
||||
detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
|
||||
);
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
|
||||
@@ -161,7 +161,7 @@ impl<'t> UninitializedTimeline<'t> {
|
||||
tenant: Arc<TenantShard>,
|
||||
copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
|
||||
base_lsn: Lsn,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
self.write(|raw_timeline| async move {
|
||||
|
||||
@@ -28,7 +28,6 @@ use std::num::NonZeroU64;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -70,7 +69,7 @@ impl WalReceiver {
|
||||
pub fn start(
|
||||
timeline: Arc<Timeline>,
|
||||
conf: WalReceiverConf,
|
||||
mut broker_client: BrokerClientChannel,
|
||||
mut broker_client: storage_broker::TimelineUpdatesSubscriber,
|
||||
ctx: &RequestContext,
|
||||
) -> Self {
|
||||
let tenant_shard_id = timeline.tenant_shard_id;
|
||||
|
||||
@@ -17,19 +17,12 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use storage_broker::proto::{
|
||||
FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
|
||||
SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription,
|
||||
TypedMessage,
|
||||
};
|
||||
use storage_broker::{BrokerClientChannel, Code, Streaming};
|
||||
use storage_broker::proto::SafekeeperDiscoveryResponse;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::backoff::{
|
||||
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
|
||||
};
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::{
|
||||
@@ -56,7 +49,7 @@ pub(crate) struct Cancelled;
|
||||
///
|
||||
/// Not cancellation-safe. Use `cancel` token to request cancellation.
|
||||
pub(super) async fn connection_manager_loop_step(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
broker_client: &mut storage_broker::TimelineUpdatesSubscriber,
|
||||
connection_manager_state: &mut ConnectionManagerState,
|
||||
ctx: &RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
@@ -81,11 +74,6 @@ pub(super) async fn connection_manager_loop_step(
|
||||
WALRECEIVER_ACTIVE_MANAGERS.dec();
|
||||
}
|
||||
|
||||
let id = TenantTimelineId {
|
||||
tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
|
||||
timeline_id: connection_manager_state.timeline.timeline_id,
|
||||
};
|
||||
|
||||
let mut timeline_state_updates = connection_manager_state
|
||||
.timeline
|
||||
.subscribe_for_state_updates();
|
||||
@@ -101,7 +89,12 @@ pub(super) async fn connection_manager_loop_step(
|
||||
// Subscribe to the broker updates. Stream shares underlying TCP connection
|
||||
// with other streams on this client (other connection managers). When
|
||||
// object goes out of scope, stream finishes in drop() automatically.
|
||||
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
|
||||
let (timeline_updates, mut discovery_requester) = broker_client.subscribe(
|
||||
connection_manager_state.timeline.tenant_shard_id,
|
||||
connection_manager_state.timeline.timeline_id,
|
||||
cancel,
|
||||
);
|
||||
let mut timeline_updates = Box::pin(timeline_updates);
|
||||
debug!("Subscribed for broker timeline updates");
|
||||
|
||||
loop {
|
||||
@@ -155,29 +148,10 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
},
|
||||
|
||||
// Got a new update from the broker
|
||||
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
|
||||
match broker_update {
|
||||
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
|
||||
Err(status) => {
|
||||
match status.code() {
|
||||
Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
|
||||
// tonic's error handling doesn't provide a clear code for disconnections: we get
|
||||
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
|
||||
// => https://github.com/neondatabase/neon/issues/9562
|
||||
info!("broker disconnected: {status}");
|
||||
},
|
||||
_ => {
|
||||
warn!("broker subscription failed: {status}");
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
Ok(None) => {
|
||||
error!("broker subscription stream ended"); // can't happen
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// Got a new update from the broker.
|
||||
// The stream ends with None if and only if `cancel` is cancelled.
|
||||
Some(timeline_update) = timeline_updates.next() => {
|
||||
connection_manager_state.register_timeline_update(timeline_update)
|
||||
},
|
||||
|
||||
new_event = async {
|
||||
@@ -258,32 +232,11 @@ pub(super) async fn connection_manager_loop_step(
|
||||
tokio::time::sleep(next_discovery_ts - now).await;
|
||||
}
|
||||
|
||||
let tenant_timeline_id = Some(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
});
|
||||
let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
|
||||
let msg = TypedMessage {
|
||||
r#type: MessageType::SafekeeperDiscoveryRequest as i32,
|
||||
safekeeper_timeline_info: None,
|
||||
safekeeper_discovery_request: Some(request),
|
||||
safekeeper_discovery_response: None,
|
||||
};
|
||||
info!("No active connection and no candidates, sending discovery request to the broker");
|
||||
discovery_requester.request().await;
|
||||
|
||||
last_discovery_ts = Some(std::time::Instant::now());
|
||||
info!("No active connection and no candidates, sending discovery request to the broker");
|
||||
|
||||
// Cancellation safety: we want to send a message to the broker, but publish_one()
|
||||
// function can get cancelled by the other select! arm. This is absolutely fine, because
|
||||
// we just want to receive broker updates and discovery is not important if we already
|
||||
// receive updates.
|
||||
//
|
||||
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
|
||||
// This is totally fine because of the reason above.
|
||||
|
||||
// This is a fire-and-forget request, we don't care about the response
|
||||
let _ = broker_client.publish_one(msg).await;
|
||||
debug!("Discovery request sent to the broker");
|
||||
None
|
||||
} => {}
|
||||
}
|
||||
@@ -298,63 +251,6 @@ pub(super) async fn connection_manager_loop_step(
|
||||
}
|
||||
}
|
||||
|
||||
/// Endlessly try to subscribe for broker updates for a given timeline.
|
||||
async fn subscribe_for_timeline_updates(
|
||||
broker_client: &mut BrokerClientChannel,
|
||||
id: TenantTimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Streaming<TypedMessage>, Cancelled> {
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
exponential_backoff(
|
||||
attempt,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
|
||||
// subscribe to the specific timeline
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperTimelineInfo as i32,
|
||||
},
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
|
||||
},
|
||||
],
|
||||
tenant_timeline_id: Some(FilterTenantTimelineId {
|
||||
enabled: true,
|
||||
tenant_timeline_id: Some(ProtoTenantTimelineId {
|
||||
tenant_id: id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: id.timeline_id.as_ref().to_owned(),
|
||||
}),
|
||||
}),
|
||||
};
|
||||
|
||||
match {
|
||||
tokio::select! {
|
||||
r = broker_client.subscribe_by_filter(request) => { r }
|
||||
_ = cancel.cancelled() => { return Err(Cancelled); }
|
||||
}
|
||||
} {
|
||||
Ok(resp) => {
|
||||
return Ok(resp.into_inner());
|
||||
}
|
||||
Err(e) => {
|
||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
||||
// entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
|
||||
info!(
|
||||
"Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS: f64 = 0.1;
|
||||
const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0;
|
||||
const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5;
|
||||
@@ -695,44 +591,14 @@ impl ConnectionManagerState {
|
||||
}
|
||||
|
||||
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
|
||||
fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
|
||||
let mut is_discovery = false;
|
||||
let timeline_update = match typed_msg.r#type() {
|
||||
MessageType::SafekeeperTimelineInfo => {
|
||||
let info = match typed_msg.safekeeper_timeline_info {
|
||||
Some(info) => info,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_timeline_info");
|
||||
return;
|
||||
}
|
||||
};
|
||||
SafekeeperDiscoveryResponse {
|
||||
safekeeper_id: info.safekeeper_id,
|
||||
tenant_timeline_id: info.tenant_timeline_id,
|
||||
commit_lsn: info.commit_lsn,
|
||||
safekeeper_connstr: info.safekeeper_connstr,
|
||||
availability_zone: info.availability_zone,
|
||||
standby_horizon: info.standby_horizon,
|
||||
}
|
||||
}
|
||||
MessageType::SafekeeperDiscoveryResponse => {
|
||||
is_discovery = true;
|
||||
match typed_msg.safekeeper_discovery_response {
|
||||
Some(response) => response,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_discovery_response");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// unexpected message
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
fn register_timeline_update(&mut self, timeline_update: storage_broker::TimelineShardUpdate) {
|
||||
WALRECEIVER_BROKER_UPDATES.inc();
|
||||
|
||||
let storage_broker::TimelineShardUpdate {
|
||||
is_discovery,
|
||||
inner: timeline_update,
|
||||
} = timeline_update;
|
||||
|
||||
trace!(
|
||||
"safekeeper info update: standby_horizon(cutoff)={}",
|
||||
timeline_update.standby_horizon
|
||||
@@ -1013,7 +879,7 @@ impl ConnectionManagerState {
|
||||
shard_stripe_size,
|
||||
listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
|
||||
auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
|
||||
availability_zone: self.conf.availability_zone.as_deref()
|
||||
availability_zone: self.conf.availability_zone.as_deref(),
|
||||
};
|
||||
|
||||
match wal_stream_connection_config(connection_conf_args) {
|
||||
|
||||
@@ -1316,10 +1316,6 @@ impl WalIngest {
|
||||
}
|
||||
});
|
||||
|
||||
if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN {
|
||||
modification.tline.prepare_basebackup(lsn);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1688,31 +1684,31 @@ mod tests {
|
||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
|
||||
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
|
||||
.await?,
|
||||
false
|
||||
);
|
||||
assert!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
|
||||
.await
|
||||
.is_err()
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
|
||||
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
|
||||
.await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
|
||||
.await?,
|
||||
1
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
|
||||
.await?,
|
||||
3
|
||||
);
|
||||
@@ -1723,7 +1719,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
0,
|
||||
Version::at(Lsn(0x20)),
|
||||
Version::Lsn(Lsn(0x20)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1737,7 +1733,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
0,
|
||||
Version::at(Lsn(0x30)),
|
||||
Version::Lsn(Lsn(0x30)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1751,7 +1747,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
0,
|
||||
Version::at(Lsn(0x40)),
|
||||
Version::Lsn(Lsn(0x40)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1764,7 +1760,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
1,
|
||||
Version::at(Lsn(0x40)),
|
||||
Version::Lsn(Lsn(0x40)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1778,7 +1774,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
0,
|
||||
Version::at(Lsn(0x50)),
|
||||
Version::Lsn(Lsn(0x50)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1791,7 +1787,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
1,
|
||||
Version::at(Lsn(0x50)),
|
||||
Version::Lsn(Lsn(0x50)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1804,7 +1800,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
2,
|
||||
Version::at(Lsn(0x50)),
|
||||
Version::Lsn(Lsn(0x50)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1824,7 +1820,7 @@ mod tests {
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
|
||||
.await?,
|
||||
2
|
||||
);
|
||||
@@ -1833,7 +1829,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
0,
|
||||
Version::at(Lsn(0x60)),
|
||||
Version::Lsn(Lsn(0x60)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1846,7 +1842,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
1,
|
||||
Version::at(Lsn(0x60)),
|
||||
Version::Lsn(Lsn(0x60)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1858,7 +1854,7 @@ mod tests {
|
||||
// should still see the truncated block with older LSN
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
|
||||
.await?,
|
||||
3
|
||||
);
|
||||
@@ -1867,7 +1863,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
2,
|
||||
Version::at(Lsn(0x50)),
|
||||
Version::Lsn(Lsn(0x50)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1884,7 +1880,7 @@ mod tests {
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x68)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
|
||||
.await?,
|
||||
0
|
||||
);
|
||||
@@ -1897,7 +1893,7 @@ mod tests {
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x70)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
|
||||
.await?,
|
||||
2
|
||||
);
|
||||
@@ -1906,7 +1902,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
0,
|
||||
Version::at(Lsn(0x70)),
|
||||
Version::Lsn(Lsn(0x70)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1919,7 +1915,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
1,
|
||||
Version::at(Lsn(0x70)),
|
||||
Version::Lsn(Lsn(0x70)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1936,7 +1932,7 @@ mod tests {
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
|
||||
.await?,
|
||||
1501
|
||||
);
|
||||
@@ -1946,7 +1942,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
blk,
|
||||
Version::at(Lsn(0x80)),
|
||||
Version::Lsn(Lsn(0x80)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1960,7 +1956,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
1500,
|
||||
Version::at(Lsn(0x80)),
|
||||
Version::Lsn(Lsn(0x80)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -1994,13 +1990,13 @@ mod tests {
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
|
||||
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
|
||||
.await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
|
||||
.await?,
|
||||
1
|
||||
);
|
||||
@@ -2015,7 +2011,7 @@ mod tests {
|
||||
// Check that rel is not visible anymore
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x30)), &ctx)
|
||||
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
|
||||
.await?,
|
||||
false
|
||||
);
|
||||
@@ -2033,13 +2029,13 @@ mod tests {
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
|
||||
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
|
||||
.await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
|
||||
.await?,
|
||||
1
|
||||
);
|
||||
@@ -2081,26 +2077,26 @@ mod tests {
|
||||
// The relation was created at LSN 20, not visible at LSN 1 yet.
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
|
||||
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
|
||||
.await?,
|
||||
false
|
||||
);
|
||||
assert!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
|
||||
.await
|
||||
.is_err()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
|
||||
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
|
||||
.await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
|
||||
.await?,
|
||||
relsize
|
||||
);
|
||||
@@ -2114,7 +2110,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
blkno,
|
||||
Version::at(lsn),
|
||||
Version::Lsn(lsn),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -2135,7 +2131,7 @@ mod tests {
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
|
||||
.await?,
|
||||
1
|
||||
);
|
||||
@@ -2148,7 +2144,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
blkno,
|
||||
Version::at(Lsn(0x60)),
|
||||
Version::Lsn(Lsn(0x60)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -2161,7 +2157,7 @@ mod tests {
|
||||
// should still see all blocks with older LSN
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
|
||||
.await?,
|
||||
relsize
|
||||
);
|
||||
@@ -2173,7 +2169,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
blkno,
|
||||
Version::at(Lsn(0x50)),
|
||||
Version::Lsn(Lsn(0x50)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -2197,13 +2193,13 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
|
||||
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
|
||||
.await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
|
||||
.await?,
|
||||
relsize
|
||||
);
|
||||
@@ -2216,7 +2212,7 @@ mod tests {
|
||||
.get_rel_page_at_lsn(
|
||||
TESTREL_A,
|
||||
blkno,
|
||||
Version::at(Lsn(0x80)),
|
||||
Version::Lsn(Lsn(0x80)),
|
||||
&ctx,
|
||||
io_concurrency.clone()
|
||||
)
|
||||
@@ -2254,7 +2250,7 @@ mod tests {
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
|
||||
.await?,
|
||||
RELSEG_SIZE + 1
|
||||
);
|
||||
@@ -2268,7 +2264,7 @@ mod tests {
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
|
||||
.await?,
|
||||
RELSEG_SIZE
|
||||
);
|
||||
@@ -2283,7 +2279,7 @@ mod tests {
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
|
||||
.await?,
|
||||
RELSEG_SIZE - 1
|
||||
);
|
||||
@@ -2301,7 +2297,7 @@ mod tests {
|
||||
m.commit(&ctx).await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
|
||||
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
|
||||
.await?,
|
||||
size as BlockNumber
|
||||
);
|
||||
|
||||
17
poetry.lock
generated
17
poetry.lock
generated
@@ -3170,24 +3170,19 @@ pbr = "*"
|
||||
|
||||
[[package]]
|
||||
name = "setuptools"
|
||||
version = "78.1.1"
|
||||
version = "70.0.0"
|
||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561"},
|
||||
{file = "setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d"},
|
||||
{file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
|
||||
{file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
|
||||
core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
|
||||
cover = ["pytest-cov"]
|
||||
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
|
||||
enabler = ["pytest-enabler (>=2.2)"]
|
||||
test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
|
||||
type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]
|
||||
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
|
||||
testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
|
||||
|
||||
[[package]]
|
||||
name = "six"
|
||||
|
||||
@@ -127,4 +127,3 @@ rstest.workspace = true
|
||||
walkdir.workspace = true
|
||||
rand_distr = "0.4"
|
||||
tokio-postgres.workspace = true
|
||||
tracing-test = "0.2"
|
||||
@@ -80,22 +80,10 @@ impl std::fmt::Display for Backend<'_, ()> {
|
||||
.field(&endpoint.url())
|
||||
.finish(),
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
ControlPlaneClient::PostgresMock(endpoint) => {
|
||||
let url = endpoint.url();
|
||||
match url::Url::parse(url) {
|
||||
Ok(mut url) => {
|
||||
let _ = url.set_password(Some("_redacted_"));
|
||||
let url = url.as_str();
|
||||
fmt.debug_tuple("ControlPlane::PostgresMock")
|
||||
.field(&url)
|
||||
.finish()
|
||||
}
|
||||
Err(_) => fmt
|
||||
.debug_tuple("ControlPlane::PostgresMock")
|
||||
.field(&url)
|
||||
.finish(),
|
||||
}
|
||||
}
|
||||
ControlPlaneClient::PostgresMock(endpoint) => fmt
|
||||
.debug_tuple("ControlPlane::PostgresMock")
|
||||
.field(&endpoint.url())
|
||||
.finish(),
|
||||
#[cfg(test)]
|
||||
ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
|
||||
},
|
||||
|
||||
@@ -1,13 +1,9 @@
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
use std::env;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::pin::pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
use anyhow::Context;
|
||||
use anyhow::{bail, ensure};
|
||||
use arc_swap::ArcSwapOption;
|
||||
use futures::future::Either;
|
||||
@@ -39,8 +35,6 @@ use crate::scram::threadpool::ThreadPool;
|
||||
use crate::serverless::GlobalConnPoolOptions;
|
||||
use crate::serverless::cancel_set::CancelSet;
|
||||
use crate::tls::client_config::compute_client_config_with_root_certs;
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
use crate::url::ApiUrl;
|
||||
use crate::{auth, control_plane, http, serverless, usage_metrics};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
@@ -167,11 +161,8 @@ struct ProxyCliArgs {
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
|
||||
redis_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Cancellation channel size (max queue size for redis kv client)
|
||||
#[clap(long, default_value_t = 1024)]
|
||||
#[clap(long, default_value = "1024")]
|
||||
cancellation_ch_size: usize,
|
||||
/// Cancellation ops batch size for redis
|
||||
#[clap(long, default_value_t = 8)]
|
||||
cancellation_batch_size: usize,
|
||||
/// cache for `allowed_ips` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
allowed_ips_cache: String,
|
||||
@@ -551,12 +542,7 @@ pub async fn run() -> anyhow::Result<()> {
|
||||
if let Some(mut redis_kv_client) = redis_kv_client {
|
||||
maintenance_tasks.spawn(async move {
|
||||
redis_kv_client.try_connect().await?;
|
||||
handle_cancel_messages(
|
||||
&mut redis_kv_client,
|
||||
rx_cancel,
|
||||
args.cancellation_batch_size,
|
||||
)
|
||||
.await?;
|
||||
handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
|
||||
|
||||
drop(redis_kv_client);
|
||||
|
||||
@@ -783,13 +769,7 @@ fn build_auth_backend(
|
||||
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
AuthBackendType::Postgres => {
|
||||
let mut url: ApiUrl = args.auth_endpoint.parse()?;
|
||||
if url.password().is_none() {
|
||||
let password = env::var("PGPASSWORD")
|
||||
.with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?;
|
||||
url.set_password(Some(&password))
|
||||
.expect("Failed to set password");
|
||||
}
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let api = control_plane::client::mock::MockControlPlane::new(
|
||||
url,
|
||||
!args.is_private_access_proxy,
|
||||
|
||||
@@ -30,6 +30,8 @@ use crate::tls::postgres_rustls::MakeRustlsConnect;
|
||||
type IpSubnetKey = IpNet;
|
||||
|
||||
const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
|
||||
const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
|
||||
const BATCH_SIZE: usize = 8;
|
||||
|
||||
// Message types for sending through mpsc channel
|
||||
pub enum CancelKeyOp {
|
||||
@@ -229,13 +231,12 @@ impl CancelReplyOp {
|
||||
pub async fn handle_cancel_messages(
|
||||
client: &mut RedisKVClient,
|
||||
mut rx: mpsc::Receiver<CancelKeyOp>,
|
||||
batch_size: usize,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut batch = Vec::with_capacity(batch_size);
|
||||
let mut pipeline = Pipeline::with_capacity(batch_size);
|
||||
let mut batch = Vec::with_capacity(BATCH_SIZE);
|
||||
let mut pipeline = Pipeline::with_capacity(BATCH_SIZE);
|
||||
|
||||
loop {
|
||||
if rx.recv_many(&mut batch, batch_size).await == 0 {
|
||||
if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
|
||||
warn!("shutting down cancellation queue");
|
||||
break Ok(());
|
||||
}
|
||||
@@ -366,7 +367,8 @@ impl CancellationHandler {
|
||||
return Err(CancelError::InternalError);
|
||||
};
|
||||
|
||||
tx.try_send(op)
|
||||
tx.send_timeout(op, REDIS_SEND_TIMEOUT)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tracing::warn!("failed to send GetCancelData for {key}: {e}");
|
||||
})
|
||||
@@ -568,7 +570,7 @@ impl Session {
|
||||
}
|
||||
|
||||
// Send the store key op to the cancellation handler and set TTL for the key
|
||||
pub(crate) fn write_cancel_key(
|
||||
pub(crate) async fn write_cancel_key(
|
||||
&self,
|
||||
cancel_closure: CancelClosure,
|
||||
) -> Result<(), CancelError> {
|
||||
@@ -594,14 +596,14 @@ impl Session {
|
||||
expire: CANCEL_KEY_TTL,
|
||||
};
|
||||
|
||||
let _ = tx.try_send(op).map_err(|e| {
|
||||
let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
|
||||
let key = self.key;
|
||||
tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
|
||||
pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
|
||||
let Some(tx) = &self.cancellation_handler.tx else {
|
||||
tracing::warn!("cancellation handler is not available");
|
||||
return Err(CancelError::InternalError);
|
||||
@@ -617,7 +619,7 @@ impl Session {
|
||||
.guard(RedisMsgKind::HDel),
|
||||
};
|
||||
|
||||
let _ = tx.try_send(op).map_err(|e| {
|
||||
let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
|
||||
let key = self.key;
|
||||
tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
|
||||
});
|
||||
|
||||
@@ -244,7 +244,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session = cancellation_handler_clone.get_key();
|
||||
|
||||
session.write_cancel_key(node.cancel_closure.clone())?;
|
||||
session
|
||||
.write_cancel_key(node.cancel_closure.clone())
|
||||
.await?;
|
||||
|
||||
prepare_client_connection(&node, *session.key(), &mut stream).await?;
|
||||
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
use std::cell::RefCell;
|
||||
use std::cell::{Cell, RefCell};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::hash::BuildHasher;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::{env, io};
|
||||
use std::{array, env, fmt, io};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use indexmap::IndexSet;
|
||||
use opentelemetry::trace::TraceContextExt;
|
||||
use scopeguard::defer;
|
||||
use serde::ser::{SerializeMap, Serializer};
|
||||
use tracing::subscriber::Interest;
|
||||
use tracing::{Event, Metadata, Span, Subscriber, callsite, span};
|
||||
@@ -17,6 +19,7 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields};
|
||||
use tracing_subscriber::layer::{Context, Layer};
|
||||
use tracing_subscriber::prelude::*;
|
||||
use tracing_subscriber::registry::{LookupSpan, SpanRef};
|
||||
use try_lock::TryLock;
|
||||
|
||||
/// Initialize logging and OpenTelemetry tracing and exporter.
|
||||
///
|
||||
@@ -52,7 +55,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
|
||||
StderrWriter {
|
||||
stderr: std::io::stderr(),
|
||||
},
|
||||
&["request_id", "session_id", "conn_id"],
|
||||
["request_id", "session_id", "conn_id"],
|
||||
))
|
||||
} else {
|
||||
None
|
||||
@@ -180,65 +183,50 @@ impl Clock for RealClock {
|
||||
/// Name of the field used by tracing crate to store the event message.
|
||||
const MESSAGE_FIELD: &str = "message";
|
||||
|
||||
/// Tracing used to enforce that spans/events have no more than 32 fields.
|
||||
/// It seems this is no longer the case, but it's still documented in some places.
|
||||
/// Generally, we shouldn't expect more than 32 fields anyway, so we can try and
|
||||
/// rely on it for some (minor) performance gains.
|
||||
const MAX_TRACING_FIELDS: usize = 32;
|
||||
|
||||
thread_local! {
|
||||
/// Protects against deadlocks and double panics during log writing.
|
||||
/// The current panic handler will use tracing to log panic information.
|
||||
static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
|
||||
/// Thread-local instance with per-thread buffer for log writing.
|
||||
static EVENT_FORMATTER: RefCell<EventFormatter> = const { RefCell::new(EventFormatter::new()) };
|
||||
static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
|
||||
/// Cached OS thread ID.
|
||||
static THREAD_ID: u64 = gettid::gettid();
|
||||
}
|
||||
|
||||
/// Map for values fixed at callsite registration.
|
||||
// We use papaya here because registration rarely happens post-startup.
|
||||
// papaya is good for read-heavy workloads.
|
||||
//
|
||||
// We use rustc_hash here because callsite::Identifier will always be an integer with low-bit entropy,
|
||||
// since it's always a pointer to static mutable data. rustc_hash was designed for low-bit entropy.
|
||||
type CallsiteMap<T> =
|
||||
papaya::HashMap<callsite::Identifier, T, std::hash::BuildHasherDefault<rustc_hash::FxHasher>>;
|
||||
|
||||
/// Implements tracing layer to handle events specific to logging.
|
||||
struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
|
||||
struct JsonLoggingLayer<C: Clock, W: MakeWriter, const F: usize> {
|
||||
clock: C,
|
||||
skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
|
||||
callsite_ids: papaya::HashMap<callsite::Identifier, CallsiteId>,
|
||||
writer: W,
|
||||
|
||||
/// tracks which fields of each **event** are duplicates
|
||||
skipped_field_indices: CallsiteMap<SkippedFieldIndices>,
|
||||
|
||||
span_info: CallsiteMap<CallsiteSpanInfo>,
|
||||
|
||||
/// Fields we want to keep track of in a separate json object.
|
||||
extract_fields: &'static [&'static str],
|
||||
// We use a const generic and arrays to bypass one heap allocation.
|
||||
extract_fields: IndexSet<&'static str>,
|
||||
_marker: std::marker::PhantomData<[&'static str; F]>,
|
||||
}
|
||||
|
||||
impl<C: Clock, W: MakeWriter> JsonLoggingLayer<C, W> {
|
||||
fn new(clock: C, writer: W, extract_fields: &'static [&'static str]) -> Self {
|
||||
impl<C: Clock, W: MakeWriter, const F: usize> JsonLoggingLayer<C, W, F> {
|
||||
fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self {
|
||||
JsonLoggingLayer {
|
||||
clock,
|
||||
skipped_field_indices: CallsiteMap::default(),
|
||||
span_info: CallsiteMap::default(),
|
||||
skipped_field_indices: papaya::HashMap::default(),
|
||||
callsite_ids: papaya::HashMap::default(),
|
||||
writer,
|
||||
extract_fields,
|
||||
extract_fields: IndexSet::from_iter(extract_fields),
|
||||
_marker: std::marker::PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn span_info(&self, metadata: &'static Metadata<'static>) -> CallsiteSpanInfo {
|
||||
self.span_info
|
||||
fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId {
|
||||
*self
|
||||
.callsite_ids
|
||||
.pin()
|
||||
.get_or_insert_with(metadata.callsite(), || {
|
||||
CallsiteSpanInfo::new(metadata, self.extract_fields)
|
||||
})
|
||||
.clone()
|
||||
.get_or_insert_with(cs, CallsiteId::next)
|
||||
}
|
||||
}
|
||||
|
||||
impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
|
||||
impl<S, C: Clock + 'static, W: MakeWriter + 'static, const F: usize> Layer<S>
|
||||
for JsonLoggingLayer<C, W, F>
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
{
|
||||
@@ -249,25 +237,35 @@ where
|
||||
// early, before OTel machinery, and add as event extension.
|
||||
let now = self.clock.now();
|
||||
|
||||
let res: io::Result<()> = EVENT_FORMATTER.with(|f| {
|
||||
let mut borrow = f.try_borrow_mut();
|
||||
let formatter = match borrow.as_deref_mut() {
|
||||
Ok(formatter) => formatter,
|
||||
// If the thread local formatter is borrowed,
|
||||
// then we likely hit an edge case were we panicked during formatting.
|
||||
// We allow the logging to proceed with an uncached formatter.
|
||||
Err(_) => &mut EventFormatter::new(),
|
||||
};
|
||||
let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
|
||||
if entered.get() {
|
||||
let mut formatter = EventFormatter::new();
|
||||
formatter.format::<S, F>(
|
||||
now,
|
||||
event,
|
||||
&ctx,
|
||||
&self.skipped_field_indices,
|
||||
&self.callsite_ids,
|
||||
&self.extract_fields,
|
||||
)?;
|
||||
self.writer.make_writer().write_all(formatter.buffer())
|
||||
} else {
|
||||
entered.set(true);
|
||||
defer!(entered.set(false););
|
||||
|
||||
formatter.reset();
|
||||
formatter.format(
|
||||
now,
|
||||
event,
|
||||
&ctx,
|
||||
&self.skipped_field_indices,
|
||||
self.extract_fields,
|
||||
)?;
|
||||
self.writer.make_writer().write_all(formatter.buffer())
|
||||
EVENT_FORMATTER.with_borrow_mut(move |formatter| {
|
||||
formatter.reset();
|
||||
formatter.format::<S, F>(
|
||||
now,
|
||||
event,
|
||||
&ctx,
|
||||
&self.skipped_field_indices,
|
||||
&self.callsite_ids,
|
||||
&self.extract_fields,
|
||||
)?;
|
||||
self.writer.make_writer().write_all(formatter.buffer())
|
||||
})
|
||||
}
|
||||
});
|
||||
|
||||
// In case logging fails we generate a simpler JSON object.
|
||||
@@ -289,48 +287,50 @@ where
|
||||
/// Registers a SpanFields instance as span extension.
|
||||
fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
|
||||
let span = ctx.span(id).expect("span must exist");
|
||||
let fields = SpanFields::default();
|
||||
fields.record_fields(attrs);
|
||||
|
||||
let mut fields = SpanFields::new(self.span_info(span.metadata()));
|
||||
attrs.record(&mut fields);
|
||||
// This could deadlock when there's a panic somewhere in the tracing
|
||||
// event handling and a read or write guard is still held. This includes
|
||||
// the OTel subscriber.
|
||||
let mut exts = span.extensions_mut();
|
||||
|
||||
// This is a new span: the extensions should not be locked
|
||||
// unless some layer spawned a thread to process this span.
|
||||
// I don't think any layers do that.
|
||||
span.extensions_mut().insert(fields);
|
||||
exts.insert(fields);
|
||||
}
|
||||
|
||||
fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
|
||||
let span = ctx.span(id).expect("span must exist");
|
||||
|
||||
// assumption: `on_record` is rarely called.
|
||||
// assumption: a span being updated by one thread,
|
||||
// and formatted by another thread is even rarer.
|
||||
let mut ext = span.extensions_mut();
|
||||
if let Some(fields) = ext.get_mut::<SpanFields>() {
|
||||
values.record(fields);
|
||||
let ext = span.extensions();
|
||||
if let Some(data) = ext.get::<SpanFields>() {
|
||||
data.record_fields(values);
|
||||
}
|
||||
}
|
||||
|
||||
/// Called (lazily) roughly once per event/span instance. We quickly check
|
||||
/// for duplicate field names and record duplicates as skippable. Last field wins.
|
||||
/// Called (lazily) whenever a new log call is executed. We quickly check
|
||||
/// for duplicate field names and record duplicates as skippable. Last one
|
||||
/// wins.
|
||||
fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
|
||||
debug_assert!(
|
||||
metadata.fields().len() <= MAX_TRACING_FIELDS,
|
||||
"callsite {metadata:?} has too many fields."
|
||||
);
|
||||
|
||||
if !metadata.is_event() {
|
||||
// register the span info.
|
||||
self.span_info(metadata);
|
||||
self.callsite_id(metadata.callsite());
|
||||
// Must not be never because we wouldn't get trace and span data.
|
||||
return Interest::always();
|
||||
}
|
||||
|
||||
let mut field_indices = SkippedFieldIndices::default();
|
||||
let mut seen_fields = HashMap::new();
|
||||
let mut seen_fields = HashMap::<&'static str, usize>::new();
|
||||
for field in metadata.fields() {
|
||||
if let Some(old_index) = seen_fields.insert(field.name(), field.index()) {
|
||||
field_indices.set(old_index);
|
||||
use std::collections::hash_map::Entry;
|
||||
match seen_fields.entry(field.name()) {
|
||||
Entry::Vacant(entry) => {
|
||||
// field not seen yet
|
||||
entry.insert(field.index());
|
||||
}
|
||||
Entry::Occupied(mut entry) => {
|
||||
// replace currently stored index
|
||||
let old_index = entry.insert(field.index());
|
||||
// ... and append it to list of skippable indices
|
||||
field_indices.push(old_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -344,113 +344,110 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Any span info that is fixed to a particular callsite. Not variable between span instances.
|
||||
#[derive(Clone)]
|
||||
struct CallsiteSpanInfo {
|
||||
/// index of each field to extract. usize::MAX if not found.
|
||||
extract: Arc<[usize]>,
|
||||
#[derive(Copy, Clone, Debug, Default)]
|
||||
#[repr(transparent)]
|
||||
struct CallsiteId(u32);
|
||||
|
||||
/// tracks the fixed "callsite ID" for each span.
|
||||
/// note: this is not stable between runs.
|
||||
normalized_name: Arc<str>,
|
||||
}
|
||||
|
||||
impl CallsiteSpanInfo {
|
||||
fn new(metadata: &'static Metadata<'static>, extract_fields: &[&'static str]) -> Self {
|
||||
impl CallsiteId {
|
||||
#[inline]
|
||||
fn next() -> Self {
|
||||
// Start at 1 to reserve 0 for default.
|
||||
static COUNTER: AtomicU32 = AtomicU32::new(1);
|
||||
CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed))
|
||||
}
|
||||
}
|
||||
|
||||
let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect();
|
||||
|
||||
// get all the indices of span fields we want to focus
|
||||
let extract = extract_fields
|
||||
.iter()
|
||||
// use rposition, since we want last match wins.
|
||||
.map(|f1| names.iter().rposition(|f2| f1 == f2).unwrap_or(usize::MAX))
|
||||
.collect();
|
||||
|
||||
// normalized_name is unique for each callsite, but it is not
|
||||
// unified across separate proxy instances.
|
||||
// todo: can we do better here?
|
||||
let cid = COUNTER.fetch_add(1, Ordering::Relaxed);
|
||||
let normalized_name = format!("{}#{cid}", metadata.name()).into();
|
||||
|
||||
Self {
|
||||
extract,
|
||||
normalized_name,
|
||||
}
|
||||
impl fmt::Display for CallsiteId {
|
||||
#[inline]
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
/// Stores span field values recorded during the spans lifetime.
|
||||
#[derive(Default)]
|
||||
struct SpanFields {
|
||||
values: [serde_json::Value; MAX_TRACING_FIELDS],
|
||||
|
||||
/// cached span info so we can avoid extra hashmap lookups in the hot path.
|
||||
span_info: CallsiteSpanInfo,
|
||||
// TODO: Switch to custom enum with lasso::Spur for Strings?
|
||||
fields: papaya::HashMap<&'static str, serde_json::Value>,
|
||||
}
|
||||
|
||||
impl SpanFields {
|
||||
fn new(span_info: CallsiteSpanInfo) -> Self {
|
||||
Self {
|
||||
span_info,
|
||||
values: [const { serde_json::Value::Null }; MAX_TRACING_FIELDS],
|
||||
}
|
||||
#[inline]
|
||||
fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
|
||||
fields.record(&mut SpanFieldsRecorder {
|
||||
fields: self.fields.pin(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
impl tracing::field::Visit for SpanFields {
|
||||
/// Implements a tracing field visitor to convert and store values.
|
||||
struct SpanFieldsRecorder<'m, S, G> {
|
||||
fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
|
||||
}
|
||||
|
||||
impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
|
||||
#[inline]
|
||||
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
|
||||
self.values[field.index()] = serde_json::Value::from(value);
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
|
||||
self.values[field.index()] = serde_json::Value::from(value);
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
|
||||
self.values[field.index()] = serde_json::Value::from(value);
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
|
||||
if let Ok(value) = i64::try_from(value) {
|
||||
self.values[field.index()] = serde_json::Value::from(value);
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
} else {
|
||||
self.values[field.index()] = serde_json::Value::from(format!("{value}"));
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
|
||||
if let Ok(value) = u64::try_from(value) {
|
||||
self.values[field.index()] = serde_json::Value::from(value);
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
} else {
|
||||
self.values[field.index()] = serde_json::Value::from(format!("{value}"));
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value}")));
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
|
||||
self.values[field.index()] = serde_json::Value::from(value);
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
|
||||
self.values[field.index()] = serde_json::Value::from(value);
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
|
||||
self.values[field.index()] = serde_json::Value::from(value);
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(value));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
|
||||
self.values[field.index()] = serde_json::Value::from(format!("{value:?}"));
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value:?}")));
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -459,33 +456,38 @@ impl tracing::field::Visit for SpanFields {
|
||||
field: &tracing::field::Field,
|
||||
value: &(dyn std::error::Error + 'static),
|
||||
) {
|
||||
self.values[field.index()] = serde_json::Value::from(format!("{value}"));
|
||||
self.fields
|
||||
.insert(field.name(), serde_json::Value::from(format!("{value}")));
|
||||
}
|
||||
}
|
||||
|
||||
/// List of field indices skipped during logging. Can list duplicate fields or
|
||||
/// metafields not meant to be logged.
|
||||
#[derive(Copy, Clone, Default)]
|
||||
#[derive(Clone, Default)]
|
||||
struct SkippedFieldIndices {
|
||||
// 32-bits is large enough for `MAX_TRACING_FIELDS`
|
||||
bits: u32,
|
||||
bits: u64,
|
||||
}
|
||||
|
||||
impl SkippedFieldIndices {
|
||||
#[inline]
|
||||
fn is_empty(self) -> bool {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.bits == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn set(&mut self, index: usize) {
|
||||
debug_assert!(index <= 32, "index out of bounds of 32-bit set");
|
||||
self.bits |= 1 << index;
|
||||
fn push(&mut self, index: usize) {
|
||||
self.bits |= 1u64
|
||||
.checked_shl(index as u32)
|
||||
.expect("field index too large");
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn contains(self, index: usize) -> bool {
|
||||
self.bits & (1 << index) != 0
|
||||
fn contains(&self, index: usize) -> bool {
|
||||
self.bits
|
||||
& 1u64
|
||||
.checked_shl(index as u32)
|
||||
.expect("field index too large")
|
||||
!= 0
|
||||
}
|
||||
}
|
||||
|
||||
@@ -497,7 +499,7 @@ struct EventFormatter {
|
||||
|
||||
impl EventFormatter {
|
||||
#[inline]
|
||||
const fn new() -> Self {
|
||||
fn new() -> Self {
|
||||
EventFormatter {
|
||||
logline_buffer: Vec::new(),
|
||||
}
|
||||
@@ -513,13 +515,14 @@ impl EventFormatter {
|
||||
self.logline_buffer.clear();
|
||||
}
|
||||
|
||||
fn format<S>(
|
||||
fn format<S, const F: usize>(
|
||||
&mut self,
|
||||
now: DateTime<Utc>,
|
||||
event: &Event<'_>,
|
||||
ctx: &Context<'_, S>,
|
||||
skipped_field_indices: &CallsiteMap<SkippedFieldIndices>,
|
||||
extract_fields: &'static [&'static str],
|
||||
skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
|
||||
callsite_ids: &papaya::HashMap<callsite::Identifier, CallsiteId>,
|
||||
extract_fields: &IndexSet<&'static str>,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
@@ -530,11 +533,8 @@ impl EventFormatter {
|
||||
let normalized_meta = event.normalized_metadata();
|
||||
let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
|
||||
|
||||
let skipped_field_indices = skipped_field_indices
|
||||
.pin()
|
||||
.get(&meta.callsite())
|
||||
.copied()
|
||||
.unwrap_or_default();
|
||||
let skipped_field_indices = skipped_field_indices.pin();
|
||||
let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
|
||||
|
||||
let mut serialize = || {
|
||||
let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
|
||||
@@ -565,11 +565,9 @@ impl EventFormatter {
|
||||
}
|
||||
|
||||
let spans = SerializableSpans {
|
||||
// collect all spans from parent to root.
|
||||
spans: ctx
|
||||
.event_span(event)
|
||||
.map_or(vec![], |parent| parent.scope().collect()),
|
||||
extracted: ExtractedSpanFields::new(extract_fields),
|
||||
ctx,
|
||||
callsite_ids,
|
||||
extract: ExtractedSpanFields::<'_, F>::new(extract_fields),
|
||||
};
|
||||
serializer.serialize_entry("spans", &spans)?;
|
||||
|
||||
@@ -622,9 +620,9 @@ impl EventFormatter {
|
||||
}
|
||||
}
|
||||
|
||||
if spans.extracted.has_values() {
|
||||
if spans.extract.has_values() {
|
||||
// TODO: add fields from event, too?
|
||||
serializer.serialize_entry("extract", &spans.extracted)?;
|
||||
serializer.serialize_entry("extract", &spans.extract)?;
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
@@ -637,15 +635,15 @@ impl EventFormatter {
|
||||
}
|
||||
|
||||
/// Extracts the message field that's mixed will other fields.
|
||||
struct MessageFieldExtractor<S: serde::ser::SerializeMap> {
|
||||
struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
|
||||
serializer: S,
|
||||
skipped_field_indices: SkippedFieldIndices,
|
||||
skipped_field_indices: Option<&'a SkippedFieldIndices>,
|
||||
state: Option<Result<(), S::Error>>,
|
||||
}
|
||||
|
||||
impl<S: serde::ser::SerializeMap> MessageFieldExtractor<S> {
|
||||
impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
|
||||
#[inline]
|
||||
fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
|
||||
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
|
||||
Self {
|
||||
serializer,
|
||||
skipped_field_indices,
|
||||
@@ -667,11 +665,13 @@ impl<S: serde::ser::SerializeMap> MessageFieldExtractor<S> {
|
||||
fn accept_field(&self, field: &tracing::field::Field) -> bool {
|
||||
self.state.is_none()
|
||||
&& field.name() == MESSAGE_FIELD
|
||||
&& !self.skipped_field_indices.contains(field.index())
|
||||
&& !self
|
||||
.skipped_field_indices
|
||||
.is_some_and(|i| i.contains(field.index()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<S> {
|
||||
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
|
||||
#[inline]
|
||||
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
|
||||
if self.accept_field(field) {
|
||||
@@ -751,14 +751,14 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtracto
|
||||
/// can be skipped.
|
||||
// This is entirely optional and only cosmetic, though maybe helps a
|
||||
// bit during log parsing in dashboards when there's no field with empty object.
|
||||
struct FieldsPresent(pub bool, SkippedFieldIndices);
|
||||
struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
|
||||
|
||||
// Even though some methods have an overhead (error, bytes) it is assumed the
|
||||
// compiler won't include this since we ignore the value entirely.
|
||||
impl tracing::field::Visit for FieldsPresent {
|
||||
impl tracing::field::Visit for FieldsPresent<'_> {
|
||||
#[inline]
|
||||
fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
|
||||
if !self.1.contains(field.index())
|
||||
if !self.1.is_some_and(|i| i.contains(field.index()))
|
||||
&& field.name() != MESSAGE_FIELD
|
||||
&& !field.name().starts_with("log.")
|
||||
{
|
||||
@@ -768,7 +768,10 @@ impl tracing::field::Visit for FieldsPresent {
|
||||
}
|
||||
|
||||
/// Serializes the fields directly supplied with a log event.
|
||||
struct SerializableEventFields<'a, 'event>(&'a tracing::Event<'event>, SkippedFieldIndices);
|
||||
struct SerializableEventFields<'a, 'event>(
|
||||
&'a tracing::Event<'event>,
|
||||
Option<&'a SkippedFieldIndices>,
|
||||
);
|
||||
|
||||
impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
@@ -785,15 +788,15 @@ impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
|
||||
}
|
||||
|
||||
/// A tracing field visitor that skips the message field.
|
||||
struct MessageFieldSkipper<S: serde::ser::SerializeMap> {
|
||||
struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
|
||||
serializer: S,
|
||||
skipped_field_indices: SkippedFieldIndices,
|
||||
skipped_field_indices: Option<&'a SkippedFieldIndices>,
|
||||
state: Result<(), S::Error>,
|
||||
}
|
||||
|
||||
impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
|
||||
impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
|
||||
#[inline]
|
||||
fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
|
||||
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
|
||||
Self {
|
||||
serializer,
|
||||
skipped_field_indices,
|
||||
@@ -806,7 +809,9 @@ impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
|
||||
self.state.is_ok()
|
||||
&& field.name() != MESSAGE_FIELD
|
||||
&& !field.name().starts_with("log.")
|
||||
&& !self.skipped_field_indices.contains(field.index())
|
||||
&& !self
|
||||
.skipped_field_indices
|
||||
.is_some_and(|i| i.contains(field.index()))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -816,7 +821,7 @@ impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<S> {
|
||||
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
|
||||
#[inline]
|
||||
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
|
||||
if self.accept_field(field) {
|
||||
@@ -900,17 +905,18 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<
|
||||
/// with the span names as keys. To prevent collision we append a numberic value
|
||||
/// to the name. Also, collects any span fields we're interested in. Last one
|
||||
/// wins.
|
||||
struct SerializableSpans<'ctx, S>
|
||||
struct SerializableSpans<'a, 'ctx, Span, const F: usize>
|
||||
where
|
||||
S: for<'lookup> LookupSpan<'lookup>,
|
||||
Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
|
||||
{
|
||||
spans: Vec<SpanRef<'ctx, S>>,
|
||||
extracted: ExtractedSpanFields,
|
||||
ctx: &'a Context<'ctx, Span>,
|
||||
callsite_ids: &'a papaya::HashMap<callsite::Identifier, CallsiteId>,
|
||||
extract: ExtractedSpanFields<'a, F>,
|
||||
}
|
||||
|
||||
impl<S> serde::ser::Serialize for SerializableSpans<'_, S>
|
||||
impl<Span, const F: usize> serde::ser::Serialize for SerializableSpans<'_, '_, Span, F>
|
||||
where
|
||||
S: for<'lookup> LookupSpan<'lookup>,
|
||||
Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
|
||||
{
|
||||
fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
|
||||
where
|
||||
@@ -918,22 +924,25 @@ where
|
||||
{
|
||||
let mut serializer = serializer.serialize_map(None)?;
|
||||
|
||||
for span in self.spans.iter().rev() {
|
||||
let ext = span.extensions();
|
||||
if let Some(leaf_span) = self.ctx.lookup_current() {
|
||||
for span in leaf_span.scope().from_root() {
|
||||
// Append a numeric callsite ID to the span name to keep the name unique
|
||||
// in the JSON object.
|
||||
let cid = self
|
||||
.callsite_ids
|
||||
.pin()
|
||||
.get(&span.metadata().callsite())
|
||||
.copied()
|
||||
.unwrap_or_default();
|
||||
|
||||
// all spans should have this extension.
|
||||
let Some(fields) = ext.get() else { continue };
|
||||
// Loki turns the # into an underscore during field name concatenation.
|
||||
serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?;
|
||||
|
||||
self.extracted.layer_span(fields);
|
||||
|
||||
let SpanFields { values, span_info } = fields;
|
||||
serializer.serialize_entry(
|
||||
&*span_info.normalized_name,
|
||||
&SerializableSpanFields {
|
||||
fields: span.metadata().fields(),
|
||||
values,
|
||||
},
|
||||
)?;
|
||||
serializer.serialize_value(&SerializableSpanFields {
|
||||
span: &span,
|
||||
extract: &self.extract,
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
@@ -941,77 +950,80 @@ where
|
||||
}
|
||||
|
||||
/// Serializes the span fields as object.
|
||||
struct SerializableSpanFields<'span> {
|
||||
fields: &'span tracing::field::FieldSet,
|
||||
values: &'span [serde_json::Value; MAX_TRACING_FIELDS],
|
||||
struct SerializableSpanFields<'a, 'span, Span, const F: usize>
|
||||
where
|
||||
Span: for<'lookup> LookupSpan<'lookup>,
|
||||
{
|
||||
span: &'a SpanRef<'span, Span>,
|
||||
extract: &'a ExtractedSpanFields<'a, F>,
|
||||
}
|
||||
|
||||
impl serde::ser::Serialize for SerializableSpanFields<'_> {
|
||||
impl<Span, const F: usize> serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F>
|
||||
where
|
||||
Span: for<'lookup> LookupSpan<'lookup>,
|
||||
{
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::ser::Serializer,
|
||||
{
|
||||
let mut serializer = serializer.serialize_map(None)?;
|
||||
|
||||
for (field, value) in std::iter::zip(self.fields, self.values) {
|
||||
if value.is_null() {
|
||||
continue;
|
||||
let ext = self.span.extensions();
|
||||
if let Some(data) = ext.get::<SpanFields>() {
|
||||
for (name, value) in &data.fields.pin() {
|
||||
serializer.serialize_entry(name, value)?;
|
||||
// TODO: replace clone with reference, if possible.
|
||||
self.extract.set(name, value.clone());
|
||||
}
|
||||
serializer.serialize_entry(field.name(), value)?;
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
}
|
||||
}
|
||||
|
||||
struct ExtractedSpanFields {
|
||||
names: &'static [&'static str],
|
||||
values: RefCell<Vec<serde_json::Value>>,
|
||||
struct ExtractedSpanFields<'a, const F: usize> {
|
||||
names: &'a IndexSet<&'static str>,
|
||||
// TODO: replace TryLock with something local thread and interior mutability.
|
||||
// serde API doesn't let us use `mut`.
|
||||
values: TryLock<([Option<serde_json::Value>; F], bool)>,
|
||||
}
|
||||
|
||||
impl ExtractedSpanFields {
|
||||
fn new(names: &'static [&'static str]) -> Self {
|
||||
impl<'a, const F: usize> ExtractedSpanFields<'a, F> {
|
||||
fn new(names: &'a IndexSet<&'static str>) -> Self {
|
||||
ExtractedSpanFields {
|
||||
names,
|
||||
values: RefCell::new(vec![serde_json::Value::Null; names.len()]),
|
||||
values: TryLock::new((array::from_fn(|_| Option::default()), false)),
|
||||
}
|
||||
}
|
||||
|
||||
fn layer_span(&self, fields: &SpanFields) {
|
||||
let mut v = self.values.borrow_mut();
|
||||
let SpanFields { values, span_info } = fields;
|
||||
|
||||
// extract the fields
|
||||
for (i, &j) in span_info.extract.iter().enumerate() {
|
||||
let Some(value) = values.get(j) else { continue };
|
||||
|
||||
if !value.is_null() {
|
||||
// TODO: replace clone with reference, if possible.
|
||||
v[i] = value.clone();
|
||||
}
|
||||
#[inline]
|
||||
fn set(&self, name: &'static str, value: serde_json::Value) {
|
||||
if let Some((index, _)) = self.names.get_full(name) {
|
||||
let mut fields = self.values.try_lock().expect("thread-local use");
|
||||
fields.0[index] = Some(value);
|
||||
fields.1 = true;
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn has_values(&self) -> bool {
|
||||
self.values.borrow().iter().any(|v| !v.is_null())
|
||||
self.values.try_lock().expect("thread-local use").1
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::ser::Serialize for ExtractedSpanFields {
|
||||
impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::ser::Serializer,
|
||||
{
|
||||
let mut serializer = serializer.serialize_map(None)?;
|
||||
|
||||
let values = self.values.borrow();
|
||||
for (key, value) in std::iter::zip(self.names, &*values) {
|
||||
if value.is_null() {
|
||||
continue;
|
||||
let values = self.values.try_lock().expect("thread-local use");
|
||||
for (i, value) in values.0.iter().enumerate() {
|
||||
if let Some(value) = value {
|
||||
let key = self.names[i];
|
||||
serializer.serialize_entry(key, value)?;
|
||||
}
|
||||
|
||||
serializer.serialize_entry(key, value)?;
|
||||
}
|
||||
|
||||
serializer.end()
|
||||
@@ -1020,6 +1032,7 @@ impl serde::ser::Serialize for ExtractedSpanFields {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
|
||||
use assert_json_diff::assert_json_eq;
|
||||
@@ -1068,9 +1081,10 @@ mod tests {
|
||||
let log_layer = JsonLoggingLayer {
|
||||
clock: clock.clone(),
|
||||
skipped_field_indices: papaya::HashMap::default(),
|
||||
span_info: papaya::HashMap::default(),
|
||||
callsite_ids: papaya::HashMap::default(),
|
||||
writer: buffer.clone(),
|
||||
extract_fields: &["x"],
|
||||
extract_fields: IndexSet::from_iter(["x"]),
|
||||
_marker: PhantomData::<[&'static str; 1]>,
|
||||
};
|
||||
|
||||
let registry = tracing_subscriber::Registry::default().with(log_layer);
|
||||
|
||||
@@ -383,7 +383,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session = cancellation_handler_clone.get_key();
|
||||
|
||||
session.write_cancel_key(node.cancel_closure.clone())?;
|
||||
session
|
||||
.write_cancel_key(node.cancel_closure.clone())
|
||||
.await?;
|
||||
|
||||
prepare_client_connection(&node, *session.key(), &mut stream).await?;
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
|
||||
tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
|
||||
}
|
||||
|
||||
drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error
|
||||
drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
|
||||
use postgres_client::error::SqlState;
|
||||
// Here are errors that happens after the user successfully authenticated to the database.
|
||||
// TODO: there are pgbouncer errors that should be retried, but they are not listed here.
|
||||
let non_retriable_pg_errors = matches!(
|
||||
!matches!(
|
||||
self.code(),
|
||||
&SqlState::TOO_MANY_CONNECTIONS
|
||||
| &SqlState::OUT_OF_MEMORY
|
||||
@@ -56,20 +56,8 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
|
||||
| &SqlState::T_R_SERIALIZATION_FAILURE
|
||||
| &SqlState::INVALID_CATALOG_NAME
|
||||
| &SqlState::INVALID_SCHEMA_NAME
|
||||
| &SqlState::INVALID_PARAMETER_VALUE,
|
||||
);
|
||||
if non_retriable_pg_errors {
|
||||
return false;
|
||||
}
|
||||
// PGBouncer errors that should not trigger a wake_compute retry.
|
||||
if self.code() == &SqlState::PROTOCOL_VIOLATION {
|
||||
// Source for the error message:
|
||||
// https://github.com/pgbouncer/pgbouncer/blob/f15997fe3effe3a94ba8bcc1ea562e6117d1a131/src/client.c#L1070
|
||||
return !self
|
||||
.message()
|
||||
.contains("no more connections allowed (max_client_conn)");
|
||||
}
|
||||
true
|
||||
| &SqlState::INVALID_PARAMETER_VALUE
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,55 +110,3 @@ pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Durati
|
||||
.base_delay
|
||||
.mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::ShouldRetryWakeCompute;
|
||||
use postgres_client::error::{DbError, SqlState};
|
||||
|
||||
#[test]
|
||||
fn should_retry_wake_compute_for_db_error() {
|
||||
// These SQLStates should NOT trigger a wake_compute retry.
|
||||
let non_retry_states = [
|
||||
SqlState::TOO_MANY_CONNECTIONS,
|
||||
SqlState::OUT_OF_MEMORY,
|
||||
SqlState::SYNTAX_ERROR,
|
||||
SqlState::T_R_SERIALIZATION_FAILURE,
|
||||
SqlState::INVALID_CATALOG_NAME,
|
||||
SqlState::INVALID_SCHEMA_NAME,
|
||||
SqlState::INVALID_PARAMETER_VALUE,
|
||||
];
|
||||
for state in non_retry_states {
|
||||
let err = DbError::new_test_error(state.clone(), "oops".to_string());
|
||||
assert!(
|
||||
!err.should_retry_wake_compute(),
|
||||
"State {state:?} unexpectedly retried"
|
||||
);
|
||||
}
|
||||
|
||||
// Errors coming from pgbouncer should not trigger a wake_compute retry
|
||||
let non_retry_pgbouncer_errors = ["no more connections allowed (max_client_conn)"];
|
||||
for error in non_retry_pgbouncer_errors {
|
||||
let err = DbError::new_test_error(SqlState::PROTOCOL_VIOLATION, error.to_string());
|
||||
assert!(
|
||||
!err.should_retry_wake_compute(),
|
||||
"PGBouncer error {error:?} unexpectedly retried"
|
||||
);
|
||||
}
|
||||
|
||||
// These SQLStates should trigger a wake_compute retry.
|
||||
let retry_states = [
|
||||
SqlState::CONNECTION_FAILURE,
|
||||
SqlState::CONNECTION_EXCEPTION,
|
||||
SqlState::CONNECTION_DOES_NOT_EXIST,
|
||||
SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
|
||||
];
|
||||
for state in retry_states {
|
||||
let err = DbError::new_test_error(state.clone(), "oops".to_string());
|
||||
assert!(
|
||||
err.should_retry_wake_compute(),
|
||||
"State {state:?} unexpectedly skipped retry"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,7 +15,6 @@ use rstest::rstest;
|
||||
use rustls::crypto::ring;
|
||||
use rustls::pki_types;
|
||||
use tokio::io::DuplexStream;
|
||||
use tracing_test::traced_test;
|
||||
|
||||
use super::connect_compute::ConnectMechanism;
|
||||
use super::retry::CouldRetry;
|
||||
@@ -382,14 +381,8 @@ enum ConnectAction {
|
||||
WakeFail,
|
||||
WakeRetry,
|
||||
Connect,
|
||||
// connect_once -> Err, could_retry = true, should_retry_wake_compute = true
|
||||
Retry,
|
||||
// connect_once -> Err, could_retry = true, should_retry_wake_compute = false
|
||||
RetryNoWake,
|
||||
// connect_once -> Err, could_retry = false, should_retry_wake_compute = true
|
||||
Fail,
|
||||
// connect_once -> Err, could_retry = false, should_retry_wake_compute = false
|
||||
FailNoWake,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -431,7 +424,6 @@ struct TestConnection;
|
||||
#[derive(Debug)]
|
||||
struct TestConnectError {
|
||||
retryable: bool,
|
||||
wakeable: bool,
|
||||
kind: crate::error::ErrorKind,
|
||||
}
|
||||
|
||||
@@ -456,7 +448,7 @@ impl CouldRetry for TestConnectError {
|
||||
}
|
||||
impl ShouldRetryWakeCompute for TestConnectError {
|
||||
fn should_retry_wake_compute(&self) -> bool {
|
||||
self.wakeable
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -479,22 +471,10 @@ impl ConnectMechanism for TestConnectMechanism {
|
||||
ConnectAction::Connect => Ok(TestConnection),
|
||||
ConnectAction::Retry => Err(TestConnectError {
|
||||
retryable: true,
|
||||
wakeable: true,
|
||||
kind: ErrorKind::Compute,
|
||||
}),
|
||||
ConnectAction::RetryNoWake => Err(TestConnectError {
|
||||
retryable: true,
|
||||
wakeable: false,
|
||||
kind: ErrorKind::Compute,
|
||||
}),
|
||||
ConnectAction::Fail => Err(TestConnectError {
|
||||
retryable: false,
|
||||
wakeable: true,
|
||||
kind: ErrorKind::Compute,
|
||||
}),
|
||||
ConnectAction::FailNoWake => Err(TestConnectError {
|
||||
retryable: false,
|
||||
wakeable: false,
|
||||
kind: ErrorKind::Compute,
|
||||
}),
|
||||
x => panic!("expecting action {x:?}, connect is called instead"),
|
||||
@@ -729,92 +709,3 @@ async fn wake_non_retry() {
|
||||
.unwrap_err();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[traced_test]
|
||||
async fn fail_but_wake_invalidates_cache() {
|
||||
let ctx = RequestContext::test();
|
||||
let mech = TestConnectMechanism::new(vec![
|
||||
ConnectAction::Wake,
|
||||
ConnectAction::Fail,
|
||||
ConnectAction::Wake,
|
||||
ConnectAction::Connect,
|
||||
]);
|
||||
let user = helper_create_connect_info(&mech);
|
||||
let cfg = config();
|
||||
|
||||
connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(logs_contain(
|
||||
"invalidating stalled compute node info cache entry"
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[traced_test]
|
||||
async fn fail_no_wake_skips_cache_invalidation() {
|
||||
let ctx = RequestContext::test();
|
||||
let mech = TestConnectMechanism::new(vec![
|
||||
ConnectAction::Wake,
|
||||
ConnectAction::FailNoWake,
|
||||
ConnectAction::Connect,
|
||||
]);
|
||||
let user = helper_create_connect_info(&mech);
|
||||
let cfg = config();
|
||||
|
||||
connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(!logs_contain(
|
||||
"invalidating stalled compute node info cache entry"
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[traced_test]
|
||||
async fn retry_but_wake_invalidates_cache() {
|
||||
let _ = env_logger::try_init();
|
||||
use ConnectAction::*;
|
||||
|
||||
let ctx = RequestContext::test();
|
||||
// Wake → Retry (retryable + wakeable) → Wake → Connect
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
let cfg = config();
|
||||
|
||||
connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
|
||||
// Because Retry has wakeable=true, we should see invalidate_cache
|
||||
assert!(logs_contain(
|
||||
"invalidating stalled compute node info cache entry"
|
||||
));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[traced_test]
|
||||
async fn retry_no_wake_skips_invalidation() {
|
||||
let _ = env_logger::try_init();
|
||||
use ConnectAction::*;
|
||||
|
||||
let ctx = RequestContext::test();
|
||||
// Wake → RetryNoWake (retryable + NOT wakeable)
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
let cfg = config();
|
||||
|
||||
connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
|
||||
.await
|
||||
.unwrap_err();
|
||||
mechanism.verify();
|
||||
|
||||
// Because RetryNoWake has wakeable=false, we must NOT see invalidate_cache
|
||||
assert!(!logs_contain(
|
||||
"invalidating stalled compute node info cache entry"
|
||||
));
|
||||
}
|
||||
|
||||
@@ -13,19 +13,22 @@ pub(crate) struct Pbkdf2 {
|
||||
// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
|
||||
impl Pbkdf2 {
|
||||
pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
|
||||
// key the HMAC and derive the first block in-place
|
||||
let mut hmac =
|
||||
let hmac =
|
||||
Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
|
||||
hmac.update(salt);
|
||||
hmac.update(&1u32.to_be_bytes());
|
||||
let init_block = hmac.finalize_reset().into_bytes();
|
||||
|
||||
let prev = hmac
|
||||
.clone()
|
||||
.chain_update(salt)
|
||||
.chain_update(1u32.to_be_bytes())
|
||||
.finalize()
|
||||
.into_bytes();
|
||||
|
||||
Self {
|
||||
hmac,
|
||||
// one iteration spent above
|
||||
// one consumed for the hash above
|
||||
iterations: iterations - 1,
|
||||
hi: init_block,
|
||||
prev: init_block,
|
||||
hi: prev,
|
||||
prev,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,17 +44,14 @@ impl Pbkdf2 {
|
||||
iterations,
|
||||
} = self;
|
||||
|
||||
// only do up to 4096 iterations per turn for fairness
|
||||
// only do 4096 iterations per turn before sharing the thread for fairness
|
||||
let n = (*iterations).clamp(0, 4096);
|
||||
for _ in 0..n {
|
||||
hmac.update(prev);
|
||||
let block = hmac.finalize_reset().into_bytes();
|
||||
*prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
|
||||
|
||||
for (hi_byte, &b) in hi.iter_mut().zip(block.iter()) {
|
||||
*hi_byte ^= b;
|
||||
for (hi, prev) in hi.iter_mut().zip(*prev) {
|
||||
*hi ^= prev;
|
||||
}
|
||||
|
||||
*prev = block;
|
||||
}
|
||||
|
||||
*iterations -= n;
|
||||
|
||||
@@ -43,12 +43,6 @@ impl std::ops::Deref for ApiUrl {
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for ApiUrl {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for ApiUrl {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.0.fmt(f)
|
||||
|
||||
@@ -52,6 +52,7 @@ tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
tokio-util = { workspace = true }
|
||||
tonic = { workspace = true }
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
metrics.workspace = true
|
||||
@@ -62,9 +63,11 @@ pq_proto.workspace = true
|
||||
remote_storage.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
safekeeper_client.workspace = true
|
||||
sk_ps_discovery.workspace = true
|
||||
sha2.workspace = true
|
||||
sd-notify.workspace = true
|
||||
storage_broker.workspace = true
|
||||
storage_controller_client.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
http-utils.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -8,11 +8,12 @@ use std::error::Error as _;
|
||||
use http_utils::error::HttpErrorBody;
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
use safekeeper_api::models::{
|
||||
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
|
||||
TimelineStatus,
|
||||
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization,
|
||||
TenantShardPageserverAttachmentChange, TimelineCreateRequest, TimelineStatus,
|
||||
};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::logging::SecretString;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Client {
|
||||
@@ -189,6 +190,20 @@ impl Client {
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn post_tenant_shard_pageserver_attachments(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
body: TenantShardPageserverAttachmentChange,
|
||||
) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/pageserver_attachments",
|
||||
tenant_shard_id.tenant_id,
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
let resp = self.post(uri, body).await?;
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
async fn post<B: serde::Serialize, U: IntoUrl>(
|
||||
&self,
|
||||
uri: U,
|
||||
|
||||
@@ -25,7 +25,7 @@ use safekeeper::defaults::{
|
||||
use safekeeper::wal_backup::WalBackup;
|
||||
use safekeeper::{
|
||||
BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
|
||||
WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service,
|
||||
WAL_ADVERTISER_RUNTIME, WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service,
|
||||
};
|
||||
use sd_notify::NotifyState;
|
||||
use storage_broker::{DEFAULT_ENDPOINT, Uri};
|
||||
@@ -626,6 +626,30 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
|
||||
.map(|res| ("broker main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(broker_task_handle));
|
||||
|
||||
let ps_connectivity_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| HTTP_RUNTIME.handle())
|
||||
.spawn(
|
||||
global_timelines
|
||||
.get_pageserver_connectivity()
|
||||
.task_main()
|
||||
.instrument(info_span!("pageserver_connectivity")),
|
||||
)
|
||||
.map(|res| ("pageserver connectivity".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(ps_connectivity_handle));
|
||||
|
||||
let wal_advertiser_task_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_ADVERTISER_RUNTIME.handle())
|
||||
.spawn(
|
||||
global_timelines
|
||||
.get_wal_advertiser()
|
||||
.task_main()
|
||||
.instrument(info_span!("wal_advertiser_main")),
|
||||
)
|
||||
.map(|res| ("wal advertiser task handle".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_advertiser_task_handle));
|
||||
|
||||
set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
|
||||
// TODO: update tokio-stream, convert to real async Stream with
|
||||
|
||||
@@ -50,7 +50,8 @@ async fn push_loop(
|
||||
conf.broker_endpoint.clone(),
|
||||
conf.broker_keepalive_interval,
|
||||
make_tls_config(&conf),
|
||||
)?;
|
||||
)?
|
||||
.into_raw_grpc_client();
|
||||
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||
|
||||
let outbound = async_stream::stream! {
|
||||
@@ -97,7 +98,8 @@ async fn pull_loop(
|
||||
conf.broker_endpoint.clone(),
|
||||
conf.broker_keepalive_interval,
|
||||
make_tls_config(&conf),
|
||||
)?;
|
||||
)?
|
||||
.into_raw_grpc_client();
|
||||
|
||||
// TODO: subscribe only to local timelines instead of all
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
@@ -153,7 +155,8 @@ async fn discover_loop(
|
||||
conf.broker_endpoint.clone(),
|
||||
conf.broker_keepalive_interval,
|
||||
make_tls_config(&conf),
|
||||
)?;
|
||||
)?
|
||||
.into_raw_grpc_client();
|
||||
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![TypeSubscription {
|
||||
|
||||
@@ -67,6 +67,19 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
||||
})
|
||||
}
|
||||
|
||||
async fn post_tenant_pageserver_attachments(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let body: models::TenantShardPageserverAttachmentChange = json_request(&mut request).await?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let wal_advertiser = global_timelines.get_wal_advertiser();
|
||||
wal_advertiser
|
||||
.update_pageserver_attachments(tenant_id, body)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Deactivates all timelines for the tenant and removes its data directory.
|
||||
/// See `timeline_delete_handler`.
|
||||
async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -718,6 +731,9 @@ pub fn make_router(
|
||||
})
|
||||
})
|
||||
.get("/v1/utilization", |r| request_span(r, utilization_handler))
|
||||
.post("/v1/tenant/:tenant_id/pageserver_attachments", |r| {
|
||||
request_span(r, post_tenant_pageserver_attachments)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
request_span(r, tenant_delete_handler)
|
||||
})
|
||||
|
||||
@@ -38,11 +38,13 @@ pub mod timeline_eviction;
|
||||
pub mod timeline_guard;
|
||||
pub mod timeline_manager;
|
||||
pub mod timelines_set;
|
||||
pub mod wal_advertiser;
|
||||
pub mod wal_backup;
|
||||
pub mod wal_backup_partial;
|
||||
pub mod wal_reader_stream;
|
||||
pub mod wal_service;
|
||||
pub mod wal_storage;
|
||||
pub(crate) mod pageserver_connectivity;
|
||||
|
||||
#[cfg(any(test, feature = "benchmarking"))]
|
||||
pub mod test_utils;
|
||||
@@ -123,6 +125,7 @@ pub struct SafeKeeperConf {
|
||||
pub ssl_ca_certs: Vec<Pem>,
|
||||
pub use_https_safekeeper_api: bool,
|
||||
pub enable_tls_wal_service_api: bool,
|
||||
pub storage_controller_api: Option<Uri>,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -168,6 +171,7 @@ impl SafeKeeperConf {
|
||||
ssl_ca_certs: Vec::new(),
|
||||
use_https_safekeeper_api: false,
|
||||
enable_tls_wal_service_api: false,
|
||||
storage_controller_api: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -198,6 +202,14 @@ pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
pub static WAL_ADVERTISER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("wal advertiser worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("WAL backup worker")
|
||||
|
||||
117
safekeeper/src/pageserver_connectivity.rs
Normal file
117
safekeeper/src/pageserver_connectivity.rs
Normal file
@@ -0,0 +1,117 @@
|
||||
use desim::world::Node;
|
||||
use hyper::Uri;
|
||||
use pageserver_api::controller_api;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::timeline::Timeline;
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, hash_map},
|
||||
sync::{Arc, Mutex},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tracing::{Instrument, error, info, info_span, warn};
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
sync::{spsc_fold, spsc_watch},
|
||||
};
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
type Advs = HashMap<TenantTimelineId, Lsn>;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct GlobalState {
|
||||
inner: once_cell::sync::OnceCell<tokio::sync::mpsc::Sender<Message>>,
|
||||
}
|
||||
|
||||
enum Message {
|
||||
Resolve {
|
||||
ps_id: NodeId,
|
||||
reply: tokio::sync::oneshot::Sender<tokio::sync::watch::Receiver<hyper::Uri>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl GlobalState {
|
||||
pub fn task_main(&self) -> impl 'static + Future<Output = anyhow::Result<()>> + Send {
|
||||
let mut ret = None;
|
||||
self.inner.get_or_init(|| {
|
||||
let (tx, task_fut) = MainTask::prepare_run();
|
||||
ret = Some(task_fut);
|
||||
tx
|
||||
});
|
||||
ret.expect("must only call this method once")
|
||||
}
|
||||
}
|
||||
|
||||
struct MainTask {
|
||||
rx: tokio::sync::mpsc::Receiver<Message>,
|
||||
}
|
||||
|
||||
impl MainTask {
|
||||
fn prepare_run() -> (
|
||||
tokio::sync::mpsc::Sender<Message>,
|
||||
impl Future<Output = anyhow::Result<()>> + Send,
|
||||
) {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(100 /* TODO think */);
|
||||
let task = MainTask { rx };
|
||||
(tx, task.task())
|
||||
}
|
||||
async fn task(mut self) -> anyhow::Result<()> {
|
||||
// TODO: persistence
|
||||
|
||||
let storcon_client = todo!();
|
||||
|
||||
let mut resolution: HashMap<NodeId, tokio::sync::watch::Sender<hyper::Uri>> =
|
||||
HashMap::new();
|
||||
|
||||
while let Some(rx) = self.rx.recv().await {
|
||||
match rx {
|
||||
Message::Resolve { ps_id, reply } => match resolution.entry(ps_id) {
|
||||
hash_map::Entry::Occupied(e) => {}
|
||||
hash_map::Entry::Vacant(e) => {
|
||||
tokio::spawn(
|
||||
ResolutionTask { ps_id, storcon_client }.run()
|
||||
)
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ResolutionTask {
|
||||
ps_id: NodeId,
|
||||
storcon_client: storage_controller_client::control_api::Client,
|
||||
}
|
||||
|
||||
impl ResolutionTask {
|
||||
pub async fn run(self) -> Result<Uri, Error> {
|
||||
loop {
|
||||
// XXX: well-defined upcall API?
|
||||
let res = self
|
||||
.storcon_client
|
||||
.dispatch(
|
||||
reqwest::Method::GET,
|
||||
format!("control/v1/node/{}", self.node_id),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
let node: NodeDescribeResponse = match res {
|
||||
Ok(res) => res,
|
||||
Err(err) => {
|
||||
warn!("storcon upcall failed")
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -117,6 +117,7 @@ impl Env {
|
||||
Arc::new(TimelinesSet::default()), // ignored for now
|
||||
RateLimiter::new(0, 0),
|
||||
wal_backup,
|
||||
todo!(),
|
||||
);
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -39,7 +39,9 @@ use crate::wal_backup;
|
||||
use crate::wal_backup::{WalBackup, remote_timeline_path};
|
||||
use crate::wal_backup_partial::PartialRemoteSegment;
|
||||
use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
|
||||
use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage};
|
||||
use crate::{
|
||||
SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_advertiser, wal_storage,
|
||||
};
|
||||
|
||||
fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
|
||||
PeerInfo {
|
||||
@@ -547,6 +549,7 @@ impl Timeline {
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
partial_backup_rate_limiter: RateLimiter,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
wal_advertiser: Arc<wal_advertiser::GlobalState>,
|
||||
) {
|
||||
let (tx, rx) = self.manager_ctl.bootstrap_manager();
|
||||
|
||||
@@ -570,6 +573,7 @@ impl Timeline {
|
||||
rx,
|
||||
partial_backup_rate_limiter,
|
||||
wal_backup,
|
||||
wal_advertiser,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -42,6 +42,7 @@ impl Manager {
|
||||
&& next_event.is_none()
|
||||
&& self.access_service.is_empty()
|
||||
&& !self.tli_broker_active.get()
|
||||
&& self.wal_advertiser.ready_for_eviction()
|
||||
// Partial segment of current flush_lsn is uploaded up to this flush_lsn.
|
||||
&& !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded)
|
||||
// And it is the next one after the last removed. Given that local
|
||||
|
||||
@@ -22,7 +22,6 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, debug, info, info_span, instrument, warn};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::control_file::{FileStorage, Storage};
|
||||
use crate::metrics::{
|
||||
MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, NUM_EVICTED_TIMELINES,
|
||||
@@ -37,6 +36,7 @@ use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard};
|
||||
use crate::timelines_set::{TimelineSetGuard, TimelinesSet};
|
||||
use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle};
|
||||
use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment};
|
||||
use crate::{SafeKeeperConf, wal_advertiser};
|
||||
|
||||
pub(crate) struct StateSnapshot {
|
||||
// inmem values
|
||||
@@ -201,6 +201,7 @@ pub(crate) struct Manager {
|
||||
pub(crate) wal_seg_size: usize,
|
||||
pub(crate) walsenders: Arc<WalSenders>,
|
||||
pub(crate) wal_backup: Arc<WalBackup>,
|
||||
pub(crate) wal_advertiser: wal_advertiser::SafekeeperTimelineHandle,
|
||||
|
||||
// current state
|
||||
pub(crate) state_version_rx: tokio::sync::watch::Receiver<usize>,
|
||||
@@ -240,6 +241,7 @@ pub async fn main_task(
|
||||
mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
|
||||
global_rate_limiter: RateLimiter,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
wal_advertiser: Arc<wal_advertiser::GlobalState>,
|
||||
) {
|
||||
tli.set_status(Status::Started);
|
||||
|
||||
@@ -259,6 +261,7 @@ pub async fn main_task(
|
||||
manager_tx,
|
||||
global_rate_limiter,
|
||||
wal_backup,
|
||||
wal_advertiser,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -287,7 +290,8 @@ pub async fn main_task(
|
||||
|
||||
mgr.set_status(Status::UpdateBackup);
|
||||
let is_wal_backup_required = mgr.update_backup(num_computes, &state_snapshot).await;
|
||||
mgr.update_is_active(is_wal_backup_required, num_computes, &state_snapshot);
|
||||
|
||||
mgr.update_broker_active(is_wal_backup_required, num_computes, &state_snapshot);
|
||||
|
||||
mgr.set_status(Status::UpdateControlFile);
|
||||
mgr.update_control_file_save(&state_snapshot, &mut next_event)
|
||||
@@ -419,6 +423,7 @@ impl Manager {
|
||||
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
|
||||
global_rate_limiter: RateLimiter,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
wal_advertiser: Arc<wal_advertiser::GlobalState>,
|
||||
) -> Manager {
|
||||
let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
|
||||
Manager {
|
||||
@@ -428,6 +433,7 @@ impl Manager {
|
||||
state_version_rx: tli.get_state_version_rx(),
|
||||
num_computes_rx: tli.get_walreceivers().get_num_rx(),
|
||||
tli_broker_active: broker_active_set.guard(tli.clone()),
|
||||
wal_advertiser: wal_advertiser.new_timeline(tli.clone()).await.unwrap(),
|
||||
last_removed_segno: 0,
|
||||
is_offloaded,
|
||||
backup_task: None,
|
||||
@@ -494,8 +500,8 @@ impl Manager {
|
||||
is_wal_backup_required
|
||||
}
|
||||
|
||||
/// Update is_active flag and returns its value.
|
||||
fn update_is_active(
|
||||
/// Update broker is_active flag and returns its value.
|
||||
fn update_broker_active(
|
||||
&mut self,
|
||||
is_wal_backup_required: bool,
|
||||
num_computes: usize,
|
||||
@@ -505,6 +511,7 @@ impl Manager {
|
||||
|| num_computes > 0
|
||||
|| state.remote_consistent_lsn < state.commit_lsn;
|
||||
|
||||
|
||||
// update the broker timeline set
|
||||
if self.tli_broker_active.set(is_active) {
|
||||
// write log if state has changed
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_t
|
||||
use crate::timelines_set::TimelinesSet;
|
||||
use crate::wal_backup::WalBackup;
|
||||
use crate::wal_storage::Storage;
|
||||
use crate::{SafeKeeperConf, control_file, wal_storage};
|
||||
use crate::{SafeKeeperConf, control_file, pageserver_connectivity, wal_advertiser, wal_storage};
|
||||
|
||||
// Timeline entry in the global map: either a ready timeline, or mark that it is
|
||||
// being created.
|
||||
@@ -47,6 +47,8 @@ struct GlobalTimelinesState {
|
||||
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
wal_advertisement: Arc<wal_advertiser::GlobalState>,
|
||||
pageserver_connectivity: Arc<pageserver_connectivity::GlobalState>,
|
||||
global_rate_limiter: RateLimiter,
|
||||
wal_backup: Arc<WalBackup>,
|
||||
}
|
||||
@@ -60,12 +62,14 @@ impl GlobalTimelinesState {
|
||||
Arc<TimelinesSet>,
|
||||
RateLimiter,
|
||||
Arc<WalBackup>,
|
||||
Arc<wal_advertiser::GlobalState>,
|
||||
) {
|
||||
(
|
||||
self.conf.clone(),
|
||||
self.broker_active_set.clone(),
|
||||
self.global_rate_limiter.clone(),
|
||||
self.wal_backup.clone(),
|
||||
self.wal_advertisement.clone(),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -101,6 +105,8 @@ impl GlobalTimelines {
|
||||
tombstones: HashMap::new(),
|
||||
conf,
|
||||
broker_active_set: Arc::new(TimelinesSet::default()),
|
||||
wal_advertisement: Arc::new(wal_advertiser::GlobalState::default()),
|
||||
pageserver_connectivity: Arc::new(pageserver_connectivity::GlobalState::default()),
|
||||
global_rate_limiter: RateLimiter::new(1, 1),
|
||||
wal_backup,
|
||||
}),
|
||||
@@ -158,12 +164,13 @@ impl GlobalTimelines {
|
||||
/// just lock and unlock it for each timeline -- this function is called
|
||||
/// during init when nothing else is running, so this is fine.
|
||||
async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = {
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup, wal_advertiser) = {
|
||||
let state = self.state.lock().unwrap();
|
||||
state.get_dependencies()
|
||||
};
|
||||
|
||||
let timelines_dir = get_tenant_dir(&conf, &tenant_id);
|
||||
|
||||
for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
|
||||
.with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
|
||||
{
|
||||
@@ -187,6 +194,7 @@ impl GlobalTimelines {
|
||||
broker_active_set.clone(),
|
||||
partial_backup_rate_limiter.clone(),
|
||||
wal_backup.clone(),
|
||||
wal_advertiser.clone(),
|
||||
);
|
||||
}
|
||||
// If we can't load a timeline, it's most likely because of a corrupted
|
||||
@@ -238,7 +246,7 @@ impl GlobalTimelines {
|
||||
start_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let (conf, _, _, _) = {
|
||||
let (conf, _, _, _, _) = {
|
||||
let state = self.state.lock().unwrap();
|
||||
if let Ok(timeline) = state.get(&ttid) {
|
||||
// Timeline already exists, return it.
|
||||
@@ -283,7 +291,7 @@ impl GlobalTimelines {
|
||||
check_tombstone: bool,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
// Check for existence and mark that we're creating it.
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = {
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup, wal_advertiser) = {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
match state.timelines.get(&ttid) {
|
||||
Some(GlobalMapTimeline::CreationInProgress) => {
|
||||
@@ -338,6 +346,7 @@ impl GlobalTimelines {
|
||||
broker_active_set,
|
||||
partial_backup_rate_limiter,
|
||||
wal_backup,
|
||||
wal_advertiser.clone(),
|
||||
);
|
||||
drop(timeline_shared_state);
|
||||
Ok(timeline)
|
||||
@@ -590,6 +599,14 @@ impl GlobalTimelines {
|
||||
Ok(deleted)
|
||||
}
|
||||
|
||||
pub fn get_wal_advertiser(&self) -> Arc<wal_advertiser::GlobalState> {
|
||||
self.state.lock().unwrap().wal_advertisement.clone()
|
||||
}
|
||||
|
||||
pub fn get_pageserver_connectivity(&self) -> Arc<pageserver_connectivity::GlobalState> {
|
||||
self.state.lock().unwrap().pageserver_connectivity.clone()
|
||||
}
|
||||
|
||||
pub fn housekeeping(&self, tombstone_ttl: &Duration) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
|
||||
|
||||
201
safekeeper/src/wal_advertiser.rs
Normal file
201
safekeeper/src/wal_advertiser.rs
Normal file
@@ -0,0 +1,201 @@
|
||||
mod persistence;
|
||||
mod pageserver_connectivity;
|
||||
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::timeline::Timeline;
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, Mutex},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tracing::{Instrument, error, info, info_span, warn};
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
sync::{spsc_fold, spsc_watch},
|
||||
};
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
type Advs = HashMap<TenantTimelineId, Lsn>;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct GlobalState {
|
||||
inner: once_cell::sync::OnceCell<tokio::sync::mpsc::Sender<Message>>,
|
||||
}
|
||||
|
||||
pub struct SafekeeperTimelineHandle {
|
||||
tx: tokio::sync::mpsc::Sender<Message>,
|
||||
}
|
||||
|
||||
enum Message {
|
||||
NewTimeline {
|
||||
reply: tokio::sync::oneshot::Sender<Result<(), Error>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl GlobalState {
|
||||
pub fn task_main(&self) -> impl 'static + Future<Output = anyhow::Result<()>> + Send {
|
||||
let mut ret = None;
|
||||
self.inner.get_or_init(|| {
|
||||
let (tx, task_fut) = MainTask::prepare_run();
|
||||
ret = Some(task_fut);
|
||||
tx
|
||||
});
|
||||
ret.expect("must only call this method once")
|
||||
}
|
||||
|
||||
pub async fn new_timeline(
|
||||
&self,
|
||||
tli: Arc<Timeline>,
|
||||
) -> Result<SafekeeperTimelineHandle, Error> {
|
||||
let tx = self.inner.get().unwrap().clone();
|
||||
let handle = SafekeeperTimelineHandle { tx };
|
||||
let (reply, rx) = tokio::sync::oneshot::channel();
|
||||
let Ok(()) = handle.tx.send(Message::NewTimeline { reply }).await else {
|
||||
return Err(Error::Cancelled);
|
||||
};
|
||||
let Ok(res) = rx.await else {
|
||||
return Err(Error::Cancelled);
|
||||
};
|
||||
Ok(handle)
|
||||
}
|
||||
pub fn update_pageserver_attachments(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
update: safekeeper_api::models::TenantShardPageserverAttachmentChange,
|
||||
) -> anyhow::Result<()> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
impl SafekeeperTimelineHandle {
|
||||
pub fn ready_for_eviction(&self) -> bool {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
struct MainTask {
|
||||
rx: tokio::sync::mpsc::Receiver<Message>,
|
||||
world: sk_ps_discovery::World,
|
||||
senders: HashMap<utils::id::NodeId, spsc_watch::Sender<Advs>>,
|
||||
}
|
||||
|
||||
impl MainTask {
|
||||
fn prepare_run() -> (
|
||||
tokio::sync::mpsc::Sender<Message>,
|
||||
impl Future<Output = anyhow::Result<()>> + Send,
|
||||
) {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(100 /* TODO think */);
|
||||
let task = MainTask {
|
||||
rx,
|
||||
world: sk_ps_discovery::World::default(),
|
||||
senders: Default::default(),
|
||||
};
|
||||
(tx, task.task())
|
||||
}
|
||||
async fn task(mut self) -> anyhow::Result<()> {
|
||||
let mut adv_frequency = tokio::time::interval(Duration::from_secs(1));
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = adv_frequency.tick() => {
|
||||
let start = Instant::now();
|
||||
self.advertisements_iteration();
|
||||
let elapsed = start.elapsed();
|
||||
if elapsed > Duration::from_millis(10) {
|
||||
warn!(?elapsed, "advertisements iteration is slow");
|
||||
}
|
||||
},
|
||||
message = self.rx.recv() => {
|
||||
match message {
|
||||
None => anyhow::bail!("last main task sender dropped, shouldn't happen, exiting"),
|
||||
Some(_) => todo!(),
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn advertisements_iteration(&mut self) {
|
||||
loop {
|
||||
let advertisements = self.world.get_commit_lsn_advertisements();
|
||||
for (node_id, mut advs) in advertisements {
|
||||
'inner: loop {
|
||||
let tx = self.senders.entry(node_id).or_insert_with(|| {
|
||||
let (tx, rx) = spsc_watch::channel();
|
||||
tokio::spawn(
|
||||
PageserverTask {
|
||||
ps_id: node_id,
|
||||
endpoint: todo!(),
|
||||
advs: rx,
|
||||
}
|
||||
.run()
|
||||
.instrument(info_span!("wal_advertiser", ps_id=%node_id)),
|
||||
);
|
||||
tx
|
||||
});
|
||||
if let Err((failed, err)) = tx.send_replace(advs) {
|
||||
self.senders.remove(&node_id);
|
||||
advs = failed;
|
||||
} else {
|
||||
break 'inner;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
struct PageserverTask {
|
||||
ps_id: NodeId,
|
||||
advs: spsc_watch::Receiver<Advs>,
|
||||
}
|
||||
|
||||
impl PageserverTask {
|
||||
/// Cancellation: happens through last PageserverHandle being dropped.
|
||||
async fn run(mut self) {
|
||||
loop {
|
||||
let Ok(advs) = self.advs.recv().await else {
|
||||
info!("main task gone, exiting");
|
||||
return;
|
||||
};
|
||||
let res = self.run0(advs).await;
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(err) => {
|
||||
error!(?err, "error sending advertisements");
|
||||
// TODO: proper backoff?
|
||||
tokio::time::sleep(Duration::from_secs(5)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
async fn run0(&mut self, advs: HashMap<TenantTimelineId, Lsn>) -> anyhow::Result<()> {
|
||||
use storage_broker::wal_advertisement as proto;
|
||||
use storage_broker::wal_advertisement::pageserver_client::PageserverClient;
|
||||
let stream = async_stream::stream! {
|
||||
for (tenant_timeline_id, commit_lsn) in advs {
|
||||
yield proto::CommitLsnAdvertisement {tenant_timeline_id: Some(proto::TenantTimelineId {
|
||||
tenant_id: tenant_timeline_id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: tenant_timeline_id.timeline_id.as_ref().to_owned(),
|
||||
}), commit_lsn: commit_lsn.0 };
|
||||
}
|
||||
};
|
||||
let mut client: PageserverClient<_> = PageserverClient::connect(self.endpoint.clone())
|
||||
.await
|
||||
.context("connect")?;
|
||||
let publish_stream = client
|
||||
.publish_commit_lsn_advertisements(stream)
|
||||
.await
|
||||
.context("publish stream")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
0
safekeeper/src/wal_advertiser/persistence.rs
Normal file
0
safekeeper/src/wal_advertiser/persistence.rs
Normal file
@@ -27,6 +27,7 @@ parking_lot.workspace = true
|
||||
prost.workspace = true
|
||||
tonic.workspace = true
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
tokio-util.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
metrics.workspace = true
|
||||
|
||||
@@ -5,7 +5,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// easy location, but apparently interference with cachepot sometimes fails
|
||||
// the build then. Anyway, per cargo docs build script shouldn't output to
|
||||
// anywhere but $OUT_DIR.
|
||||
tonic_build::compile_protos("proto/broker.proto")
|
||||
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
|
||||
let protos = [
|
||||
"proto/broker.proto",
|
||||
"proto/wal_advertisement.proto",
|
||||
];
|
||||
for proto in protos {
|
||||
tonic_build::compile_protos(proto)
|
||||
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -35,14 +35,14 @@ message SafekeeperTimelineInfo {
|
||||
// LSN of the last record.
|
||||
uint64 flush_lsn = 4;
|
||||
// Up to which LSN safekeeper regards its WAL as committed.
|
||||
uint64 commit_lsn = 5;
|
||||
uint64 commit_lsn = 5; // yes
|
||||
// LSN up to which safekeeper has backed WAL.
|
||||
uint64 backup_lsn = 6;
|
||||
// LSN of last checkpoint uploaded by pageserver.
|
||||
uint64 remote_consistent_lsn = 7;
|
||||
uint64 peer_horizon_lsn = 8;
|
||||
uint64 local_start_lsn = 9;
|
||||
uint64 standby_horizon = 14;
|
||||
uint64 standby_horizon = 14; // yes
|
||||
// A connection string to use for WAL receiving.
|
||||
string safekeeper_connstr = 10;
|
||||
// HTTP endpoint connection string.
|
||||
|
||||
29
storage_broker/proto/wal_advertisement.proto
Normal file
29
storage_broker/proto/wal_advertisement.proto
Normal file
@@ -0,0 +1,29 @@
|
||||
syntax = "proto3";
|
||||
|
||||
import "google/protobuf/empty.proto";
|
||||
|
||||
package wal_advertisement;
|
||||
|
||||
service Pageserver {
|
||||
rpc PublishCommitLsnAdvertisements(stream CommitLsnAdvertisement) returns (google.protobuf.Empty) {};
|
||||
rpc SubscribeRemoteConsistentLsnAdvertisements(google.protobuf.Empty) returns (stream RemoteConsistentLsnAdvertisement) {};
|
||||
}
|
||||
|
||||
message CommitLsnAdvertisement {
|
||||
TenantTimelineId tenant_timeline_id = 1;
|
||||
uint64 commit_lsn = 2;
|
||||
}
|
||||
|
||||
message RemoteConsistentLsnAdvertisement {
|
||||
bytes tenant_id = 1;
|
||||
uint32 shard_id = 2;
|
||||
bytes timeline_id = 3;
|
||||
uint64 generation = 4;
|
||||
uint64 remote_consistent_lsn = 5;
|
||||
}
|
||||
|
||||
message TenantTimelineId {
|
||||
bytes tenant_id = 1;
|
||||
bytes timeline_id = 2;
|
||||
}
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use futures::Stream;
|
||||
use proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use proto::broker_service_client::BrokerServiceClient;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tonic::Status;
|
||||
use tonic::codegen::StdError;
|
||||
use tonic::transport::{Channel, Endpoint};
|
||||
use tonic::transport::Endpoint;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use utils::backoff::{
|
||||
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
|
||||
};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
// Code generated by protobuf.
|
||||
@@ -16,12 +20,18 @@ pub mod proto {
|
||||
tonic::include_proto!("storage_broker");
|
||||
}
|
||||
|
||||
pub mod wal_advertisement {
|
||||
#![allow(clippy::derive_partial_eq_without_eq)]
|
||||
tonic::include_proto!("wal_advertisement");
|
||||
}
|
||||
|
||||
pub mod metrics;
|
||||
|
||||
// Re-exports to avoid direct tonic dependency in user crates.
|
||||
pub use hyper::Uri;
|
||||
pub use tonic::transport::{Certificate, ClientTlsConfig};
|
||||
pub use tonic::{Code, Request, Streaming};
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
|
||||
pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
|
||||
@@ -29,9 +39,199 @@ pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LIST
|
||||
pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
|
||||
pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
// BrokerServiceClient charged with tonic provided Channel transport; helps to
|
||||
// avoid depending on tonic directly in user crates.
|
||||
pub type BrokerClientChannel = BrokerServiceClient<Channel>;
|
||||
#[derive(Clone)]
|
||||
pub struct TimelineUpdatesSubscriber {
|
||||
client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
|
||||
}
|
||||
|
||||
/// Wrapper type to weed out all places in the codebase that interact directly with the gRPC generated code.
|
||||
pub struct BrokerClientChannel {
|
||||
client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
|
||||
}
|
||||
|
||||
impl BrokerClientChannel {
|
||||
pub fn into_raw_grpc_client(
|
||||
self,
|
||||
) -> proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel> {
|
||||
self.client
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TimelineShardUpdate {
|
||||
pub is_discovery: bool,
|
||||
pub inner: proto::SafekeeperDiscoveryResponse,
|
||||
}
|
||||
|
||||
pub struct DiscoveryRequester {
|
||||
id: ProtoTenantTimelineId,
|
||||
client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
|
||||
}
|
||||
|
||||
impl TimelineUpdatesSubscriber {
|
||||
pub fn new(service_client: BrokerClientChannel) -> Self {
|
||||
Self {
|
||||
client: service_client.client.clone(),
|
||||
}
|
||||
}
|
||||
pub fn subscribe(
|
||||
&mut self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
cancel: &CancellationToken,
|
||||
) -> (impl Stream<Item = TimelineShardUpdate>, DiscoveryRequester) {
|
||||
let id = ProtoTenantTimelineId {
|
||||
tenant_id: tenant_shard_id.tenant_id.as_ref().to_owned(),
|
||||
timeline_id: timeline_id.as_ref().to_owned(),
|
||||
};
|
||||
let discovery_requester = DiscoveryRequester {
|
||||
id: id.clone(),
|
||||
client: self.client.clone(),
|
||||
};
|
||||
let stream = async_stream::stream! {
|
||||
let mut attempt = 0;
|
||||
'resubscribe: loop {
|
||||
exponential_backoff(
|
||||
attempt,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
attempt += 1;
|
||||
|
||||
use proto::*;
|
||||
// subscribe to the specific timeline
|
||||
let request = SubscribeByFilterRequest {
|
||||
types: vec![
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperTimelineInfo as i32,
|
||||
},
|
||||
TypeSubscription {
|
||||
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
|
||||
},
|
||||
],
|
||||
tenant_timeline_id: Some(FilterTenantTimelineId {
|
||||
enabled: true,
|
||||
tenant_timeline_id: Some(id.clone()),
|
||||
}),
|
||||
};
|
||||
|
||||
let res = tokio::select! {
|
||||
r = self.client.subscribe_by_filter(request) => { r }
|
||||
_ = cancel.cancelled() => { return; }
|
||||
};
|
||||
let mut update_stream = match res
|
||||
{
|
||||
Ok(resp) => {
|
||||
resp.into_inner()
|
||||
}
|
||||
Err(e) => {
|
||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
||||
// entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
|
||||
info!(
|
||||
attempt, "failed to subscribe: {e:#}"
|
||||
);
|
||||
continue 'resubscribe;
|
||||
}
|
||||
};
|
||||
loop {
|
||||
let broker_update = tokio::select!{
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
}
|
||||
update = update_stream.message() => { update }
|
||||
};
|
||||
match broker_update {
|
||||
Ok(Some(typed_msg)) => {
|
||||
let mut is_discovery = false;
|
||||
let timeline_update = match typed_msg.r#type() {
|
||||
MessageType::SafekeeperTimelineInfo => {
|
||||
let info = match typed_msg.safekeeper_timeline_info {
|
||||
Some(info) => info,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_timeline_info");
|
||||
continue 'resubscribe;
|
||||
}
|
||||
};
|
||||
SafekeeperDiscoveryResponse {
|
||||
safekeeper_id: info.safekeeper_id,
|
||||
tenant_timeline_id: info.tenant_timeline_id,
|
||||
commit_lsn: info.commit_lsn,
|
||||
safekeeper_connstr: info.safekeeper_connstr,
|
||||
availability_zone: info.availability_zone,
|
||||
standby_horizon: info.standby_horizon,
|
||||
}
|
||||
}
|
||||
MessageType::SafekeeperDiscoveryResponse => {
|
||||
is_discovery = true;
|
||||
match typed_msg.safekeeper_discovery_response {
|
||||
Some(response) => response,
|
||||
None => {
|
||||
warn!("bad proto message from broker: no safekeeper_discovery_response");
|
||||
continue 'resubscribe;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// unexpected message
|
||||
warn!("unexpected message from broker: {typed_msg:?}");
|
||||
continue 'resubscribe;
|
||||
}
|
||||
};
|
||||
attempt = 0; // reset backoff iff we received a valid update
|
||||
yield TimelineShardUpdate{is_discovery, inner: timeline_update };
|
||||
},
|
||||
Err(status) => {
|
||||
match status.code() {
|
||||
Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
|
||||
// tonic's error handling doesn't provide a clear code for disconnections: we get
|
||||
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
|
||||
// => https://github.com/neondatabase/neon/issues/9562
|
||||
info!("broker disconnected: {status}");
|
||||
},
|
||||
_ => {
|
||||
warn!("broker subscription failed: {status}");
|
||||
}
|
||||
}
|
||||
continue 'resubscribe;
|
||||
}
|
||||
Ok(None) => {
|
||||
error!("broker subscription stream ended"); // can't happen
|
||||
continue 'resubscribe;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
(stream, discovery_requester)
|
||||
}
|
||||
}
|
||||
|
||||
impl DiscoveryRequester {
|
||||
pub async fn request(&mut self) {
|
||||
let request = proto::SafekeeperDiscoveryRequest {
|
||||
tenant_timeline_id: Some(self.id.clone()),
|
||||
};
|
||||
let msg = proto::TypedMessage {
|
||||
r#type: proto::MessageType::SafekeeperDiscoveryRequest as i32,
|
||||
safekeeper_timeline_info: None,
|
||||
safekeeper_discovery_request: Some(request),
|
||||
safekeeper_discovery_response: None,
|
||||
};
|
||||
|
||||
// Cancellation safety: we want to send a message to the broker, but publish_one()
|
||||
// function can get cancelled by the other select! arm. This is absolutely fine, because
|
||||
// we just want to receive broker updates and discovery is not important if we already
|
||||
// receive updates.
|
||||
//
|
||||
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
|
||||
// This is totally fine because of the reason above.
|
||||
|
||||
// This is a fire-and-forget request, we don't care about the response
|
||||
let _ = self.client.publish_one(msg).await;
|
||||
debug!("Discovery request sent to the broker");
|
||||
}
|
||||
}
|
||||
|
||||
// Create connection object configured to run TLS if schema starts with https://
|
||||
// and plain text otherwise. Connection is lazy, only endpoint sanity is
|
||||
@@ -67,19 +267,9 @@ where
|
||||
.connect_timeout(DEFAULT_CONNECT_TIMEOUT);
|
||||
// keep_alive_timeout is 20s by default on both client and server side
|
||||
let channel = tonic_endpoint.connect_lazy();
|
||||
Ok(BrokerClientChannel::new(channel))
|
||||
}
|
||||
|
||||
impl BrokerClientChannel {
|
||||
/// Create a new client to the given endpoint, but don't actually connect until the first request.
|
||||
pub async fn connect_lazy<D>(dst: D) -> Result<Self, tonic::transport::Error>
|
||||
where
|
||||
D: std::convert::TryInto<tonic::transport::Endpoint>,
|
||||
D::Error: Into<StdError>,
|
||||
{
|
||||
let conn = tonic::transport::Endpoint::new(dst)?.connect_lazy();
|
||||
Ok(Self::new(conn))
|
||||
}
|
||||
Ok(BrokerClientChannel {
|
||||
client: proto::broker_service_client::BrokerServiceClient::new(channel),
|
||||
})
|
||||
}
|
||||
|
||||
// parse variable length bytes from protobuf
|
||||
|
||||
@@ -15,6 +15,7 @@ testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-stream.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
chrono.workspace = true
|
||||
@@ -69,4 +70,4 @@ http-utils = { path = "../libs/http-utils/" }
|
||||
utils = { path = "../libs/utils/" }
|
||||
metrics = { path = "../libs/metrics/" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
DROP TRIGGER on_timelines_UPDATE_enqueue_sk_ps_discovery on "timelines";
|
||||
DROP FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_timelines_DELETE_enqueue_sk_ps_discovery on "timelines";
|
||||
DROP FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_timelines_INSERT_enqueue_sk_ps_discovery on "timelines";
|
||||
DROP FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery on "tenant_shards";
|
||||
DROP FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery on "tenant_shards";
|
||||
DROP FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP TRIGGER on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery on "tenant_shards";
|
||||
DROP FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn;
|
||||
DROP FUNCTION IF EXISTS sk_ps_discovery_enqueue_attachment_create;
|
||||
DROP TABLE "sk_ps_discovery";
|
||||
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
CREATE TABLE "sk_ps_discovery"(
|
||||
"tenant_id" VARCHAR NOT NULL,
|
||||
"shard_number" INT4 NOT NULL,
|
||||
"shard_count" INT4 NOT NULL,
|
||||
"ps_generation" INT4 NOT NULL,
|
||||
"sk_id" INT8 NOT NULL REFERENCES "safekeepers"("id") ON DELETE CASCADE, -- more efficient that trigger on "safekeepers"
|
||||
"intent_state" VARCHAR NOT NULL, -- attached,detached
|
||||
"ps_id" INT8 NOT NULL REFERENCES "nodes"("node_id") ON DELETE CASCADE, -- more efficient that trigger on "nodes"
|
||||
"created_at" TIMESTAMPTZ NOT NULL,
|
||||
"retries" INT4 NOT NULL DEFAULT 0,
|
||||
"last_retry_at" TIMESTAMPTZ,
|
||||
"acknowledged_at" TIMESTAMPTZ,
|
||||
PRIMARY KEY("tenant_id", "shard_number", "shard_count", "ps_generation", "sk_id")
|
||||
);
|
||||
|
||||
CREATE OR REPLACE FUNCTION sk_ps_discovery_enqueue_attachment_create(ARG_TENANT_ID VARCHAR)
|
||||
RETURNS VOID AS $$
|
||||
BEGIN
|
||||
INSERT INTO sk_ps_discovery (tenant_id, shard_number, shard_count, ps_generation, sk_id, intent_state, ps_id, created_at)
|
||||
WITH intent_attachments AS (
|
||||
SELECT DISTINCT tenant_id,unnest(array_cat(sk_set, new_sk_set)) as sk_id FROM timelines
|
||||
WHERE
|
||||
tenant_id = ARG_TENANT_ID
|
||||
AND
|
||||
timelines.deleted_at IS NULL
|
||||
)
|
||||
SELECT tenant_shards.tenant_id, tenant_shards.shard_number, tenant_shards.shard_count,
|
||||
tenant_shards.generation, intent_attachments.sk_id, 'attached', tenant_shards.generation_pageserver, NOW()
|
||||
FROM tenant_shards
|
||||
INNER JOIN intent_attachments ON tenant_shards.tenant_id = intent_attachments.tenant_id
|
||||
ON CONFLICT DO NOTHING; -- the first trigger creates the attachment, all others are identical because tenant shard generations are monotonic
|
||||
|
||||
PERFORM pg_notify('sk_ps_discovery', json_build_object(
|
||||
'tenant_id', ARG_TENANT_ID
|
||||
)::text);
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Trigger on tenant_shards table
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery
|
||||
AFTER INSERT
|
||||
ON "tenant_shards"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(OLD.tenant_id);
|
||||
RETURN OLD;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery
|
||||
AFTER DELETE
|
||||
ON "tenant_shards"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery
|
||||
AFTER UPDATE
|
||||
ON "tenant_shards"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
-- Trigger on timelines table
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_timelines_INSERT_enqueue_sk_ps_discovery
|
||||
AFTER INSERT
|
||||
ON "timelines"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(OLD.tenant_id);
|
||||
RETURN OLD;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_timelines_DELETE_enqueue_sk_ps_discovery
|
||||
AFTER DELETE
|
||||
ON "timelines"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
CREATE OR REPLACE FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
CREATE OR REPLACE TRIGGER on_timelines_UPDATE_enqueue_sk_ps_discovery
|
||||
AFTER UPDATE
|
||||
ON "timelines"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn();
|
||||
|
||||
@@ -436,7 +436,8 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
};
|
||||
|
||||
// Validate that we can connect to the database
|
||||
Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
|
||||
Persistence::await_connection(secrets.database_url.clone(), args.db_connect_timeout.into())
|
||||
.await?;
|
||||
|
||||
let persistence = Arc::new(Persistence::new(secrets.database_url).await);
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
pub(crate) mod split_state;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::ops::Add;
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -15,8 +17,8 @@ use diesel_async::pooled_connection::bb8::Pool;
|
||||
use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig};
|
||||
use diesel_async::{AsyncPgConnection, RunQueryDsl};
|
||||
use diesel_migrations::{EmbeddedMigrations, embed_migrations};
|
||||
use futures::FutureExt;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::controller_api::{
|
||||
AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy,
|
||||
@@ -31,6 +33,7 @@ use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
|
||||
use rustls::crypto::ring;
|
||||
use scoped_futures::ScopedBoxFuture;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -74,6 +77,8 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
|
||||
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
|
||||
pub struct Persistence {
|
||||
connection_pool: Pool<AsyncPgConnection>,
|
||||
connect_tokio_postgres:
|
||||
Box<dyn Sync + Send + 'static + Fn() -> BoxFuture<'static, TokioPostgresConnectResult>>,
|
||||
}
|
||||
|
||||
/// Legacy format, for use in JSON compat objects in test environment
|
||||
@@ -135,6 +140,8 @@ pub(crate) enum DatabaseOperation {
|
||||
DeleteTimelineImport,
|
||||
ListTimelineImports,
|
||||
IsTenantImportingTimeline,
|
||||
ListSkPsDiscovery,
|
||||
UpdateSkPsDiscoveryAttempt,
|
||||
}
|
||||
|
||||
#[must_use]
|
||||
@@ -177,10 +184,11 @@ impl Persistence {
|
||||
|
||||
pub async fn new(database_url: String) -> Self {
|
||||
let mut mgr_config = ManagerConfig::default();
|
||||
mgr_config.custom_setup = Box::new(establish_connection_rustls);
|
||||
mgr_config.custom_setup =
|
||||
Box::new(|config| establish_connection_rustls_diesel(config.to_owned()));
|
||||
|
||||
let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new_with_config(
|
||||
database_url,
|
||||
database_url.clone(),
|
||||
mgr_config,
|
||||
);
|
||||
|
||||
@@ -197,20 +205,25 @@ impl Persistence {
|
||||
.await
|
||||
.expect("Could not build connection pool");
|
||||
|
||||
Self { connection_pool }
|
||||
Self {
|
||||
connection_pool,
|
||||
connect_tokio_postgres: Box::new(move || {
|
||||
establish_connection_rustls_tokio_postgres(database_url.clone())
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
|
||||
/// database and the storage controller, therefore the database might not be available right away
|
||||
pub async fn await_connection(
|
||||
database_url: &str,
|
||||
database_url: String,
|
||||
timeout: Duration,
|
||||
) -> Result<(), diesel::ConnectionError> {
|
||||
let started_at = Instant::now();
|
||||
log_postgres_connstr_info(database_url)
|
||||
log_postgres_connstr_info(&database_url)
|
||||
.map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
|
||||
loop {
|
||||
match establish_connection_rustls(database_url).await {
|
||||
match establish_connection_rustls_diesel(database_url.clone()).await {
|
||||
Ok(_) => {
|
||||
tracing::info!("Connected to database.");
|
||||
return Ok(());
|
||||
@@ -1821,6 +1834,151 @@ impl Persistence {
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn listen_sk_ps_discovery(
|
||||
&self,
|
||||
) -> DatabaseResult<
|
||||
Pin<Box<dyn Send + 'static + futures::Stream<Item = Result<TenantId, serde_json::Error>>>>,
|
||||
> {
|
||||
let (client, mut conn) = (&self.connect_tokio_postgres)().await?;
|
||||
|
||||
let (tx, mut rx) = tokio::sync::mpsc::channel(1);
|
||||
tokio::spawn(async move {
|
||||
let mut stream = futures::stream::poll_fn(move |cx| conn.poll_message(cx));
|
||||
while let Some(msg) = stream.next().await {
|
||||
info!(?msg, "async message");
|
||||
match msg {
|
||||
Ok(tokio_postgres::AsyncMessage::Notification(notification))
|
||||
if notification.channel() == "sk_ps_discovery" =>
|
||||
{
|
||||
let Ok(()) = tx.send(notification).await else {
|
||||
tracing::info!(
|
||||
"sk_ps_discovery notification rx dropped, stopping async notification processing"
|
||||
);
|
||||
break;
|
||||
};
|
||||
}
|
||||
Ok(_) => {}
|
||||
Err(err) => {
|
||||
tracing::error!(?err, "tokio_postgres poll_message error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
tracing::info!("sk_ps_discovery notification stream returned None, exiting");
|
||||
});
|
||||
|
||||
client
|
||||
.batch_execute("LISTEN sk_ps_discovery;")
|
||||
.await
|
||||
.expect("TODO");
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct Notification {
|
||||
tenant_id: TenantId,
|
||||
}
|
||||
Ok(Box::pin(async_stream::stream! {
|
||||
while let Some(msg) = rx.recv().await {
|
||||
let msg: Result<Notification, _> = serde_json::from_str(msg.payload());
|
||||
let msg = msg.map(|Notification { tenant_id }| tenant_id );
|
||||
yield msg;
|
||||
}
|
||||
tracing::info!("sk_ps_discovery channel closed, stopping stream");
|
||||
// keep client alive inside the returned sream object, othrwise `conn` ends as soon as we return from this function
|
||||
drop(client);
|
||||
}))
|
||||
}
|
||||
|
||||
pub(crate) async fn get_all_sk_ps_discovery_work(
|
||||
&self,
|
||||
) -> DatabaseResult<Vec<SkPsDiscoveryPersistence>> {
|
||||
use crate::schema::sk_ps_discovery::dsl;
|
||||
self.with_measured_conn(DatabaseOperation::ListSkPsDiscovery, move |conn| {
|
||||
Box::pin(async move {
|
||||
let vec: Vec<SkPsDiscoveryPersistence> = dsl::sk_ps_discovery.load(conn).await?;
|
||||
Ok(vec)
|
||||
})
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn update_sk_ps_discovery_attempt(
|
||||
&self,
|
||||
pk: SkPsDiscoveryPersistencePk,
|
||||
intent_state: String,
|
||||
update: Result<(), ()>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::sk_ps_discovery::dsl;
|
||||
|
||||
self.with_measured_conn(DatabaseOperation::UpdateSkPsDiscoveryAttempt, move |conn| {
|
||||
let pk = pk.clone();
|
||||
let intent_state = intent_state.clone();
|
||||
Box::pin(async move {
|
||||
match update {
|
||||
Ok(()) => {
|
||||
let SkPsDiscoveryPersistencePk {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
ps_generation,
|
||||
sk_id,
|
||||
} = pk;
|
||||
let nrows = diesel::delete(dsl::sk_ps_discovery)
|
||||
// primary key
|
||||
.filter(dsl::tenant_id.eq(tenant_id))
|
||||
.filter(dsl::shard_number.eq(shard_number))
|
||||
.filter(dsl::shard_count.eq(shard_count))
|
||||
.filter(dsl::ps_generation.eq(ps_generation))
|
||||
.filter(dsl::sk_id.eq(sk_id))
|
||||
// intent_state could have changed beneath us (split brain or concurrent state gc)
|
||||
// TODO: this could also just be a globally monotonic sequence number, maybe easier to reason about?
|
||||
.filter(dsl::intent_state.eq(intent_state))
|
||||
.execute(conn)
|
||||
.await?;
|
||||
if nrows != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of deletes: {nrows}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
let SkPsDiscoveryPersistencePk {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
ps_generation,
|
||||
sk_id,
|
||||
} = pk;
|
||||
|
||||
let nrows = diesel::update(dsl::sk_ps_discovery)
|
||||
// primary key
|
||||
.filter(dsl::tenant_id.eq(tenant_id))
|
||||
.filter(dsl::shard_number.eq(shard_number))
|
||||
.filter(dsl::shard_count.eq(shard_count))
|
||||
.filter(dsl::ps_generation.eq(ps_generation))
|
||||
.filter(dsl::sk_id.eq(sk_id))
|
||||
// intent_state could have changed beneath us (split brain or concurrent state gc)
|
||||
// TODO: this could also just be a globally monotonic sequence number, maybe easier to reason about?
|
||||
.filter(dsl::intent_state.eq(intent_state))
|
||||
// action:
|
||||
.set((
|
||||
dsl::retries.eq(dsl::retries.add(1)), // XXX: in split-brain situation we would bump twice...
|
||||
dsl::last_retry_at.eq(diesel::dsl::now),
|
||||
))
|
||||
.execute(conn) // TODO: check update count?
|
||||
.await?;
|
||||
if nrows != 1 {
|
||||
return Err(DatabaseError::Logical(format!(
|
||||
"unexpected number of updates: {nrows}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
})
|
||||
})
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
|
||||
@@ -1909,21 +2067,40 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
|
||||
})
|
||||
}
|
||||
|
||||
fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {
|
||||
let fut = async {
|
||||
type TokioPostgresConnectResult = ConnectionResult<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<
|
||||
tokio_postgres::Socket,
|
||||
tokio_postgres_rustls::RustlsStream<tokio_postgres::Socket>,
|
||||
>,
|
||||
)>;
|
||||
|
||||
fn establish_connection_rustls_tokio_postgres(
|
||||
config: String,
|
||||
) -> BoxFuture<'static, TokioPostgresConnectResult> {
|
||||
let fut = async move {
|
||||
// We first set up the way we want rustls to work.
|
||||
let rustls_config = client_config_with_root_certs()
|
||||
.map_err(|err| ConnectionError::BadConnection(format!("{err:?}")))?;
|
||||
let tls = tokio_postgres_rustls::MakeRustlsConnect::new(rustls_config);
|
||||
let (client, conn) = tokio_postgres::connect(config, tls)
|
||||
let (client, conn) = tokio_postgres::connect(&config, tls)
|
||||
.await
|
||||
.map_err(|e| ConnectionError::BadConnection(e.to_string()))?;
|
||||
|
||||
AsyncPgConnection::try_from_client_and_connection(client, conn).await
|
||||
Ok((client, conn))
|
||||
};
|
||||
fut.boxed()
|
||||
}
|
||||
|
||||
fn establish_connection_rustls_diesel(
|
||||
config: String,
|
||||
) -> BoxFuture<'static, ConnectionResult<AsyncPgConnection>> {
|
||||
async {
|
||||
let (client, conn) = establish_connection_rustls_tokio_postgres(config).await?;
|
||||
AsyncPgConnection::try_from_client_and_connection(client, conn).await
|
||||
}
|
||||
.boxed()
|
||||
}
|
||||
|
||||
#[cfg_attr(test, test)]
|
||||
fn test_config_debug_censors_password() {
|
||||
let has_pw =
|
||||
@@ -2386,3 +2563,61 @@ pub(crate) struct TimelineImportPersistence {
|
||||
pub(crate) timeline_id: String,
|
||||
pub(crate) shard_statuses: serde_json::Value,
|
||||
}
|
||||
|
||||
#[derive(Insertable, AsChangeset, Selectable, Clone, PartialEq, Eq, Hash, Debug)]
|
||||
#[diesel(table_name = crate::schema::sk_ps_discovery)]
|
||||
pub(crate) struct SkPsDiscoveryPersistencePk {
|
||||
pub(crate) tenant_id: String,
|
||||
pub(crate) shard_number: i32,
|
||||
pub(crate) shard_count: i32,
|
||||
pub(crate) ps_generation: i32,
|
||||
pub(crate) sk_id: i64,
|
||||
}
|
||||
|
||||
#[derive(Queryable, Selectable, Clone, PartialEq, Eq)]
|
||||
#[diesel(table_name = crate::schema::sk_ps_discovery)]
|
||||
pub(crate) struct SkPsDiscoveryPersistence {
|
||||
pub(crate) tenant_id: String,
|
||||
pub(crate) shard_number: i32,
|
||||
pub(crate) shard_count: i32,
|
||||
pub(crate) ps_generation: i32,
|
||||
pub(crate) sk_id: i64,
|
||||
pub(crate) intent_state: String,
|
||||
pub(crate) ps_id: i64,
|
||||
pub(crate) created_at: chrono::DateTime<chrono::Utc>,
|
||||
pub(crate) retries: i32,
|
||||
pub(crate) last_retry_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
pub(crate) acknowledged_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
}
|
||||
|
||||
impl SkPsDiscoveryPersistence {
|
||||
pub(crate) fn tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
|
||||
Ok(TenantShardId {
|
||||
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
|
||||
shard_number: ShardNumber(self.shard_number as u8),
|
||||
shard_count: ShardCount::new(self.shard_count as u8),
|
||||
})
|
||||
}
|
||||
pub(crate) fn primary_key(&self) -> SkPsDiscoveryPersistencePk {
|
||||
let SkPsDiscoveryPersistence {
|
||||
tenant_id,
|
||||
shard_number,
|
||||
shard_count,
|
||||
ps_generation,
|
||||
sk_id,
|
||||
intent_state: _,
|
||||
ps_id: _,
|
||||
created_at: _,
|
||||
retries: _,
|
||||
last_retry_at: _,
|
||||
acknowledged_at: _,
|
||||
} = self;
|
||||
SkPsDiscoveryPersistencePk {
|
||||
tenant_id: tenant_id.clone(),
|
||||
shard_number: *shard_number,
|
||||
shard_count: *shard_count,
|
||||
ps_generation: *ps_generation,
|
||||
sk_id: *sk_id,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use safekeeper_api::models::{
|
||||
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
|
||||
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization,
|
||||
TenantShardPageserverAttachmentChange, TimelineCreateRequest,
|
||||
};
|
||||
use safekeeper_client::mgmt_api::{Client, Result};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::logging::SecretString;
|
||||
use utils::shard::TenantShardId;
|
||||
|
||||
use crate::metrics::PageserverRequestLabelGroup;
|
||||
|
||||
@@ -164,4 +166,19 @@ impl SafekeeperClient {
|
||||
self.inner.utilization().await
|
||||
)
|
||||
}
|
||||
|
||||
pub async fn post_tenant_shard_pageserver_attachments(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
body: TenantShardPageserverAttachmentChange,
|
||||
) -> Result<()> {
|
||||
measured_request!(
|
||||
"post_tenant_shard_pageserver_attachments",
|
||||
crate::metrics::Method::Post,
|
||||
&self.node_id_label,
|
||||
self.inner
|
||||
.post_tenant_shard_pageserver_attachments(tenant_shard_id, body)
|
||||
.await
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,6 +60,22 @@ diesel::table! {
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
sk_ps_discovery (tenant_id, shard_number, shard_count, ps_generation, sk_id) {
|
||||
tenant_id -> Varchar,
|
||||
shard_number -> Int4,
|
||||
shard_count -> Int4,
|
||||
ps_generation -> Int4,
|
||||
sk_id -> Int8,
|
||||
intent_state -> Varchar,
|
||||
ps_id -> Int8,
|
||||
created_at -> Timestamptz,
|
||||
retries -> Int4,
|
||||
last_retry_at -> Nullable<Timestamptz>,
|
||||
acknowledged_at -> Nullable<Timestamptz>,
|
||||
}
|
||||
}
|
||||
|
||||
diesel::table! {
|
||||
tenant_shards (tenant_id, shard_number, shard_count) {
|
||||
tenant_id -> Varchar,
|
||||
@@ -100,12 +116,16 @@ diesel::table! {
|
||||
}
|
||||
}
|
||||
|
||||
diesel::joinable!(sk_ps_discovery -> nodes (ps_id));
|
||||
diesel::joinable!(sk_ps_discovery -> safekeepers (sk_id));
|
||||
|
||||
diesel::allow_tables_to_appear_in_same_query!(
|
||||
controllers,
|
||||
metadata_health,
|
||||
nodes,
|
||||
safekeeper_timeline_pending_ops,
|
||||
safekeepers,
|
||||
sk_ps_discovery,
|
||||
tenant_shards,
|
||||
timeline_imports,
|
||||
timelines,
|
||||
|
||||
@@ -2,6 +2,7 @@ pub mod chaos_injector;
|
||||
mod context_iterator;
|
||||
pub(crate) mod safekeeper_reconciler;
|
||||
mod safekeeper_service;
|
||||
mod sk_ps_discovery;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
@@ -1192,6 +1193,16 @@ impl Service {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn run_sk_ps_discovery(self: &Arc<Self>) {
|
||||
self.startup_complete.clone().wait().await;
|
||||
sk_ps_discovery::run(
|
||||
self.clone(),
|
||||
self.http_client.clone(), /* TODO this client is configured to openf resh TCP connection each time, very inefficient */
|
||||
).await;
|
||||
}
|
||||
|
||||
/// Heartbeat all storage nodes once in a while.
|
||||
#[instrument(skip_all)]
|
||||
async fn spawn_heartbeat_driver(&self) {
|
||||
@@ -1797,7 +1808,7 @@ impl Service {
|
||||
reconcilers_gate: Gate::default(),
|
||||
tenant_op_locks: Default::default(),
|
||||
node_op_locks: Default::default(),
|
||||
http_client,
|
||||
http_client: http_client.clone(),
|
||||
step_down_barrier: Default::default(),
|
||||
});
|
||||
|
||||
@@ -1865,6 +1876,15 @@ impl Service {
|
||||
}
|
||||
});
|
||||
|
||||
tokio::task::spawn({
|
||||
let this = this.clone();
|
||||
let startup_complete = startup_complete.clone();
|
||||
async move {
|
||||
startup_complete.wait().await;
|
||||
this.run_sk_ps_discovery().await
|
||||
}
|
||||
});
|
||||
|
||||
tokio::task::spawn({
|
||||
let this = this.clone();
|
||||
let startup_complete = startup_complete.clone();
|
||||
|
||||
@@ -647,6 +647,11 @@ impl Service {
|
||||
sk.describe_response()
|
||||
}
|
||||
|
||||
pub(crate) fn get_safekeeper_object(&self, node_id: i64) -> Option<Safekeeper> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
locked.safekeepers.get(&NodeId(node_id as u64)).cloned()
|
||||
}
|
||||
|
||||
pub(crate) async fn upsert_safekeeper(
|
||||
self: &Arc<Service>,
|
||||
record: crate::persistence::SafekeeperUpsert,
|
||||
|
||||
266
storage_controller/src/service/sk_ps_discovery.rs
Normal file
266
storage_controller/src/service/sk_ps_discovery.rs
Normal file
@@ -0,0 +1,266 @@
|
||||
use std::{
|
||||
collections::{HashMap, hash_map},
|
||||
sync::Arc,
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::{StreamExt, stream::FuturesUnordered};
|
||||
use safekeeper_api::models::{
|
||||
TenantShardPageserverAttachment, TenantShardPageserverAttachmentChange,
|
||||
};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, Span, error, info, info_span};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
id::{NodeId, TenantId},
|
||||
logging::SecretString,
|
||||
shard::ShardIndex,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
heartbeater::SafekeeperState,
|
||||
persistence::{Persistence, SkPsDiscoveryPersistence},
|
||||
};
|
||||
|
||||
use super::Service;
|
||||
|
||||
struct Actor {
|
||||
service: Arc<Service>,
|
||||
persistence: Arc<Persistence>,
|
||||
http_client: reqwest::Client,
|
||||
}
|
||||
|
||||
pub async fn run(service: Arc<Service>, http_client: reqwest::Client) {
|
||||
let actor = Actor {
|
||||
persistence: service.persistence.clone(),
|
||||
service,
|
||||
http_client, // XXX: build our own client instead of getting Service's client; we probably want idle conn to each sk
|
||||
};
|
||||
actor.run().await;
|
||||
}
|
||||
|
||||
impl Actor {
|
||||
async fn run(mut self) {
|
||||
loop {
|
||||
match self.run0().await {
|
||||
Ok(()) => {
|
||||
info!("sk_ps_discovery actor exiting after shutdown signal observed");
|
||||
return;
|
||||
}
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
?err,
|
||||
"sk_ps_discovery actor encountered an error, restarting after backoff"
|
||||
);
|
||||
// TODO: proper backoff
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn run0(&mut self) -> anyhow::Result<()> {
|
||||
let mut subscription = self
|
||||
.persistence
|
||||
.listen_sk_ps_discovery()
|
||||
.await
|
||||
.context("listen to sk_ps_discovery")?;
|
||||
|
||||
let mut sync_full_ticker = tokio::time::interval(std::time::Duration::from_secs(5));
|
||||
|
||||
struct Task {
|
||||
work: SkPsDiscoveryPersistence,
|
||||
cancel: CancellationToken,
|
||||
join_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
let mut tasks = HashMap::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
biased; // control messages have higher priority, the periodic full tick, then subscriptions.
|
||||
_ = sync_full_ticker.tick() => {
|
||||
info!("rebuild");
|
||||
}
|
||||
maybe_res = subscription.next() => {
|
||||
match maybe_res {
|
||||
None => {
|
||||
anyhow::bail!("subscription should never end");
|
||||
}
|
||||
Some(Ok(tenant_id)) => {
|
||||
let tenant_id: TenantId = tenant_id;
|
||||
info!(?tenant_id, "notify for tenant_id");
|
||||
// for now, just also rebuild everything
|
||||
}
|
||||
Some(Err(err)) => {
|
||||
let err: serde_json::Error = err;
|
||||
anyhow::bail!("incorrect notification format: {err:?}"); // FIXME repeat message in error so it can be debugged ?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// get list of tasks from database
|
||||
let mut new_tasks = self
|
||||
.persistence
|
||||
.get_all_sk_ps_discovery_work()
|
||||
.await
|
||||
.context("get_all_sk_ps_discovery_work")?
|
||||
.into_iter()
|
||||
.map(|work: SkPsDiscoveryPersistence| {
|
||||
(
|
||||
work.primary_key(),
|
||||
Task {
|
||||
work,
|
||||
cancel: CancellationToken::new(),
|
||||
join_handle: None,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
// Carry over ongoing tasks
|
||||
let mut cancelled_wait = FuturesUnordered::new();
|
||||
for (
|
||||
task_key,
|
||||
Task {
|
||||
work: ongoing_persistence,
|
||||
cancel,
|
||||
join_handle,
|
||||
},
|
||||
) in tasks.drain()
|
||||
{
|
||||
match new_tasks.entry(task_key) {
|
||||
hash_map::Entry::Occupied(mut planned) => {
|
||||
let Task {
|
||||
work: planned_persistence,
|
||||
cancel: planned_cancel,
|
||||
join_handle: planned_jh,
|
||||
} = planned.get_mut();
|
||||
assert!(planned_jh.is_none());
|
||||
if *planned_persistence == ongoing_persistence {
|
||||
*planned_jh = join_handle;
|
||||
*planned_cancel = cancel;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(_) => (),
|
||||
}
|
||||
cancel.cancel();
|
||||
cancelled_wait.push(async move {
|
||||
if let Some(jh) = join_handle {
|
||||
let _ = jh.await;
|
||||
}
|
||||
});
|
||||
}
|
||||
while let Some(_) = cancelled_wait.next().await {}
|
||||
tasks = new_tasks;
|
||||
|
||||
// Kick off new tasks
|
||||
for (key, task) in tasks.iter_mut() {
|
||||
if task.join_handle.is_none() {
|
||||
task.join_handle = Some(tokio::spawn(
|
||||
DeliveryAttempt {
|
||||
cancel: task.cancel.clone(),
|
||||
persistence: self.persistence.clone(),
|
||||
service: self.service.clone(),
|
||||
http_client: self.http_client.clone(),
|
||||
work: task.work.clone(),
|
||||
}
|
||||
.run()
|
||||
.instrument({
|
||||
let span = info_span!(parent: None, "sk_ps_discovery_delivery", ?key);
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct DeliveryAttempt {
|
||||
cancel: CancellationToken,
|
||||
persistence: Arc<Persistence>,
|
||||
service: Arc<super::Service>,
|
||||
http_client: reqwest::Client,
|
||||
work: SkPsDiscoveryPersistence,
|
||||
}
|
||||
|
||||
impl DeliveryAttempt {
|
||||
pub async fn run(self) {
|
||||
let res = self.run0().await;
|
||||
if self.cancel.is_cancelled() {
|
||||
return;
|
||||
}
|
||||
if let Err(ref err) = res {
|
||||
error!(?err, "attempt failed");
|
||||
}
|
||||
let res = self
|
||||
.persistence
|
||||
.update_sk_ps_discovery_attempt(
|
||||
self.work.primary_key(),
|
||||
self.work.intent_state.clone(),
|
||||
res.map_err(|_| ()),
|
||||
)
|
||||
.await;
|
||||
if let Err(ref err) = res {
|
||||
error!(?err, "persistence of attempt result failed");
|
||||
}
|
||||
}
|
||||
async fn run0(&self) -> anyhow::Result<()> {
|
||||
let Some(sk) = self.service.get_safekeeper_object(self.work.sk_id) else {
|
||||
anyhow::bail!("safekeeper object does not exist");
|
||||
};
|
||||
|
||||
match sk.availability() {
|
||||
SafekeeperState::Available { .. } => (),
|
||||
SafekeeperState::Offline => {
|
||||
anyhow::bail!("safekeeper is offline");
|
||||
}
|
||||
}
|
||||
|
||||
let body = {
|
||||
let val = TenantShardPageserverAttachment {
|
||||
shard_id: ShardIndex {
|
||||
shard_number: utils::shard::ShardNumber(self.work.shard_number as u8),
|
||||
shard_count: utils::shard::ShardCount(self.work.shard_count as u8),
|
||||
},
|
||||
ps_id: NodeId(self.work.ps_id as u64),
|
||||
generation: Generation::new(self.work.ps_generation as u32),
|
||||
};
|
||||
match self.work.intent_state.as_str() {
|
||||
"attached" => TenantShardPageserverAttachmentChange::Attach { field1: val },
|
||||
"detached" => TenantShardPageserverAttachmentChange::Detach(val),
|
||||
x => anyhow::bail!("unknown intent state {x:?}"),
|
||||
}
|
||||
};
|
||||
let tenant_shard_id = self.work.tenant_shard_id()?;
|
||||
sk.with_client_retries(
|
||||
|client| {
|
||||
let body = body.clone();
|
||||
async move {
|
||||
client
|
||||
.post_tenant_shard_pageserver_attachments(tenant_shard_id, body)
|
||||
.await
|
||||
}
|
||||
},
|
||||
&self.http_client,
|
||||
&self
|
||||
.service
|
||||
.config
|
||||
.safekeeper_jwt_token
|
||||
.clone()
|
||||
.map(SecretString::from),
|
||||
1,
|
||||
3,
|
||||
Duration::from_secs(1),
|
||||
&self.cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -184,7 +184,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
"pageserver_evictions_with_low_residence_duration_total",
|
||||
"pageserver_aux_file_estimated_size",
|
||||
"pageserver_valid_lsn_lease_count",
|
||||
"pageserver_tenant_offloaded_timelines",
|
||||
counter("pageserver_tenant_throttling_count_accounted_start"),
|
||||
counter("pageserver_tenant_throttling_count_accounted_finish"),
|
||||
counter("pageserver_tenant_throttling_wait_usecs_sum"),
|
||||
|
||||
@@ -187,7 +187,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
|
||||
},
|
||||
"rel_size_v2_enabled": True,
|
||||
"relsize_snapshot_cache_capacity": 10000,
|
||||
"gc_compaction_enabled": True,
|
||||
"gc_compaction_verification": False,
|
||||
"gc_compaction_initial_threshold_kb": 1024000,
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Simple test for basebackup cache.
|
||||
1. Check that we always hit the cache after compute restart.
|
||||
2. Check that we eventually delete old basebackup files, but not the latest one.
|
||||
3. Check that we delete basebackup file for timeline with active compute.
|
||||
"""
|
||||
|
||||
neon_env_builder.pageserver_config_override = """
|
||||
tenant_config = { basebackup_cache_enabled = true }
|
||||
basebackup_cache_config = { cleanup_period = '1s' }
|
||||
"""
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
ep = env.endpoints.create("main")
|
||||
ps = env.pageserver
|
||||
ps_http = ps.http_client()
|
||||
|
||||
# 1. Check that we always hit the cache after compute restart.
|
||||
for i in range(3):
|
||||
ep.start()
|
||||
ep.stop()
|
||||
|
||||
def check_metrics(i=i):
|
||||
metrics = ps_http.get_metrics()
|
||||
# Never miss.
|
||||
# The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests.
|
||||
# All other requests should be a hit
|
||||
assert (
|
||||
metrics.query_one(
|
||||
"pageserver_basebackup_cache_read_total", {"result": "miss"}
|
||||
).value
|
||||
== 0
|
||||
)
|
||||
# All but the first requests are hits.
|
||||
assert (
|
||||
metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value
|
||||
== i
|
||||
)
|
||||
# Every compute shut down should trigger a prepare reuest.
|
||||
assert (
|
||||
metrics.query_one(
|
||||
"pageserver_basebackup_cache_prepare_total", {"result": "ok"}
|
||||
).value
|
||||
== i + 1
|
||||
)
|
||||
|
||||
wait_until(check_metrics)
|
||||
|
||||
# 2. Check that we eventually delete old basebackup files, but not the latest one.
|
||||
def check_bb_file_count():
|
||||
bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir())
|
||||
# tmp dir + 1 basebackup file.
|
||||
assert len(bb_files) == 2
|
||||
|
||||
wait_until(check_bb_file_count)
|
||||
|
||||
# 3. Check that we delete basebackup file for timeline with active compute.
|
||||
ep.start()
|
||||
ep.safe_psql("create table t1 as select generate_series(1, 10) as n")
|
||||
|
||||
def check_bb_dir_empty():
|
||||
bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir())
|
||||
# only tmp dir.
|
||||
assert len(bb_files) == 1
|
||||
|
||||
wait_until(check_bb_dir_empty)
|
||||
@@ -19,16 +19,6 @@ TEST_ROLE_NAMES = [
|
||||
{"name": "role$"},
|
||||
{"name": "role$$"},
|
||||
{"name": "role$x$"},
|
||||
{"name": "x"},
|
||||
{"name": "xx"},
|
||||
{"name": "$x"},
|
||||
{"name": "x$"},
|
||||
{"name": "$x$"},
|
||||
{"name": "xx$"},
|
||||
{"name": "$xx"},
|
||||
{"name": "$xx$"},
|
||||
# 63 bytes is the limit for role/DB names in Postgres
|
||||
{"name": "x" * 63},
|
||||
]
|
||||
|
||||
TEST_DB_NAMES = [
|
||||
@@ -84,43 +74,6 @@ TEST_DB_NAMES = [
|
||||
"name": "db name$x$",
|
||||
"owner": "role$x$",
|
||||
},
|
||||
{
|
||||
"name": "x",
|
||||
"owner": "x",
|
||||
},
|
||||
{
|
||||
"name": "xx",
|
||||
"owner": "xx",
|
||||
},
|
||||
{
|
||||
"name": "$x",
|
||||
"owner": "$x",
|
||||
},
|
||||
{
|
||||
"name": "x$",
|
||||
"owner": "x$",
|
||||
},
|
||||
{
|
||||
"name": "$x$",
|
||||
"owner": "$x$",
|
||||
},
|
||||
{
|
||||
"name": "xx$",
|
||||
"owner": "xx$",
|
||||
},
|
||||
{
|
||||
"name": "$xx",
|
||||
"owner": "$xx",
|
||||
},
|
||||
{
|
||||
"name": "$xx$",
|
||||
"owner": "$xx$",
|
||||
},
|
||||
# 63 bytes is the limit for role/DB names in Postgres
|
||||
{
|
||||
"name": "x" * 63,
|
||||
"owner": "x" * 63,
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -193,10 +146,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test that compute_ctl can create and work with databases and roles
|
||||
with special characters (whitespaces, %, tabs, etc.) in the name.
|
||||
Also use `drop_subscriptions_before_start: true`. We do not actually
|
||||
have any subscriptions in this test, so it should be no-op, but it
|
||||
i) simulates the case when we create a second dev branch together with
|
||||
a new project creation, and ii) just generally stresses more code paths.
|
||||
"""
|
||||
env = neon_simple_env
|
||||
|
||||
@@ -210,7 +159,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
|
||||
**{
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"drop_subscriptions_before_start": True,
|
||||
"cluster": {
|
||||
"roles": TEST_ROLE_NAMES,
|
||||
"databases": TEST_DB_NAMES,
|
||||
@@ -254,7 +202,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
|
||||
**{
|
||||
"spec": {
|
||||
"skip_pg_catalog_updates": False,
|
||||
"drop_subscriptions_before_start": True,
|
||||
"cluster": {
|
||||
"roles": [],
|
||||
"databases": [],
|
||||
|
||||
@@ -508,9 +508,6 @@ PER_METRIC_VERIFIERS = {
|
||||
"remote_storage_size": CannotVerifyAnything,
|
||||
"written_size": WrittenDataVerifier,
|
||||
"written_data_bytes_delta": WrittenDataDeltaVerifier,
|
||||
"written_size_since_parent": WrittenDataVerifier, # same as written_size on root
|
||||
"pitr_cutoff": CannotVerifyAnything,
|
||||
"pitr_history_size_since_parent": WrittenDataVerifier, # same as written_size on root w/o GC
|
||||
"timeline_logical_size": CannotVerifyAnything,
|
||||
"synthetic_storage_size": SyntheticSizeVerifier,
|
||||
}
|
||||
|
||||
@@ -27,9 +27,8 @@ from contextlib import closing
|
||||
|
||||
import psycopg2
|
||||
import pytest
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin, wait_for_last_flush_lsn, wait_replica_caughtup
|
||||
from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import query_scalar, skip_on_postgres, wait_until
|
||||
|
||||
@@ -696,110 +695,3 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv):
|
||||
with secondary.cursor() as secondary_cur:
|
||||
secondary_cur.execute("select count(*) from t")
|
||||
assert secondary_cur.fetchone() == (n_restarts,)
|
||||
|
||||
|
||||
def test_ephemeral_endpoints_vacuum(neon_simple_env: NeonEnv, pg_bin: PgBin):
|
||||
env = neon_simple_env
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
sql = """
|
||||
CREATE TABLE CHAR_TBL(f1 char(4));
|
||||
CREATE TABLE FLOAT8_TBL(f1 float8);
|
||||
CREATE TABLE INT2_TBL(f1 int2);
|
||||
CREATE TABLE INT4_TBL(f1 int4);
|
||||
CREATE TABLE INT8_TBL(q1 int8, q2 int8);
|
||||
CREATE TABLE POINT_TBL(f1 point);
|
||||
CREATE TABLE TEXT_TBL (f1 text);
|
||||
CREATE TABLE VARCHAR_TBL(f1 varchar(4));
|
||||
CREATE TABLE onek (unique1 int4);
|
||||
CREATE TABLE onek2 AS SELECT * FROM onek;
|
||||
CREATE TABLE tenk1 (unique1 int4);
|
||||
CREATE TABLE tenk2 AS SELECT * FROM tenk1;
|
||||
CREATE TABLE person (name text, age int4,location point);
|
||||
CREATE TABLE emp (salary int4, manager name) INHERITS (person);
|
||||
CREATE TABLE student (gpa float8) INHERITS (person);
|
||||
CREATE TABLE stud_emp ( percent int4) INHERITS (emp, student);
|
||||
CREATE TABLE road (name text,thepath path);
|
||||
CREATE TABLE ihighway () INHERITS (road);
|
||||
CREATE TABLE shighway(surface text) INHERITS (road);
|
||||
CREATE TABLE BOOLTBL3 (d text, b bool, o int);
|
||||
CREATE TABLE booltbl4(isfalse bool, istrue bool, isnul bool);
|
||||
DROP TABLE BOOLTBL3;
|
||||
DROP TABLE BOOLTBL4;
|
||||
CREATE TABLE ceil_floor_round (a numeric);
|
||||
DROP TABLE ceil_floor_round;
|
||||
CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
|
||||
DROP TABLE width_bucket_test;
|
||||
CREATE TABLE num_input_test (n1 numeric);
|
||||
CREATE TABLE num_variance (a numeric);
|
||||
INSERT INTO num_variance VALUES (0);
|
||||
CREATE TABLE snapshot_test (nr integer, snap txid_snapshot);
|
||||
CREATE TABLE guid1(guid_field UUID, text_field TEXT DEFAULT(now()));
|
||||
CREATE TABLE guid2(guid_field UUID, text_field TEXT DEFAULT(now()));
|
||||
CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field);
|
||||
CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field);
|
||||
TRUNCATE guid1;
|
||||
DROP TABLE guid1;
|
||||
DROP TABLE guid2 CASCADE;
|
||||
CREATE TABLE numrange_test (nr NUMRANGE);
|
||||
CREATE INDEX numrange_test_btree on numrange_test(nr);
|
||||
CREATE TABLE numrange_test2(nr numrange);
|
||||
CREATE INDEX numrange_test2_hash_idx on numrange_test2 using hash (nr);
|
||||
INSERT INTO numrange_test2 VALUES('[, 5)');
|
||||
CREATE TABLE textrange_test (tr text);
|
||||
CREATE INDEX textrange_test_btree on textrange_test(tr);
|
||||
CREATE TABLE test_range_gist(ir int4range);
|
||||
CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
|
||||
DROP INDEX test_range_gist_idx;
|
||||
CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
|
||||
CREATE TABLE test_range_spgist(ir int4range);
|
||||
CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
|
||||
DROP INDEX test_range_spgist_idx;
|
||||
CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
|
||||
CREATE TABLE test_range_elem(i int4);
|
||||
CREATE INDEX test_range_elem_idx on test_range_elem (i);
|
||||
CREATE INDEX ON test_range_elem using spgist(int4range(i,i+10));
|
||||
DROP TABLE test_range_elem;
|
||||
CREATE TABLE test_range_excl(room int4range, speaker int4range, during tsrange, exclude using gist (room with =, during with &&), exclude using gist (speaker with =, during with &&));
|
||||
CREATE TABLE f_test(f text, i int);
|
||||
CREATE TABLE i8r_array (f1 int, f2 text);
|
||||
CREATE TYPE arrayrange as range (subtype=int4[]);
|
||||
CREATE TYPE two_ints as (a int, b int);
|
||||
DROP TYPE two_ints cascade;
|
||||
CREATE TABLE text_support_test (t text);
|
||||
CREATE TABLE TEMP_FLOAT (f1 FLOAT8);
|
||||
CREATE TABLE TEMP_INT4 (f1 INT4);
|
||||
CREATE TABLE TEMP_INT2 (f1 INT2);
|
||||
CREATE TABLE TEMP_GROUP (f1 INT4, f2 INT4, f3 FLOAT8);
|
||||
CREATE TABLE POLYGON_TBL(f1 polygon);
|
||||
CREATE TABLE quad_poly_tbl (id int, p polygon);
|
||||
INSERT INTO quad_poly_tbl SELECT (x - 1) * 100 + y, polygon(circle(point(x * 10, y * 10), 1 + (x + y) % 10)) FROM generate_series(1, 200) x, generate_series(1, 100) y;
|
||||
CREATE TABLE quad_poly_tbl_ord_seq2 AS SELECT 1 FROM quad_poly_tbl;
|
||||
CREATE TABLE quad_poly_tbl_ord_idx2 AS SELECT 1 FROM quad_poly_tbl;
|
||||
"""
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
env.endpoints.create_start(branch_name="main", lsn=lsn)
|
||||
log.info(f"lsn: {lsn}")
|
||||
|
||||
for line in sql.split("\n"):
|
||||
if len(line.strip()) == 0 or line.startswith("--"):
|
||||
continue
|
||||
cur.execute(line)
|
||||
|
||||
lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
env.endpoints.create_start(branch_name="main", lsn=lsn)
|
||||
log.info(f"lsn: {lsn}")
|
||||
|
||||
cur.execute("VACUUM FULL pg_class;")
|
||||
|
||||
for ep in env.endpoints.endpoints:
|
||||
log.info(f"{ep.endpoint_id} / {ep.pg_port}")
|
||||
pg_dump_command = ["pg_dumpall", "-f", f"/tmp/dump-{ep.endpoint_id}.sql"]
|
||||
env_vars = {
|
||||
"PGPORT": str(ep.pg_port),
|
||||
"PGUSER": endpoint.default_options["user"],
|
||||
"PGHOST": endpoint.default_options["host"],
|
||||
}
|
||||
pg_bin.run_capture(pg_dump_command, env=env_vars)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user