Compare commits

..

42 Commits

Author SHA1 Message Date
Christian Schwarz
a80790e0c4 WIP 2025-06-12 06:41:07 -07:00
Christian Schwarz
906a963351 WIP advertisement sending 2025-06-08 20:10:30 -07:00
Christian Schwarz
9e7556bef2 WIP 2025-06-06 19:03:23 -07:00
Christian Schwarz
4f4214eea3 WIP integrate 2025-06-06 18:36:16 -07:00
Christian Schwarz
3dffbda428 rip out the old wal_advertiser::advmap impl, stubbing out with todo!()s 2025-06-06 17:40:59 -07:00
Christian Schwarz
04fb256e0f leave some TODOs on the lib 2025-06-06 16:51:04 -07:00
Christian Schwarz
02fe35831c fix compile warninign in benchmark 2025-06-06 16:50:14 -07:00
Christian Schwarz
5f7bc3ce60 left equi join for remote_consistent_lsn retrieval 2025-06-06 14:28:20 -07:00
Christian Schwarz
e40b1c79fa carginality estimation for collected hashmap; maintain nodes and nodes_timelines; allocations not showing in flamegraph anymore, but replaced with btreemap, but replaced with btreemap, but replaced with btreemap, but replaced with btreemap::get 2025-06-06 13:50:03 -07:00
Christian Schwarz
2f416267dc finish implementing auto-quiescing; needs more tests 2025-06-05 02:41:56 +02:00
Christian Schwarz
d52d560f16 log int est 2025-06-05 02:08:13 +02:00
Christian Schwarz
de908a7b0d WIP auto-quiesce 2025-06-05 01:56:55 +02:00
Christian Schwarz
687fb25d41 WIP 2025-06-05 01:28:55 +02:00
Christian Schwarz
eaa91291ae add test case for initial advertisement and fix everything by switching to btrees and proper merge equi join 2025-06-05 01:09:33 +02:00
Christian Schwarz
fc9f38dd2d continue 2025-06-04 22:48:21 +02:00
Christian Schwarz
c689110ad6 continue 2025-06-04 22:34:01 +02:00
Christian Schwarz
ba6abe203d WIP promising quiescing mechanism 2025-06-04 22:07:48 +02:00
Christian Schwarz
d16e024d49 treat storage more likea n actor itself; dead end also 2025-06-04 20:46:48 +02:00
Christian Schwarz
a4b9335b73 dabble around with effect-style system 2025-06-04 20:38:55 +02:00
Christian Schwarz
2cea7a7838 sketch offloading 2025-06-04 19:19:20 +02:00
Christian Schwarz
e1c1aa74fe get benchmark to work 2025-06-04 18:28:53 +02:00
Christian Schwarz
ee775a24a0 WIP 2025-06-04 16:46:43 +02:00
Christian Schwarz
428f532f08 naive implementation of advertisement generator 2025-06-04 15:29:37 +02:00
Christian Schwarz
5efb0d8072 WIP 2025-06-03 19:45:19 +02:00
Christian Schwarz
36ba2b8e44 WIP proto 2025-06-03 19:14:48 +02:00
Christian Schwarz
1de0f41403 WIP 2025-06-03 18:42:54 +02:00
Christian Schwarz
4d2f27a33f WIP 2025-06-03 14:55:08 +02:00
Christian Schwarz
88a3c9e7fd WIP 2025-06-03 14:00:56 +02:00
Christian Schwarz
df36b9aa62 WIP(ctd): plumbing to feed commit_lsn to wal_advertiser 2025-06-02 12:39:42 +02:00
Christian Schwarz
18a43eeab3 undo the remote_consistent_lsn feedback channel brought in by the PoC merge (includes undo of funneling pageserver_connection field via connection options) 2025-06-02 12:04:35 +02:00
Christian Schwarz
39039d1be7 WIP: plumbing to feed commit_lsn to wal_advertiser 2025-06-02 12:03:30 +02:00
Christian Schwarz
9ee75ceee6 merge fixups; storcon and safekeeper compile again 2025-06-02 11:26:14 +02:00
Christian Schwarz
f5210a367d git merge --squash problame/broker-spof/poc
Squashed commit of the following:

commit 4a1b52c12e
Author: Christian Schwarz <christian@neon.tech>
Date:   Mon May 5 12:45:45 2025 +0200

    WIP

commit 257693e4f2
Author: Christian Schwarz <christian@neon.tech>
Date:   Mon May 5 10:59:45 2025 +0200

    WIP

commit 7aa9beaefd
Author: Christian Schwarz <christian@neon.tech>
Date:   Sun May 4 17:06:46 2025 +0200

    make sk compile

commit 35dbbbaf60
Author: Christian Schwarz <christian@neon.tech>
Date:   Sun May 4 16:50:23 2025 +0200

    move discovery request mechanism into that type as well

    Can't move the policy when we send disovery mechanism because that's
    tied to connection_manager loop state.

commit 6380c9674c
Author: Christian Schwarz <christian@neon.tech>
Date:   Sun May 4 16:22:46 2025 +0200

    move subscription code into new client struct

commit 1f53688189
Author: Christian Schwarz <christian@neon.tech>
Date:   Sun May 4 14:40:44 2025 +0200

    Revert "rip out broker binary target & launch of it in cplane & mention of it in docs"

    This reverts commit 8f201b1580.

commit 8f201b1580
Author: Christian Schwarz <christian@neon.tech>
Date:   Sun May 4 14:38:52 2025 +0200

    rip out broker binary target & launch of it in cplane & mention of it in docs
2025-06-02 11:19:37 +02:00
Christian Schwarz
f36520eb94 stub api impl 2025-06-02 11:19:28 +02:00
Christian Schwarz
afa35eea87 trigger now only does insertions; app loop will do cleanup; prepare API for cleanup 2025-05-30 20:36:50 +02:00
Christian Schwarz
8eb853b731 finish the stub implementation of storcon side, it now PUTs to SKs and gets back 404s 2025-05-28 19:29:32 +02:00
Christian Schwarz
a95015d967 triggers for timelines table and ps/sk row deletion 2025-05-28 13:14:37 +02:00
Christian Schwarz
3836ee8539 finish prototyping event changes via triggers 2025-05-27 18:28:22 +02:00
Christian Schwarz
a6bd4a3be6 Revert "abandoned prototype how it would be if we do what triggers do but in the app"
This reverts commit 24d96e4372.
2025-05-27 14:13:10 +02:00
Christian Schwarz
24d96e4372 abandoned prototype how it would be if we do what triggers do but in the app 2025-05-27 14:12:48 +02:00
Christian Schwarz
29ea89b61d trigger-based thing 2025-05-27 14:12:34 +02:00
Christian Schwarz
322e742e4c schema 2025-05-27 13:44:25 +02:00
102 changed files with 3353 additions and 2795 deletions

116
Cargo.lock generated
View File

@@ -3898,16 +3898,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
@@ -4192,12 +4182,6 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "p256"
version = "0.11.1"
@@ -4302,7 +4286,6 @@ dependencies = [
"enumset",
"fail",
"futures",
"hashlink",
"hex",
"hex-literal",
"http-utils",
@@ -5255,7 +5238,6 @@ dependencies = [
"tracing-log",
"tracing-opentelemetry",
"tracing-subscriber",
"tracing-test",
"tracing-utils",
"try-lock",
"typed-json",
@@ -6082,8 +6064,10 @@ dependencies = [
"serde",
"serde_json",
"sha2",
"sk_ps_discovery",
"smallvec",
"storage_broker",
"storage_controller_client",
"strum",
"strum_macros",
"thiserror 1.0.69",
@@ -6095,6 +6079,7 @@ dependencies = [
"tokio-stream",
"tokio-tar",
"tokio-util",
"tonic",
"tracing",
"tracing-subscriber",
"url",
@@ -6590,6 +6575,76 @@ version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
[[package]]
name = "sk_ps_discovery"
version = "0.1.0"
dependencies = [
"anyhow",
"async-stream",
"byteorder",
"bytes",
"camino",
"camino-tempfile",
"chrono",
"clap",
"crc32c",
"criterion",
"desim",
"env_logger",
"fail",
"futures",
"hex",
"http 1.1.0",
"http-utils",
"humantime",
"hyper 0.14.30",
"itertools 0.10.5",
"jsonwebtoken",
"metrics",
"once_cell",
"pageserver_api",
"parking_lot 0.12.1",
"pem",
"postgres-protocol",
"postgres_backend",
"postgres_ffi",
"pprof",
"pq_proto",
"rand 0.8.5",
"regex",
"remote_storage",
"reqwest",
"rustls 0.23.18",
"safekeeper_api",
"safekeeper_client",
"scopeguard",
"sd-notify",
"serde",
"serde_json",
"sha2",
"smallvec",
"storage_broker",
"strum",
"strum_macros",
"thiserror 1.0.69",
"tikv-jemallocator",
"tokio",
"tokio-io-timeout",
"tokio-postgres",
"tokio-rustls 0.26.0",
"tokio-stream",
"tokio-tar",
"tokio-util",
"tonic",
"tracing",
"tracing-subscriber",
"url",
"utils",
"wal_decoder",
"walproposer",
"workspace_hack",
]
[[package]]
name = "slab"
version = "0.4.8"
@@ -6696,6 +6751,7 @@ dependencies = [
"rustls 0.23.18",
"tokio",
"tokio-rustls 0.26.0",
"tokio-util",
"tonic",
"tonic-build",
"tracing",
@@ -6708,6 +6764,7 @@ name = "storage_controller"
version = "0.1.0"
dependencies = [
"anyhow",
"async-stream",
"bytes",
"camino",
"chrono",
@@ -7706,7 +7763,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"serde",
@@ -7720,27 +7776,6 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "tracing-test"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68"
dependencies = [
"tracing-core",
"tracing-subscriber",
"tracing-test-macro",
]
[[package]]
name = "tracing-test-macro"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568"
dependencies = [
"quote",
"syn 2.0.100",
]
[[package]]
name = "tracing-utils"
version = "0.1.0"
@@ -8593,7 +8628,6 @@ dependencies = [
"tracing",
"tracing-core",
"tracing-log",
"tracing-subscriber",
"url",
"uuid",
"zeroize",

View File

@@ -43,7 +43,7 @@ members = [
"libs/proxy/postgres-protocol2",
"libs/proxy/postgres-types2",
"libs/proxy/tokio-postgres2",
"endpoint_storage",
"endpoint_storage", "libs/sk_ps_discovery",
]
[workspace.package]
@@ -262,6 +262,7 @@ pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
safekeeper_client = { path = "./safekeeper/client" }
sk_ps_discovery = { path = "./libs/sk_ps_discovery" }
desim = { version = "0.1", path = "./libs/desim" }
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
storage_controller_client = { path = "./storage_controller/client" }

View File

@@ -582,38 +582,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
#########################################################################################
#
# Layer "online_advisor-build"
# compile online_advisor extension
#
#########################################################################################
FROM build-deps AS online_advisor-src
ARG PG_VERSION
# online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries
# last release 1.0 - May 15, 2025
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
"v17") \
;; \
*) \
echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \
;; \
esac && \
wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \
mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
FROM pg-build AS online_advisor-build
COPY --from=online_advisor-src /ext-src/ /ext-src/
WORKDIR /ext-src/
RUN if [ -d online_advisor-src ]; then \
cd online_advisor-src && \
make -j install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \
fi
#########################################################################################
#
# Layer "pg_hashids-build"
@@ -1680,7 +1648,6 @@ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1856,7 +1823,6 @@ COPY --from=pgjwt-src /ext-src/ /ext-src/
COPY --from=pg_graphql-src /ext-src/ /ext-src/
#COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
COPY --from=hypopg-src /ext-src/ /ext-src/
COPY --from=online_advisor-src /ext-src/ /ext-src/
COPY --from=pg_hashids-src /ext-src/ /ext-src/
COPY --from=rum-src /ext-src/ /ext-src/
COPY --from=pgtap-src /ext-src/ /ext-src/

View File

@@ -213,10 +213,8 @@ impl Escaping for PgIdent {
// Find the first suitable tag that is not present in the string.
// Postgres' max role/DB name length is 63 bytes, so even in the
// worst case it won't take long. Outer tag is always `tag + "x"`,
// so if `tag` is not present in the string, `outer_tag` is not
// present in the string either.
while self.contains(&tag.to_string()) {
// worst case it won't take long.
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
tag += "x";
outer_tag = tag.clone() + "x";
}

View File

@@ -27,40 +27,6 @@ fn get_rsyslog_pid() -> Option<String> {
}
}
fn wait_for_rsyslog_pid() -> Result<String, anyhow::Error> {
const MAX_WAIT: Duration = Duration::from_secs(5);
const INITIAL_SLEEP: Duration = Duration::from_millis(2);
let mut sleep_duration = INITIAL_SLEEP;
let start = std::time::Instant::now();
let mut attempts = 1;
for attempt in 1.. {
attempts = attempt;
match get_rsyslog_pid() {
Some(pid) => return Ok(pid),
None => {
if start.elapsed() >= MAX_WAIT {
break;
}
info!(
"rsyslogd is not running, attempt {}. Sleeping for {} ms",
attempt,
sleep_duration.as_millis()
);
std::thread::sleep(sleep_duration);
sleep_duration *= 2;
}
}
}
Err(anyhow::anyhow!(
"rsyslogd is not running after waiting for {} seconds and {} attempts",
attempts,
start.elapsed().as_secs()
))
}
// Restart rsyslogd to apply the new configuration.
// This is necessary, because there is no other way to reload the rsyslog configuration.
//
@@ -70,14 +36,14 @@ fn wait_for_rsyslog_pid() -> Result<String, anyhow::Error> {
// TODO: test it properly
//
fn restart_rsyslog() -> Result<()> {
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
info!("rsyslogd is running with pid: {}, restart it", old_pid);
// kill it to restart
let _ = Command::new("pkill")
.arg("rsyslogd")
.output()
.context("Failed to restart rsyslogd")?;
// ensure rsyslogd is running
wait_for_rsyslog_pid()?;
.context("Failed to stop rsyslogd")?;
Ok(())
}
@@ -165,11 +131,15 @@ pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result
return Ok(());
}
// Nothing to configure
// When new config is empty we can simply remove the configuration file.
if new_config.is_empty() {
// When the configuration is removed, PostgreSQL will stop sending data
// to the files watched by rsyslog, so restarting rsyslog is more effort
// than just ignoring this change.
info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH);
match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) {
Ok(_) => {}
Err(err) if err.kind() == ErrorKind::NotFound => {}
Err(err) => return Err(err.into()),
}
restart_rsyslog()?;
return Ok(());
}

View File

@@ -71,14 +71,6 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
("name$$$", ("$x$name$$$$x$", "xx")),
("name$$$$", ("$x$name$$$$$x$", "xx")),
("name$x$", ("$xx$name$x$$xx$", "xxx")),
("x", ("$xx$x$xx$", "xxx")),
("xx", ("$xxx$xx$xxx$", "xxxx")),
("$x", ("$xx$$x$xx$", "xxx")),
("x$", ("$xx$x$$xx$", "xxx")),
("$x$", ("$xx$$x$$xx$", "xxx")),
("xx$", ("$xxx$xx$$xxx$", "xxxx")),
("$xx", ("$xxx$$xx$xxx$", "xxxx")),
("$xx$", ("$xxx$$xx$$xxx$", "xxxx")),
];
for (input, expected) in test_cases {

View File

@@ -546,16 +546,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("Falied to parse 'sampling_ratio'")?,
relsize_snapshot_cache_capacity: settings
.remove("relsize snapshot cache capacity")
.map(|x| x.parse::<usize>())
.transpose()
.context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?,
basebackup_cache_enabled: settings
.remove("basebackup_cache_enabled")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'basebackup_cache_enabled' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")

View File

@@ -462,8 +462,6 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
if var(REAL_S3_ENV).is_ok() {
assert!(body.contains("remote_storage_s3_deleted_objects_total"));
}
#[cfg(target_os = "linux")]
assert!(body.contains("process_threads"));
}

View File

@@ -183,8 +183,6 @@ pub struct ConfigToml {
pub enable_tls_page_service_api: bool,
pub dev_mode: bool,
pub timeline_import_config: TimelineImportConfig,
#[serde(skip_serializing_if = "Option::is_none")]
pub basebackup_cache_config: Option<BasebackupCacheConfig>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -237,7 +235,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy {
ScatteredLsn,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case")]
pub enum GetVectoredConcurrentIo {
/// The read path is fully sequential: layers are visited
@@ -310,26 +308,6 @@ pub struct TimelineImportConfig {
pub import_job_checkpoint_threshold: NonZeroUsize,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(default)]
pub struct BasebackupCacheConfig {
#[serde(with = "humantime_serde")]
pub cleanup_period: Duration,
// FIXME: Support max_size_bytes.
// pub max_size_bytes: usize,
pub max_size_entries: i64,
}
impl Default for BasebackupCacheConfig {
fn default() -> Self {
Self {
cleanup_period: Duration::from_secs(60),
// max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
max_size_entries: 1000,
}
}
}
pub mod statvfs {
pub mod mock {
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -513,14 +491,6 @@ pub struct TenantConfigToml {
/// Tenant level performance sampling ratio override. Controls the ratio of get page requests
/// that will get perf sampling for the tenant.
pub sampling_ratio: Option<Ratio>,
/// Capacity of relsize snapshot cache (used by replicas).
pub relsize_snapshot_cache_capacity: usize,
/// Enable preparing basebackup on XLOG_CHECKPOINT_SHUTDOWN and using it in basebackup requests.
// FIXME: Remove skip_serializing_if when the feature is stable.
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub basebackup_cache_enabled: bool,
}
pub mod defaults {
@@ -694,7 +664,6 @@ impl Default for ConfigToml {
import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
},
basebackup_cache_config: None,
}
}
}
@@ -761,7 +730,6 @@ pub mod tenant_conf_defaults {
pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000;
}
impl Default for TenantConfigToml {
@@ -819,8 +787,6 @@ impl Default for TenantConfigToml {
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
sampling_ratio: None,
relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY,
basebackup_cache_enabled: false,
}
}
}

View File

@@ -630,10 +630,6 @@ pub struct TenantConfigPatch {
pub gc_compaction_ratio_percent: FieldPatch<u64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub sampling_ratio: FieldPatch<Option<Ratio>>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub relsize_snapshot_cache_capacity: FieldPatch<usize>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub basebackup_cache_enabled: FieldPatch<bool>,
}
/// Like [`crate::config::TenantConfigToml`], but preserves the information
@@ -763,12 +759,6 @@ pub struct TenantConfig {
#[serde(skip_serializing_if = "Option::is_none")]
pub sampling_ratio: Option<Option<Ratio>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub relsize_snapshot_cache_capacity: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub basebackup_cache_enabled: Option<bool>,
}
impl TenantConfig {
@@ -814,8 +804,6 @@ impl TenantConfig {
mut gc_compaction_initial_threshold_kb,
mut gc_compaction_ratio_percent,
mut sampling_ratio,
mut relsize_snapshot_cache_capacity,
mut basebackup_cache_enabled,
} = self;
patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -917,12 +905,6 @@ impl TenantConfig {
.gc_compaction_ratio_percent
.apply(&mut gc_compaction_ratio_percent);
patch.sampling_ratio.apply(&mut sampling_ratio);
patch
.relsize_snapshot_cache_capacity
.apply(&mut relsize_snapshot_cache_capacity);
patch
.basebackup_cache_enabled
.apply(&mut basebackup_cache_enabled);
Ok(Self {
checkpoint_distance,
@@ -962,8 +944,6 @@ impl TenantConfig {
gc_compaction_initial_threshold_kb,
gc_compaction_ratio_percent,
sampling_ratio,
relsize_snapshot_cache_capacity,
basebackup_cache_enabled,
})
}
@@ -1072,12 +1052,6 @@ impl TenantConfig {
.gc_compaction_ratio_percent
.unwrap_or(global_conf.gc_compaction_ratio_percent),
sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio),
relsize_snapshot_cache_capacity: self
.relsize_snapshot_cache_capacity
.unwrap_or(global_conf.relsize_snapshot_cache_capacity),
basebackup_cache_enabled: self
.basebackup_cache_enabled
.unwrap_or(global_conf.basebackup_cache_enabled),
}
}
}

View File

@@ -86,27 +86,6 @@ pub struct DbError {
}
impl DbError {
pub fn new_test_error(code: SqlState, message: String) -> Self {
DbError {
severity: "ERROR".to_string(),
parsed_severity: Some(Severity::Error),
code,
message,
detail: None,
hint: None,
position: None,
where_: None,
schema: None,
table: None,
column: None,
datatype: None,
constraint: None,
file: None,
line: None,
routine: None,
}
}
pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
let mut severity = None;
let mut parsed_severity = None;

View File

@@ -6,9 +6,11 @@ use pageserver_api::shard::ShardIdentity;
use postgres_ffi::TimestampTz;
use serde::{Deserialize, Serialize};
use tokio::time::Instant;
use utils::generation::Generation;
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
use utils::lsn::Lsn;
use utils::pageserver_feedback::PageserverFeedback;
use utils::shard::ShardIndex;
use crate::membership::Configuration;
use crate::{ServerInfo, Term};
@@ -309,3 +311,29 @@ pub struct PullTimelineResponse {
pub safekeeper_host: Option<String>,
// TODO: add more fields?
}
#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(tag = "action")]
pub enum TenantShardPageserverAttachmentChange {
Attach(TenantShardPageserverAttachment),
Detach(TenantShardPageserverAttachment),
}
impl TenantShardPageserverAttachmentChange {
pub fn attachment(&self) -> &TenantShardPageserverAttachment {
match self {
TenantShardPageserverAttachmentChange::Attach(a) => a,
TenantShardPageserverAttachmentChange::Detach(a) => a,
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TenantShardPageserverAttachment {
pub shard_id: ShardIndex,
pub generation: Generation,
pub ps_id: NodeId,
// TODO: avoid transmitting this with every request.
// How nice things could be if there were simple DNS records for ps-$node_id.$cell.$region.$cloud.neon.tech
pub ps_hostname: String, // TODO: some type safety
}

View File

@@ -0,0 +1,81 @@
[package]
name = "sk_ps_discovery"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
workspace_hack.workspace = true
async-stream.workspace = true
anyhow.workspace = true
byteorder.workspace = true
bytes.workspace = true
camino.workspace = true
camino-tempfile.workspace = true
chrono.workspace = true
clap = { workspace = true, features = ["derive"] }
crc32c.workspace = true
fail.workspace = true
hex.workspace = true
humantime.workspace = true
http.workspace = true
hyper0.workspace = true
itertools.workspace = true
jsonwebtoken.workspace = true
futures.workspace = true
once_cell.workspace = true
parking_lot.workspace = true
pageserver_api.workspace = true
postgres-protocol.workspace = true
pprof.workspace = true
rand.workspace = true
regex.workspace = true
reqwest = { workspace = true, features = ["json"] }
rustls.workspace = true
scopeguard.workspace = true
serde.workspace = true
serde_json.workspace = true
smallvec.workspace = true
strum.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
tikv-jemallocator.workspace = true
tokio = { workspace = true, features = ["fs"] }
tokio-io-timeout.workspace = true
tokio-postgres.workspace = true
tokio-rustls.workspace = true
tokio-tar.workspace = true
tokio-util = { workspace = true }
tonic = { workspace = true }
tracing.workspace = true
url.workspace = true
metrics.workspace = true
pem.workspace = true
postgres_backend.workspace = true
postgres_ffi.workspace = true
pq_proto.workspace = true
remote_storage.workspace = true
safekeeper_api.workspace = true
safekeeper_client.workspace = true
sha2.workspace = true
sd-notify.workspace = true
storage_broker.workspace = true
tokio-stream.workspace = true
http-utils.workspace = true
utils.workspace = true
wal_decoder.workspace = true
env_logger.workspace = true
[dev-dependencies]
criterion.workspace = true
itertools.workspace = true
walproposer.workspace = true
rand.workspace = true
desim.workspace = true
tracing.workspace = true
tracing-subscriber = { workspace = true, features = ["json"] }
[[bench]]
name = "bench"
harness = false

View File

@@ -0,0 +1,97 @@
//! WAL ingestion benchmarks.
use std::time::Instant;
use criterion::{Criterion, criterion_group, criterion_main};
use pprof::criterion::{Output, PProfProfiler};
use sk_ps_discovery::{
AttachmentUpdate, RemoteConsistentLsnAdv, TenantShardAttachmentId, TimelineAttachmentId,
};
use utils::{
generation::Generation,
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
shard::ShardIndex,
};
/// Use jemalloc and enable profiling, to mirror bin/safekeeper.rs.
#[global_allocator]
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
#[allow(non_upper_case_globals)]
#[unsafe(export_name = "malloc_conf")]
pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
// Register benchmarks with Criterion.
criterion_group!(
name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = bench_simple,
);
criterion_main!(benches);
fn bench_simple(c: &mut Criterion) {
let mut g = c.benchmark_group("simple");
// setup
let mut world = sk_ps_discovery::World::default();
// Simplified view: lots of unsharded tenants with one timeline each
let n_pageservers = 20;
let n_tenant_shards_per_pageserver = 2000;
for ps_id in 1..=n_pageservers {
for _ in ..n_tenant_shards_per_pageserver {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
for generation in 10..=11 {
let tenant_shard_attachment_id = TenantShardAttachmentId {
tenant_id,
shard_id: ShardIndex::unsharded(),
generation: Generation::Valid(generation),
};
let timeline_attachment = TimelineAttachmentId {
tenant_timeline_id: TenantTimelineId {
tenant_id,
timeline_id,
},
shard_id: ShardIndex::unsharded(),
generation: Generation::Valid(generation),
};
world.update_attachment(AttachmentUpdate {
tenant_shard_attachment_id,
action: sk_ps_discovery::AttachmentUpdateAction::Attach {
ps_id: NodeId(ps_id),
},
});
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
remote_consistent_lsn: Lsn(23),
attachment: timeline_attachment,
});
}
world.handle_commit_lsn_advancement(
TenantTimelineId {
tenant_id,
timeline_id,
},
Lsn(42),
);
}
}
// setup done
let world = world;
g.bench_function("get_commit_lsn_advertisements", |bencher| {
bencher.iter_custom(|iters| {
let started = Instant::now();
for _ in 0..iters {
criterion::black_box(world.get_commit_lsn_advertisements());
}
let elapsed = started.elapsed();
elapsed
});
});
g.finish();
}

View File

@@ -0,0 +1,515 @@
#[cfg(test)]
mod tests;
use std::{
collections::{BTreeMap, BTreeSet, HashMap, HashSet, btree_map, hash_map},
ops::RangeInclusive,
};
use tracing::{info, warn};
use utils::{
generation::Generation,
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
merge_join,
shard::ShardIndex,
};
#[derive(Debug, Default)]
pub struct World {
attachments: BTreeMap<TenantShardAttachmentId, NodeId>,
attachment_count: HashMap<TenantId, u16>,
nodes_timelines: HashMap<NodeId, HashMap<TenantTimelineId, u16>>, // u16 is a refcount from each timeline attachment id
// continously maintained aggregate for efficient decisionmaking on quiescing;
// quiesced timelines are always caught up
// can quiesce one == attachment_count (TODO: this requires enforcing foreign key relationship between attachments and remote_consistent_lsn)
caught_up_count: HashMap<TenantTimelineId, u16>,
// BEGIN quiescing/active split
quiesced_timelines: BTreeMap<TenantTimelineId, Lsn>,
// ^
// either a timeline is in quiesced_timelines
// or it is below
// v
commit_lsns: BTreeMap<TenantTimelineId, Lsn>,
remote_consistent_lsns: BTreeMap<TimelineAttachmentId, Lsn>,
// END quiescing/active split
// other fields
}
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, PartialOrd, Ord)]
pub struct TenantShardAttachmentId {
pub tenant_id: TenantId,
pub shard_id: ShardIndex,
pub generation: Generation,
}
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, PartialOrd, Ord)]
pub struct TimelineAttachmentId {
pub tenant_timeline_id: TenantTimelineId,
pub shard_id: ShardIndex,
pub generation: Generation,
}
pub struct AttachmentUpdate {
pub tenant_shard_attachment_id: TenantShardAttachmentId,
pub action: AttachmentUpdateAction,
}
pub enum AttachmentUpdateAction {
Attach { ps_id: NodeId },
Detach,
}
pub struct RemoteConsistentLsnAdv {
pub attachment: TimelineAttachmentId,
pub remote_consistent_lsn: Lsn,
}
impl World {
fn check_invariants(&self) {
if !cfg!(debug_assertions) {
return;
}
// caught_up_count maintenance
{
for (tenant_timeline_id, caught_up_count) in
self.caught_up_count.iter().map(|(k, v)| (*k, *v))
{
let attachment_count = *self
.attachment_count
.get(&tenant_timeline_id.tenant_id)
.unwrap();
assert!(caught_up_count <= attachment_count);
if caught_up_count == attachment_count {
self.quiesced_timelines.contains_key(&tenant_timeline_id);
// remote_consistent_lsn and commit_lsns is empty, checked by "quiescing XOR ..." below
} else {
let commit_lsn = self.commit_lsns[&&tenant_timeline_id];
let mut validate_caught_up = 0;
let mut validate_not_caught_up = 0;
for (_, r_c_lsn) in self
.remote_consistent_lsns
.range(TimelineAttachmentId::timeline_range(tenant_timeline_id))
.map(|(k, v)| (*k, *v))
{
if r_c_lsn == commit_lsn {
validate_caught_up += 1;
} else {
assert!(r_c_lsn < commit_lsn);
validate_not_caught_up += 1;
}
}
assert_eq!(validate_caught_up, caught_up_count);
assert_eq!(
validate_caught_up + validate_not_caught_up,
attachment_count
);
}
}
}
// quiescing XOR ...
{
let quiesced_timelines: HashSet<TenantTimelineId> =
self.quiesced_timelines.keys().cloned().collect();
let commit_lsn_timelines: HashSet<TenantTimelineId> =
self.commit_lsns.keys().cloned().collect();
let remote_consistent_lsn_timelines: HashSet<TenantTimelineId> = self
.remote_consistent_lsns
.keys()
.map(|tlaid: &TimelineAttachmentId| tlaid.tenant_timeline_id)
.collect();
#[rustfmt::skip]
assert_eq!(0, quiesced_timelines.intersection(&commit_lsn_timelines).count());
#[rustfmt::skip]
assert_eq!(0, quiesced_timelines.intersection(&remote_consistent_lsn_timelines).count());
}
// nodes_timelines maintenance
{
let mut expect: HashMap<NodeId, HashMap<TenantTimelineId, u16>> = HashMap::new();
let all_ttids: BTreeSet<TenantTimelineId> = self
.quiesced_timelines
.keys()
.cloned()
.chain(
self.remote_consistent_lsns
.keys()
.cloned()
.map(|tlaid| tlaid.tenant_timeline_id),
)
.collect();
for ttid in all_ttids {
for (_, node_id) in self
.attachments
.range(TenantShardAttachmentId::tenant_range(ttid.tenant_id))
.map(|(k, v)| (*k, *v))
{
let expect = expect.entry(node_id).or_default();
let refcount = expect.entry(ttid).or_default();
*refcount += 1;
}
}
assert_eq!(expect, self.nodes_timelines);
}
}
pub fn update_attachment(&mut self, upd: AttachmentUpdate) {
self.check_invariants();
use AttachmentUpdateAction::*;
use btree_map::Entry::*;
let AttachmentUpdate {
tenant_shard_attachment_id,
action,
} = upd;
match (action, self.attachments.entry(tenant_shard_attachment_id)) {
(Attach { ps_id }, Occupied(e)) if *e.get() == ps_id => {
info!("attachment is already known")
}
(Attach { ps_id }, Occupied(e)) => {
warn!(current_node=%e.get(), proposed_node=%ps_id, "ignoring update that moves attachment to a different pageserver");
}
(Attach { ps_id }, Vacant(e)) => {
e.insert(ps_id);
// Keep attachmount_count up to date
let attachment_count = self
.attachment_count
.entry(tenant_shard_attachment_id.tenant_id)
.or_default();
*attachment_count += attachment_count.checked_add(1).unwrap();
// Keep nodes_timelines up to date
let nodes_timelines = self.nodes_timelines.entry(ps_id).or_default();
for (ttid, _) in self.commit_lsns.range(TenantTimelineId::tenant_range(
tenant_shard_attachment_id.tenant_id,
)) {
let refcount = nodes_timelines.entry(*ttid).or_default();
*refcount = refcount.checked_add(1).unwrap();
}
if nodes_timelines.is_empty() {
self.nodes_timelines.remove(&ps_id);
}
// New shards may start at an older LSN than where we quiesced => activate all quiesced timelines.
let activate_range =
TenantTimelineId::tenant_range(tenant_shard_attachment_id.tenant_id);
let activate: HashSet<TenantTimelineId> = self
.quiesced_timelines
.range(activate_range)
.map(|(ttid, _quiesced_lsn)| *ttid)
.collect();
for tenant_timeline_id in activate {
self.activate_timeline(tenant_timeline_id);
}
}
(Detach, Occupied(e)) => {
let ps_id = e.remove();
// Keep attachment count up to date
let attachment_count = self
.attachment_count
.get_mut(&tenant_shard_attachment_id.tenant_id)
.expect("attachment action initializes the hasmap entry");
*attachment_count = attachment_count.checked_sub(1).unwrap();
// Keep nodes_timelines up to date
let nodes_timelines = self
.nodes_timelines
.get_mut(&ps_id)
.expect("attachment action initializes hashmap entry");
for (ttid, _) in self.commit_lsns.range(TenantTimelineId::tenant_range(
tenant_shard_attachment_id.tenant_id,
)) {
let refcount = nodes_timelines.entry(*ttid).or_default();
*refcount = refcount.checked_sub(1).unwrap();
}
}
(Detach, Vacant(_)) => {
info!("detachment is already known");
}
}
self.check_invariants();
}
pub fn handle_remote_consistent_lsn_advertisement(&mut self, adv: RemoteConsistentLsnAdv) {
self.check_invariants();
let RemoteConsistentLsnAdv {
attachment,
remote_consistent_lsn,
} = adv;
match self.remote_consistent_lsns.entry(attachment) {
btree_map::Entry::Occupied(mut occupied_entry) => {
let current = occupied_entry.get_mut();
use std::cmp::Ordering::*;
match (*current).cmp(&remote_consistent_lsn) {
Less => {
*current = remote_consistent_lsn;
let caught_up_count = self
.caught_up_count
.get_mut(&attachment.tenant_timeline_id)
.unwrap();
*caught_up_count = caught_up_count.checked_add(1).unwrap();
if *caught_up_count
== self.attachment_count[&attachment.tenant_timeline_id.tenant_id]
{
self.quiesce_timeline(attachment.tenant_timeline_id);
}
}
Equal => {
info!("ignoring no-op update, likely duplicate delivery");
}
Greater => {
warn!(
"ignoring advertisement because remote_consistent_lsn is moving backwards"
);
}
}
}
btree_map::Entry::Vacant(_) => {
let ttid = attachment.tenant_timeline_id;
match self.quiesced_timelines.get(&ttid).cloned() {
Some(quiesced_lsn) if quiesced_lsn == remote_consistent_lsn => {
info!("ignoring no-op update for quiesced timeline");
}
Some(_) => {
self.activate_timeline(ttid);
// recurse one level, guarnateed to hit `Occupied` case above
self.handle_remote_consistent_lsn_advertisement(adv);
}
None => {
info!("ignoring advertisement because timeline is not known");
}
}
}
}
self.check_invariants();
}
pub fn handle_commit_lsn_advancement(&mut self, ttid: TenantTimelineId, update: Lsn) {
self.check_invariants();
match self.commit_lsns.entry(ttid) {
btree_map::Entry::Occupied(mut entry) => {
let current = entry.get_mut();
use std::cmp::Ordering::*;
match (*current).cmp(&update) {
Less => {
*current = update;
// We never allow remote_consistent_lsn to be ahead of commit_lsn.
// Therefore, it is safe to say nothing is caught up anymore.
let caught_up_count = self.caught_up_count.get_mut(&ttid).unwrap();
*caught_up_count = 0;
}
Equal => {
// This code runs in safekeeper impl, no reason why there would be duplicate delivery.
warn!("ignoring no-op update; why is this happening?");
}
Greater => {
panic!(
"proposed commit_lsn would move it backwards: current={} update={}",
current, update
);
}
}
}
btree_map::Entry::Vacant(entry) => {
match self.quiesced_timelines.get(&ttid).cloned() {
Some(quiesced_lsn) if quiesced_lsn == update => {
info!("ignoring no-op update for quiesced timeline");
}
Some(_) => {
self.activate_timeline(ttid);
// recurse one level, guarnateed to hit `Occupied` case above
self.handle_commit_lsn_advancement(ttid, update);
}
None => {
info!("first time hearing about this timeline, initializing");
entry.insert(update);
let replaced = self.caught_up_count.insert(ttid, 0);
// only commit_lsn advancement makes timelines known to world
assert_eq!(None, replaced);
for (attachment, node_id) in self
.attachments
.range(TenantShardAttachmentId::tenant_range(ttid.tenant_id))
{
let replaced = self.remote_consistent_lsns.insert(
attachment.timeline_attachment_id(ttid.timeline_id),
Lsn(0),
);
// only commit_lsn advancement makes timelines known to World
assert_eq!(None, replaced);
let nodes_timelines = self.nodes_timelines.entry(*node_id).or_default();
let refcount = nodes_timelines.entry(ttid).or_default();
*refcount = refcount.checked_add(1).unwrap();
}
}
}
}
}
self.check_invariants();
}
pub fn get_commit_lsn_advertisements(&self) -> HashMap<NodeId, HashMap<TenantTimelineId, Lsn>> {
let mut commit_lsn_advertisements_by_node: HashMap<NodeId, HashMap<TenantTimelineId, Lsn>> =
HashMap::with_capacity(self.nodes_timelines.len());
let commit_lsns_iter = self.commit_lsns.iter().map(|(k, v)| (*k, *v));
let attachments_iter = self.attachments.iter().map(|(k, v)| (*k, *v));
let remote_consistent_lsns_iter = self.remote_consistent_lsns.iter().map(|(k, v)| (*k, *v));
let join = merge_join::inner_equi_join_with_merge_strategy(
commit_lsns_iter,
attachments_iter,
|(tenant_timeline_id, _)| tenant_timeline_id.tenant_id,
|(shard_attachment_id, _)| shard_attachment_id.tenant_id,
);
let join = merge_join::left_equi_join_with_merge_strategy(
join,
remote_consistent_lsns_iter,
|((ttid, _), _)| ttid.tenant_id,
|(tlaid, _)| tlaid.tenant_timeline_id.tenant_id,
);
for ((c, a), r) in join {
let (tenant_timeline_id, commit_lsn): (TenantTimelineId, Lsn) = c;
let (_, node_id): (TenantShardAttachmentId, NodeId) = a;
match r {
// TODO: can > ever happen?
Some((_, remote_consistent_lsn)) if remote_consistent_lsn >= commit_lsn => {
// this timeline shard attachment is already caught up
continue;
}
Some(_) | None => {
// need to advertise
// -> fallthrough
}
};
// DISTINCT node_id, array_agg(DISTINCT tenant_shard_id )
let for_node = commit_lsn_advertisements_by_node
.entry(node_id)
.or_insert_with(|| HashMap::with_capacity(self.nodes_timelines[&node_id].len()));
match for_node.entry(tenant_timeline_id) {
hash_map::Entry::Vacant(vacant_entry) => {
vacant_entry.insert(commit_lsn);
}
hash_map::Entry::Occupied(occupied_entry) => {
assert_eq!(*occupied_entry.get(), commit_lsn);
}
}
}
commit_lsn_advertisements_by_node
}
fn activate_timeline(&mut self, tenant_timeline_id: TenantTimelineId) {
let quiesced_lsn = self
.quiesced_timelines
.remove(&tenant_timeline_id)
.expect("must call this function only on quiesced tenant_timeline_id");
let replaced = self.commit_lsns.insert(tenant_timeline_id, quiesced_lsn);
assert_eq!(None, replaced);
let reconstruct_remote_consistent_lsn_entries = self
.attachments
.range(TenantShardAttachmentId::tenant_range(
tenant_timeline_id.tenant_id,
))
.map(|(k, _)| *k)
.map(|tenant_shard_attachment_id| {
(
tenant_shard_attachment_id
.timeline_attachment_id(tenant_timeline_id.timeline_id),
quiesced_lsn,
)
});
for (key, value) in reconstruct_remote_consistent_lsn_entries {
let replaced = self.remote_consistent_lsns.insert(key, value);
assert_eq!(None, replaced);
}
}
fn quiesce_timeline(&mut self, tenant_timeline_id: TenantTimelineId) {
self.check_invariants();
if self.quiesced_timelines.contains_key(&tenant_timeline_id) {
panic!("only call this function on active timelines");
}
let quiesced_lsn = self
.commit_lsns
.remove(&tenant_timeline_id)
.expect("inconsistent: we checked it's not in quiesced_timelines, so, must be active");
let caught_up_count = self
.caught_up_count
.remove(&tenant_timeline_id)
.expect("inconsistent: we checked it's not in quiesced_timleines, so, must be active");
let mut remove_remote_consistent_lsns = Vec::new();
for (k, remote_consistent_lsn) in self
.remote_consistent_lsns
.range(TimelineAttachmentId::timeline_range(tenant_timeline_id))
{
assert_eq!(*remote_consistent_lsn, quiesced_lsn);
remove_remote_consistent_lsns.push(*k);
}
assert_eq!(
caught_up_count,
u16::try_from(remove_remote_consistent_lsns.len()).unwrap()
);
for k in remove_remote_consistent_lsns {
let removed = self.remote_consistent_lsns.remove(&k);
assert!(removed.is_some(), "we just added");
}
let replaced = self
.quiesced_timelines
.insert(tenant_timeline_id, quiesced_lsn);
assert_eq!(None, replaced); // we checked at function entry
self.check_invariants();
}
}
impl TimelineAttachmentId {
pub fn timeline_range(ttid: TenantTimelineId) -> RangeInclusive<Self> {
let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
let generation_range: RangeInclusive<_> = Generation::RANGE;
RangeInclusive::new(
TimelineAttachmentId {
tenant_timeline_id: ttid,
shard_id: *shard_index_range.start(),
generation: *generation_range.start(),
},
TimelineAttachmentId {
tenant_timeline_id: ttid,
shard_id: *shard_index_range.end(),
generation: *generation_range.end(),
},
)
}
pub fn tenant_shard_attachment_id(self) -> TenantShardAttachmentId {
TenantShardAttachmentId {
tenant_id: self.tenant_timeline_id.tenant_id,
shard_id: self.shard_id,
generation: self.generation,
}
}
}
impl TenantShardAttachmentId {
pub fn timeline_attachment_id(self, timeline_id: TimelineId) -> TimelineAttachmentId {
TimelineAttachmentId {
tenant_timeline_id: TenantTimelineId {
tenant_id: self.tenant_id,
timeline_id,
},
shard_id: self.shard_id,
generation: self.generation,
}
}
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
let generation_range: RangeInclusive<_> = Generation::RANGE;
RangeInclusive::new(
Self {
tenant_id,
shard_id: *shard_index_range.start(),
generation: *generation_range.start(),
},
Self {
tenant_id,
shard_id: *shard_index_range.end(),
generation: *generation_range.end(),
},
)
}
}

View File

@@ -0,0 +1,224 @@
use utils::{id::TenantId, logging};
use super::*;
use crate::World;
#[track_caller]
fn validate_advertisements(
actual: HashMap<NodeId, HashMap<TenantTimelineId, Lsn>>,
expect: Vec<(NodeId, Vec<(TenantTimelineId, Lsn)>)>,
) {
let expect: HashMap<_, _> = expect
.into_iter()
.map(|(node_id, innermap)| (node_id, innermap.into_iter().collect()))
.collect();
assert_eq!(actual, expect);
}
#[test]
fn basic() {
let mut world = World::default();
let tenant_id = TenantId::from_array([0xff; 16]);
let timeline_id = TimelineId::from_array([1; 16]);
let timeline2 = TimelineId::from_array([2; 16]);
let attachment1 = TenantShardAttachmentId {
tenant_id,
shard_id: ShardIndex::unsharded(),
generation: Generation::Valid(2),
};
let attachment2 = TenantShardAttachmentId {
tenant_id,
shard_id: ShardIndex::unsharded(),
generation: Generation::Valid(3),
};
let ps1 = NodeId(0x100);
// Out of order; in happy path, commit_lsn advances first, but let's test the
// case where safekeeper doesn't know about the attachments yet first, before
// we extend the case to the happy path.
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
attachment: attachment1.timeline_attachment_id(timeline_id),
remote_consistent_lsn: Lsn(0x23),
});
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
attachment: attachment2.timeline_attachment_id(timeline_id),
remote_consistent_lsn: Lsn(0x42),
});
// SK authoritative info on which advertisements ought exist is still empty
assert_eq!(world.get_commit_lsn_advertisements(), HashMap::default());
world.update_attachment(AttachmentUpdate {
tenant_shard_attachment_id: attachment1,
action: AttachmentUpdateAction::Attach { ps_id: ps1 },
});
// We have not inserted any commit_lsn info yet, so, still no advs expected
assert_eq!(world.get_commit_lsn_advertisements(), HashMap::default());
// insert commit_lsn info for different timeline
world.handle_commit_lsn_advancement(
TenantTimelineId {
tenant_id,
timeline_id: timeline2,
},
Lsn(0x66),
);
// Advs should still be empty
validate_advertisements(
world.get_commit_lsn_advertisements(),
vec![(
ps1,
vec![(
TenantTimelineId {
tenant_id,
timeline_id: timeline2,
},
Lsn(0x66),
)],
)],
);
// Ok, out of order part tested. Now Safekeeper learns about the attachments.
// insert commit_lsn info for the timeline we have remote_consistent_lsn info for
world.handle_commit_lsn_advancement(
TenantTimelineId {
tenant_id,
timeline_id,
},
Lsn(0x55),
);
dbg!(&world);
// Now advertisements to attachment1 will be sent out, but attachment2 is still not known, so, no advertisements to it.
validate_advertisements(
world.get_commit_lsn_advertisements(),
vec![(
ps1,
vec![(
TenantTimelineId {
tenant_id,
timeline_id,
},
Lsn(0x55),
)],
)],
);
}
#[test]
fn advertisement_for_new_timeline() {
let mut world = World::default();
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let ttid = TenantTimelineId {
tenant_id,
timeline_id,
};
let tenant_shard_attachment_id = TenantShardAttachmentId {
tenant_id,
shard_id: ShardIndex::unsharded(),
generation: Generation::Valid(2),
};
let ps_id = NodeId(0x100);
world.update_attachment(AttachmentUpdate {
tenant_shard_attachment_id,
action: AttachmentUpdateAction::Attach { ps_id },
});
world.handle_commit_lsn_advancement(ttid, Lsn(23));
let advs = world.get_commit_lsn_advertisements();
validate_advertisements(advs, vec![(ps_id, vec![(ttid, Lsn(23))])]);
}
#[test]
fn quiescing_timeline_catchup() {
let _guard = logging::init(
logging::LogFormat::Test,
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
logging::Output::Stdout,
)
.unwrap();
let mut world = World::default();
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let ttid = TenantTimelineId {
tenant_id,
timeline_id,
};
let tenant_shard_attachment_id = TenantShardAttachmentId {
tenant_id,
shard_id: ShardIndex::unsharded(),
generation: Generation::Valid(2),
};
let ps_id = NodeId(0x100);
world.update_attachment(AttachmentUpdate {
tenant_shard_attachment_id,
action: AttachmentUpdateAction::Attach { ps_id },
});
world.handle_commit_lsn_advancement(ttid, Lsn(23));
assert!(world.quiesced_timelines.is_empty());
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
attachment: tenant_shard_attachment_id.timeline_attachment_id(timeline_id),
remote_consistent_lsn: Lsn(23),
});
assert!(world.quiesced_timelines.contains_key(&ttid));
}
#[test]
fn nodes_timelines() {
let mut world = World::default();
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::from_array([0x1; 16]);
let ttid = TenantTimelineId {
tenant_id,
timeline_id,
};
let tenant_shard_attachment_id = TenantShardAttachmentId {
tenant_id,
shard_id: ShardIndex::unsharded(),
generation: Generation::Valid(2),
};
let ps_id = NodeId(0x100);
world.update_attachment(AttachmentUpdate {
tenant_shard_attachment_id,
action: AttachmentUpdateAction::Attach { ps_id },
});
assert!(world.nodes_timelines.get(&ps_id).is_none());
world.handle_commit_lsn_advancement(ttid, Lsn(0x23));
assert_eq!(world.nodes_timelines[&ps_id].len(), 1);
let timeline2 = TimelineId::from_array([0x2; 16]);
world.handle_remote_consistent_lsn_advertisement(RemoteConsistentLsnAdv {
attachment: TimelineAttachmentId {
tenant_timeline_id: TenantTimelineId {
tenant_id,
timeline_id: timeline2,
},
shard_id: ShardIndex::unsharded(),
generation: Generation::Valid(2),
},
remote_consistent_lsn: Lsn(0x42),
});
}
// TODO: need more tests, esp for the removal path

View File

@@ -1,4 +1,4 @@
use std::fmt::Debug;
use std::{fmt::Debug, ops::RangeInclusive};
use serde::{Deserialize, Serialize};
@@ -25,7 +25,9 @@ pub enum Generation {
/// scenarios where pageservers might otherwise issue conflicting writes to
/// remote storage
impl Generation {
pub const MIN: Self = Self::None;
pub const MAX: Self = Self::Valid(u32::MAX);
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
/// Create a new Generation that represents a legacy key format with
/// no generation suffix

View File

@@ -1,5 +1,6 @@
use std::fmt;
use std::num::ParseIntError;
use std::ops::RangeInclusive;
use std::str::FromStr;
use anyhow::Context;
@@ -320,6 +321,19 @@ impl TenantTimelineId {
pub fn empty() -> Self {
Self::new(TenantId::from([0u8; 16]), TimelineId::from([0u8; 16]))
}
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
RangeInclusive::new(
Self {
tenant_id,
timeline_id: TimelineId::from_array([u8::MIN; 16]),
},
Self {
tenant_id,
timeline_id: TimelineId::from_array([u8::MAX; 16]),
},
)
}
}
impl fmt::Display for TenantTimelineId {

View File

@@ -95,6 +95,9 @@ pub mod guard_arc_swap;
pub mod elapsed_accum;
pub mod merge_join;
#[cfg(target_os = "linux")]
pub mod linux_socket_ioctl;

View File

@@ -0,0 +1,164 @@
pub fn inner_equi_join_with_merge_strategy<L, LI, R, RI, K, FL, FR>(
l: L,
r: R,
key_l: FL,
key_r: FR,
) -> impl Iterator<Item = (LI, RI)>
where
L: Iterator<Item = LI>, // + Sorted
R: Iterator<Item = RI>, // + Sorted
FL: 'static + Fn(&LI) -> K,
FR: 'static + Fn(&RI) -> K,
LI: Copy,
RI: Copy,
K: PartialEq + Eq + Ord,
{
let mut l = l.map(move |i| (i, key_l(&i))).peekable();
let mut r = r.map(move |i| (i, key_r(&i))).peekable();
std::iter::from_fn(move || {
loop {
match (l.peek(), r.peek()) {
(Some((_, lk)), Some((_, rk))) if lk < rk => {
drop(l.next());
continue;
}
(Some((_, lk)), Some((_, rk))) if lk > rk => {
drop(r.next());
continue;
}
(Some((lv, lk)), Some((_, rk))) => {
assert!(lk == rk);
let (rv, _) = r.next().unwrap();
return Some((lv.clone(), rv));
}
(None, None) | (None, Some(_)) | (Some(_), None) => return None,
}
}
})
}
pub fn left_equi_join_with_merge_strategy<L, LI, R, RI, K, FL, FR>(
l: L,
r: R,
key_l: FL,
key_r: FR,
) -> impl Iterator<Item = (LI, Option<RI>)>
where
L: Iterator<Item = LI>, // + Sorted
R: Iterator<Item = RI>, // + Sorted
FL: 'static + Fn(&LI) -> K,
FR: 'static + Fn(&RI) -> K,
LI: Copy,
RI: Copy,
K: PartialEq + Eq + Ord,
{
let mut l = l.map(move |i| (i, key_l(&i))).peekable();
let mut r = r.map(move |i| (i, key_r(&i))).peekable();
let mut l_had_match = false;
std::iter::from_fn(move || {
loop {
match (l.peek(), r.peek()) {
(Some((_, lk)), Some((_, rk))) if lk < rk => {
let (lv, _) = l.next().unwrap();
if l_had_match {
l_had_match = false;
continue;
} else {
return Some((lv, None));
}
}
(Some((_, _)), None) => {
let (lv, _) = l.next().unwrap();
if l_had_match {
l_had_match = false;
continue;
} else {
return Some((lv, None));
}
}
(Some((_, lk)), Some((_, rk))) if lk > rk => {
drop(r.next());
continue;
}
(Some((lv, lk)), Some((_, rk))) => {
l_had_match = true;
assert!(lk == rk);
let (rv, _) = r.next().unwrap();
return Some((lv.clone(), Some(rv)));
}
(None, None) | (None, Some(_)) => return None,
}
}
})
}
#[cfg(test)]
mod tests {
#[test]
fn inner_equi_basic() {
let l = vec![b"a", b"c"];
let r = vec![b"aa", b"ad", b"ba", b"bb", b"ca", b"cb", b"cd", b"dd"];
let res: Vec<_> = super::inner_equi_join_with_merge_strategy(
l.into_iter(),
r.into_iter(),
|l| &l[0..1],
|r| &r[0..1],
)
.collect();
assert_eq!(
res,
vec![
(b"a", b"aa"),
(b"a", b"ad"),
(b"c", b"ca"),
(b"c", b"cb"),
(b"c", b"cd"),
]
);
}
#[test]
fn left_equi_basic() {
/*
create table aleft (id text, aleft text);
create table aright (id text, aright text);
insert into aleft values ('a', 'a'), ('b', 'b');
insert into aright values ('a', 'aa'), ('a', 'ab'), ('c', 'cd');
select * from aleft left join aright using ("id");
*/
let l = vec![b"a", b"b"];
let r = vec![b"aa", b"ab", b"cd"];
let res: Vec<_> = super::left_equi_join_with_merge_strategy(
l.into_iter(),
r.into_iter(),
|l| &l[0..1],
|r| &r[0..1],
)
.collect();
assert_eq!(
res,
vec![(b"a", Some(b"aa")), (b"a", Some(b"ab")), (b"b", None)]
);
}
#[test]
fn left_equi_basic_2() {
let l = vec![b"b"];
let r = vec![b"aa", b"ab", b"bb"];
let res: Vec<_> = super::left_equi_join_with_merge_strategy(
l.into_iter(),
r.into_iter(),
|l| &l[0..1],
|r| &r[0..1],
)
.collect();
assert_eq!(res, vec![(b"b", Some(b"bb"))])
}
}

View File

@@ -52,6 +52,7 @@ pub struct TenantShardId {
impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
pub const MIN: Self = Self(0);
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known
@@ -85,7 +86,9 @@ impl ShardCount {
}
impl ShardNumber {
pub const MIN: Self = Self(0);
pub const MAX: Self = Self(u8::MAX);
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
}
impl TenantShardId {
@@ -100,16 +103,17 @@ impl TenantShardId {
/// The range of all TenantShardId that belong to a particular TenantId. This is useful when
/// you have a BTreeMap of TenantShardId, and are querying by TenantId.
pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
let shard_index_range: RangeInclusive<_> = ShardIndex::RANGE;
RangeInclusive::new(
Self {
tenant_id,
shard_number: ShardNumber(0),
shard_count: ShardCount(0),
shard_number: shard_index_range.start().shard_number,
shard_count: shard_index_range.start().shard_count,
},
Self {
tenant_id,
shard_number: ShardNumber::MAX,
shard_count: ShardCount::MAX,
shard_number: shard_index_range.end().shard_number,
shard_count: shard_index_range.end().shard_count,
},
)
}
@@ -241,6 +245,16 @@ impl From<[u8; 18]> for TenantShardId {
}
impl ShardIndex {
pub const MIN: Self = ShardIndex {
shard_number: ShardNumber::MIN,
shard_count: ShardCount::MIN,
};
pub const MAX: Self = ShardIndex {
shard_number: ShardNumber::MAX,
shard_count: ShardCount::MAX,
};
pub const RANGE: RangeInclusive<Self> = RangeInclusive::new(Self::MIN, Self::MAX);
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
Self {
shard_number: number,

View File

@@ -4,3 +4,5 @@ pub mod duplex;
pub mod gate;
pub mod spsc_fold;
pub mod spsc_watch;

View File

@@ -56,7 +56,7 @@ impl<T: Send> Sender<T> {
/// # Panics
///
/// If `try_fold` panics, any subsequent call to `send` panic.
pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), SendError>
pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), (T, SendError)>
where
F: Fn(&mut T, T) -> Result<(), T>,
{
@@ -104,7 +104,9 @@ impl<T: Send> Sender<T> {
}
Poll::Pending
}
State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
State::ReceiverGone => {
Poll::Ready(Err((value.take().unwrap(), SendError::ReceiverGone)))
}
State::SenderGone(_)
| State::AllGone
| State::SenderDropping

View File

@@ -0,0 +1,55 @@
//! watch is probably not the right word, because we do take out
use tokio_util::sync::CancellationToken;
use crate::sync::spsc_fold;
pub fn channel<T: Send>() -> (Sender<T>, Receiver<T>) {
let (tx, rx) = spsc_fold::channel();
let cancel = CancellationToken::new();
(
Sender {
tx,
_cancel: cancel.clone().drop_guard(),
},
Receiver { rx, cancel },
)
}
pub struct Sender<T> {
tx: spsc_fold::Sender<T>,
_cancel: tokio_util::sync::DropGuard,
}
pub struct Receiver<T> {
rx: spsc_fold::Receiver<T>,
cancel: CancellationToken,
}
impl<T: Send> Sender<T> {
pub fn send_replace(&mut self, value: T) -> Result<(), (T, spsc_fold::SendError)> {
poll_ready(self.tx.send(value, |old, new| {
*old = new;
Ok(())
}))
}
}
impl<T: Send> Receiver<T> {
pub async fn recv(&mut self) -> Result<T, spsc_fold::RecvError> {
self.rx.recv().await
}
pub async fn cancelled(&mut self) {
self.cancel.cancelled().await
}
}
fn poll_ready<F: Future<Output = O>, O>(f: F) -> O {
futures::executor::block_on(async move {
let f = std::pin::pin!(f);
match futures::poll!(f) {
std::task::Poll::Ready(r) => r,
std::task::Poll::Pending => unreachable!("expecting future to always return Ready"),
}
})
}

View File

@@ -30,7 +30,6 @@ crc32c.workspace = true
either.workspace = true
fail.workspace = true
futures.workspace = true
hashlink.workspace = true
hex.workspace = true
humantime.workspace = true
humantime-serde.workspace = true

View File

@@ -1,13 +1,7 @@
use std::env;
use std::path::PathBuf;
/// Generates Rust code from .proto Protobuf schemas, along with a binary file
/// descriptor set for Protobuf schema reflection.
fn main() -> Result<(), Box<dyn std::error::Error>> {
let out_dir = PathBuf::from(env::var("OUT_DIR")?);
// Generates Rust code from .proto Protobuf schemas.
tonic_build::configure()
.bytes(["."])
.file_descriptor_set_path(out_dir.join("page_api_descriptor.bin"))
.compile_protos(&["proto/page_service.proto"], &["proto"])
.map_err(|err| err.into())
}

View File

@@ -11,19 +11,6 @@
// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
//
// The service can be accessed via e.g. grpcurl:
//
// ```
// grpcurl \
// -plaintext \
// -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
// -H "neon-shard-id: 0b10" \
// -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
// -H "authorization: Bearer $JWT" \
// -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
// localhost:51051 page_api.PageService/CheckRelExists
// ```
//
// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
// However, this will require reconnecting when changing modes.
//
@@ -33,7 +20,7 @@
// - Compression
syntax = "proto3";
package page_api;
package page_service;
service PageService {
// Returns whether a relation exists.

View File

@@ -7,12 +7,7 @@
// Code generated by protobuf.
pub mod proto {
tonic::include_proto!("page_api");
/// File descriptor set for Protobuf schema reflection. This allows using
/// e.g. grpcurl with the API.
pub const FILE_DESCRIPTOR_SET: &[u8] =
tonic::include_file_descriptor_set!("page_api_descriptor");
tonic::include_proto!("page_service");
pub use page_service_client::PageServiceClient;
pub use page_service_server::{PageService, PageServiceServer};

View File

@@ -144,7 +144,7 @@ where
replica,
ctx,
io_concurrency: IoConcurrency::spawn_from_conf(
timeline.conf.get_vectored_concurrent_io,
timeline.conf,
timeline
.gate
.enter()
@@ -343,7 +343,7 @@ where
// Gather non-relational files from object storage pages.
let slru_partitions = self
.timeline
.get_slru_keyspace(Version::at(self.lsn), self.ctx)
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
.await?
.partition(
self.timeline.get_shard_identity(),
@@ -378,7 +378,7 @@ where
// Otherwise only include init forks of unlogged relations.
let rels = self
.timeline
.list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
@@ -517,7 +517,7 @@ where
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
let nblocks = self
.timeline
.get_rel_size(src, Version::at(self.lsn), self.ctx)
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
.await?;
// If the relation is empty, create an empty file
@@ -577,7 +577,7 @@ where
let relmap_img = if has_relmap_file {
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, Version::at(self.lsn), self.ctx)
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?;
if img.len()
@@ -631,7 +631,7 @@ where
if !has_relmap_file
&& self
.timeline
.list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?
.is_empty()
{

View File

@@ -1,518 +0,0 @@
use std::{collections::HashMap, sync::Arc};
use async_compression::tokio::write::GzipEncoder;
use camino::{Utf8Path, Utf8PathBuf};
use metrics::core::{AtomicU64, GenericCounter};
use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
use tokio::{
io::{AsyncWriteExt, BufWriter},
sync::mpsc::{UnboundedReceiver, UnboundedSender},
};
use tokio_util::sync::CancellationToken;
use utils::{
id::{TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
shard::TenantShardId,
};
use crate::{
basebackup::send_basebackup_tarball,
context::{DownloadBehavior, RequestContext},
metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ},
task_mgr::TaskKind,
tenant::{
Timeline,
mgr::{TenantManager, TenantSlot},
},
};
pub struct BasebackupPrepareRequest {
pub tenant_shard_id: TenantShardId,
pub timeline_id: TimelineId,
pub lsn: Lsn,
}
pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
type BasebackupRemoveEntrySender = UnboundedSender<Utf8PathBuf>;
type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
/// BasebackupCache stores cached basebackup archives for timelines on local disk.
///
/// The main purpose of this cache is to speed up the startup process of compute nodes
/// after scaling to zero.
/// Thus, the basebackup is stored only for the latest LSN of the timeline and with
/// fixed set of parameters (gzip=true, full_backup=false, replica=false, prev_lsn=none).
///
/// The cache receives prepare requests through the `BasebackupPrepareSender` channel,
/// generates a basebackup from the timeline in the background, and stores it on disk.
///
/// Basebackup requests are pretty rare. We expect ~thousands of entries in the cache
/// and ~1 RPS for get requests.
pub struct BasebackupCache {
data_dir: Utf8PathBuf,
config: BasebackupCacheConfig,
tenant_manager: Arc<TenantManager>,
remove_entry_sender: BasebackupRemoveEntrySender,
entries: std::sync::Mutex<HashMap<TenantTimelineId, Lsn>>,
cancel: CancellationToken,
read_hit_count: GenericCounter<AtomicU64>,
read_miss_count: GenericCounter<AtomicU64>,
read_err_count: GenericCounter<AtomicU64>,
prepare_ok_count: GenericCounter<AtomicU64>,
prepare_skip_count: GenericCounter<AtomicU64>,
prepare_err_count: GenericCounter<AtomicU64>,
}
impl BasebackupCache {
/// Creates a BasebackupCache and spawns the background task.
/// The initialization of the cache is performed in the background and does not
/// block the caller. The cache will return `None` for any get requests until
/// initialization is complete.
pub fn spawn(
runtime_handle: &tokio::runtime::Handle,
data_dir: Utf8PathBuf,
config: Option<BasebackupCacheConfig>,
prepare_receiver: BasebackupPrepareReceiver,
tenant_manager: Arc<TenantManager>,
cancel: CancellationToken,
) -> Arc<Self> {
let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel();
let enabled = config.is_some();
let cache = Arc::new(BasebackupCache {
data_dir,
config: config.unwrap_or_default(),
tenant_manager,
remove_entry_sender,
entries: std::sync::Mutex::new(HashMap::new()),
cancel,
read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
});
if enabled {
runtime_handle.spawn(
cache
.clone()
.background(prepare_receiver, remove_entry_receiver),
);
}
cache
}
/// Gets a basebackup entry from the cache.
/// If the entry is found, opens a file with the basebackup archive and returns it.
/// The open file descriptor will prevent the file system from deleting the file
/// even if the entry is removed from the cache in the background.
pub async fn get(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Option<tokio::fs::File> {
// Fast path. Check if the entry exists using the in-memory state.
let tti = TenantTimelineId::new(tenant_id, timeline_id);
if self.entries.lock().unwrap().get(&tti) != Some(&lsn) {
self.read_miss_count.inc();
return None;
}
let path = self.entry_path(tenant_id, timeline_id, lsn);
match tokio::fs::File::open(path).await {
Ok(file) => {
self.read_hit_count.inc();
Some(file)
}
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
// We may end up here if the basebackup was concurrently removed by the cleanup task.
self.read_miss_count.inc();
} else {
self.read_err_count.inc();
tracing::warn!("Unexpected error opening basebackup cache file: {:?}", e);
}
None
}
}
}
// Private methods.
fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
// The default format for LSN is 0/ABCDEF.
// The backslash is not filename friendly, so serialize it as plain hex.
let lsn = lsn.0;
format!("basebackup_{tenant_id}_{timeline_id}_{lsn:016X}.tar.gz")
}
fn entry_path(&self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> Utf8PathBuf {
self.data_dir
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
}
fn entry_tmp_path(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Utf8PathBuf {
self.data_dir
.join("tmp")
.join(Self::entry_filename(tenant_id, timeline_id, lsn))
}
fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> {
let parts: Vec<&str> = filename
.strip_prefix("basebackup_")?
.strip_suffix(".tar.gz")?
.split('_')
.collect();
if parts.len() != 3 {
return None;
}
let tenant_id = parts[0].parse::<TenantId>().ok()?;
let timeline_id = parts[1].parse::<TimelineId>().ok()?;
let lsn = Lsn(u64::from_str_radix(parts[2], 16).ok()?);
Some((tenant_id, timeline_id, lsn))
}
async fn cleanup(&self) -> anyhow::Result<()> {
// Cleanup tmp directory.
let tmp_dir = self.data_dir.join("tmp");
let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?;
while let Some(dir_entry) = tmp_dir.next_entry().await? {
if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await {
tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e);
}
}
// Remove outdated entries.
let entries_old = self.entries.lock().unwrap().clone();
let mut entries_new = HashMap::new();
for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() {
if !tenant_shard_id.is_shard_zero() {
continue;
}
let TenantSlot::Attached(tenant) = tenant_slot else {
continue;
};
let tenant_id = tenant_shard_id.tenant_id;
for timeline in tenant.list_timelines() {
let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id);
if let Some(&entry_lsn) = entries_old.get(&tti) {
if timeline.get_last_record_lsn() <= entry_lsn {
entries_new.insert(tti, entry_lsn);
}
}
}
}
for (&tti, &lsn) in entries_old.iter() {
if !entries_new.contains_key(&tti) {
self.remove_entry_sender
.send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn))
.unwrap();
}
}
BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64);
*self.entries.lock().unwrap() = entries_new;
Ok(())
}
async fn on_startup(&self) -> anyhow::Result<()> {
// Create data_dir and tmp directory if they do not exist.
tokio::fs::create_dir_all(&self.data_dir.join("tmp"))
.await
.map_err(|e| {
anyhow::anyhow!(
"Failed to create basebackup cache data_dir {:?}: {:?}",
self.data_dir,
e
)
})?;
// Read existing entries from the data_dir and add them to in-memory state.
let mut entries = HashMap::new();
let mut dir = tokio::fs::read_dir(&self.data_dir).await?;
while let Some(dir_entry) = dir.next_entry().await? {
let filename = dir_entry.file_name();
if filename == "tmp" {
// Skip the tmp directory.
continue;
}
let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref());
let Some((tenant_id, timeline_id, lsn)) = parsed else {
tracing::warn!("Invalid basebackup cache file name: {:?}", filename);
continue;
};
let tti = TenantTimelineId::new(tenant_id, timeline_id);
use std::collections::hash_map::Entry::*;
match entries.entry(tti) {
Occupied(mut entry) => {
let entry_lsn = *entry.get();
// Leave only the latest entry, remove the old one.
if lsn < entry_lsn {
self.remove_entry_sender.send(self.entry_path(
tenant_id,
timeline_id,
lsn,
))?;
} else if lsn > entry_lsn {
self.remove_entry_sender.send(self.entry_path(
tenant_id,
timeline_id,
entry_lsn,
))?;
entry.insert(lsn);
} else {
// Two different filenames parsed to the same timline_id and LSN.
// Should never happen.
return Err(anyhow::anyhow!(
"Duplicate basebackup cache entry with the same LSN: {:?}",
filename
));
}
}
Vacant(entry) => {
entry.insert(lsn);
}
}
}
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
*self.entries.lock().unwrap() = entries;
Ok(())
}
async fn background(
self: Arc<Self>,
mut prepare_receiver: BasebackupPrepareReceiver,
mut remove_entry_receiver: BasebackupRemoveEntryReceiver,
) {
// Panic in the background is a safe fallback.
// It will drop receivers and the cache will be effectively disabled.
self.on_startup()
.await
.expect("Failed to initialize basebackup cache");
let mut cleanup_ticker = tokio::time::interval(self.config.cleanup_period);
cleanup_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
loop {
tokio::select! {
Some(req) = prepare_receiver.recv() => {
if let Err(err) = self.prepare_basebackup(
req.tenant_shard_id,
req.timeline_id,
req.lsn,
).await {
tracing::info!("Failed to prepare basebackup: {:#}", err);
self.prepare_err_count.inc();
continue;
}
}
Some(req) = remove_entry_receiver.recv() => {
if let Err(e) = tokio::fs::remove_file(req).await {
tracing::warn!("Failed to remove basebackup cache file: {:#}", e);
}
}
_ = cleanup_ticker.tick() => {
self.cleanup().await.unwrap_or_else(|e| {
tracing::warn!("Failed to clean up basebackup cache: {:#}", e);
});
}
_ = self.cancel.cancelled() => {
tracing::info!("BasebackupCache background task cancelled");
break;
}
}
}
}
/// Prepare a basebackup for the given timeline.
///
/// If the basebackup already exists with a higher LSN or the timeline already
/// has a higher last_record_lsn, skip the preparation.
///
/// The basebackup is prepared in a temporary directory and then moved to the final
/// location to make the operation atomic.
async fn prepare_basebackup(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
req_lsn: Lsn,
) -> anyhow::Result<()> {
tracing::info!(
tenant_id = %tenant_shard_id.tenant_id,
%timeline_id,
%req_lsn,
"Preparing basebackup for timeline",
);
let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id);
{
let entries = self.entries.lock().unwrap();
if let Some(&entry_lsn) = entries.get(&tti) {
if entry_lsn >= req_lsn {
tracing::info!(
%timeline_id,
%req_lsn,
%entry_lsn,
"Basebackup entry already exists for timeline with higher LSN, skipping basebackup",
);
self.prepare_skip_count.inc();
return Ok(());
}
}
if entries.len() as i64 >= self.config.max_size_entries {
tracing::info!(
%timeline_id,
%req_lsn,
"Basebackup cache is full, skipping basebackup",
);
self.prepare_skip_count.inc();
return Ok(());
}
}
let tenant = self
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let tenant_state = tenant.current_state();
if tenant_state != TenantState::Active {
anyhow::bail!(
"Tenant {} is not active, current state: {:?}",
tenant_shard_id.tenant_id,
tenant_state
)
}
let timeline = tenant.get_timeline(timeline_id, true)?;
let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn > req_lsn {
tracing::info!(
%timeline_id,
%req_lsn,
%last_record_lsn,
"Timeline has a higher LSN than the requested one, skipping basebackup",
);
self.prepare_skip_count.inc();
return Ok(());
}
let entry_tmp_path = self.entry_tmp_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
let res = self
.prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn)
.await;
if let Err(err) = res {
tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
// Try to clean up tmp file. If we fail, the background clean up task will take care of it.
match tokio::fs::remove_file(&entry_tmp_path).await {
Ok(_) => {}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => {
tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
}
}
return Err(err);
}
// Move the tmp file to the final location atomically.
let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
tokio::fs::rename(&entry_tmp_path, &entry_path).await?;
let mut entries = self.entries.lock().unwrap();
if let Some(old_lsn) = entries.insert(tti, req_lsn) {
// Remove the old entry if it exists.
self.remove_entry_sender
.send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn))
.unwrap();
}
BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
self.prepare_ok_count.inc();
Ok(())
}
/// Prepares a basebackup in a temporary file.
async fn prepare_basebackup_tmp(
&self,
emptry_tmp_path: &Utf8Path,
timeline: &Arc<Timeline>,
req_lsn: Lsn,
) -> anyhow::Result<()> {
let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download);
let ctx = ctx.with_scope_timeline(timeline);
let file = tokio::fs::File::create(emptry_tmp_path).await?;
let mut writer = BufWriter::new(file);
let mut encoder = GzipEncoder::with_quality(
&mut writer,
// Level::Best because compression is not on the hot path of basebackup requests.
// The decompression is almost not affected by the compression level.
async_compression::Level::Best,
);
// We may receive a request before the WAL record is applied to the timeline.
// Wait for the requested LSN to be applied.
timeline
.wait_lsn(
req_lsn,
crate::tenant::timeline::WaitLsnWaiter::BaseBackupCache,
crate::tenant::timeline::WaitLsnTimeout::Default,
&ctx,
)
.await?;
send_basebackup_tarball(
&mut encoder,
timeline,
Some(req_lsn),
None,
false,
false,
&ctx,
)
.await?;
encoder.shutdown().await?;
writer.flush().await?;
writer.into_inner().sync_all().await?;
Ok(())
}
}

View File

@@ -16,7 +16,6 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
use metrics::set_build_info_metric;
use nix::sys::socket::{setsockopt, sockopt};
use pageserver::basebackup_cache::BasebackupCache;
use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
use pageserver::controller_upcall_client::StorageControllerUpcallClient;
use pageserver::deletion_queue::DeletionQueue;
@@ -424,11 +423,14 @@ fn start_pageserver(
.map(storage_broker::Certificate::from_pem),
);
// Note: we do not attempt connecting here (but validate endpoints sanity).
storage_broker::connect(
let service_client = storage_broker::connect(
conf.broker_endpoint.clone(),
conf.broker_keepalive_interval,
tls_config,
)
)?;
anyhow::Ok(storage_broker::TimelineUpdatesSubscriber::new(
service_client,
))
})
.with_context(|| {
format!(
@@ -542,8 +544,6 @@ fn start_pageserver(
pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
// Scan the local 'tenants/' directory and start loading the tenants
let (basebackup_prepare_sender, basebackup_prepare_receiver) =
tokio::sync::mpsc::unbounded_channel();
let deletion_queue_client = deletion_queue.new_client();
let background_purges = mgr::BackgroundPurges::default();
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -554,22 +554,12 @@ fn start_pageserver(
remote_storage: remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
basebackup_prepare_sender,
},
order,
shutdown_pageserver.clone(),
))?;
let tenant_manager = Arc::new(tenant_manager);
let basebackup_cache = BasebackupCache::spawn(
BACKGROUND_RUNTIME.handle(),
conf.basebackup_cache_dir(),
conf.basebackup_cache_config.clone(),
basebackup_prepare_receiver,
Arc::clone(&tenant_manager),
shutdown_pageserver.child_token(),
);
BACKGROUND_RUNTIME.spawn({
let shutdown_pageserver = shutdown_pageserver.clone();
let drive_init = async move {
@@ -776,7 +766,6 @@ fn start_pageserver(
} else {
None
},
basebackup_cache,
);
// All started up! Now just sit and wait for shutdown signal.

View File

@@ -232,8 +232,6 @@ pub struct PageServerConf {
pub dev_mode: bool,
pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
}
/// Token for authentication to safekeepers
@@ -263,10 +261,6 @@ impl PageServerConf {
self.workdir.join("metadata.json")
}
pub fn basebackup_cache_dir(&self) -> Utf8PathBuf {
self.workdir.join("basebackup_cache")
}
pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
// Encode a version in the filename, so that if we ever switch away from JSON we can
// increment this.
@@ -413,7 +407,6 @@ impl PageServerConf {
enable_tls_page_service_api,
dev_mode,
timeline_import_config,
basebackup_cache_config,
} = config_toml;
let mut conf = PageServerConf {
@@ -468,7 +461,6 @@ impl PageServerConf {
enable_tls_page_service_api,
dev_mode,
timeline_import_config,
basebackup_cache_config,
// ------------------------------------------------------------
// fields that require additional validation or custom handling
@@ -552,23 +544,6 @@ impl PageServerConf {
ratio.numerator, ratio.denominator
)
);
let url = Url::parse(&tracing_config.export_config.endpoint)
.map_err(anyhow::Error::msg)
.with_context(|| {
format!(
"tracing endpoint URL is invalid : {}",
tracing_config.export_config.endpoint
)
})?;
ensure!(
url.scheme() == "http" || url.scheme() == "https",
format!(
"tracing endpoint URL must start with http:// or https://: {}",
tracing_config.export_config.endpoint
)
);
}
IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
@@ -685,25 +660,4 @@ mod tests {
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
.expect("parse_and_validate");
}
#[test]
fn test_config_tracing_endpoint_is_invalid() {
let input = r#"
control_plane_api = "http://localhost:6666"
[tracing]
sampling_ratio = { numerator = 1, denominator = 0 }
[tracing.export_config]
endpoint = "localhost:4317"
protocol = "http-binary"
timeout = "1ms"
"#;
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
.expect("config has valid fields");
let workdir = Utf8PathBuf::from("/nonexistent");
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
.expect_err("parse_and_validate should fail for endpoint without scheme");
}
}

View File

@@ -18,25 +18,12 @@ use crate::tenant::timeline::logical_size::CurrentLogicalSize;
// management.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
pub(super) enum Name {
/// Timeline last_record_lsn, absolute.
/// Timeline last_record_lsn, absolute
#[serde(rename = "written_size")]
WrittenSize,
/// Timeline last_record_lsn, incremental
#[serde(rename = "written_data_bytes_delta")]
WrittenSizeDelta,
/// Written bytes only on this timeline (not including ancestors):
/// written_size - ancestor_lsn
///
/// On the root branch, this is equivalent to `written_size`.
#[serde(rename = "written_size_since_parent")]
WrittenSizeSinceParent,
/// PITR history size only on this timeline (not including ancestors):
/// last_record_lsn - max(pitr_cutoff, ancestor_lsn).
///
/// On the root branch, this is its entire PITR history size. Not emitted if GC hasn't computed
/// the PITR cutoff yet. 0 if PITR is disabled.
#[serde(rename = "pitr_history_size_since_parent")]
PitrHistorySizeSinceParent,
/// Timeline logical size
#[serde(rename = "timeline_logical_size")]
LogicalSize,
@@ -170,32 +157,6 @@ impl MetricsKey {
.incremental_values()
}
/// `written_size` - `ancestor_lsn`.
const fn written_size_since_parent(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: Name::WrittenSizeSinceParent,
}
.absolute_values()
}
/// `written_size` - max(`pitr_cutoff`, `ancestor_lsn`).
const fn pitr_history_size_since_parent(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: Name::PitrHistorySizeSinceParent,
}
.absolute_values()
}
/// Exact [`Timeline::get_current_logical_size`].
///
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
@@ -373,13 +334,7 @@ impl TenantSnapshot {
struct TimelineSnapshot {
loaded_at: (Lsn, SystemTime),
last_record_lsn: Lsn,
ancestor_lsn: Lsn,
current_exact_logical_size: Option<u64>,
/// Whether PITR is enabled (pitr_interval > 0).
pitr_enabled: bool,
/// The PITR cutoff LSN. None if not yet initialized. If PITR is disabled, this is approximately
/// Some(last_record_lsn), but may lag behind it since it's computed periodically.
pitr_cutoff: Option<Lsn>,
}
impl TimelineSnapshot {
@@ -399,9 +354,6 @@ impl TimelineSnapshot {
} else {
let loaded_at = t.loaded_at;
let last_record_lsn = t.get_last_record_lsn();
let ancestor_lsn = t.get_ancestor_lsn();
let pitr_enabled = !t.get_pitr_interval().is_zero();
let pitr_cutoff = t.gc_info.read().unwrap().cutoffs.time;
let current_exact_logical_size = {
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
@@ -421,10 +373,7 @@ impl TimelineSnapshot {
Ok(Some(TimelineSnapshot {
loaded_at,
last_record_lsn,
ancestor_lsn,
current_exact_logical_size,
pitr_enabled,
pitr_cutoff,
}))
}
}
@@ -475,8 +424,6 @@ impl TimelineSnapshot {
let up_to = now;
let written_size_last = written_size_now.value.max(prev.1); // don't regress
if let Some(delta) = written_size_now.value.checked_sub(prev.1) {
let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
// written_size_delta
@@ -494,27 +441,6 @@ impl TimelineSnapshot {
});
}
// Compute the branch-local written size.
let written_size_since_parent_key =
MetricsKey::written_size_since_parent(tenant_id, timeline_id);
metrics.push(
written_size_since_parent_key
.at(now, written_size_last.saturating_sub(self.ancestor_lsn.0)),
);
// Compute the branch-local PITR history size. Not emitted if GC hasn't yet computed the
// PITR cutoff. 0 if PITR is disabled.
let pitr_history_size_since_parent_key =
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id);
if !self.pitr_enabled {
metrics.push(pitr_history_size_since_parent_key.at(now, 0));
} else if let Some(pitr_cutoff) = self.pitr_cutoff {
metrics.push(pitr_history_size_since_parent_key.at(
now,
written_size_last.saturating_sub(pitr_cutoff.max(self.ancestor_lsn).0),
));
}
{
let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
let current_or_previous = self

View File

@@ -12,17 +12,12 @@ fn startup_collected_timeline_metrics_before_advancing() {
let cache = HashMap::new();
let initdb_lsn = Lsn(0x10000);
let pitr_cutoff = Lsn(0x11000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let logical_size = 0x42000;
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, SystemTime::now()),
last_record_lsn: disk_consistent_lsn,
ancestor_lsn: Lsn(0),
current_exact_logical_size: Some(logical_size),
pitr_enabled: true,
pitr_cutoff: Some(pitr_cutoff),
current_exact_logical_size: Some(0x42000),
};
let now = DateTime::<Utc>::from(SystemTime::now());
@@ -38,11 +33,7 @@ fn startup_collected_timeline_metrics_before_advancing() {
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
@@ -58,9 +49,7 @@ fn startup_collected_timeline_metrics_second_round() {
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let pitr_cutoff = Lsn(0x11000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let logical_size = 0x42000;
let mut metrics = Vec::new();
let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id)
@@ -70,10 +59,7 @@ fn startup_collected_timeline_metrics_second_round() {
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
ancestor_lsn: Lsn(0),
current_exact_logical_size: Some(logical_size),
pitr_enabled: true,
pitr_cutoff: Some(pitr_cutoff),
current_exact_logical_size: Some(0x42000),
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
@@ -83,11 +69,7 @@ fn startup_collected_timeline_metrics_second_round() {
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
@@ -104,9 +86,7 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let pitr_cutoff = Lsn(0x11000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let logical_size = 0x42000;
let mut metrics = Vec::new();
let cache = HashMap::from([
@@ -123,10 +103,7 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
ancestor_lsn: Lsn(0),
current_exact_logical_size: Some(logical_size),
pitr_enabled: true,
pitr_cutoff: Some(pitr_cutoff),
current_exact_logical_size: Some(0x42000),
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
@@ -136,18 +113,16 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
/// Tests that written sizes do not regress across restarts.
#[test]
fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
// it can happen that we lose the inmemorylayer but have previously sent metrics and we
// should never go backwards
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
@@ -165,10 +140,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
let snap = TimelineSnapshot {
loaded_at: (Lsn(50), at_restart),
last_record_lsn: Lsn(50),
ancestor_lsn: Lsn(0),
current_exact_logical_size: None,
pitr_enabled: true,
pitr_cutoff: Some(Lsn(20)),
};
let mut cache = HashMap::from([
@@ -197,8 +169,6 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 100),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 80),
]
);
@@ -213,157 +183,6 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 100),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 80),
]
);
}
/// Tests that written sizes do not regress across restarts, even on child branches.
#[test]
fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [later, now, at_restart] = time_backwards();
// FIXME: tests would be so much easier if we did not need to juggle back and forth
// SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
let now = DateTime::<Utc>::from(now);
let later = DateTime::<Utc>::from(later);
let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
let before_restart = DateTime::<Utc>::from(before_restart);
let way_before = DateTime::<Utc>::from(way_before);
let snap = TimelineSnapshot {
loaded_at: (Lsn(50), at_restart),
last_record_lsn: Lsn(50),
ancestor_lsn: Lsn(40),
current_exact_logical_size: None,
pitr_enabled: true,
pitr_cutoff: Some(Lsn(20)),
};
let mut cache = HashMap::from([
MetricsKey::written_size(tenant_id, timeline_id)
.at(before_restart, 100)
.to_kv_pair(),
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_until(
way_before,
before_restart,
// not taken into account, but the timestamps are important
999_999_999,
)
.to_kv_pair(),
]);
let mut metrics = Vec::new();
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
before_restart,
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 60),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60),
]
);
// now if we cache these metrics, and re-run while "still in recovery"
cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
// "still in recovery", because our snapshot did not change
snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 60),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60),
]
);
}
/// Tests that written sizes do not regress across restarts, even on child branches and
/// with a PITR cutoff after the branch point.
#[test]
fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn_and_pitr_cutoff() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [later, now, at_restart] = time_backwards();
// FIXME: tests would be so much easier if we did not need to juggle back and forth
// SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
let now = DateTime::<Utc>::from(now);
let later = DateTime::<Utc>::from(later);
let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
let before_restart = DateTime::<Utc>::from(before_restart);
let way_before = DateTime::<Utc>::from(way_before);
let snap = TimelineSnapshot {
loaded_at: (Lsn(50), at_restart),
last_record_lsn: Lsn(50),
ancestor_lsn: Lsn(30),
current_exact_logical_size: None,
pitr_enabled: true,
pitr_cutoff: Some(Lsn(40)),
};
let mut cache = HashMap::from([
MetricsKey::written_size(tenant_id, timeline_id)
.at(before_restart, 100)
.to_kv_pair(),
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_until(
way_before,
before_restart,
// not taken into account, but the timestamps are important
999_999_999,
)
.to_kv_pair(),
]);
let mut metrics = Vec::new();
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
before_restart,
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 70),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60),
]
);
// now if we cache these metrics, and re-run while "still in recovery"
cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
// "still in recovery", because our snapshot did not change
snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 70),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60),
]
);
}
@@ -382,10 +201,7 @@ fn post_restart_current_exact_logical_size_uses_cached() {
let snap = TimelineSnapshot {
loaded_at: (Lsn(50), at_restart),
last_record_lsn: Lsn(50),
ancestor_lsn: Lsn(0),
current_exact_logical_size: None,
pitr_enabled: true,
pitr_cutoff: None,
};
let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id)
@@ -470,101 +286,16 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
times
}
/// Tests that disabled PITR history does not yield any history size, even when the PITR cutoff
/// indicates otherwise.
#[test]
fn pitr_disabled_yields_no_history_size() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let mut metrics = Vec::new();
let cache = HashMap::new();
let initdb_lsn = Lsn(0x10000);
let pitr_cutoff = Lsn(0x11000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, SystemTime::now()),
last_record_lsn: disk_consistent_lsn,
ancestor_lsn: Lsn(0),
current_exact_logical_size: None,
pitr_enabled: false,
pitr_cutoff: Some(pitr_cutoff),
};
let now = DateTime::<Utc>::from(SystemTime::now());
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
snap.loaded_at.1.into(),
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0),
]
);
}
/// Tests that uninitialized PITR cutoff does not emit any history size metric at all.
#[test]
fn pitr_uninitialized_does_not_emit_history_size() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let mut metrics = Vec::new();
let cache = HashMap::new();
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, SystemTime::now()),
last_record_lsn: disk_consistent_lsn,
ancestor_lsn: Lsn(0),
current_exact_logical_size: None,
pitr_enabled: true,
pitr_cutoff: None,
};
let now = DateTime::<Utc>::from(SystemTime::now());
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
snap.loaded_at.1.into(),
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::written_size_since_parent(tenant_id, timeline_id)
.at(now, disk_consistent_lsn.0),
]
);
}
pub(crate) const fn metric_examples_old(
tenant_id: TenantId,
timeline_id: TimelineId,
now: DateTime<Utc>,
before: DateTime<Utc>,
) -> [RawMetric; 7] {
) -> [RawMetric; 5] {
[
MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_until_old_format(before, now, 0),
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
@@ -576,12 +307,10 @@ pub(crate) const fn metric_examples(
timeline_id: TimelineId,
now: DateTime<Utc>,
before: DateTime<Utc>,
) -> [NewRawMetric; 7] {
) -> [NewRawMetric; 5] {
[
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 0),
MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
MetricsKey::synthetic_size(tenant_id).at(now, 1),

View File

@@ -513,14 +513,6 @@ mod tests {
line!(),
r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"pitr_history_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
@@ -568,7 +560,7 @@ mod tests {
assert_eq!(upgraded_samples, new_samples);
}
fn metric_samples_old() -> [RawMetric; 7] {
fn metric_samples_old() -> [RawMetric; 5] {
let tenant_id = TenantId::from_array([0; 16]);
let timeline_id = TimelineId::from_array([0xff; 16]);
@@ -580,7 +572,7 @@ mod tests {
super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
}
fn metric_samples() -> [NewRawMetric; 7] {
fn metric_samples() -> [NewRawMetric; 5] {
let tenant_id = TenantId::from_array([0; 16]);
let timeline_id = TimelineId::from_array([0xff; 16]);

View File

@@ -100,7 +100,7 @@ pub struct State {
auth: Option<Arc<SwappableJwtAuth>>,
allowlist_routes: &'static [&'static str],
remote_storage: GenericRemoteStorage,
broker_client: storage_broker::BrokerClientChannel,
broker_client: storage_broker::TimelineUpdatesSubscriber,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
@@ -114,7 +114,7 @@ impl State {
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
remote_storage: GenericRemoteStorage,
broker_client: storage_broker::BrokerClientChannel,
broker_client: storage_broker::TimelineUpdatesSubscriber,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
@@ -449,7 +449,7 @@ async fn build_timeline_info_common(
// Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
// actually trimmed data to), which can pass each other when PITR is changed.
let min_readable_lsn = std::cmp::max(
timeline.get_gc_cutoff_lsn().unwrap_or_default(),
timeline.get_gc_cutoff_lsn(),
*timeline.get_applied_gc_cutoff_lsn(),
);
@@ -3199,7 +3199,7 @@ async fn list_aux_files(
.await?;
let io_concurrency = IoConcurrency::spawn_from_conf(
state.conf.get_vectored_concurrent_io,
state.conf,
timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
);

View File

@@ -3,7 +3,6 @@
mod auth;
pub mod basebackup;
pub mod basebackup_cache;
pub mod config;
pub mod consumption_metrics;
pub mod context;

View File

@@ -843,50 +843,23 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_relsize_latest_cache_entries",
"Number of entries in the latest relation size cache",
"pageserver_relsize_cache_entries",
"Number of entries in the relation size cache",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_latest_cache_hits",
"Latest relation size cache hits",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_latest_cache_misses",
"Relation size latest cache misses",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_relsize_snapshot_cache_entries",
"Number of entries in the pitr relation size cache",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_snapshot_cache_hits",
"Pitr relation size cache hits",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_snapshot_cache_misses",
"Relation size snapshot cache misses",
"pageserver_relsize_cache_misses",
"Relation size cache misses",
)
.expect("failed to define a metric")
});
@@ -1066,15 +1039,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
.expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
});
pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_offloaded_timelines",
"Number of offloaded timelines of a tenant",
&["tenant_id", "shard_id"]
)
.expect("Failed to register pageserver_tenant_offloaded_timelines metric")
});
pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_eviction_iteration_duration_seconds_global",
@@ -3560,14 +3524,11 @@ impl TimelineMetrics {
}
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
let tid = tenant_shard_id.tenant_id.to_string();
let shard_id = tenant_shard_id.shard_slug().to_string();
// Only shard zero deals in synthetic sizes
if tenant_shard_id.is_shard_zero() {
let tid = tenant_shard_id.tenant_id.to_string();
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
}
let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]);
tenant_throttling::remove_tenant_metrics(tenant_shard_id);
@@ -4359,42 +4320,6 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
.set(u64::try_from(num_threads.get()).unwrap());
}
pub(crate) static BASEBACKUP_CACHE_READ: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_basebackup_cache_read_total",
"Number of read accesses to the basebackup cache grouped by hit/miss/error",
&["result"]
)
.expect("failed to define a metric")
});
pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_basebackup_cache_prepare_total",
"Number of prepare requests processed by the basebackup cache grouped by ok/skip/error",
&["result"]
)
.expect("failed to define a metric")
});
pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"pageserver_basebackup_cache_entries_total",
"Number of entries in the basebackup cache"
)
.expect("failed to define a metric")
});
// FIXME: Support basebackup cache size metrics.
#[allow(dead_code)]
pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<IntGauge> = Lazy::new(|| {
register_int_gauge!(
"pageserver_basebackup_cache_size_bytes",
"Total size of all basebackup cache entries on disk in bytes"
)
.expect("failed to define a metric")
});
static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_config_ignored_items",

View File

@@ -9,6 +9,7 @@ use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime};
use std::{io, str};
use crate::PERF_TRACE_TARGET;
use anyhow::{Context, bail};
use async_compression::tokio::write::GzipEncoder;
use bytes::Buf;
@@ -17,7 +18,7 @@ use itertools::Itertools;
use jsonwebtoken::TokenData;
use once_cell::sync::OnceCell;
use pageserver_api::config::{
GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
};
use pageserver_api::key::rel_block_to_key;
@@ -51,10 +52,8 @@ use utils::simple_rcu::RcuReadGuard;
use utils::sync::gate::{Gate, GateGuard};
use utils::sync::spsc_fold;
use crate::PERF_TRACE_TARGET;
use crate::auth::check_permission;
use crate::basebackup::BasebackupError;
use crate::basebackup_cache::BasebackupCache;
use crate::config::PageServerConf;
use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -63,7 +62,7 @@ use crate::metrics::{
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
SmgrOpTimer, TimelineMetrics,
};
use crate::pgdatadir_mapping::{LsnRange, Version};
use crate::pgdatadir_mapping::Version;
use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id,
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -108,7 +107,6 @@ pub fn spawn(
perf_trace_dispatch: Option<Dispatch>,
tcp_listener: tokio::net::TcpListener,
tls_config: Option<Arc<rustls::ServerConfig>>,
basebackup_cache: Arc<BasebackupCache>,
) -> Listener {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
@@ -130,7 +128,6 @@ pub fn spawn(
conf.pg_auth_type,
tls_config,
conf.page_service_pipelining.clone(),
basebackup_cache,
libpq_ctx,
cancel.clone(),
)
@@ -189,7 +186,6 @@ pub async fn libpq_listener_main(
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
basebackup_cache: Arc<BasebackupCache>,
listener_ctx: RequestContext,
listener_cancel: CancellationToken,
) -> Connections {
@@ -233,7 +229,6 @@ pub async fn libpq_listener_main(
auth_type,
tls_config.clone(),
pipelining_config.clone(),
Arc::clone(&basebackup_cache),
connection_ctx,
connections_cancel.child_token(),
gate_guard,
@@ -276,7 +271,6 @@ async fn page_service_conn_main(
auth_type: AuthType,
tls_config: Option<Arc<rustls::ServerConfig>>,
pipelining_config: PageServicePipeliningConfig,
basebackup_cache: Arc<BasebackupCache>,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
@@ -337,12 +331,11 @@ async fn page_service_conn_main(
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
// and create the per-query context in process_query ourselves.
let mut conn_handler = PageServerHandler::new(
conf,
tenant_manager,
auth,
pipelining_config,
conf.get_vectored_concurrent_io,
perf_span_fields,
basebackup_cache,
connection_ctx,
cancel.clone(),
gate_guard,
@@ -378,6 +371,7 @@ async fn page_service_conn_main(
}
struct PageServerHandler {
conf: &'static PageServerConf,
auth: Option<Arc<SwappableJwtAuth>>,
claims: Option<Claims>,
@@ -395,9 +389,6 @@ struct PageServerHandler {
timeline_handles: Option<TimelineHandles>,
pipelining_config: PageServicePipeliningConfig,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
basebackup_cache: Arc<BasebackupCache>,
gate_guard: GateGuard,
}
@@ -651,7 +642,7 @@ impl std::fmt::Display for BatchedPageStreamError {
struct BatchedGetPageRequest {
req: PagestreamGetPageRequest,
timer: SmgrOpTimer,
lsn_range: LsnRange,
effective_request_lsn: Lsn,
ctx: RequestContext,
}
@@ -773,12 +764,12 @@ impl BatchedFeMessage {
match batching_strategy {
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
if let Some(last_in_batch) = accum_pages.last() {
if last_in_batch.lsn_range.effective_lsn
!= this_pages[0].lsn_range.effective_lsn
if last_in_batch.effective_request_lsn
!= this_pages[0].effective_request_lsn
{
trace!(
accum_lsn = %last_in_batch.lsn_range.effective_lsn,
this_lsn = %this_pages[0].lsn_range.effective_lsn,
accum_lsn = %last_in_batch.effective_request_lsn,
this_lsn = %this_pages[0].effective_request_lsn,
"stopping batching because LSN changed"
);
@@ -793,15 +784,15 @@ impl BatchedFeMessage {
let same_page_different_lsn = accum_pages.iter().any(|batched| {
batched.req.rel == this_pages[0].req.rel
&& batched.req.blkno == this_pages[0].req.blkno
&& batched.lsn_range.effective_lsn
!= this_pages[0].lsn_range.effective_lsn
&& batched.effective_request_lsn
!= this_pages[0].effective_request_lsn
});
if same_page_different_lsn {
trace!(
rel=%this_pages[0].req.rel,
blkno=%this_pages[0].req.blkno,
lsn=%this_pages[0].lsn_range.effective_lsn,
lsn=%this_pages[0].effective_request_lsn,
"stopping batching because same page was requested at different LSNs"
);
@@ -853,17 +844,17 @@ impl BatchedFeMessage {
impl PageServerHandler {
#[allow(clippy::too_many_arguments)]
pub fn new(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
pipelining_config: PageServicePipeliningConfig,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
perf_span_fields: ConnectionPerfSpanFields,
basebackup_cache: Arc<BasebackupCache>,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
) -> Self {
PageServerHandler {
conf,
auth,
claims: None,
connection_ctx,
@@ -871,8 +862,6 @@ impl PageServerHandler {
timeline_handles: Some(TimelineHandles::new(tenant_manager)),
cancel,
pipelining_config,
get_vectored_concurrent_io,
basebackup_cache,
gate_guard,
}
}
@@ -1169,7 +1158,7 @@ impl PageServerHandler {
.await?;
// We're holding the Handle
let effective_lsn = match Self::effective_request_lsn(
let effective_request_lsn = match Self::effective_request_lsn(
&shard,
shard.get_last_record_lsn(),
req.hdr.request_lsn,
@@ -1188,10 +1177,7 @@ impl PageServerHandler {
pages: smallvec::smallvec![BatchedGetPageRequest {
req,
timer,
lsn_range: LsnRange {
effective_lsn,
request_lsn: req.hdr.request_lsn
},
effective_request_lsn,
ctx,
}],
// The executor grabs the batch when it becomes idle.
@@ -1637,7 +1623,7 @@ impl PageServerHandler {
}
let io_concurrency = IoConcurrency::spawn_from_conf(
self.get_vectored_concurrent_io,
self.conf,
match self.gate_guard.try_clone() {
Ok(guard) => guard,
Err(_) => {
@@ -2141,14 +2127,7 @@ impl PageServerHandler {
.await?;
let exists = timeline
.get_rel_exists(
req.rel,
Version::LsnRange(LsnRange {
effective_lsn: lsn,
request_lsn: req.hdr.request_lsn,
}),
ctx,
)
.get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -2175,14 +2154,7 @@ impl PageServerHandler {
.await?;
let n_blocks = timeline
.get_rel_size(
req.rel,
Version::LsnRange(LsnRange {
effective_lsn: lsn,
request_lsn: req.hdr.request_lsn,
}),
ctx,
)
.get_rel_size(req.rel, Version::Lsn(lsn), ctx)
.await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -2209,15 +2181,7 @@ impl PageServerHandler {
.await?;
let total_blocks = timeline
.get_db_size(
DEFAULTTABLESPACE_OID,
req.dbnode,
Version::LsnRange(LsnRange {
effective_lsn: lsn,
request_lsn: req.hdr.request_lsn,
}),
ctx,
)
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
.await?;
let db_size = total_blocks as i64 * BLCKSZ as i64;
@@ -2250,7 +2214,7 @@ impl PageServerHandler {
// Ignore error (trace buffer may be full or tracer may have disconnected).
_ = page_trace.try_send(PageTraceEvent {
key,
effective_lsn: batch.lsn_range.effective_lsn,
effective_lsn: batch.effective_request_lsn,
time,
});
}
@@ -2265,7 +2229,7 @@ impl PageServerHandler {
perf_instrument = true;
}
req.lsn_range.effective_lsn
req.effective_request_lsn
})
.max()
.expect("batch is never empty");
@@ -2319,7 +2283,7 @@ impl PageServerHandler {
(
&p.req.rel,
&p.req.blkno,
p.lsn_range,
p.effective_request_lsn,
p.ctx.attached_child(),
)
}),
@@ -2504,8 +2468,6 @@ impl PageServerHandler {
.map_err(QueryError::Disconnected)?;
self.flush_cancellable(pgb, &self.cancel).await?;
let mut from_cache = false;
// Send a tarball of the latest layer on the timeline. Compress if not
// fullbackup. TODO Compress in that case too (tests need to be updated)
if full_backup {
@@ -2523,33 +2485,7 @@ impl PageServerHandler {
.map_err(map_basebackup_error)?;
} else {
let mut writer = BufWriter::new(pgb.copyout_writer());
let cached = {
// Basebackup is cached only for this combination of parameters.
if timeline.is_basebackup_cache_enabled()
&& gzip
&& lsn.is_some()
&& prev_lsn.is_none()
{
self.basebackup_cache
.get(tenant_id, timeline_id, lsn.unwrap())
.await
} else {
None
}
};
if let Some(mut cached) = cached {
from_cache = true;
tokio::io::copy(&mut cached, &mut writer)
.await
.map_err(|e| {
map_basebackup_error(BasebackupError::Client(
e,
"handle_basebackup_request,cached,copy",
))
})?;
} else if gzip {
if gzip {
let mut encoder = GzipEncoder::with_quality(
&mut writer,
// NOTE using fast compression because it's on the critical path
@@ -2608,7 +2544,6 @@ impl PageServerHandler {
info!(
lsn_await_millis = lsn_awaited_after.as_millis(),
basebackup_millis = basebackup_after.as_millis(),
%from_cache,
"basebackup complete"
);

View File

@@ -43,9 +43,7 @@ use crate::aux_file;
use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::{
RELSIZE_CACHE_MISSES_OLD, RELSIZE_LATEST_CACHE_ENTRIES, RELSIZE_LATEST_CACHE_HITS,
RELSIZE_LATEST_CACHE_MISSES, RELSIZE_SNAPSHOT_CACHE_ENTRIES, RELSIZE_SNAPSHOT_CACHE_HITS,
RELSIZE_SNAPSHOT_CACHE_MISSES,
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
};
use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id,
@@ -92,28 +90,6 @@ pub enum LsnForTimestamp {
NoData(Lsn),
}
/// Each request to page server contains LSN range: `not_modified_since..request_lsn`.
/// See comments libs/pageserver_api/src/models.rs.
/// Based on this range and `last_record_lsn` PS calculates `effective_lsn`.
/// But to distinguish requests from primary and replicas we need also to pass `request_lsn`.
#[derive(Debug, Clone, Copy, Default)]
pub struct LsnRange {
pub effective_lsn: Lsn,
pub request_lsn: Lsn,
}
impl LsnRange {
pub fn at(lsn: Lsn) -> LsnRange {
LsnRange {
effective_lsn: lsn,
request_lsn: lsn,
}
}
pub fn is_latest(&self) -> bool {
self.request_lsn == Lsn::MAX
}
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum CalculateLogicalSizeError {
#[error("cancelled")]
@@ -226,13 +202,13 @@ impl Timeline {
io_concurrency: IoConcurrency,
) -> Result<Bytes, PageReconstructError> {
match version {
Version::LsnRange(lsns) => {
Version::Lsn(effective_lsn) => {
let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
let res = self
.get_rel_page_at_lsn_batched(
pages
.iter()
.map(|(tag, blknum)| (tag, blknum, lsns, ctx.attached_child())),
pages.iter().map(|(tag, blknum)| {
(tag, blknum, effective_lsn, ctx.attached_child())
}),
io_concurrency.clone(),
ctx,
)
@@ -270,7 +246,7 @@ impl Timeline {
/// The ordering of the returned vec corresponds to the ordering of `pages`.
pub(crate) async fn get_rel_page_at_lsn_batched(
&self,
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, LsnRange, RequestContext)>,
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
io_concurrency: IoConcurrency,
ctx: &RequestContext,
) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -289,7 +265,7 @@ impl Timeline {
let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
HashMap::with_capacity(pages.len());
for (response_slot_idx, (tag, blknum, lsns, ctx)) in pages.enumerate() {
for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
if tag.relnode == 0 {
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
@@ -298,7 +274,7 @@ impl Timeline {
slots_filled += 1;
continue;
}
let lsn = lsns.effective_lsn;
let nblocks = {
let ctx = RequestContextBuilder::from(&ctx)
.perf_span(|crnt_perf_span| {
@@ -313,7 +289,7 @@ impl Timeline {
.attached_child();
match self
.get_rel_size(*tag, Version::LsnRange(lsns), &ctx)
.get_rel_size(*tag, Version::Lsn(lsn), &ctx)
.maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
.await
{
@@ -494,7 +470,7 @@ impl Timeline {
));
}
if let Some(nblocks) = self.get_cached_rel_size(&tag, version) {
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
return Ok(nblocks);
}
@@ -512,7 +488,7 @@ impl Timeline {
let mut buf = version.get(self, key, ctx).await?;
let nblocks = buf.get_u32_le();
self.update_cached_rel_size(tag, version, nblocks);
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
Ok(nblocks)
}
@@ -534,7 +510,7 @@ impl Timeline {
}
// first try to lookup relation in cache
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version) {
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
return Ok(true);
}
// then check if the database was already initialized.
@@ -610,7 +586,7 @@ impl Timeline {
// scan directory listing (new), merge with the old results
let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf.get_vectored_concurrent_io,
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
@@ -656,7 +632,7 @@ impl Timeline {
) -> Result<Bytes, PageReconstructError> {
assert!(self.tenant_shard_id.is_shard_zero());
let n_blocks = self
.get_slru_segment_size(kind, segno, Version::at(lsn), ctx)
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
.await?;
let keyspace = KeySpace::single(
@@ -669,7 +645,7 @@ impl Timeline {
);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf.get_vectored_concurrent_io,
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
@@ -891,11 +867,11 @@ impl Timeline {
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
) -> Result<T, PageReconstructError> {
for segno in self
.list_slru_segments(SlruKind::Clog, Version::at(probe_lsn), ctx)
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
.await?
{
let nblocks = self
.get_slru_segment_size(SlruKind::Clog, segno, Version::at(probe_lsn), ctx)
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
.await?;
let keyspace = KeySpace::single(
@@ -909,7 +885,7 @@ impl Timeline {
);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf.get_vectored_concurrent_io,
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
@@ -1161,7 +1137,7 @@ impl Timeline {
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self
.list_rels(*spcnode, *dbnode, Version::at(lsn), ctx)
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
.await?
{
if self.cancel.is_cancelled() {
@@ -1236,7 +1212,7 @@ impl Timeline {
result.add_key(rel_dir_to_key(spcnode, dbnode));
let mut rels: Vec<RelTag> = self
.list_rels(spcnode, dbnode, Version::at(lsn), ctx)
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
.await?
.into_iter()
.collect();
@@ -1353,75 +1329,59 @@ impl Timeline {
Ok((dense_keyspace, sparse_keyspace))
}
/// Get cached size of relation. There are two caches: one for primary updates, it captures the latest state of
/// of the timeline and snapshot cache, which key includes LSN and so can be used by replicas to get relation size
/// at the particular LSN (snapshot).
pub fn get_cached_rel_size(&self, tag: &RelTag, version: Version<'_>) -> Option<BlockNumber> {
let lsn = version.get_lsn();
{
let rel_size_cache = self.rel_size_latest_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
if lsn >= *cached_lsn {
RELSIZE_LATEST_CACHE_HITS.inc();
return Some(*nblocks);
}
RELSIZE_CACHE_MISSES_OLD.inc();
/// Get cached size of relation if it not updated after specified LSN
pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
let rel_size_cache = self.rel_size_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
if lsn >= *cached_lsn {
RELSIZE_CACHE_HITS.inc();
return Some(*nblocks);
}
RELSIZE_CACHE_MISSES_OLD.inc();
}
{
let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
if let Some(nblock) = rel_size_cache.get(&(lsn, *tag)) {
RELSIZE_SNAPSHOT_CACHE_HITS.inc();
return Some(*nblock);
}
}
if version.is_latest() {
RELSIZE_LATEST_CACHE_MISSES.inc();
} else {
RELSIZE_SNAPSHOT_CACHE_MISSES.inc();
}
RELSIZE_CACHE_MISSES.inc();
None
}
/// Update cached relation size if there is no more recent update
pub fn update_cached_rel_size(&self, tag: RelTag, version: Version<'_>, nblocks: BlockNumber) {
let lsn = version.get_lsn();
if version.is_latest() {
let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
match rel_size_cache.entry(tag) {
hash_map::Entry::Occupied(mut entry) => {
let cached_lsn = entry.get_mut();
if lsn >= cached_lsn.0 {
*cached_lsn = (lsn, nblocks);
}
}
hash_map::Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
RELSIZE_LATEST_CACHE_ENTRIES.inc();
pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
if lsn < rel_size_cache.complete_as_of {
// Do not cache old values. It's safe to cache the size on read, as long as
// the read was at an LSN since we started the WAL ingestion. Reasoning: we
// never evict values from the cache, so if the relation size changed after
// 'lsn', the new value is already in the cache.
return;
}
match rel_size_cache.map.entry(tag) {
hash_map::Entry::Occupied(mut entry) => {
let cached_lsn = entry.get_mut();
if lsn >= cached_lsn.0 {
*cached_lsn = (lsn, nblocks);
}
}
} else {
let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
if rel_size_cache.capacity() != 0 {
rel_size_cache.insert((lsn, tag), nblocks);
RELSIZE_SNAPSHOT_CACHE_ENTRIES.set(rel_size_cache.len() as u64);
hash_map::Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
RELSIZE_CACHE_ENTRIES.inc();
}
}
}
/// Store cached relation size
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
if rel_size_cache.insert(tag, (lsn, nblocks)).is_none() {
RELSIZE_LATEST_CACHE_ENTRIES.inc();
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
RELSIZE_CACHE_ENTRIES.inc();
}
}
/// Remove cached relation size
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
if rel_size_cache.remove(tag).is_some() {
RELSIZE_LATEST_CACHE_ENTRIES.dec();
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
if rel_size_cache.map.remove(tag).is_some() {
RELSIZE_CACHE_ENTRIES.dec();
}
}
}
@@ -1625,10 +1585,7 @@ impl DatadirModification<'_> {
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
if let Some(nblocks) = self
.tline
.get_cached_rel_size(&rel, Version::Modified(self))
{
if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
Ok(nblocks)
} else if !self
.tline
@@ -2710,7 +2667,7 @@ pub struct DatadirModificationStats {
/// timeline to not miss the latest updates.
#[derive(Clone, Copy)]
pub enum Version<'a> {
LsnRange(LsnRange),
Lsn(Lsn),
Modified(&'a DatadirModification<'a>),
}
@@ -2722,7 +2679,7 @@ impl Version<'_> {
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
match self {
Version::LsnRange(lsns) => timeline.get(key, lsns.effective_lsn, ctx).await,
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
Version::Modified(modification) => modification.get(key, ctx).await,
}
}
@@ -2744,26 +2701,12 @@ impl Version<'_> {
}
}
pub fn is_latest(&self) -> bool {
fn get_lsn(&self) -> Lsn {
match self {
Version::LsnRange(lsns) => lsns.is_latest(),
Version::Modified(_) => true,
}
}
pub fn get_lsn(&self) -> Lsn {
match self {
Version::LsnRange(lsns) => lsns.effective_lsn,
Version::Lsn(lsn) => *lsn,
Version::Modified(modification) => modification.lsn,
}
}
pub fn at(lsn: Lsn) -> Self {
Version::LsnRange(LsnRange {
effective_lsn: lsn,
request_lsn: lsn,
})
}
}
//--- Metadata structs stored in key-value pairs in the repository.

View File

@@ -380,10 +380,6 @@ pub enum TaskKind {
DetachAncestor,
ImportPgdata,
/// Background task of [`crate::basebackup_cache::BasebackupCache`].
/// Prepares basebackups and clears outdated entries.
BasebackupCache,
}
#[derive(Default)]

View File

@@ -48,7 +48,6 @@ use remote_timeline_client::{
download_tenant_manifest,
};
use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
use storage_broker::BrokerClientChannel;
use timeline::compaction::{CompactionOutcome, GcCompactionQueue};
use timeline::import_pgdata::ImportingTimeline;
use timeline::offload::{OffloadError, offload_timeline};
@@ -78,7 +77,6 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
use self::timeline::{
EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
};
use crate::basebackup_cache::BasebackupPrepareSender;
use crate::config::PageServerConf;
use crate::context;
use crate::context::RequestContextBuilder;
@@ -87,8 +85,8 @@ use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::{
BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
};
use crate::task_mgr::TaskKind;
use crate::tenant::config::LocationMode;
@@ -154,11 +152,10 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
/// as the shared remote storage client and process initialization state.
#[derive(Clone)]
pub struct TenantSharedResources {
pub broker_client: storage_broker::BrokerClientChannel,
pub broker_client: storage_broker::TimelineUpdatesSubscriber,
pub remote_storage: GenericRemoteStorage,
pub deletion_queue_client: DeletionQueueClient,
pub l0_flush_global_state: L0FlushGlobalState,
pub basebackup_prepare_sender: BasebackupPrepareSender,
}
/// A [`TenantShard`] is really an _attached_ tenant. The configuration
@@ -319,15 +316,12 @@ pub struct TenantShard {
gc_cs: tokio::sync::Mutex<()>,
walredo_mgr: Option<Arc<WalRedoManager>>,
/// Provides access to timeline data sitting in the remote storage.
// provides access to timeline data sitting in the remote storage
pub(crate) remote_storage: GenericRemoteStorage,
/// Access to global deletion queue for when this tenant wants to schedule a deletion.
// Access to global deletion queue for when this tenant wants to schedule a deletion
deletion_queue_client: DeletionQueueClient,
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
basebackup_prepare_sender: BasebackupPrepareSender,
/// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -1291,7 +1285,6 @@ impl TenantShard {
remote_storage,
deletion_queue_client,
l0_flush_global_state,
basebackup_prepare_sender,
} = resources;
let attach_mode = attached_conf.location.attach_mode;
@@ -1307,7 +1300,6 @@ impl TenantShard {
remote_storage.clone(),
deletion_queue_client,
l0_flush_global_state,
basebackup_prepare_sender,
));
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -2114,7 +2106,7 @@ impl TenantShard {
async fn unoffload_timeline(
self: &Arc<Self>,
timeline_id: TimelineId,
broker_client: storage_broker::BrokerClientChannel,
broker_client: storage_broker::TimelineUpdatesSubscriber,
ctx: RequestContext,
) -> Result<Arc<Timeline>, TimelineArchivalError> {
info!("unoffloading timeline");
@@ -2249,7 +2241,7 @@ impl TenantShard {
self: &Arc<Self>,
timeline_id: TimelineId,
new_state: TimelineArchivalState,
broker_client: storage_broker::BrokerClientChannel,
broker_client: storage_broker::TimelineUpdatesSubscriber,
ctx: RequestContext,
) -> Result<(), TimelineArchivalError> {
info!("setting timeline archival config");
@@ -2578,7 +2570,7 @@ impl TenantShard {
pub(crate) async fn create_timeline(
self: &Arc<TenantShard>,
params: CreateTimelineParams,
broker_client: storage_broker::BrokerClientChannel,
broker_client: storage_broker::TimelineUpdatesSubscriber,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
if !self.is_active() {
@@ -3306,7 +3298,7 @@ impl TenantShard {
/// to delay background jobs. Background jobs can be started right away when None is given.
fn activate(
self: &Arc<Self>,
broker_client: BrokerClientChannel,
broker_client: storage_broker::TimelineUpdatesSubscriber,
background_jobs_can_start: Option<&completion::Barrier>,
ctx: &RequestContext,
) {
@@ -3355,13 +3347,6 @@ impl TenantShard {
activated_timelines += 1;
}
let tid = self.tenant_shard_id.tenant_id.to_string();
let shard_id = self.tenant_shard_id.shard_slug().to_string();
let offloaded_timeline_count = timelines_offloaded_accessor.len();
TENANT_OFFLOADED_TIMELINES
.with_label_values(&[&tid, &shard_id])
.set(offloaded_timeline_count as u64);
self.state.send_modify(move |current_state| {
assert!(
matches!(current_state, TenantState::Activating(_)),
@@ -4246,7 +4231,6 @@ impl TenantShard {
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
l0_flush_global_state: L0FlushGlobalState,
basebackup_prepare_sender: BasebackupPrepareSender,
) -> TenantShard {
assert!(!attached_conf.location.generation.is_none());
@@ -4350,7 +4334,6 @@ impl TenantShard {
ongoing_timeline_detach: std::sync::Mutex::default(),
gc_block: Default::default(),
l0_flush_global_state,
basebackup_prepare_sender,
}
}
@@ -4603,7 +4586,7 @@ impl TenantShard {
target.cutoffs = GcCutoffs {
space: space_cutoff,
time: None,
time: Lsn::INVALID,
};
}
}
@@ -4687,7 +4670,7 @@ impl TenantShard {
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
target.within_ancestor_pitr =
Some(timeline.get_ancestor_lsn()) >= ancestor_gc_cutoffs.time;
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
}
}
@@ -4700,15 +4683,13 @@ impl TenantShard {
} else {
0
});
if let Some(time_cutoff) = target.cutoffs.time {
timeline.metrics.pitr_history_size.set(
timeline
.get_last_record_lsn()
.checked_sub(time_cutoff)
.unwrap_or_default()
.0,
);
}
timeline.metrics.pitr_history_size.set(
timeline
.get_last_record_lsn()
.checked_sub(target.cutoffs.time)
.unwrap_or(Lsn(0))
.0,
);
// Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline?
// - this timeline was created while we were finding cutoffs
@@ -4717,8 +4698,8 @@ impl TenantShard {
let original_cutoffs = target.cutoffs.clone();
// GC cutoffs should never go back
target.cutoffs = GcCutoffs {
space: cutoffs.space.max(original_cutoffs.space),
time: cutoffs.time.max(original_cutoffs.time),
space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
}
}
}
@@ -5270,7 +5251,6 @@ impl TenantShard {
pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
l0_compaction_trigger: self.l0_compaction_trigger.clone(),
l0_flush_global_state: self.l0_flush_global_state.clone(),
basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
}
}
@@ -5579,14 +5559,6 @@ impl TenantShard {
}
}
// Update metrics
let tid = self.tenant_shard_id.to_string();
let shard_id = self.tenant_shard_id.shard_slug().to_string();
let set_key = &[tid.as_str(), shard_id.as_str()][..];
TENANT_OFFLOADED_TIMELINES
.with_label_values(set_key)
.set(manifest.offloaded_timelines.len() as u64);
// Upload the manifest. Remote storage does no retries internally, so retry here.
match backoff::retry(
|| async {
@@ -5853,8 +5825,6 @@ pub(crate) mod harness {
) -> anyhow::Result<Arc<TenantShard>> {
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
let tenant = Arc::new(TenantShard::new(
TenantState::Attaching,
self.conf,
@@ -5872,7 +5842,6 @@ pub(crate) mod harness {
self.deletion_queue.new_client(),
// TODO: ideally we should run all unit tests with both configs
L0FlushGlobalState::new(L0FlushConfig::default()),
basebackup_requst_sender,
));
let preload = tenant
@@ -8626,10 +8595,8 @@ mod tests {
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Option<Bytes>, GetVectoredError> {
let io_concurrency = IoConcurrency::spawn_from_conf(
tline.conf.get_vectored_concurrent_io,
tline.gate.enter().unwrap(),
);
let io_concurrency =
IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
let mut res = tline
@@ -8967,7 +8934,7 @@ mod tests {
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Some(Lsn(0x30));
guard.cutoffs.time = Lsn(0x30);
guard.cutoffs.space = Lsn(0x30);
}
@@ -9075,7 +9042,7 @@ mod tests {
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Some(Lsn(0x40));
guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40);
}
tline
@@ -9493,7 +9460,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -9577,7 +9544,7 @@ mod tests {
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Some(Lsn(0x40));
guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40);
}
tline
@@ -10048,7 +10015,7 @@ mod tests {
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -10111,7 +10078,7 @@ mod tests {
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time.unwrap_or_default()
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
@@ -10189,7 +10156,7 @@ mod tests {
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Some(Lsn(0x38));
guard.cutoffs.time = Lsn(0x38);
guard.cutoffs.space = Lsn(0x38);
}
tline
@@ -10297,7 +10264,7 @@ mod tests {
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -10360,7 +10327,7 @@ mod tests {
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time.unwrap_or_default()
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
@@ -10546,7 +10513,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)],
cutoffs: GcCutoffs {
time: Some(Lsn(0x10)),
time: Lsn(0x10),
space: Lsn(0x10),
},
leases: Default::default(),
@@ -10566,7 +10533,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)],
cutoffs: GcCutoffs {
time: Some(Lsn(0x50)),
time: Lsn(0x50),
space: Lsn(0x50),
},
leases: Default::default(),
@@ -11287,7 +11254,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -11676,7 +11643,7 @@ mod tests {
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -11739,7 +11706,7 @@ mod tests {
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time.unwrap_or_default()
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
@@ -11928,7 +11895,7 @@ mod tests {
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -11991,7 +11958,7 @@ mod tests {
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time.unwrap_or_default()
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
@@ -12254,7 +12221,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),

View File

@@ -235,7 +235,7 @@ pub(super) async fn gather_inputs(
// than our internal space cutoff. This means that if someone drops a database and waits for their
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
// the space cutoff.
let mut next_pitr_cutoff = gc_info.cutoffs.time.unwrap_or_default(); // TODO: handle None
let mut next_pitr_cutoff = gc_info.cutoffs.time;
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {

View File

@@ -31,7 +31,6 @@ pub use inmemory_layer::InMemoryLayer;
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
use pageserver_api::config::GetVectoredConcurrentIo;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::record::NeonWalRecord;
@@ -44,6 +43,7 @@ use self::inmemory_layer::InMemoryLayerFileId;
use super::PageReconstructError;
use super::layer_map::InMemoryLayerDesc;
use super::timeline::{GetVectoredError, ReadPath};
use crate::config::PageServerConf;
use crate::context::{
AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
};
@@ -318,10 +318,11 @@ impl IoConcurrency {
}
pub(crate) fn spawn_from_conf(
conf: GetVectoredConcurrentIo,
conf: &'static PageServerConf,
gate_guard: GateGuard,
) -> IoConcurrency {
let selected = match conf {
use pageserver_api::config::GetVectoredConcurrentIo;
let selected = match conf.get_vectored_concurrent_io {
GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
};

View File

@@ -63,28 +63,7 @@ pub struct InMemoryLayer {
opened_at: Instant,
/// All versions of all pages in the layer are kept here. Indexed
/// by block number and LSN. The [`IndexEntry`] is an offset into the
/// ephemeral file where the page version is stored.
///
/// We use a separate lock for the index to reduce the critical section
/// during which reads cannot be planned.
///
/// If you need access to both the index and the underlying file at the same time,
/// respect the following locking order to avoid deadlocks:
/// 1. [`InMemoryLayer::inner`]
/// 2. [`InMemoryLayer::index`]
///
/// Note that the file backing [`InMemoryLayer::inner`] is append-only,
/// so it is not necessary to hold simultaneous locks on index.
/// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
/// In particular:
/// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
/// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
/// The above fields never change, except for `end_lsn`, which is only set once,
/// and `index` (see rationale there).
/// The above fields never change, except for `end_lsn`, which is only set once.
/// All other changing parts are in `inner`, and protected by a mutex.
inner: RwLock<InMemoryLayerInner>,
@@ -102,6 +81,11 @@ impl std::fmt::Debug for InMemoryLayer {
}
pub struct InMemoryLayerInner {
/// All versions of all pages in the layer are kept here. Indexed
/// by block number and LSN. The [`IndexEntry`] is an offset into the
/// ephemeral file where the page version is stored.
index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
/// The values are stored in a serialized format in this file.
/// Each serialized Value is preceded by a 'u32' length field.
/// PerSeg::page_versions map stores offsets into this file.
@@ -121,7 +105,7 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
trailing_ones
};
/// See [`InMemoryLayer::index`].
/// See [`InMemoryLayerInner::index`].
///
/// For memory efficiency, the data is packed into a u64.
///
@@ -441,7 +425,7 @@ impl InMemoryLayer {
.page_content_kind(PageContentKind::InMemoryLayer)
.attached_child();
let index = self.index.read().await;
let inner = self.inner.read().await;
struct ValueRead {
entry_lsn: Lsn,
@@ -451,7 +435,10 @@ impl InMemoryLayer {
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
for range in keyspace.ranges.iter() {
for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) {
for (key, vec_map) in inner
.index
.range(range.start.to_compact()..range.end.to_compact())
{
let key = Key::from_compact(*key);
let slice = vec_map.slice_range(lsn_range.clone());
@@ -479,7 +466,7 @@ impl InMemoryLayer {
}
}
}
drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
let read_from = Arc::clone(self);
let read_ctx = ctx.attached_child();
reconstruct_state
@@ -586,8 +573,8 @@ impl InMemoryLayer {
start_lsn,
end_lsn: OnceLock::new(),
opened_at: Instant::now(),
index: RwLock::new(BTreeMap::new()),
inner: RwLock::new(InMemoryLayerInner {
index: BTreeMap::new(),
file,
resource_units: GlobalResourceUnits::new(),
}),
@@ -605,39 +592,31 @@ impl InMemoryLayer {
serialized_batch: SerializedValueBatch,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let (base_offset, metadata) = {
let mut inner = self.inner.write().await;
self.assert_writable();
let mut inner = self.inner.write().await;
self.assert_writable();
let base_offset = inner.file.len();
let base_offset = inner.file.len();
let SerializedValueBatch {
raw,
metadata,
max_lsn: _,
len: _,
} = serialized_batch;
let SerializedValueBatch {
raw,
metadata,
max_lsn: _,
len: _,
} = serialized_batch;
// Write the batch to the file
inner.file.write_raw(&raw, ctx).await?;
let new_size = inner.file.len();
// Write the batch to the file
inner.file.write_raw(&raw, ctx).await?;
let new_size = inner.file.len();
let expected_new_len = base_offset
.checked_add(raw.len().into_u64())
// write_raw would error if we were to overflow u64.
// also IndexEntry and higher levels in
//the code don't allow the file to grow that large
.unwrap();
assert_eq!(new_size, expected_new_len);
inner.resource_units.maybe_publish_size(new_size);
(base_offset, metadata)
};
let expected_new_len = base_offset
.checked_add(raw.len().into_u64())
// write_raw would error if we were to overflow u64.
// also IndexEntry and higher levels in
//the code don't allow the file to grow that large
.unwrap();
assert_eq!(new_size, expected_new_len);
// Update the index with the new entries
let mut index = self.index.write().await;
for meta in metadata {
let SerializedValueMeta {
key,
@@ -660,7 +639,7 @@ impl InMemoryLayer {
will_init,
})?;
let vec_map = index.entry(key).or_default();
let vec_map = inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
if old.is_some() {
// This should not break anything, but is unexpected: ingestion code aims to filter out
@@ -679,6 +658,8 @@ impl InMemoryLayer {
);
}
inner.resource_units.maybe_publish_size(new_size);
Ok(())
}
@@ -699,18 +680,6 @@ impl InMemoryLayer {
/// Records the end_lsn for non-dropped layers.
/// `end_lsn` is exclusive
///
/// A note on locking:
/// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing
/// writes while freezing the layer. This is enforced at a higher level via
/// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths:
/// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the
/// Timeline::write_lock for its lifetime. The rolling is handled in
/// [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function
/// so can't be called from different threads.
/// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`].
/// This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer),
/// hence there can be no concurrent writes
pub async fn freeze(&self, end_lsn: Lsn) {
assert!(
self.start_lsn < end_lsn,
@@ -731,8 +700,8 @@ impl InMemoryLayer {
#[cfg(debug_assertions)]
{
let index = self.index.read().await;
for vec_map in index.values() {
let inner = self.inner.write().await;
for vec_map in inner.index.values() {
for (lsn, _) in vec_map.as_slice() {
assert!(*lsn < end_lsn);
}
@@ -755,11 +724,14 @@ impl InMemoryLayer {
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. See the comment on
// [`InMemoryLayer::freeze`] to understand how locking between the append path
// and layer flushing works.
// write lock on it, so we shouldn't block anyone. There's one exception
// though: another thread might have grabbed a reference to this layer
// in `get_layer_for_write' just before the checkpointer called
// `freeze`, and then `write_to_disk` on it. When the thread gets the
// lock, it will see that it's not writeable anymore and retry, but it
// would have to wait until we release it. That race condition is very
// rare though, so we just accept the potential latency hit for now.
let inner = self.inner.read().await;
let index = self.index.read().await;
use l0_flush::Inner;
let _concurrency_permit = match l0_flush_global_state {
@@ -771,9 +743,13 @@ impl InMemoryLayer {
let key_count = if let Some(key_range) = key_range {
let key_range = key_range.start.to_compact()..key_range.end.to_compact();
index.iter().filter(|(k, _)| key_range.contains(k)).count()
inner
.index
.iter()
.filter(|(k, _)| key_range.contains(k))
.count()
} else {
index.len()
inner.index.len()
};
if key_count == 0 {
return Ok(None);
@@ -796,7 +772,7 @@ impl InMemoryLayer {
let file_contents = inner.file.load_to_io_buf(ctx).await?;
let file_contents = file_contents.freeze();
for (key, vec_map) in index.iter() {
for (key, vec_map) in inner.index.iter() {
// Write all page versions
for (lsn, entry) in vec_map
.as_slice()

View File

@@ -14,7 +14,6 @@ pub mod span;
pub mod uninit;
mod walreceiver;
use hashlink::LruCache;
use std::array;
use std::cmp::{max, min};
use std::collections::btree_map::Entry;
@@ -24,6 +23,8 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
use std::time::{Duration, Instant, SystemTime};
use crate::PERF_TRACE_TARGET;
use crate::walredo::RedoAttemptType;
use anyhow::{Context, Result, anyhow, bail, ensure};
use arc_swap::{ArcSwap, ArcSwapOption};
use bytes::Bytes;
@@ -60,7 +61,6 @@ use postgres_ffi::{WAL_SEGMENT_SIZE, to_pg_timestamp};
use rand::Rng;
use remote_storage::DownloadError;
use serde_with::serde_as;
use storage_broker::BrokerClientChannel;
use tokio::runtime::Handle;
use tokio::sync::mpsc::Sender;
use tokio::sync::{Notify, oneshot, watch};
@@ -92,12 +92,10 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
use super::tasks::log_compaction_error;
use super::upload_queue::NotInitialized;
use super::{
AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
debug_assert_current_span_has_tenant_and_timeline_id,
};
use crate::PERF_TRACE_TARGET;
use crate::aux_file::AuxFileSizeEstimator;
use crate::basebackup_cache::BasebackupPrepareRequest;
use crate::config::PageServerConf;
use crate::context::{
DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -131,7 +129,6 @@ use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
use crate::walingest::WalLagCooldown;
use crate::walredo::RedoAttemptType;
use crate::{ZERO_PAGE, task_mgr, walredo};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -197,7 +194,16 @@ pub struct TimelineResources {
pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
pub l0_compaction_trigger: Arc<Notify>,
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
pub basebackup_prepare_sender: BasebackupPrepareSender,
}
/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
/// ingestion considerably, because WAL ingestion needs to check on most records if the record
/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end
/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the
/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
pub(crate) struct RelSizeCache {
pub(crate) complete_as_of: Lsn,
pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
}
pub struct Timeline {
@@ -358,8 +364,7 @@ pub struct Timeline {
pub walreceiver: Mutex<Option<WalReceiver>>,
/// Relation size cache
pub(crate) rel_size_latest_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
pub(crate) rel_size_snapshot_cache: Mutex<LruCache<(Lsn, RelTag), BlockNumber>>,
pub(crate) rel_size_cache: RwLock<RelSizeCache>,
download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
@@ -441,9 +446,6 @@ pub struct Timeline {
pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
wait_lsn_log_slow: tokio::sync::Semaphore,
/// A channel to send async requests to prepare a basebackup for the basebackup cache.
basebackup_prepare_sender: BasebackupPrepareSender,
}
pub(crate) enum PreviousHeatmap {
@@ -534,24 +536,29 @@ impl GcInfo {
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
/// between time-based and space-based retention for observability and consumption metrics purposes.
#[derive(Clone, Debug, Default)]
#[derive(Debug, Clone)]
pub(crate) struct GcCutoffs {
/// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much
/// history we must keep to retain a specified number of bytes of WAL.
pub(crate) space: Lsn,
/// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates
/// how much history we must keep to enable reading back at least the PITR interval duration.
///
/// None indicates that the PITR cutoff has not been computed. A PITR interval of 0 will yield
/// Some(last_record_lsn).
pub(crate) time: Option<Lsn>,
/// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates how much
/// history we must keep to enable reading back at least the PITR interval duration.
pub(crate) time: Lsn,
}
impl Default for GcCutoffs {
fn default() -> Self {
Self {
space: Lsn::INVALID,
time: Lsn::INVALID,
}
}
}
impl GcCutoffs {
fn select_min(&self) -> Lsn {
// NB: if we haven't computed the PITR cutoff yet, we can't GC anything.
self.space.min(self.time.unwrap_or_default())
std::cmp::min(self.space, self.time)
}
}
@@ -1033,7 +1040,6 @@ pub(crate) enum WaitLsnWaiter<'a> {
Tenant,
PageService,
HttpEndpoint,
BaseBackupCache,
}
/// Argument to [`Timeline::shutdown`].
@@ -1089,14 +1095,11 @@ impl Timeline {
/// Get the bytes written since the PITR cutoff on this branch, and
/// whether this branch's ancestor_lsn is within its parent's PITR.
pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
// TODO: for backwards compatibility, we return the full history back to 0 when the PITR
// cutoff has not yet been initialized. This should return None instead, but this is exposed
// in external HTTP APIs and callers may not handle a null value.
let gc_info = self.gc_info.read().unwrap();
let history = self
.get_last_record_lsn()
.checked_sub(gc_info.cutoffs.time.unwrap_or_default())
.unwrap_or_default()
.checked_sub(gc_info.cutoffs.time)
.unwrap_or(Lsn(0))
.0;
(history, gc_info.within_ancestor_pitr)
}
@@ -1106,10 +1109,9 @@ impl Timeline {
self.applied_gc_cutoff_lsn.read()
}
/// Read timeline's planned GC cutoff: this is the logical end of history that users are allowed
/// to read (based on configured PITR), even if physically we have more history. Returns None
/// if the PITR cutoff has not yet been initialized.
pub(crate) fn get_gc_cutoff_lsn(&self) -> Option<Lsn> {
/// Read timeline's planned GC cutoff: this is the logical end of history that users
/// are allowed to read (based on configured PITR), even if physically we have more history.
pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
self.gc_info.read().unwrap().cutoffs.time
}
@@ -1560,8 +1562,7 @@ impl Timeline {
}
WaitLsnWaiter::Tenant
| WaitLsnWaiter::PageService
| WaitLsnWaiter::HttpEndpoint
| WaitLsnWaiter::BaseBackupCache => unreachable!(
| WaitLsnWaiter::HttpEndpoint => unreachable!(
"tenant or page_service context are not expected to have task kind {:?}",
ctx.task_kind()
),
@@ -2078,7 +2079,7 @@ impl Timeline {
pub(crate) fn activate(
self: &Arc<Self>,
parent: Arc<crate::tenant::TenantShard>,
broker_client: BrokerClientChannel,
broker_client: storage_broker::TimelineUpdatesSubscriber,
background_jobs_can_start: Option<&completion::Barrier>,
ctx: &RequestContext,
) {
@@ -2466,41 +2467,6 @@ impl Timeline {
false
}
}
pub(crate) fn is_basebackup_cache_enabled(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.basebackup_cache_enabled
.unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
}
/// Prepare basebackup for the given LSN and store it in the basebackup cache.
/// The method is asynchronous and returns immediately.
/// The actual basebackup preparation is performed in the background
/// by the basebackup cache on a best-effort basis.
pub(crate) fn prepare_basebackup(&self, lsn: Lsn) {
if !self.is_basebackup_cache_enabled() {
return;
}
if !self.tenant_shard_id.is_shard_zero() {
// In theory we should never get here, but just in case check it.
// Preparing basebackup doesn't make sense for shards other than shard zero.
return;
}
let res = self
.basebackup_prepare_sender
.send(BasebackupPrepareRequest {
tenant_shard_id: self.tenant_shard_id,
timeline_id: self.timeline_id,
lsn,
});
if let Err(e) = res {
// May happen during shutdown, it's not critical.
info!("Failed to send shutdown checkpoint: {e:#}");
}
}
}
/// Number of times we will compute partition within a checkpoint distance.
@@ -2578,13 +2544,6 @@ impl Timeline {
.unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
}
pub(crate) fn get_pitr_interval(&self) -> Duration {
let tenant_conf = &self.tenant_conf.load().tenant_conf;
tenant_conf
.pitr_interval
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
}
fn get_compaction_period(&self) -> Duration {
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
tenant_conf
@@ -2860,13 +2819,6 @@ impl Timeline {
self.remote_client.update_config(&new_conf.location);
let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
if let Some(new_capacity) = new_conf.tenant_conf.relsize_snapshot_cache_capacity {
if new_capacity != rel_size_cache.capacity() {
rel_size_cache.set_capacity(new_capacity);
}
}
self.metrics
.evictions_with_low_residence_duration
.write()
@@ -2925,14 +2877,6 @@ impl Timeline {
ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded);
}
let relsize_snapshot_cache_capacity = {
let loaded_tenant_conf = tenant_conf.load();
loaded_tenant_conf
.tenant_conf
.relsize_snapshot_cache_capacity
.unwrap_or(conf.default_tenant_conf.relsize_snapshot_cache_capacity)
};
Arc::new_cyclic(|myself| {
let metrics = Arc::new(TimelineMetrics::new(
&tenant_shard_id,
@@ -3024,8 +2968,10 @@ impl Timeline {
last_image_layer_creation_check_instant: Mutex::new(None),
last_received_wal: Mutex::new(None),
rel_size_latest_cache: RwLock::new(HashMap::new()),
rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
rel_size_cache: RwLock::new(RelSizeCache {
complete_as_of: disk_consistent_lsn,
map: HashMap::new(),
}),
download_all_remote_layers_task_info: RwLock::new(None),
@@ -3070,8 +3016,6 @@ impl Timeline {
rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
basebackup_prepare_sender: resources.basebackup_prepare_sender,
};
result.repartition_threshold =
@@ -3169,7 +3113,7 @@ impl Timeline {
fn launch_wal_receiver(
self: &Arc<Self>,
ctx: &RequestContext,
broker_client: BrokerClientChannel,
broker_client: storage_broker::TimelineUpdatesSubscriber,
) {
info!(
"launching WAL receiver for timeline {} of tenant {}",
@@ -3585,7 +3529,7 @@ impl Timeline {
};
let io_concurrency = IoConcurrency::spawn_from_conf(
self_ref.conf.get_vectored_concurrent_io,
self_ref.conf,
self_ref
.gate
.enter()
@@ -5614,7 +5558,7 @@ impl Timeline {
});
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf.get_vectored_concurrent_io,
self.conf,
self.gate
.enter()
.map_err(|_| CreateImageLayersError::Cancelled)?,
@@ -6285,12 +6229,14 @@ impl Timeline {
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
if cfg!(test) && pitr == Duration::ZERO {
if cfg!(test) {
// Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
return Ok(GcCutoffs {
time: Some(self.get_last_record_lsn()),
space: space_cutoff,
});
if pitr == Duration::ZERO {
return Ok(GcCutoffs {
time: self.get_last_record_lsn(),
space: space_cutoff,
});
}
}
// Calculate a time-based limit on how much to retain:
@@ -6304,14 +6250,14 @@ impl Timeline {
// PITR is not set. Retain the size-based limit, or the default time retention,
// whichever requires less data.
GcCutoffs {
time: Some(self.get_last_record_lsn()),
time: self.get_last_record_lsn(),
space: std::cmp::max(time_cutoff, space_cutoff),
}
}
(Duration::ZERO, None) => {
// PITR is not set, and time lookup failed
GcCutoffs {
time: Some(self.get_last_record_lsn()),
time: self.get_last_record_lsn(),
space: space_cutoff,
}
}
@@ -6319,7 +6265,7 @@ impl Timeline {
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
// cannot advance beyond what was already GC'd, and respect space-based retention
GcCutoffs {
time: Some(*self.get_applied_gc_cutoff_lsn()),
time: *self.get_applied_gc_cutoff_lsn(),
space: space_cutoff,
}
}
@@ -6327,7 +6273,7 @@ impl Timeline {
// PITR interval is set and we looked up timestamp successfully. Ignore
// size based retention and make time cutoff authoritative
GcCutoffs {
time: Some(time_cutoff),
time: time_cutoff,
space: time_cutoff,
}
}
@@ -6380,7 +6326,7 @@ impl Timeline {
)
};
let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default());
let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
let standby_horizon = self.standby_horizon.load();
// Hold GC for the standby, but as a safety guard do it only within some
// reasonable lag.
@@ -6429,7 +6375,7 @@ impl Timeline {
async fn gc_timeline(
&self,
space_cutoff: Lsn,
time_cutoff: Option<Lsn>, // None if uninitialized
time_cutoff: Lsn,
retain_lsns: Vec<Lsn>,
max_lsn_with_valid_lease: Option<Lsn>,
new_gc_cutoff: Lsn,
@@ -6448,12 +6394,6 @@ impl Timeline {
return Ok(result);
}
let Some(time_cutoff) = time_cutoff else {
// The GC cutoff should have been computed by now, but let's be defensive.
info!("Nothing to GC: time_cutoff not yet computed");
return Ok(result);
};
// We need to ensure that no one tries to read page versions or create
// branches at a point before latest_gc_cutoff_lsn. See branch_timeline()
// for details. This will block until the old value is no longer in use.

View File

@@ -1526,7 +1526,7 @@ impl Timeline {
info!(
"starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
checked {layers_checked}/{layers_total} layers \
(latest_gc_cutoff={} pitr_cutoff={:?})",
(latest_gc_cutoff={} pitr_cutoff={})",
layers_to_rewrite.len(),
drop_layers.len(),
*latest_gc_cutoff,

View File

@@ -188,7 +188,7 @@ pub(crate) async fn generate_tombstone_image_layer(
"removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
);
let io_concurrency = IoConcurrency::spawn_from_conf(
detached.conf.get_vectored_concurrent_io,
detached.conf,
detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
);
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);

View File

@@ -161,7 +161,7 @@ impl<'t> UninitializedTimeline<'t> {
tenant: Arc<TenantShard>,
copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
base_lsn: Lsn,
broker_client: storage_broker::BrokerClientChannel,
broker_client: storage_broker::TimelineUpdatesSubscriber,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
self.write(|raw_timeline| async move {

View File

@@ -28,7 +28,6 @@ use std::num::NonZeroU64;
use std::sync::Arc;
use std::time::Duration;
use storage_broker::BrokerClientChannel;
use tokio::sync::watch;
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -70,7 +69,7 @@ impl WalReceiver {
pub fn start(
timeline: Arc<Timeline>,
conf: WalReceiverConf,
mut broker_client: BrokerClientChannel,
mut broker_client: storage_broker::TimelineUpdatesSubscriber,
ctx: &RequestContext,
) -> Self {
let tenant_shard_id = timeline.tenant_shard_id;

View File

@@ -17,19 +17,12 @@ use std::time::Duration;
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
use futures::StreamExt;
use pageserver_api::models::TimelineState;
use postgres_connection::PgConnectionConfig;
use storage_broker::proto::{
FilterTenantTimelineId, MessageType, SafekeeperDiscoveryRequest, SafekeeperDiscoveryResponse,
SubscribeByFilterRequest, TenantTimelineId as ProtoTenantTimelineId, TypeSubscription,
TypedMessage,
};
use storage_broker::{BrokerClientChannel, Code, Streaming};
use storage_broker::proto::SafekeeperDiscoveryResponse;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::backoff::{
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
};
use utils::id::{NodeId, TenantTimelineId};
use utils::lsn::Lsn;
use utils::postgres_client::{
@@ -56,7 +49,7 @@ pub(crate) struct Cancelled;
///
/// Not cancellation-safe. Use `cancel` token to request cancellation.
pub(super) async fn connection_manager_loop_step(
broker_client: &mut BrokerClientChannel,
broker_client: &mut storage_broker::TimelineUpdatesSubscriber,
connection_manager_state: &mut ConnectionManagerState,
ctx: &RequestContext,
cancel: &CancellationToken,
@@ -81,11 +74,6 @@ pub(super) async fn connection_manager_loop_step(
WALRECEIVER_ACTIVE_MANAGERS.dec();
}
let id = TenantTimelineId {
tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
timeline_id: connection_manager_state.timeline.timeline_id,
};
let mut timeline_state_updates = connection_manager_state
.timeline
.subscribe_for_state_updates();
@@ -101,7 +89,12 @@ pub(super) async fn connection_manager_loop_step(
// Subscribe to the broker updates. Stream shares underlying TCP connection
// with other streams on this client (other connection managers). When
// object goes out of scope, stream finishes in drop() automatically.
let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
let (timeline_updates, mut discovery_requester) = broker_client.subscribe(
connection_manager_state.timeline.tenant_shard_id,
connection_manager_state.timeline.timeline_id,
cancel,
);
let mut timeline_updates = Box::pin(timeline_updates);
debug!("Subscribed for broker timeline updates");
loop {
@@ -155,29 +148,10 @@ pub(super) async fn connection_manager_loop_step(
}
},
// Got a new update from the broker
broker_update = broker_subscription.message() /* TODO: review cancellation-safety */ => {
match broker_update {
Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
Err(status) => {
match status.code() {
Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
// tonic's error handling doesn't provide a clear code for disconnections: we get
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
// => https://github.com/neondatabase/neon/issues/9562
info!("broker disconnected: {status}");
},
_ => {
warn!("broker subscription failed: {status}");
}
}
return Ok(());
}
Ok(None) => {
error!("broker subscription stream ended"); // can't happen
return Ok(());
}
}
// Got a new update from the broker.
// The stream ends with None if and only if `cancel` is cancelled.
Some(timeline_update) = timeline_updates.next() => {
connection_manager_state.register_timeline_update(timeline_update)
},
new_event = async {
@@ -258,32 +232,11 @@ pub(super) async fn connection_manager_loop_step(
tokio::time::sleep(next_discovery_ts - now).await;
}
let tenant_timeline_id = Some(ProtoTenantTimelineId {
tenant_id: id.tenant_id.as_ref().to_owned(),
timeline_id: id.timeline_id.as_ref().to_owned(),
});
let request = SafekeeperDiscoveryRequest { tenant_timeline_id };
let msg = TypedMessage {
r#type: MessageType::SafekeeperDiscoveryRequest as i32,
safekeeper_timeline_info: None,
safekeeper_discovery_request: Some(request),
safekeeper_discovery_response: None,
};
info!("No active connection and no candidates, sending discovery request to the broker");
discovery_requester.request().await;
last_discovery_ts = Some(std::time::Instant::now());
info!("No active connection and no candidates, sending discovery request to the broker");
// Cancellation safety: we want to send a message to the broker, but publish_one()
// function can get cancelled by the other select! arm. This is absolutely fine, because
// we just want to receive broker updates and discovery is not important if we already
// receive updates.
//
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
// This is totally fine because of the reason above.
// This is a fire-and-forget request, we don't care about the response
let _ = broker_client.publish_one(msg).await;
debug!("Discovery request sent to the broker");
None
} => {}
}
@@ -298,63 +251,6 @@ pub(super) async fn connection_manager_loop_step(
}
}
/// Endlessly try to subscribe for broker updates for a given timeline.
async fn subscribe_for_timeline_updates(
broker_client: &mut BrokerClientChannel,
id: TenantTimelineId,
cancel: &CancellationToken,
) -> Result<Streaming<TypedMessage>, Cancelled> {
let mut attempt = 0;
loop {
exponential_backoff(
attempt,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
cancel,
)
.await;
attempt += 1;
// subscribe to the specific timeline
let request = SubscribeByFilterRequest {
types: vec![
TypeSubscription {
r#type: MessageType::SafekeeperTimelineInfo as i32,
},
TypeSubscription {
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
},
],
tenant_timeline_id: Some(FilterTenantTimelineId {
enabled: true,
tenant_timeline_id: Some(ProtoTenantTimelineId {
tenant_id: id.tenant_id.as_ref().to_owned(),
timeline_id: id.timeline_id.as_ref().to_owned(),
}),
}),
};
match {
tokio::select! {
r = broker_client.subscribe_by_filter(request) => { r }
_ = cancel.cancelled() => { return Err(Cancelled); }
}
} {
Ok(resp) => {
return Ok(resp.into_inner());
}
Err(e) => {
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
// entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
info!(
"Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}"
);
continue;
}
}
}
}
const WALCONNECTION_RETRY_MIN_BACKOFF_SECONDS: f64 = 0.1;
const WALCONNECTION_RETRY_MAX_BACKOFF_SECONDS: f64 = 15.0;
const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5;
@@ -695,44 +591,14 @@ impl ConnectionManagerState {
}
/// Adds another broker timeline into the state, if its more recent than the one already added there for the same key.
fn register_timeline_update(&mut self, typed_msg: TypedMessage) {
let mut is_discovery = false;
let timeline_update = match typed_msg.r#type() {
MessageType::SafekeeperTimelineInfo => {
let info = match typed_msg.safekeeper_timeline_info {
Some(info) => info,
None => {
warn!("bad proto message from broker: no safekeeper_timeline_info");
return;
}
};
SafekeeperDiscoveryResponse {
safekeeper_id: info.safekeeper_id,
tenant_timeline_id: info.tenant_timeline_id,
commit_lsn: info.commit_lsn,
safekeeper_connstr: info.safekeeper_connstr,
availability_zone: info.availability_zone,
standby_horizon: info.standby_horizon,
}
}
MessageType::SafekeeperDiscoveryResponse => {
is_discovery = true;
match typed_msg.safekeeper_discovery_response {
Some(response) => response,
None => {
warn!("bad proto message from broker: no safekeeper_discovery_response");
return;
}
}
}
_ => {
// unexpected message
return;
}
};
fn register_timeline_update(&mut self, timeline_update: storage_broker::TimelineShardUpdate) {
WALRECEIVER_BROKER_UPDATES.inc();
let storage_broker::TimelineShardUpdate {
is_discovery,
inner: timeline_update,
} = timeline_update;
trace!(
"safekeeper info update: standby_horizon(cutoff)={}",
timeline_update.standby_horizon
@@ -1013,7 +879,7 @@ impl ConnectionManagerState {
shard_stripe_size,
listen_pg_addr_str: info.safekeeper_connstr.as_ref(),
auth_token: self.conf.auth_token.as_ref().map(|t| t.as_str()),
availability_zone: self.conf.availability_zone.as_deref()
availability_zone: self.conf.availability_zone.as_deref(),
};
match wal_stream_connection_config(connection_conf_args) {

View File

@@ -1316,10 +1316,6 @@ impl WalIngest {
}
});
if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN {
modification.tline.prepare_basebackup(lsn);
}
Ok(())
}
@@ -1688,31 +1684,31 @@ mod tests {
// The relation was created at LSN 2, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.await?,
false
);
assert!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.await
.is_err()
);
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
1
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
.await?,
3
);
@@ -1723,7 +1719,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x20)),
Version::Lsn(Lsn(0x20)),
&ctx,
io_concurrency.clone()
)
@@ -1737,7 +1733,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x30)),
Version::Lsn(Lsn(0x30)),
&ctx,
io_concurrency.clone()
)
@@ -1751,7 +1747,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x40)),
Version::Lsn(Lsn(0x40)),
&ctx,
io_concurrency.clone()
)
@@ -1764,7 +1760,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1,
Version::at(Lsn(0x40)),
Version::Lsn(Lsn(0x40)),
&ctx,
io_concurrency.clone()
)
@@ -1778,7 +1774,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -1791,7 +1787,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -1804,7 +1800,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
2,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -1824,7 +1820,7 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
.await?,
2
);
@@ -1833,7 +1829,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x60)),
Version::Lsn(Lsn(0x60)),
&ctx,
io_concurrency.clone()
)
@@ -1846,7 +1842,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1,
Version::at(Lsn(0x60)),
Version::Lsn(Lsn(0x60)),
&ctx,
io_concurrency.clone()
)
@@ -1858,7 +1854,7 @@ mod tests {
// should still see the truncated block with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
.await?,
3
);
@@ -1867,7 +1863,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
2,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -1884,7 +1880,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x68)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
.await?,
0
);
@@ -1897,7 +1893,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x70)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
.await?,
2
);
@@ -1906,7 +1902,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x70)),
Version::Lsn(Lsn(0x70)),
&ctx,
io_concurrency.clone()
)
@@ -1919,7 +1915,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1,
Version::at(Lsn(0x70)),
Version::Lsn(Lsn(0x70)),
&ctx,
io_concurrency.clone()
)
@@ -1936,7 +1932,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
.await?,
1501
);
@@ -1946,7 +1942,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blk,
Version::at(Lsn(0x80)),
Version::Lsn(Lsn(0x80)),
&ctx,
io_concurrency.clone()
)
@@ -1960,7 +1956,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1500,
Version::at(Lsn(0x80)),
Version::Lsn(Lsn(0x80)),
&ctx,
io_concurrency.clone()
)
@@ -1994,13 +1990,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
1
);
@@ -2015,7 +2011,7 @@ mod tests {
// Check that rel is not visible anymore
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x30)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
.await?,
false
);
@@ -2033,13 +2029,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
.await?,
1
);
@@ -2081,26 +2077,26 @@ mod tests {
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.await?,
false
);
assert!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.await
.is_err()
);
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
relsize
);
@@ -2114,7 +2110,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blkno,
Version::at(lsn),
Version::Lsn(lsn),
&ctx,
io_concurrency.clone()
)
@@ -2135,7 +2131,7 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
.await?,
1
);
@@ -2148,7 +2144,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blkno,
Version::at(Lsn(0x60)),
Version::Lsn(Lsn(0x60)),
&ctx,
io_concurrency.clone()
)
@@ -2161,7 +2157,7 @@ mod tests {
// should still see all blocks with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
.await?,
relsize
);
@@ -2173,7 +2169,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blkno,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -2197,13 +2193,13 @@ mod tests {
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
.await?,
relsize
);
@@ -2216,7 +2212,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blkno,
Version::at(Lsn(0x80)),
Version::Lsn(Lsn(0x80)),
&ctx,
io_concurrency.clone()
)
@@ -2254,7 +2250,7 @@ mod tests {
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.await?,
RELSEG_SIZE + 1
);
@@ -2268,7 +2264,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.await?,
RELSEG_SIZE
);
@@ -2283,7 +2279,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.await?,
RELSEG_SIZE - 1
);
@@ -2301,7 +2297,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.await?,
size as BlockNumber
);

17
poetry.lock generated
View File

@@ -3170,24 +3170,19 @@ pbr = "*"
[[package]]
name = "setuptools"
version = "78.1.1"
version = "70.0.0"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.9"
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561"},
{file = "setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d"},
{file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
{file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
]
[package.extras]
check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
cover = ["pytest-cov"]
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
enabler = ["pytest-enabler (>=2.2)"]
test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
[[package]]
name = "six"

View File

@@ -127,4 +127,3 @@ rstest.workspace = true
walkdir.workspace = true
rand_distr = "0.4"
tokio-postgres.workspace = true
tracing-test = "0.2"

View File

@@ -80,22 +80,10 @@ impl std::fmt::Display for Backend<'_, ()> {
.field(&endpoint.url())
.finish(),
#[cfg(any(test, feature = "testing"))]
ControlPlaneClient::PostgresMock(endpoint) => {
let url = endpoint.url();
match url::Url::parse(url) {
Ok(mut url) => {
let _ = url.set_password(Some("_redacted_"));
let url = url.as_str();
fmt.debug_tuple("ControlPlane::PostgresMock")
.field(&url)
.finish()
}
Err(_) => fmt
.debug_tuple("ControlPlane::PostgresMock")
.field(&url)
.finish(),
}
}
ControlPlaneClient::PostgresMock(endpoint) => fmt
.debug_tuple("ControlPlane::PostgresMock")
.field(&endpoint.url())
.finish(),
#[cfg(test)]
ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
},

View File

@@ -1,13 +1,9 @@
#[cfg(any(test, feature = "testing"))]
use std::env;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::pin::pin;
use std::sync::Arc;
use std::time::Duration;
#[cfg(any(test, feature = "testing"))]
use anyhow::Context;
use anyhow::{bail, ensure};
use arc_swap::ArcSwapOption;
use futures::future::Either;
@@ -39,8 +35,6 @@ use crate::scram::threadpool::ThreadPool;
use crate::serverless::GlobalConnPoolOptions;
use crate::serverless::cancel_set::CancelSet;
use crate::tls::client_config::compute_client_config_with_root_certs;
#[cfg(any(test, feature = "testing"))]
use crate::url::ApiUrl;
use crate::{auth, control_plane, http, serverless, usage_metrics};
project_git_version!(GIT_VERSION);
@@ -167,11 +161,8 @@ struct ProxyCliArgs {
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
redis_rps_limit: Vec<RateBucketInfo>,
/// Cancellation channel size (max queue size for redis kv client)
#[clap(long, default_value_t = 1024)]
#[clap(long, default_value = "1024")]
cancellation_ch_size: usize,
/// Cancellation ops batch size for redis
#[clap(long, default_value_t = 8)]
cancellation_batch_size: usize,
/// cache for `allowed_ips` (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
allowed_ips_cache: String,
@@ -551,12 +542,7 @@ pub async fn run() -> anyhow::Result<()> {
if let Some(mut redis_kv_client) = redis_kv_client {
maintenance_tasks.spawn(async move {
redis_kv_client.try_connect().await?;
handle_cancel_messages(
&mut redis_kv_client,
rx_cancel,
args.cancellation_batch_size,
)
.await?;
handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
drop(redis_kv_client);
@@ -783,13 +769,7 @@ fn build_auth_backend(
#[cfg(any(test, feature = "testing"))]
AuthBackendType::Postgres => {
let mut url: ApiUrl = args.auth_endpoint.parse()?;
if url.password().is_none() {
let password = env::var("PGPASSWORD")
.with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?;
url.set_password(Some(&password))
.expect("Failed to set password");
}
let url = args.auth_endpoint.parse()?;
let api = control_plane::client::mock::MockControlPlane::new(
url,
!args.is_private_access_proxy,

View File

@@ -30,6 +30,8 @@ use crate::tls::postgres_rustls::MakeRustlsConnect;
type IpSubnetKey = IpNet;
const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
const BATCH_SIZE: usize = 8;
// Message types for sending through mpsc channel
pub enum CancelKeyOp {
@@ -229,13 +231,12 @@ impl CancelReplyOp {
pub async fn handle_cancel_messages(
client: &mut RedisKVClient,
mut rx: mpsc::Receiver<CancelKeyOp>,
batch_size: usize,
) -> anyhow::Result<()> {
let mut batch = Vec::with_capacity(batch_size);
let mut pipeline = Pipeline::with_capacity(batch_size);
let mut batch = Vec::with_capacity(BATCH_SIZE);
let mut pipeline = Pipeline::with_capacity(BATCH_SIZE);
loop {
if rx.recv_many(&mut batch, batch_size).await == 0 {
if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
warn!("shutting down cancellation queue");
break Ok(());
}
@@ -366,7 +367,8 @@ impl CancellationHandler {
return Err(CancelError::InternalError);
};
tx.try_send(op)
tx.send_timeout(op, REDIS_SEND_TIMEOUT)
.await
.map_err(|e| {
tracing::warn!("failed to send GetCancelData for {key}: {e}");
})
@@ -568,7 +570,7 @@ impl Session {
}
// Send the store key op to the cancellation handler and set TTL for the key
pub(crate) fn write_cancel_key(
pub(crate) async fn write_cancel_key(
&self,
cancel_closure: CancelClosure,
) -> Result<(), CancelError> {
@@ -594,14 +596,14 @@ impl Session {
expire: CANCEL_KEY_TTL,
};
let _ = tx.try_send(op).map_err(|e| {
let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
let key = self.key;
tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
});
Ok(())
}
pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
let Some(tx) = &self.cancellation_handler.tx else {
tracing::warn!("cancellation handler is not available");
return Err(CancelError::InternalError);
@@ -617,7 +619,7 @@ impl Session {
.guard(RedisMsgKind::HDel),
};
let _ = tx.try_send(op).map_err(|e| {
let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
let key = self.key;
tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
});

View File

@@ -244,7 +244,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session = cancellation_handler_clone.get_key();
session.write_cancel_key(node.cancel_closure.clone())?;
session
.write_cancel_key(node.cancel_closure.clone())
.await?;
prepare_client_connection(&node, *session.key(), &mut stream).await?;

View File

@@ -1,11 +1,13 @@
use std::cell::RefCell;
use std::cell::{Cell, RefCell};
use std::collections::HashMap;
use std::sync::Arc;
use std::hash::BuildHasher;
use std::sync::atomic::{AtomicU32, Ordering};
use std::{env, io};
use std::{array, env, fmt, io};
use chrono::{DateTime, Utc};
use indexmap::IndexSet;
use opentelemetry::trace::TraceContextExt;
use scopeguard::defer;
use serde::ser::{SerializeMap, Serializer};
use tracing::subscriber::Interest;
use tracing::{Event, Metadata, Span, Subscriber, callsite, span};
@@ -17,6 +19,7 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields};
use tracing_subscriber::layer::{Context, Layer};
use tracing_subscriber::prelude::*;
use tracing_subscriber::registry::{LookupSpan, SpanRef};
use try_lock::TryLock;
/// Initialize logging and OpenTelemetry tracing and exporter.
///
@@ -52,7 +55,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
StderrWriter {
stderr: std::io::stderr(),
},
&["request_id", "session_id", "conn_id"],
["request_id", "session_id", "conn_id"],
))
} else {
None
@@ -180,65 +183,50 @@ impl Clock for RealClock {
/// Name of the field used by tracing crate to store the event message.
const MESSAGE_FIELD: &str = "message";
/// Tracing used to enforce that spans/events have no more than 32 fields.
/// It seems this is no longer the case, but it's still documented in some places.
/// Generally, we shouldn't expect more than 32 fields anyway, so we can try and
/// rely on it for some (minor) performance gains.
const MAX_TRACING_FIELDS: usize = 32;
thread_local! {
/// Protects against deadlocks and double panics during log writing.
/// The current panic handler will use tracing to log panic information.
static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
/// Thread-local instance with per-thread buffer for log writing.
static EVENT_FORMATTER: RefCell<EventFormatter> = const { RefCell::new(EventFormatter::new()) };
static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
/// Cached OS thread ID.
static THREAD_ID: u64 = gettid::gettid();
}
/// Map for values fixed at callsite registration.
// We use papaya here because registration rarely happens post-startup.
// papaya is good for read-heavy workloads.
//
// We use rustc_hash here because callsite::Identifier will always be an integer with low-bit entropy,
// since it's always a pointer to static mutable data. rustc_hash was designed for low-bit entropy.
type CallsiteMap<T> =
papaya::HashMap<callsite::Identifier, T, std::hash::BuildHasherDefault<rustc_hash::FxHasher>>;
/// Implements tracing layer to handle events specific to logging.
struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
struct JsonLoggingLayer<C: Clock, W: MakeWriter, const F: usize> {
clock: C,
skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
callsite_ids: papaya::HashMap<callsite::Identifier, CallsiteId>,
writer: W,
/// tracks which fields of each **event** are duplicates
skipped_field_indices: CallsiteMap<SkippedFieldIndices>,
span_info: CallsiteMap<CallsiteSpanInfo>,
/// Fields we want to keep track of in a separate json object.
extract_fields: &'static [&'static str],
// We use a const generic and arrays to bypass one heap allocation.
extract_fields: IndexSet<&'static str>,
_marker: std::marker::PhantomData<[&'static str; F]>,
}
impl<C: Clock, W: MakeWriter> JsonLoggingLayer<C, W> {
fn new(clock: C, writer: W, extract_fields: &'static [&'static str]) -> Self {
impl<C: Clock, W: MakeWriter, const F: usize> JsonLoggingLayer<C, W, F> {
fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self {
JsonLoggingLayer {
clock,
skipped_field_indices: CallsiteMap::default(),
span_info: CallsiteMap::default(),
skipped_field_indices: papaya::HashMap::default(),
callsite_ids: papaya::HashMap::default(),
writer,
extract_fields,
extract_fields: IndexSet::from_iter(extract_fields),
_marker: std::marker::PhantomData,
}
}
#[inline]
fn span_info(&self, metadata: &'static Metadata<'static>) -> CallsiteSpanInfo {
self.span_info
fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId {
*self
.callsite_ids
.pin()
.get_or_insert_with(metadata.callsite(), || {
CallsiteSpanInfo::new(metadata, self.extract_fields)
})
.clone()
.get_or_insert_with(cs, CallsiteId::next)
}
}
impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
impl<S, C: Clock + 'static, W: MakeWriter + 'static, const F: usize> Layer<S>
for JsonLoggingLayer<C, W, F>
where
S: Subscriber + for<'a> LookupSpan<'a>,
{
@@ -249,25 +237,35 @@ where
// early, before OTel machinery, and add as event extension.
let now = self.clock.now();
let res: io::Result<()> = EVENT_FORMATTER.with(|f| {
let mut borrow = f.try_borrow_mut();
let formatter = match borrow.as_deref_mut() {
Ok(formatter) => formatter,
// If the thread local formatter is borrowed,
// then we likely hit an edge case were we panicked during formatting.
// We allow the logging to proceed with an uncached formatter.
Err(_) => &mut EventFormatter::new(),
};
let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
if entered.get() {
let mut formatter = EventFormatter::new();
formatter.format::<S, F>(
now,
event,
&ctx,
&self.skipped_field_indices,
&self.callsite_ids,
&self.extract_fields,
)?;
self.writer.make_writer().write_all(formatter.buffer())
} else {
entered.set(true);
defer!(entered.set(false););
formatter.reset();
formatter.format(
now,
event,
&ctx,
&self.skipped_field_indices,
self.extract_fields,
)?;
self.writer.make_writer().write_all(formatter.buffer())
EVENT_FORMATTER.with_borrow_mut(move |formatter| {
formatter.reset();
formatter.format::<S, F>(
now,
event,
&ctx,
&self.skipped_field_indices,
&self.callsite_ids,
&self.extract_fields,
)?;
self.writer.make_writer().write_all(formatter.buffer())
})
}
});
// In case logging fails we generate a simpler JSON object.
@@ -289,48 +287,50 @@ where
/// Registers a SpanFields instance as span extension.
fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
let span = ctx.span(id).expect("span must exist");
let fields = SpanFields::default();
fields.record_fields(attrs);
let mut fields = SpanFields::new(self.span_info(span.metadata()));
attrs.record(&mut fields);
// This could deadlock when there's a panic somewhere in the tracing
// event handling and a read or write guard is still held. This includes
// the OTel subscriber.
let mut exts = span.extensions_mut();
// This is a new span: the extensions should not be locked
// unless some layer spawned a thread to process this span.
// I don't think any layers do that.
span.extensions_mut().insert(fields);
exts.insert(fields);
}
fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
let span = ctx.span(id).expect("span must exist");
// assumption: `on_record` is rarely called.
// assumption: a span being updated by one thread,
// and formatted by another thread is even rarer.
let mut ext = span.extensions_mut();
if let Some(fields) = ext.get_mut::<SpanFields>() {
values.record(fields);
let ext = span.extensions();
if let Some(data) = ext.get::<SpanFields>() {
data.record_fields(values);
}
}
/// Called (lazily) roughly once per event/span instance. We quickly check
/// for duplicate field names and record duplicates as skippable. Last field wins.
/// Called (lazily) whenever a new log call is executed. We quickly check
/// for duplicate field names and record duplicates as skippable. Last one
/// wins.
fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
debug_assert!(
metadata.fields().len() <= MAX_TRACING_FIELDS,
"callsite {metadata:?} has too many fields."
);
if !metadata.is_event() {
// register the span info.
self.span_info(metadata);
self.callsite_id(metadata.callsite());
// Must not be never because we wouldn't get trace and span data.
return Interest::always();
}
let mut field_indices = SkippedFieldIndices::default();
let mut seen_fields = HashMap::new();
let mut seen_fields = HashMap::<&'static str, usize>::new();
for field in metadata.fields() {
if let Some(old_index) = seen_fields.insert(field.name(), field.index()) {
field_indices.set(old_index);
use std::collections::hash_map::Entry;
match seen_fields.entry(field.name()) {
Entry::Vacant(entry) => {
// field not seen yet
entry.insert(field.index());
}
Entry::Occupied(mut entry) => {
// replace currently stored index
let old_index = entry.insert(field.index());
// ... and append it to list of skippable indices
field_indices.push(old_index);
}
}
}
@@ -344,113 +344,110 @@ where
}
}
/// Any span info that is fixed to a particular callsite. Not variable between span instances.
#[derive(Clone)]
struct CallsiteSpanInfo {
/// index of each field to extract. usize::MAX if not found.
extract: Arc<[usize]>,
#[derive(Copy, Clone, Debug, Default)]
#[repr(transparent)]
struct CallsiteId(u32);
/// tracks the fixed "callsite ID" for each span.
/// note: this is not stable between runs.
normalized_name: Arc<str>,
}
impl CallsiteSpanInfo {
fn new(metadata: &'static Metadata<'static>, extract_fields: &[&'static str]) -> Self {
impl CallsiteId {
#[inline]
fn next() -> Self {
// Start at 1 to reserve 0 for default.
static COUNTER: AtomicU32 = AtomicU32::new(1);
CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed))
}
}
let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect();
// get all the indices of span fields we want to focus
let extract = extract_fields
.iter()
// use rposition, since we want last match wins.
.map(|f1| names.iter().rposition(|f2| f1 == f2).unwrap_or(usize::MAX))
.collect();
// normalized_name is unique for each callsite, but it is not
// unified across separate proxy instances.
// todo: can we do better here?
let cid = COUNTER.fetch_add(1, Ordering::Relaxed);
let normalized_name = format!("{}#{cid}", metadata.name()).into();
Self {
extract,
normalized_name,
}
impl fmt::Display for CallsiteId {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.0.fmt(f)
}
}
/// Stores span field values recorded during the spans lifetime.
#[derive(Default)]
struct SpanFields {
values: [serde_json::Value; MAX_TRACING_FIELDS],
/// cached span info so we can avoid extra hashmap lookups in the hot path.
span_info: CallsiteSpanInfo,
// TODO: Switch to custom enum with lasso::Spur for Strings?
fields: papaya::HashMap<&'static str, serde_json::Value>,
}
impl SpanFields {
fn new(span_info: CallsiteSpanInfo) -> Self {
Self {
span_info,
values: [const { serde_json::Value::Null }; MAX_TRACING_FIELDS],
}
#[inline]
fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
fields.record(&mut SpanFieldsRecorder {
fields: self.fields.pin(),
});
}
}
impl tracing::field::Visit for SpanFields {
/// Implements a tracing field visitor to convert and store values.
struct SpanFieldsRecorder<'m, S, G> {
fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
}
impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
#[inline]
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
if let Ok(value) = i64::try_from(value) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
} else {
self.values[field.index()] = serde_json::Value::from(format!("{value}"));
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value}")));
}
}
#[inline]
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
if let Ok(value) = u64::try_from(value) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
} else {
self.values[field.index()] = serde_json::Value::from(format!("{value}"));
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value}")));
}
}
#[inline]
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
self.values[field.index()] = serde_json::Value::from(format!("{value:?}"));
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value:?}")));
}
#[inline]
@@ -459,33 +456,38 @@ impl tracing::field::Visit for SpanFields {
field: &tracing::field::Field,
value: &(dyn std::error::Error + 'static),
) {
self.values[field.index()] = serde_json::Value::from(format!("{value}"));
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value}")));
}
}
/// List of field indices skipped during logging. Can list duplicate fields or
/// metafields not meant to be logged.
#[derive(Copy, Clone, Default)]
#[derive(Clone, Default)]
struct SkippedFieldIndices {
// 32-bits is large enough for `MAX_TRACING_FIELDS`
bits: u32,
bits: u64,
}
impl SkippedFieldIndices {
#[inline]
fn is_empty(self) -> bool {
fn is_empty(&self) -> bool {
self.bits == 0
}
#[inline]
fn set(&mut self, index: usize) {
debug_assert!(index <= 32, "index out of bounds of 32-bit set");
self.bits |= 1 << index;
fn push(&mut self, index: usize) {
self.bits |= 1u64
.checked_shl(index as u32)
.expect("field index too large");
}
#[inline]
fn contains(self, index: usize) -> bool {
self.bits & (1 << index) != 0
fn contains(&self, index: usize) -> bool {
self.bits
& 1u64
.checked_shl(index as u32)
.expect("field index too large")
!= 0
}
}
@@ -497,7 +499,7 @@ struct EventFormatter {
impl EventFormatter {
#[inline]
const fn new() -> Self {
fn new() -> Self {
EventFormatter {
logline_buffer: Vec::new(),
}
@@ -513,13 +515,14 @@ impl EventFormatter {
self.logline_buffer.clear();
}
fn format<S>(
fn format<S, const F: usize>(
&mut self,
now: DateTime<Utc>,
event: &Event<'_>,
ctx: &Context<'_, S>,
skipped_field_indices: &CallsiteMap<SkippedFieldIndices>,
extract_fields: &'static [&'static str],
skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
callsite_ids: &papaya::HashMap<callsite::Identifier, CallsiteId>,
extract_fields: &IndexSet<&'static str>,
) -> io::Result<()>
where
S: Subscriber + for<'a> LookupSpan<'a>,
@@ -530,11 +533,8 @@ impl EventFormatter {
let normalized_meta = event.normalized_metadata();
let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
let skipped_field_indices = skipped_field_indices
.pin()
.get(&meta.callsite())
.copied()
.unwrap_or_default();
let skipped_field_indices = skipped_field_indices.pin();
let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
let mut serialize = || {
let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
@@ -565,11 +565,9 @@ impl EventFormatter {
}
let spans = SerializableSpans {
// collect all spans from parent to root.
spans: ctx
.event_span(event)
.map_or(vec![], |parent| parent.scope().collect()),
extracted: ExtractedSpanFields::new(extract_fields),
ctx,
callsite_ids,
extract: ExtractedSpanFields::<'_, F>::new(extract_fields),
};
serializer.serialize_entry("spans", &spans)?;
@@ -622,9 +620,9 @@ impl EventFormatter {
}
}
if spans.extracted.has_values() {
if spans.extract.has_values() {
// TODO: add fields from event, too?
serializer.serialize_entry("extract", &spans.extracted)?;
serializer.serialize_entry("extract", &spans.extract)?;
}
serializer.end()
@@ -637,15 +635,15 @@ impl EventFormatter {
}
/// Extracts the message field that's mixed will other fields.
struct MessageFieldExtractor<S: serde::ser::SerializeMap> {
struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
serializer: S,
skipped_field_indices: SkippedFieldIndices,
skipped_field_indices: Option<&'a SkippedFieldIndices>,
state: Option<Result<(), S::Error>>,
}
impl<S: serde::ser::SerializeMap> MessageFieldExtractor<S> {
impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
#[inline]
fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
Self {
serializer,
skipped_field_indices,
@@ -667,11 +665,13 @@ impl<S: serde::ser::SerializeMap> MessageFieldExtractor<S> {
fn accept_field(&self, field: &tracing::field::Field) -> bool {
self.state.is_none()
&& field.name() == MESSAGE_FIELD
&& !self.skipped_field_indices.contains(field.index())
&& !self
.skipped_field_indices
.is_some_and(|i| i.contains(field.index()))
}
}
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<S> {
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
#[inline]
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
if self.accept_field(field) {
@@ -751,14 +751,14 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtracto
/// can be skipped.
// This is entirely optional and only cosmetic, though maybe helps a
// bit during log parsing in dashboards when there's no field with empty object.
struct FieldsPresent(pub bool, SkippedFieldIndices);
struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
// Even though some methods have an overhead (error, bytes) it is assumed the
// compiler won't include this since we ignore the value entirely.
impl tracing::field::Visit for FieldsPresent {
impl tracing::field::Visit for FieldsPresent<'_> {
#[inline]
fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
if !self.1.contains(field.index())
if !self.1.is_some_and(|i| i.contains(field.index()))
&& field.name() != MESSAGE_FIELD
&& !field.name().starts_with("log.")
{
@@ -768,7 +768,10 @@ impl tracing::field::Visit for FieldsPresent {
}
/// Serializes the fields directly supplied with a log event.
struct SerializableEventFields<'a, 'event>(&'a tracing::Event<'event>, SkippedFieldIndices);
struct SerializableEventFields<'a, 'event>(
&'a tracing::Event<'event>,
Option<&'a SkippedFieldIndices>,
);
impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
@@ -785,15 +788,15 @@ impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
}
/// A tracing field visitor that skips the message field.
struct MessageFieldSkipper<S: serde::ser::SerializeMap> {
struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
serializer: S,
skipped_field_indices: SkippedFieldIndices,
skipped_field_indices: Option<&'a SkippedFieldIndices>,
state: Result<(), S::Error>,
}
impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
#[inline]
fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
Self {
serializer,
skipped_field_indices,
@@ -806,7 +809,9 @@ impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
self.state.is_ok()
&& field.name() != MESSAGE_FIELD
&& !field.name().starts_with("log.")
&& !self.skipped_field_indices.contains(field.index())
&& !self
.skipped_field_indices
.is_some_and(|i| i.contains(field.index()))
}
#[inline]
@@ -816,7 +821,7 @@ impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
}
}
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<S> {
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
#[inline]
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
if self.accept_field(field) {
@@ -900,17 +905,18 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<
/// with the span names as keys. To prevent collision we append a numberic value
/// to the name. Also, collects any span fields we're interested in. Last one
/// wins.
struct SerializableSpans<'ctx, S>
struct SerializableSpans<'a, 'ctx, Span, const F: usize>
where
S: for<'lookup> LookupSpan<'lookup>,
Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
{
spans: Vec<SpanRef<'ctx, S>>,
extracted: ExtractedSpanFields,
ctx: &'a Context<'ctx, Span>,
callsite_ids: &'a papaya::HashMap<callsite::Identifier, CallsiteId>,
extract: ExtractedSpanFields<'a, F>,
}
impl<S> serde::ser::Serialize for SerializableSpans<'_, S>
impl<Span, const F: usize> serde::ser::Serialize for SerializableSpans<'_, '_, Span, F>
where
S: for<'lookup> LookupSpan<'lookup>,
Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
{
fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
where
@@ -918,22 +924,25 @@ where
{
let mut serializer = serializer.serialize_map(None)?;
for span in self.spans.iter().rev() {
let ext = span.extensions();
if let Some(leaf_span) = self.ctx.lookup_current() {
for span in leaf_span.scope().from_root() {
// Append a numeric callsite ID to the span name to keep the name unique
// in the JSON object.
let cid = self
.callsite_ids
.pin()
.get(&span.metadata().callsite())
.copied()
.unwrap_or_default();
// all spans should have this extension.
let Some(fields) = ext.get() else { continue };
// Loki turns the # into an underscore during field name concatenation.
serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?;
self.extracted.layer_span(fields);
let SpanFields { values, span_info } = fields;
serializer.serialize_entry(
&*span_info.normalized_name,
&SerializableSpanFields {
fields: span.metadata().fields(),
values,
},
)?;
serializer.serialize_value(&SerializableSpanFields {
span: &span,
extract: &self.extract,
})?;
}
}
serializer.end()
@@ -941,77 +950,80 @@ where
}
/// Serializes the span fields as object.
struct SerializableSpanFields<'span> {
fields: &'span tracing::field::FieldSet,
values: &'span [serde_json::Value; MAX_TRACING_FIELDS],
struct SerializableSpanFields<'a, 'span, Span, const F: usize>
where
Span: for<'lookup> LookupSpan<'lookup>,
{
span: &'a SpanRef<'span, Span>,
extract: &'a ExtractedSpanFields<'a, F>,
}
impl serde::ser::Serialize for SerializableSpanFields<'_> {
impl<Span, const F: usize> serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F>
where
Span: for<'lookup> LookupSpan<'lookup>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
{
let mut serializer = serializer.serialize_map(None)?;
for (field, value) in std::iter::zip(self.fields, self.values) {
if value.is_null() {
continue;
let ext = self.span.extensions();
if let Some(data) = ext.get::<SpanFields>() {
for (name, value) in &data.fields.pin() {
serializer.serialize_entry(name, value)?;
// TODO: replace clone with reference, if possible.
self.extract.set(name, value.clone());
}
serializer.serialize_entry(field.name(), value)?;
}
serializer.end()
}
}
struct ExtractedSpanFields {
names: &'static [&'static str],
values: RefCell<Vec<serde_json::Value>>,
struct ExtractedSpanFields<'a, const F: usize> {
names: &'a IndexSet<&'static str>,
// TODO: replace TryLock with something local thread and interior mutability.
// serde API doesn't let us use `mut`.
values: TryLock<([Option<serde_json::Value>; F], bool)>,
}
impl ExtractedSpanFields {
fn new(names: &'static [&'static str]) -> Self {
impl<'a, const F: usize> ExtractedSpanFields<'a, F> {
fn new(names: &'a IndexSet<&'static str>) -> Self {
ExtractedSpanFields {
names,
values: RefCell::new(vec![serde_json::Value::Null; names.len()]),
values: TryLock::new((array::from_fn(|_| Option::default()), false)),
}
}
fn layer_span(&self, fields: &SpanFields) {
let mut v = self.values.borrow_mut();
let SpanFields { values, span_info } = fields;
// extract the fields
for (i, &j) in span_info.extract.iter().enumerate() {
let Some(value) = values.get(j) else { continue };
if !value.is_null() {
// TODO: replace clone with reference, if possible.
v[i] = value.clone();
}
#[inline]
fn set(&self, name: &'static str, value: serde_json::Value) {
if let Some((index, _)) = self.names.get_full(name) {
let mut fields = self.values.try_lock().expect("thread-local use");
fields.0[index] = Some(value);
fields.1 = true;
}
}
#[inline]
fn has_values(&self) -> bool {
self.values.borrow().iter().any(|v| !v.is_null())
self.values.try_lock().expect("thread-local use").1
}
}
impl serde::ser::Serialize for ExtractedSpanFields {
impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
{
let mut serializer = serializer.serialize_map(None)?;
let values = self.values.borrow();
for (key, value) in std::iter::zip(self.names, &*values) {
if value.is_null() {
continue;
let values = self.values.try_lock().expect("thread-local use");
for (i, value) in values.0.iter().enumerate() {
if let Some(value) = value {
let key = self.names[i];
serializer.serialize_entry(key, value)?;
}
serializer.serialize_entry(key, value)?;
}
serializer.end()
@@ -1020,6 +1032,7 @@ impl serde::ser::Serialize for ExtractedSpanFields {
#[cfg(test)]
mod tests {
use std::marker::PhantomData;
use std::sync::{Arc, Mutex, MutexGuard};
use assert_json_diff::assert_json_eq;
@@ -1068,9 +1081,10 @@ mod tests {
let log_layer = JsonLoggingLayer {
clock: clock.clone(),
skipped_field_indices: papaya::HashMap::default(),
span_info: papaya::HashMap::default(),
callsite_ids: papaya::HashMap::default(),
writer: buffer.clone(),
extract_fields: &["x"],
extract_fields: IndexSet::from_iter(["x"]),
_marker: PhantomData::<[&'static str; 1]>,
};
let registry = tracing_subscriber::Registry::default().with(log_layer);

View File

@@ -383,7 +383,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session = cancellation_handler_clone.get_key();
session.write_cancel_key(node.cancel_closure.clone())?;
session
.write_cancel_key(node.cancel_closure.clone())
.await?;
prepare_client_connection(&node, *session.key(), &mut stream).await?;

View File

@@ -94,7 +94,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
}
drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error
drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error
res
}

View File

@@ -48,7 +48,7 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
use postgres_client::error::SqlState;
// Here are errors that happens after the user successfully authenticated to the database.
// TODO: there are pgbouncer errors that should be retried, but they are not listed here.
let non_retriable_pg_errors = matches!(
!matches!(
self.code(),
&SqlState::TOO_MANY_CONNECTIONS
| &SqlState::OUT_OF_MEMORY
@@ -56,20 +56,8 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
| &SqlState::T_R_SERIALIZATION_FAILURE
| &SqlState::INVALID_CATALOG_NAME
| &SqlState::INVALID_SCHEMA_NAME
| &SqlState::INVALID_PARAMETER_VALUE,
);
if non_retriable_pg_errors {
return false;
}
// PGBouncer errors that should not trigger a wake_compute retry.
if self.code() == &SqlState::PROTOCOL_VIOLATION {
// Source for the error message:
// https://github.com/pgbouncer/pgbouncer/blob/f15997fe3effe3a94ba8bcc1ea562e6117d1a131/src/client.c#L1070
return !self
.message()
.contains("no more connections allowed (max_client_conn)");
}
true
| &SqlState::INVALID_PARAMETER_VALUE
)
}
}
@@ -122,55 +110,3 @@ pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Durati
.base_delay
.mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
}
#[cfg(test)]
mod tests {
use super::ShouldRetryWakeCompute;
use postgres_client::error::{DbError, SqlState};
#[test]
fn should_retry_wake_compute_for_db_error() {
// These SQLStates should NOT trigger a wake_compute retry.
let non_retry_states = [
SqlState::TOO_MANY_CONNECTIONS,
SqlState::OUT_OF_MEMORY,
SqlState::SYNTAX_ERROR,
SqlState::T_R_SERIALIZATION_FAILURE,
SqlState::INVALID_CATALOG_NAME,
SqlState::INVALID_SCHEMA_NAME,
SqlState::INVALID_PARAMETER_VALUE,
];
for state in non_retry_states {
let err = DbError::new_test_error(state.clone(), "oops".to_string());
assert!(
!err.should_retry_wake_compute(),
"State {state:?} unexpectedly retried"
);
}
// Errors coming from pgbouncer should not trigger a wake_compute retry
let non_retry_pgbouncer_errors = ["no more connections allowed (max_client_conn)"];
for error in non_retry_pgbouncer_errors {
let err = DbError::new_test_error(SqlState::PROTOCOL_VIOLATION, error.to_string());
assert!(
!err.should_retry_wake_compute(),
"PGBouncer error {error:?} unexpectedly retried"
);
}
// These SQLStates should trigger a wake_compute retry.
let retry_states = [
SqlState::CONNECTION_FAILURE,
SqlState::CONNECTION_EXCEPTION,
SqlState::CONNECTION_DOES_NOT_EXIST,
SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
];
for state in retry_states {
let err = DbError::new_test_error(state.clone(), "oops".to_string());
assert!(
err.should_retry_wake_compute(),
"State {state:?} unexpectedly skipped retry"
);
}
}
}

View File

@@ -15,7 +15,6 @@ use rstest::rstest;
use rustls::crypto::ring;
use rustls::pki_types;
use tokio::io::DuplexStream;
use tracing_test::traced_test;
use super::connect_compute::ConnectMechanism;
use super::retry::CouldRetry;
@@ -382,14 +381,8 @@ enum ConnectAction {
WakeFail,
WakeRetry,
Connect,
// connect_once -> Err, could_retry = true, should_retry_wake_compute = true
Retry,
// connect_once -> Err, could_retry = true, should_retry_wake_compute = false
RetryNoWake,
// connect_once -> Err, could_retry = false, should_retry_wake_compute = true
Fail,
// connect_once -> Err, could_retry = false, should_retry_wake_compute = false
FailNoWake,
}
#[derive(Clone)]
@@ -431,7 +424,6 @@ struct TestConnection;
#[derive(Debug)]
struct TestConnectError {
retryable: bool,
wakeable: bool,
kind: crate::error::ErrorKind,
}
@@ -456,7 +448,7 @@ impl CouldRetry for TestConnectError {
}
impl ShouldRetryWakeCompute for TestConnectError {
fn should_retry_wake_compute(&self) -> bool {
self.wakeable
true
}
}
@@ -479,22 +471,10 @@ impl ConnectMechanism for TestConnectMechanism {
ConnectAction::Connect => Ok(TestConnection),
ConnectAction::Retry => Err(TestConnectError {
retryable: true,
wakeable: true,
kind: ErrorKind::Compute,
}),
ConnectAction::RetryNoWake => Err(TestConnectError {
retryable: true,
wakeable: false,
kind: ErrorKind::Compute,
}),
ConnectAction::Fail => Err(TestConnectError {
retryable: false,
wakeable: true,
kind: ErrorKind::Compute,
}),
ConnectAction::FailNoWake => Err(TestConnectError {
retryable: false,
wakeable: false,
kind: ErrorKind::Compute,
}),
x => panic!("expecting action {x:?}, connect is called instead"),
@@ -729,92 +709,3 @@ async fn wake_non_retry() {
.unwrap_err();
mechanism.verify();
}
#[tokio::test]
#[traced_test]
async fn fail_but_wake_invalidates_cache() {
let ctx = RequestContext::test();
let mech = TestConnectMechanism::new(vec![
ConnectAction::Wake,
ConnectAction::Fail,
ConnectAction::Wake,
ConnectAction::Connect,
]);
let user = helper_create_connect_info(&mech);
let cfg = config();
connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
.await
.unwrap();
assert!(logs_contain(
"invalidating stalled compute node info cache entry"
));
}
#[tokio::test]
#[traced_test]
async fn fail_no_wake_skips_cache_invalidation() {
let ctx = RequestContext::test();
let mech = TestConnectMechanism::new(vec![
ConnectAction::Wake,
ConnectAction::FailNoWake,
ConnectAction::Connect,
]);
let user = helper_create_connect_info(&mech);
let cfg = config();
connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
.await
.unwrap();
assert!(!logs_contain(
"invalidating stalled compute node info cache entry"
));
}
#[tokio::test]
#[traced_test]
async fn retry_but_wake_invalidates_cache() {
let _ = env_logger::try_init();
use ConnectAction::*;
let ctx = RequestContext::test();
// Wake → Retry (retryable + wakeable) → Wake → Connect
let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
let user_info = helper_create_connect_info(&mechanism);
let cfg = config();
connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
.await
.unwrap();
mechanism.verify();
// Because Retry has wakeable=true, we should see invalidate_cache
assert!(logs_contain(
"invalidating stalled compute node info cache entry"
));
}
#[tokio::test]
#[traced_test]
async fn retry_no_wake_skips_invalidation() {
let _ = env_logger::try_init();
use ConnectAction::*;
let ctx = RequestContext::test();
// Wake → RetryNoWake (retryable + NOT wakeable)
let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]);
let user_info = helper_create_connect_info(&mechanism);
let cfg = config();
connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
.await
.unwrap_err();
mechanism.verify();
// Because RetryNoWake has wakeable=false, we must NOT see invalidate_cache
assert!(!logs_contain(
"invalidating stalled compute node info cache entry"
));
}

View File

@@ -13,19 +13,22 @@ pub(crate) struct Pbkdf2 {
// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
impl Pbkdf2 {
pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
// key the HMAC and derive the first block in-place
let mut hmac =
let hmac =
Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
hmac.update(salt);
hmac.update(&1u32.to_be_bytes());
let init_block = hmac.finalize_reset().into_bytes();
let prev = hmac
.clone()
.chain_update(salt)
.chain_update(1u32.to_be_bytes())
.finalize()
.into_bytes();
Self {
hmac,
// one iteration spent above
// one consumed for the hash above
iterations: iterations - 1,
hi: init_block,
prev: init_block,
hi: prev,
prev,
}
}
@@ -41,17 +44,14 @@ impl Pbkdf2 {
iterations,
} = self;
// only do up to 4096 iterations per turn for fairness
// only do 4096 iterations per turn before sharing the thread for fairness
let n = (*iterations).clamp(0, 4096);
for _ in 0..n {
hmac.update(prev);
let block = hmac.finalize_reset().into_bytes();
*prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
for (hi_byte, &b) in hi.iter_mut().zip(block.iter()) {
*hi_byte ^= b;
for (hi, prev) in hi.iter_mut().zip(*prev) {
*hi ^= prev;
}
*prev = block;
}
*iterations -= n;

View File

@@ -43,12 +43,6 @@ impl std::ops::Deref for ApiUrl {
}
}
impl std::ops::DerefMut for ApiUrl {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl std::fmt::Display for ApiUrl {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)

View File

@@ -52,6 +52,7 @@ tokio-postgres.workspace = true
tokio-rustls.workspace = true
tokio-tar.workspace = true
tokio-util = { workspace = true }
tonic = { workspace = true }
tracing.workspace = true
url.workspace = true
metrics.workspace = true
@@ -62,9 +63,11 @@ pq_proto.workspace = true
remote_storage.workspace = true
safekeeper_api.workspace = true
safekeeper_client.workspace = true
sk_ps_discovery.workspace = true
sha2.workspace = true
sd-notify.workspace = true
storage_broker.workspace = true
storage_controller_client.workspace = true
tokio-stream.workspace = true
http-utils.workspace = true
utils.workspace = true

View File

@@ -8,11 +8,12 @@ use std::error::Error as _;
use http_utils::error::HttpErrorBody;
use reqwest::{IntoUrl, Method, StatusCode};
use safekeeper_api::models::{
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
TimelineStatus,
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization,
TenantShardPageserverAttachmentChange, TimelineCreateRequest, TimelineStatus,
};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::logging::SecretString;
use utils::shard::TenantShardId;
#[derive(Debug, Clone)]
pub struct Client {
@@ -189,6 +190,20 @@ impl Client {
resp.json().await.map_err(Error::ReceiveBody)
}
pub async fn post_tenant_shard_pageserver_attachments(
&self,
tenant_shard_id: TenantShardId,
body: TenantShardPageserverAttachmentChange,
) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{}/pageserver_attachments",
tenant_shard_id.tenant_id,
self.mgmt_api_endpoint
);
let resp = self.post(uri, body).await?;
resp.json().await.map_err(Error::ReceiveBody)
}
async fn post<B: serde::Serialize, U: IntoUrl>(
&self,
uri: U,

View File

@@ -25,7 +25,7 @@ use safekeeper::defaults::{
use safekeeper::wal_backup::WalBackup;
use safekeeper::{
BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service,
WAL_ADVERTISER_RUNTIME, WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service,
};
use sd_notify::NotifyState;
use storage_broker::{DEFAULT_ENDPOINT, Uri};
@@ -626,6 +626,30 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
.map(|res| ("broker main".to_owned(), res));
tasks_handles.push(Box::pin(broker_task_handle));
let ps_connectivity_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| HTTP_RUNTIME.handle())
.spawn(
global_timelines
.get_pageserver_connectivity()
.task_main()
.instrument(info_span!("pageserver_connectivity")),
)
.map(|res| ("pageserver connectivity".to_owned(), res));
tasks_handles.push(Box::pin(ps_connectivity_handle));
let wal_advertiser_task_handle = current_thread_rt
.as_ref()
.unwrap_or_else(|| WAL_ADVERTISER_RUNTIME.handle())
.spawn(
global_timelines
.get_wal_advertiser()
.task_main()
.instrument(info_span!("wal_advertiser_main")),
)
.map(|res| ("wal advertiser task handle".to_owned(), res));
tasks_handles.push(Box::pin(wal_advertiser_task_handle));
set_build_info_metric(GIT_VERSION, BUILD_TAG);
// TODO: update tokio-stream, convert to real async Stream with

View File

@@ -50,7 +50,8 @@ async fn push_loop(
conf.broker_endpoint.clone(),
conf.broker_keepalive_interval,
make_tls_config(&conf),
)?;
)?
.into_raw_grpc_client();
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
let outbound = async_stream::stream! {
@@ -97,7 +98,8 @@ async fn pull_loop(
conf.broker_endpoint.clone(),
conf.broker_keepalive_interval,
make_tls_config(&conf),
)?;
)?
.into_raw_grpc_client();
// TODO: subscribe only to local timelines instead of all
let request = SubscribeSafekeeperInfoRequest {
@@ -153,7 +155,8 @@ async fn discover_loop(
conf.broker_endpoint.clone(),
conf.broker_keepalive_interval,
make_tls_config(&conf),
)?;
)?
.into_raw_grpc_client();
let request = SubscribeByFilterRequest {
types: vec![TypeSubscription {

View File

@@ -67,6 +67,19 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
})
}
async fn post_tenant_pageserver_attachments(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let body: models::TenantShardPageserverAttachmentChange = json_request(&mut request).await?;
let global_timelines = get_global_timelines(&request);
let wal_advertiser = global_timelines.get_wal_advertiser();
wal_advertiser
.update_pageserver_attachments(tenant_id, body)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
/// Deactivates all timelines for the tenant and removes its data directory.
/// See `timeline_delete_handler`.
async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -718,6 +731,9 @@ pub fn make_router(
})
})
.get("/v1/utilization", |r| request_span(r, utilization_handler))
.post("/v1/tenant/:tenant_id/pageserver_attachments", |r| {
request_span(r, post_tenant_pageserver_attachments)
})
.delete("/v1/tenant/:tenant_id", |r| {
request_span(r, tenant_delete_handler)
})

View File

@@ -38,11 +38,13 @@ pub mod timeline_eviction;
pub mod timeline_guard;
pub mod timeline_manager;
pub mod timelines_set;
pub mod wal_advertiser;
pub mod wal_backup;
pub mod wal_backup_partial;
pub mod wal_reader_stream;
pub mod wal_service;
pub mod wal_storage;
pub(crate) mod pageserver_connectivity;
#[cfg(any(test, feature = "benchmarking"))]
pub mod test_utils;
@@ -123,6 +125,7 @@ pub struct SafeKeeperConf {
pub ssl_ca_certs: Vec<Pem>,
pub use_https_safekeeper_api: bool,
pub enable_tls_wal_service_api: bool,
pub storage_controller_api: Option<Uri>,
}
impl SafeKeeperConf {
@@ -168,6 +171,7 @@ impl SafeKeeperConf {
ssl_ca_certs: Vec::new(),
use_https_safekeeper_api: false,
enable_tls_wal_service_api: false,
storage_controller_api: None,
}
}
}
@@ -198,6 +202,14 @@ pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
.expect("Failed to create broker runtime")
});
pub static WAL_ADVERTISER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("wal advertiser worker")
.enable_all()
.build()
.expect("Failed to create broker runtime")
});
pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
tokio::runtime::Builder::new_multi_thread()
.thread_name("WAL backup worker")

View File

@@ -0,0 +1,117 @@
use desim::world::Node;
use hyper::Uri;
use pageserver_api::controller_api;
use utils::id::TenantId;
use crate::timeline::Timeline;
use std::{
collections::{HashMap, hash_map},
sync::{Arc, Mutex},
time::{Duration, Instant},
};
use anyhow::Context;
use tracing::{Instrument, error, info, info_span, warn};
use utils::{
id::{NodeId, TenantTimelineId},
lsn::Lsn,
sync::{spsc_fold, spsc_watch},
};
use crate::{GlobalTimelines, SafeKeeperConf};
type Advs = HashMap<TenantTimelineId, Lsn>;
#[derive(Default)]
pub struct GlobalState {
inner: once_cell::sync::OnceCell<tokio::sync::mpsc::Sender<Message>>,
}
enum Message {
Resolve {
ps_id: NodeId,
reply: tokio::sync::oneshot::Sender<tokio::sync::watch::Receiver<hyper::Uri>>,
},
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("cancelled")]
Cancelled,
}
impl GlobalState {
pub fn task_main(&self) -> impl 'static + Future<Output = anyhow::Result<()>> + Send {
let mut ret = None;
self.inner.get_or_init(|| {
let (tx, task_fut) = MainTask::prepare_run();
ret = Some(task_fut);
tx
});
ret.expect("must only call this method once")
}
}
struct MainTask {
rx: tokio::sync::mpsc::Receiver<Message>,
}
impl MainTask {
fn prepare_run() -> (
tokio::sync::mpsc::Sender<Message>,
impl Future<Output = anyhow::Result<()>> + Send,
) {
let (tx, rx) = tokio::sync::mpsc::channel(100 /* TODO think */);
let task = MainTask { rx };
(tx, task.task())
}
async fn task(mut self) -> anyhow::Result<()> {
// TODO: persistence
let storcon_client = todo!();
let mut resolution: HashMap<NodeId, tokio::sync::watch::Sender<hyper::Uri>> =
HashMap::new();
while let Some(rx) = self.rx.recv().await {
match rx {
Message::Resolve { ps_id, reply } => match resolution.entry(ps_id) {
hash_map::Entry::Occupied(e) => {}
hash_map::Entry::Vacant(e) => {
tokio::spawn(
ResolutionTask { ps_id, storcon_client }.run()
)
},
},
}
}
}
}
struct ResolutionTask {
ps_id: NodeId,
storcon_client: storage_controller_client::control_api::Client,
}
impl ResolutionTask {
pub async fn run(self) -> Result<Uri, Error> {
loop {
// XXX: well-defined upcall API?
let res = self
.storcon_client
.dispatch(
reqwest::Method::GET,
format!("control/v1/node/{}", self.node_id),
None,
)
.await;
let node: NodeDescribeResponse = match res {
Ok(res) => res,
Err(err) => {
warn!("storcon upcall failed")
}
};
}
}
}

View File

@@ -117,6 +117,7 @@ impl Env {
Arc::new(TimelinesSet::default()), // ignored for now
RateLimiter::new(0, 0),
wal_backup,
todo!(),
);
Ok(timeline)
}

View File

@@ -39,7 +39,9 @@ use crate::wal_backup;
use crate::wal_backup::{WalBackup, remote_timeline_path};
use crate::wal_backup_partial::PartialRemoteSegment;
use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage};
use crate::{
SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_advertiser, wal_storage,
};
fn peer_info_from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
PeerInfo {
@@ -547,6 +549,7 @@ impl Timeline {
broker_active_set: Arc<TimelinesSet>,
partial_backup_rate_limiter: RateLimiter,
wal_backup: Arc<WalBackup>,
wal_advertiser: Arc<wal_advertiser::GlobalState>,
) {
let (tx, rx) = self.manager_ctl.bootstrap_manager();
@@ -570,6 +573,7 @@ impl Timeline {
rx,
partial_backup_rate_limiter,
wal_backup,
wal_advertiser,
)
.await
}

View File

@@ -42,6 +42,7 @@ impl Manager {
&& next_event.is_none()
&& self.access_service.is_empty()
&& !self.tli_broker_active.get()
&& self.wal_advertiser.ready_for_eviction()
// Partial segment of current flush_lsn is uploaded up to this flush_lsn.
&& !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded)
// And it is the next one after the last removed. Given that local

View File

@@ -22,7 +22,6 @@ use tokio_util::sync::CancellationToken;
use tracing::{Instrument, debug, info, info_span, instrument, warn};
use utils::lsn::Lsn;
use crate::SafeKeeperConf;
use crate::control_file::{FileStorage, Storage};
use crate::metrics::{
MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS, NUM_EVICTED_TIMELINES,
@@ -37,6 +36,7 @@ use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard};
use crate::timelines_set::{TimelineSetGuard, TimelinesSet};
use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle};
use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment};
use crate::{SafeKeeperConf, wal_advertiser};
pub(crate) struct StateSnapshot {
// inmem values
@@ -201,6 +201,7 @@ pub(crate) struct Manager {
pub(crate) wal_seg_size: usize,
pub(crate) walsenders: Arc<WalSenders>,
pub(crate) wal_backup: Arc<WalBackup>,
pub(crate) wal_advertiser: wal_advertiser::SafekeeperTimelineHandle,
// current state
pub(crate) state_version_rx: tokio::sync::watch::Receiver<usize>,
@@ -240,6 +241,7 @@ pub async fn main_task(
mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
global_rate_limiter: RateLimiter,
wal_backup: Arc<WalBackup>,
wal_advertiser: Arc<wal_advertiser::GlobalState>,
) {
tli.set_status(Status::Started);
@@ -259,6 +261,7 @@ pub async fn main_task(
manager_tx,
global_rate_limiter,
wal_backup,
wal_advertiser,
)
.await;
@@ -287,7 +290,8 @@ pub async fn main_task(
mgr.set_status(Status::UpdateBackup);
let is_wal_backup_required = mgr.update_backup(num_computes, &state_snapshot).await;
mgr.update_is_active(is_wal_backup_required, num_computes, &state_snapshot);
mgr.update_broker_active(is_wal_backup_required, num_computes, &state_snapshot);
mgr.set_status(Status::UpdateControlFile);
mgr.update_control_file_save(&state_snapshot, &mut next_event)
@@ -419,6 +423,7 @@ impl Manager {
manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
global_rate_limiter: RateLimiter,
wal_backup: Arc<WalBackup>,
wal_advertiser: Arc<wal_advertiser::GlobalState>,
) -> Manager {
let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
Manager {
@@ -428,6 +433,7 @@ impl Manager {
state_version_rx: tli.get_state_version_rx(),
num_computes_rx: tli.get_walreceivers().get_num_rx(),
tli_broker_active: broker_active_set.guard(tli.clone()),
wal_advertiser: wal_advertiser.new_timeline(tli.clone()).await.unwrap(),
last_removed_segno: 0,
is_offloaded,
backup_task: None,
@@ -494,8 +500,8 @@ impl Manager {
is_wal_backup_required
}
/// Update is_active flag and returns its value.
fn update_is_active(
/// Update broker is_active flag and returns its value.
fn update_broker_active(
&mut self,
is_wal_backup_required: bool,
num_computes: usize,
@@ -505,6 +511,7 @@ impl Manager {
|| num_computes > 0
|| state.remote_consistent_lsn < state.commit_lsn;
// update the broker timeline set
if self.tli_broker_active.set(is_active) {
// write log if state has changed

View File

@@ -27,7 +27,7 @@ use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_t
use crate::timelines_set::TimelinesSet;
use crate::wal_backup::WalBackup;
use crate::wal_storage::Storage;
use crate::{SafeKeeperConf, control_file, wal_storage};
use crate::{SafeKeeperConf, control_file, pageserver_connectivity, wal_advertiser, wal_storage};
// Timeline entry in the global map: either a ready timeline, or mark that it is
// being created.
@@ -47,6 +47,8 @@ struct GlobalTimelinesState {
conf: Arc<SafeKeeperConf>,
broker_active_set: Arc<TimelinesSet>,
wal_advertisement: Arc<wal_advertiser::GlobalState>,
pageserver_connectivity: Arc<pageserver_connectivity::GlobalState>,
global_rate_limiter: RateLimiter,
wal_backup: Arc<WalBackup>,
}
@@ -60,12 +62,14 @@ impl GlobalTimelinesState {
Arc<TimelinesSet>,
RateLimiter,
Arc<WalBackup>,
Arc<wal_advertiser::GlobalState>,
) {
(
self.conf.clone(),
self.broker_active_set.clone(),
self.global_rate_limiter.clone(),
self.wal_backup.clone(),
self.wal_advertisement.clone(),
)
}
@@ -101,6 +105,8 @@ impl GlobalTimelines {
tombstones: HashMap::new(),
conf,
broker_active_set: Arc::new(TimelinesSet::default()),
wal_advertisement: Arc::new(wal_advertiser::GlobalState::default()),
pageserver_connectivity: Arc::new(pageserver_connectivity::GlobalState::default()),
global_rate_limiter: RateLimiter::new(1, 1),
wal_backup,
}),
@@ -158,12 +164,13 @@ impl GlobalTimelines {
/// just lock and unlock it for each timeline -- this function is called
/// during init when nothing else is running, so this is fine.
async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = {
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup, wal_advertiser) = {
let state = self.state.lock().unwrap();
state.get_dependencies()
};
let timelines_dir = get_tenant_dir(&conf, &tenant_id);
for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
.with_context(|| format!("failed to list timelines dir {}", timelines_dir))?
{
@@ -187,6 +194,7 @@ impl GlobalTimelines {
broker_active_set.clone(),
partial_backup_rate_limiter.clone(),
wal_backup.clone(),
wal_advertiser.clone(),
);
}
// If we can't load a timeline, it's most likely because of a corrupted
@@ -238,7 +246,7 @@ impl GlobalTimelines {
start_lsn: Lsn,
commit_lsn: Lsn,
) -> Result<Arc<Timeline>> {
let (conf, _, _, _) = {
let (conf, _, _, _, _) = {
let state = self.state.lock().unwrap();
if let Ok(timeline) = state.get(&ttid) {
// Timeline already exists, return it.
@@ -283,7 +291,7 @@ impl GlobalTimelines {
check_tombstone: bool,
) -> Result<Arc<Timeline>> {
// Check for existence and mark that we're creating it.
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = {
let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup, wal_advertiser) = {
let mut state = self.state.lock().unwrap();
match state.timelines.get(&ttid) {
Some(GlobalMapTimeline::CreationInProgress) => {
@@ -338,6 +346,7 @@ impl GlobalTimelines {
broker_active_set,
partial_backup_rate_limiter,
wal_backup,
wal_advertiser.clone(),
);
drop(timeline_shared_state);
Ok(timeline)
@@ -590,6 +599,14 @@ impl GlobalTimelines {
Ok(deleted)
}
pub fn get_wal_advertiser(&self) -> Arc<wal_advertiser::GlobalState> {
self.state.lock().unwrap().wal_advertisement.clone()
}
pub fn get_pageserver_connectivity(&self) -> Arc<pageserver_connectivity::GlobalState> {
self.state.lock().unwrap().pageserver_connectivity.clone()
}
pub fn housekeeping(&self, tombstone_ttl: &Duration) {
let mut state = self.state.lock().unwrap();

View File

@@ -0,0 +1,201 @@
mod persistence;
mod pageserver_connectivity;
use utils::id::TenantId;
use crate::timeline::Timeline;
use std::{
collections::HashMap,
sync::{Arc, Mutex},
time::{Duration, Instant},
};
use anyhow::Context;
use tracing::{Instrument, error, info, info_span, warn};
use utils::{
id::{NodeId, TenantTimelineId},
lsn::Lsn,
sync::{spsc_fold, spsc_watch},
};
use crate::{GlobalTimelines, SafeKeeperConf};
type Advs = HashMap<TenantTimelineId, Lsn>;
#[derive(Default)]
pub struct GlobalState {
inner: once_cell::sync::OnceCell<tokio::sync::mpsc::Sender<Message>>,
}
pub struct SafekeeperTimelineHandle {
tx: tokio::sync::mpsc::Sender<Message>,
}
enum Message {
NewTimeline {
reply: tokio::sync::oneshot::Sender<Result<(), Error>>,
},
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("cancelled")]
Cancelled,
}
impl GlobalState {
pub fn task_main(&self) -> impl 'static + Future<Output = anyhow::Result<()>> + Send {
let mut ret = None;
self.inner.get_or_init(|| {
let (tx, task_fut) = MainTask::prepare_run();
ret = Some(task_fut);
tx
});
ret.expect("must only call this method once")
}
pub async fn new_timeline(
&self,
tli: Arc<Timeline>,
) -> Result<SafekeeperTimelineHandle, Error> {
let tx = self.inner.get().unwrap().clone();
let handle = SafekeeperTimelineHandle { tx };
let (reply, rx) = tokio::sync::oneshot::channel();
let Ok(()) = handle.tx.send(Message::NewTimeline { reply }).await else {
return Err(Error::Cancelled);
};
let Ok(res) = rx.await else {
return Err(Error::Cancelled);
};
Ok(handle)
}
pub fn update_pageserver_attachments(
&self,
tenant_id: TenantId,
update: safekeeper_api::models::TenantShardPageserverAttachmentChange,
) -> anyhow::Result<()> {
todo!()
}
}
impl SafekeeperTimelineHandle {
pub fn ready_for_eviction(&self) -> bool {
todo!()
}
}
struct MainTask {
rx: tokio::sync::mpsc::Receiver<Message>,
world: sk_ps_discovery::World,
senders: HashMap<utils::id::NodeId, spsc_watch::Sender<Advs>>,
}
impl MainTask {
fn prepare_run() -> (
tokio::sync::mpsc::Sender<Message>,
impl Future<Output = anyhow::Result<()>> + Send,
) {
let (tx, rx) = tokio::sync::mpsc::channel(100 /* TODO think */);
let task = MainTask {
rx,
world: sk_ps_discovery::World::default(),
senders: Default::default(),
};
(tx, task.task())
}
async fn task(mut self) -> anyhow::Result<()> {
let mut adv_frequency = tokio::time::interval(Duration::from_secs(1));
loop {
tokio::select! {
_ = adv_frequency.tick() => {
let start = Instant::now();
self.advertisements_iteration();
let elapsed = start.elapsed();
if elapsed > Duration::from_millis(10) {
warn!(?elapsed, "advertisements iteration is slow");
}
},
message = self.rx.recv() => {
match message {
None => anyhow::bail!("last main task sender dropped, shouldn't happen, exiting"),
Some(_) => todo!(),
}
},
}
}
}
fn advertisements_iteration(&mut self) {
loop {
let advertisements = self.world.get_commit_lsn_advertisements();
for (node_id, mut advs) in advertisements {
'inner: loop {
let tx = self.senders.entry(node_id).or_insert_with(|| {
let (tx, rx) = spsc_watch::channel();
tokio::spawn(
PageserverTask {
ps_id: node_id,
endpoint: todo!(),
advs: rx,
}
.run()
.instrument(info_span!("wal_advertiser", ps_id=%node_id)),
);
tx
});
if let Err((failed, err)) = tx.send_replace(advs) {
self.senders.remove(&node_id);
advs = failed;
} else {
break 'inner;
}
}
}
}
}
}
struct PageserverTask {
ps_id: NodeId,
advs: spsc_watch::Receiver<Advs>,
}
impl PageserverTask {
/// Cancellation: happens through last PageserverHandle being dropped.
async fn run(mut self) {
loop {
let Ok(advs) = self.advs.recv().await else {
info!("main task gone, exiting");
return;
};
let res = self.run0(advs).await;
match res {
Ok(()) => {}
Err(err) => {
error!(?err, "error sending advertisements");
// TODO: proper backoff?
tokio::time::sleep(Duration::from_secs(5)).await;
}
}
}
}
async fn run0(&mut self, advs: HashMap<TenantTimelineId, Lsn>) -> anyhow::Result<()> {
use storage_broker::wal_advertisement as proto;
use storage_broker::wal_advertisement::pageserver_client::PageserverClient;
let stream = async_stream::stream! {
for (tenant_timeline_id, commit_lsn) in advs {
yield proto::CommitLsnAdvertisement {tenant_timeline_id: Some(proto::TenantTimelineId {
tenant_id: tenant_timeline_id.tenant_id.as_ref().to_owned(),
timeline_id: tenant_timeline_id.timeline_id.as_ref().to_owned(),
}), commit_lsn: commit_lsn.0 };
}
};
let mut client: PageserverClient<_> = PageserverClient::connect(self.endpoint.clone())
.await
.context("connect")?;
let publish_stream = client
.publish_commit_lsn_advertisements(stream)
.await
.context("publish stream")?;
Ok(())
}
}

View File

@@ -27,6 +27,7 @@ parking_lot.workspace = true
prost.workspace = true
tonic.workspace = true
tokio = { workspace = true, features = ["rt-multi-thread"] }
tokio-util.workspace = true
tokio-rustls.workspace = true
tracing.workspace = true
metrics.workspace = true

View File

@@ -5,7 +5,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// easy location, but apparently interference with cachepot sometimes fails
// the build then. Anyway, per cargo docs build script shouldn't output to
// anywhere but $OUT_DIR.
tonic_build::compile_protos("proto/broker.proto")
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
let protos = [
"proto/broker.proto",
"proto/wal_advertisement.proto",
];
for proto in protos {
tonic_build::compile_protos(proto)
.unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
}
Ok(())
}

View File

@@ -35,14 +35,14 @@ message SafekeeperTimelineInfo {
// LSN of the last record.
uint64 flush_lsn = 4;
// Up to which LSN safekeeper regards its WAL as committed.
uint64 commit_lsn = 5;
uint64 commit_lsn = 5; // yes
// LSN up to which safekeeper has backed WAL.
uint64 backup_lsn = 6;
// LSN of last checkpoint uploaded by pageserver.
uint64 remote_consistent_lsn = 7;
uint64 peer_horizon_lsn = 8;
uint64 local_start_lsn = 9;
uint64 standby_horizon = 14;
uint64 standby_horizon = 14; // yes
// A connection string to use for WAL receiving.
string safekeeper_connstr = 10;
// HTTP endpoint connection string.

View File

@@ -0,0 +1,29 @@
syntax = "proto3";
import "google/protobuf/empty.proto";
package wal_advertisement;
service Pageserver {
rpc PublishCommitLsnAdvertisements(stream CommitLsnAdvertisement) returns (google.protobuf.Empty) {};
rpc SubscribeRemoteConsistentLsnAdvertisements(google.protobuf.Empty) returns (stream RemoteConsistentLsnAdvertisement) {};
}
message CommitLsnAdvertisement {
TenantTimelineId tenant_timeline_id = 1;
uint64 commit_lsn = 2;
}
message RemoteConsistentLsnAdvertisement {
bytes tenant_id = 1;
uint32 shard_id = 2;
bytes timeline_id = 3;
uint64 generation = 4;
uint64 remote_consistent_lsn = 5;
}
message TenantTimelineId {
bytes tenant_id = 1;
bytes timeline_id = 2;
}

View File

@@ -1,10 +1,14 @@
use std::time::Duration;
use futures::Stream;
use proto::TenantTimelineId as ProtoTenantTimelineId;
use proto::broker_service_client::BrokerServiceClient;
use tokio_util::sync::CancellationToken;
use tonic::Status;
use tonic::codegen::StdError;
use tonic::transport::{Channel, Endpoint};
use tonic::transport::Endpoint;
use tracing::{debug, error, info, warn};
use utils::backoff::{
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff,
};
use utils::id::{TenantId, TenantTimelineId, TimelineId};
// Code generated by protobuf.
@@ -16,12 +20,18 @@ pub mod proto {
tonic::include_proto!("storage_broker");
}
pub mod wal_advertisement {
#![allow(clippy::derive_partial_eq_without_eq)]
tonic::include_proto!("wal_advertisement");
}
pub mod metrics;
// Re-exports to avoid direct tonic dependency in user crates.
pub use hyper::Uri;
pub use tonic::transport::{Certificate, ClientTlsConfig};
pub use tonic::{Code, Request, Streaming};
use utils::shard::TenantShardId;
pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
@@ -29,9 +39,199 @@ pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LIST
pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000);
// BrokerServiceClient charged with tonic provided Channel transport; helps to
// avoid depending on tonic directly in user crates.
pub type BrokerClientChannel = BrokerServiceClient<Channel>;
#[derive(Clone)]
pub struct TimelineUpdatesSubscriber {
client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
}
/// Wrapper type to weed out all places in the codebase that interact directly with the gRPC generated code.
pub struct BrokerClientChannel {
client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
}
impl BrokerClientChannel {
pub fn into_raw_grpc_client(
self,
) -> proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel> {
self.client
}
}
pub struct TimelineShardUpdate {
pub is_discovery: bool,
pub inner: proto::SafekeeperDiscoveryResponse,
}
pub struct DiscoveryRequester {
id: ProtoTenantTimelineId,
client: proto::broker_service_client::BrokerServiceClient<tonic::transport::Channel>,
}
impl TimelineUpdatesSubscriber {
pub fn new(service_client: BrokerClientChannel) -> Self {
Self {
client: service_client.client.clone(),
}
}
pub fn subscribe(
&mut self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
cancel: &CancellationToken,
) -> (impl Stream<Item = TimelineShardUpdate>, DiscoveryRequester) {
let id = ProtoTenantTimelineId {
tenant_id: tenant_shard_id.tenant_id.as_ref().to_owned(),
timeline_id: timeline_id.as_ref().to_owned(),
};
let discovery_requester = DiscoveryRequester {
id: id.clone(),
client: self.client.clone(),
};
let stream = async_stream::stream! {
let mut attempt = 0;
'resubscribe: loop {
exponential_backoff(
attempt,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
cancel,
)
.await;
attempt += 1;
use proto::*;
// subscribe to the specific timeline
let request = SubscribeByFilterRequest {
types: vec![
TypeSubscription {
r#type: MessageType::SafekeeperTimelineInfo as i32,
},
TypeSubscription {
r#type: MessageType::SafekeeperDiscoveryResponse as i32,
},
],
tenant_timeline_id: Some(FilterTenantTimelineId {
enabled: true,
tenant_timeline_id: Some(id.clone()),
}),
};
let res = tokio::select! {
r = self.client.subscribe_by_filter(request) => { r }
_ = cancel.cancelled() => { return; }
};
let mut update_stream = match res
{
Ok(resp) => {
resp.into_inner()
}
Err(e) => {
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
// entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
info!(
attempt, "failed to subscribe: {e:#}"
);
continue 'resubscribe;
}
};
loop {
let broker_update = tokio::select!{
_ = cancel.cancelled() => {
return;
}
update = update_stream.message() => { update }
};
match broker_update {
Ok(Some(typed_msg)) => {
let mut is_discovery = false;
let timeline_update = match typed_msg.r#type() {
MessageType::SafekeeperTimelineInfo => {
let info = match typed_msg.safekeeper_timeline_info {
Some(info) => info,
None => {
warn!("bad proto message from broker: no safekeeper_timeline_info");
continue 'resubscribe;
}
};
SafekeeperDiscoveryResponse {
safekeeper_id: info.safekeeper_id,
tenant_timeline_id: info.tenant_timeline_id,
commit_lsn: info.commit_lsn,
safekeeper_connstr: info.safekeeper_connstr,
availability_zone: info.availability_zone,
standby_horizon: info.standby_horizon,
}
}
MessageType::SafekeeperDiscoveryResponse => {
is_discovery = true;
match typed_msg.safekeeper_discovery_response {
Some(response) => response,
None => {
warn!("bad proto message from broker: no safekeeper_discovery_response");
continue 'resubscribe;
}
}
}
_ => {
// unexpected message
warn!("unexpected message from broker: {typed_msg:?}");
continue 'resubscribe;
}
};
attempt = 0; // reset backoff iff we received a valid update
yield TimelineShardUpdate{is_discovery, inner: timeline_update };
},
Err(status) => {
match status.code() {
Code::Unknown if status.message().contains("stream closed because of a broken pipe") || status.message().contains("connection reset") || status.message().contains("error reading a body from connection") => {
// tonic's error handling doesn't provide a clear code for disconnections: we get
// "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
// => https://github.com/neondatabase/neon/issues/9562
info!("broker disconnected: {status}");
},
_ => {
warn!("broker subscription failed: {status}");
}
}
continue 'resubscribe;
}
Ok(None) => {
error!("broker subscription stream ended"); // can't happen
continue 'resubscribe;
}
}
}
}
};
(stream, discovery_requester)
}
}
impl DiscoveryRequester {
pub async fn request(&mut self) {
let request = proto::SafekeeperDiscoveryRequest {
tenant_timeline_id: Some(self.id.clone()),
};
let msg = proto::TypedMessage {
r#type: proto::MessageType::SafekeeperDiscoveryRequest as i32,
safekeeper_timeline_info: None,
safekeeper_discovery_request: Some(request),
safekeeper_discovery_response: None,
};
// Cancellation safety: we want to send a message to the broker, but publish_one()
// function can get cancelled by the other select! arm. This is absolutely fine, because
// we just want to receive broker updates and discovery is not important if we already
// receive updates.
//
// It is possible that `last_discovery_ts` will be updated, but the message will not be sent.
// This is totally fine because of the reason above.
// This is a fire-and-forget request, we don't care about the response
let _ = self.client.publish_one(msg).await;
debug!("Discovery request sent to the broker");
}
}
// Create connection object configured to run TLS if schema starts with https://
// and plain text otherwise. Connection is lazy, only endpoint sanity is
@@ -67,19 +267,9 @@ where
.connect_timeout(DEFAULT_CONNECT_TIMEOUT);
// keep_alive_timeout is 20s by default on both client and server side
let channel = tonic_endpoint.connect_lazy();
Ok(BrokerClientChannel::new(channel))
}
impl BrokerClientChannel {
/// Create a new client to the given endpoint, but don't actually connect until the first request.
pub async fn connect_lazy<D>(dst: D) -> Result<Self, tonic::transport::Error>
where
D: std::convert::TryInto<tonic::transport::Endpoint>,
D::Error: Into<StdError>,
{
let conn = tonic::transport::Endpoint::new(dst)?.connect_lazy();
Ok(Self::new(conn))
}
Ok(BrokerClientChannel {
client: proto::broker_service_client::BrokerServiceClient::new(channel),
})
}
// parse variable length bytes from protobuf

View File

@@ -15,6 +15,7 @@ testing = []
[dependencies]
anyhow.workspace = true
async-stream.workspace = true
bytes.workspace = true
camino.workspace = true
chrono.workspace = true
@@ -69,4 +70,4 @@ http-utils = { path = "../libs/http-utils/" }
utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }
control_plane = { path = "../control_plane" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -0,0 +1,16 @@
DROP TRIGGER on_timelines_UPDATE_enqueue_sk_ps_discovery on "timelines";
DROP FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn;
DROP TRIGGER on_timelines_DELETE_enqueue_sk_ps_discovery on "timelines";
DROP FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn;
DROP TRIGGER on_timelines_INSERT_enqueue_sk_ps_discovery on "timelines";
DROP FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn;
DROP TRIGGER on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery on "tenant_shards";
DROP FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn;
DROP TRIGGER on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery on "tenant_shards";
DROP FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn;
DROP TRIGGER on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery on "tenant_shards";
DROP FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn;
DROP FUNCTION IF EXISTS sk_ps_discovery_enqueue_attachment_create;
DROP TABLE "sk_ps_discovery";

View File

@@ -0,0 +1,122 @@
CREATE TABLE "sk_ps_discovery"(
"tenant_id" VARCHAR NOT NULL,
"shard_number" INT4 NOT NULL,
"shard_count" INT4 NOT NULL,
"ps_generation" INT4 NOT NULL,
"sk_id" INT8 NOT NULL REFERENCES "safekeepers"("id") ON DELETE CASCADE, -- more efficient that trigger on "safekeepers"
"intent_state" VARCHAR NOT NULL, -- attached,detached
"ps_id" INT8 NOT NULL REFERENCES "nodes"("node_id") ON DELETE CASCADE, -- more efficient that trigger on "nodes"
"created_at" TIMESTAMPTZ NOT NULL,
"retries" INT4 NOT NULL DEFAULT 0,
"last_retry_at" TIMESTAMPTZ,
"acknowledged_at" TIMESTAMPTZ,
PRIMARY KEY("tenant_id", "shard_number", "shard_count", "ps_generation", "sk_id")
);
CREATE OR REPLACE FUNCTION sk_ps_discovery_enqueue_attachment_create(ARG_TENANT_ID VARCHAR)
RETURNS VOID AS $$
BEGIN
INSERT INTO sk_ps_discovery (tenant_id, shard_number, shard_count, ps_generation, sk_id, intent_state, ps_id, created_at)
WITH intent_attachments AS (
SELECT DISTINCT tenant_id,unnest(array_cat(sk_set, new_sk_set)) as sk_id FROM timelines
WHERE
tenant_id = ARG_TENANT_ID
AND
timelines.deleted_at IS NULL
)
SELECT tenant_shards.tenant_id, tenant_shards.shard_number, tenant_shards.shard_count,
tenant_shards.generation, intent_attachments.sk_id, 'attached', tenant_shards.generation_pageserver, NOW()
FROM tenant_shards
INNER JOIN intent_attachments ON tenant_shards.tenant_id = intent_attachments.tenant_id
ON CONFLICT DO NOTHING; -- the first trigger creates the attachment, all others are identical because tenant shard generations are monotonic
PERFORM pg_notify('sk_ps_discovery', json_build_object(
'tenant_id', ARG_TENANT_ID
)::text);
END;
$$ LANGUAGE plpgsql;
-- Trigger on tenant_shards table
CREATE OR REPLACE FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn()
RETURNS TRIGGER AS $$
BEGIN
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE TRIGGER on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery
AFTER INSERT
ON "tenant_shards"
FOR EACH ROW
EXECUTE FUNCTION on_ps_tenant_shard_INSERT_enqueue_sk_ps_discovery_triggerfn();
CREATE OR REPLACE FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn()
RETURNS TRIGGER AS $$
BEGIN
PERFORM sk_ps_discovery_enqueue_attachment_create(OLD.tenant_id);
RETURN OLD;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE TRIGGER on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery
AFTER DELETE
ON "tenant_shards"
FOR EACH ROW
EXECUTE FUNCTION on_ps_tenant_shard_DELETE_enqueue_sk_ps_discovery_triggerfn();
CREATE OR REPLACE FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn()
RETURNS TRIGGER AS $$
BEGIN
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE TRIGGER on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery
AFTER UPDATE
ON "tenant_shards"
FOR EACH ROW
EXECUTE FUNCTION on_ps_tenant_shard_UPDATE_enqueue_sk_ps_discovery_triggerfn();
-- Trigger on timelines table
CREATE OR REPLACE FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn()
RETURNS TRIGGER AS $$
BEGIN
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE TRIGGER on_timelines_INSERT_enqueue_sk_ps_discovery
AFTER INSERT
ON "timelines"
FOR EACH ROW
EXECUTE FUNCTION on_timelines_INSERT_enqueue_sk_ps_discovery_triggerfn();
CREATE OR REPLACE FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn()
RETURNS TRIGGER AS $$
BEGIN
PERFORM sk_ps_discovery_enqueue_attachment_create(OLD.tenant_id);
RETURN OLD;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE TRIGGER on_timelines_DELETE_enqueue_sk_ps_discovery
AFTER DELETE
ON "timelines"
FOR EACH ROW
EXECUTE FUNCTION on_timelines_DELETE_enqueue_sk_ps_discovery_triggerfn();
CREATE OR REPLACE FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn()
RETURNS TRIGGER AS $$
BEGIN
PERFORM sk_ps_discovery_enqueue_attachment_create(NEW.tenant_id);
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE TRIGGER on_timelines_UPDATE_enqueue_sk_ps_discovery
AFTER UPDATE
ON "timelines"
FOR EACH ROW
EXECUTE FUNCTION on_timelines_UPDATE_enqueue_sk_ps_discovery_triggerfn();

View File

@@ -436,7 +436,8 @@ async fn async_main() -> anyhow::Result<()> {
};
// Validate that we can connect to the database
Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
Persistence::await_connection(secrets.database_url.clone(), args.db_connect_timeout.into())
.await?;
let persistence = Arc::new(Persistence::new(secrets.database_url).await);

View File

@@ -1,6 +1,8 @@
pub(crate) mod split_state;
use std::collections::HashMap;
use std::io::Write;
use std::ops::Add;
use std::pin::Pin;
use std::str::FromStr;
use std::sync::Arc;
use std::time::{Duration, Instant};
@@ -15,8 +17,8 @@ use diesel_async::pooled_connection::bb8::Pool;
use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfig};
use diesel_async::{AsyncPgConnection, RunQueryDsl};
use diesel_migrations::{EmbeddedMigrations, embed_migrations};
use futures::FutureExt;
use futures::future::BoxFuture;
use futures::{FutureExt, StreamExt};
use itertools::Itertools;
use pageserver_api::controller_api::{
AvailabilityZone, MetadataHealthRecord, NodeSchedulingPolicy, PlacementPolicy,
@@ -31,6 +33,7 @@ use rustls::client::danger::{ServerCertVerified, ServerCertVerifier};
use rustls::crypto::ring;
use scoped_futures::ScopedBoxFuture;
use serde::{Deserialize, Serialize};
use tracing::info;
use utils::generation::Generation;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -74,6 +77,8 @@ const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
/// we can UPDATE a node's scheduling mode reasonably quickly to mark a bad node offline.
pub struct Persistence {
connection_pool: Pool<AsyncPgConnection>,
connect_tokio_postgres:
Box<dyn Sync + Send + 'static + Fn() -> BoxFuture<'static, TokioPostgresConnectResult>>,
}
/// Legacy format, for use in JSON compat objects in test environment
@@ -135,6 +140,8 @@ pub(crate) enum DatabaseOperation {
DeleteTimelineImport,
ListTimelineImports,
IsTenantImportingTimeline,
ListSkPsDiscovery,
UpdateSkPsDiscoveryAttempt,
}
#[must_use]
@@ -177,10 +184,11 @@ impl Persistence {
pub async fn new(database_url: String) -> Self {
let mut mgr_config = ManagerConfig::default();
mgr_config.custom_setup = Box::new(establish_connection_rustls);
mgr_config.custom_setup =
Box::new(|config| establish_connection_rustls_diesel(config.to_owned()));
let manager = AsyncDieselConnectionManager::<AsyncPgConnection>::new_with_config(
database_url,
database_url.clone(),
mgr_config,
);
@@ -197,20 +205,25 @@ impl Persistence {
.await
.expect("Could not build connection pool");
Self { connection_pool }
Self {
connection_pool,
connect_tokio_postgres: Box::new(move || {
establish_connection_rustls_tokio_postgres(database_url.clone())
}),
}
}
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
/// database and the storage controller, therefore the database might not be available right away
pub async fn await_connection(
database_url: &str,
database_url: String,
timeout: Duration,
) -> Result<(), diesel::ConnectionError> {
let started_at = Instant::now();
log_postgres_connstr_info(database_url)
log_postgres_connstr_info(&database_url)
.map_err(|e| diesel::ConnectionError::InvalidConnectionUrl(e.to_string()))?;
loop {
match establish_connection_rustls(database_url).await {
match establish_connection_rustls_diesel(database_url.clone()).await {
Ok(_) => {
tracing::info!("Connected to database.");
return Ok(());
@@ -1821,6 +1834,151 @@ impl Persistence {
})
.await
}
pub(crate) async fn listen_sk_ps_discovery(
&self,
) -> DatabaseResult<
Pin<Box<dyn Send + 'static + futures::Stream<Item = Result<TenantId, serde_json::Error>>>>,
> {
let (client, mut conn) = (&self.connect_tokio_postgres)().await?;
let (tx, mut rx) = tokio::sync::mpsc::channel(1);
tokio::spawn(async move {
let mut stream = futures::stream::poll_fn(move |cx| conn.poll_message(cx));
while let Some(msg) = stream.next().await {
info!(?msg, "async message");
match msg {
Ok(tokio_postgres::AsyncMessage::Notification(notification))
if notification.channel() == "sk_ps_discovery" =>
{
let Ok(()) = tx.send(notification).await else {
tracing::info!(
"sk_ps_discovery notification rx dropped, stopping async notification processing"
);
break;
};
}
Ok(_) => {}
Err(err) => {
tracing::error!(?err, "tokio_postgres poll_message error");
break;
}
}
}
tracing::info!("sk_ps_discovery notification stream returned None, exiting");
});
client
.batch_execute("LISTEN sk_ps_discovery;")
.await
.expect("TODO");
#[derive(serde::Deserialize)]
struct Notification {
tenant_id: TenantId,
}
Ok(Box::pin(async_stream::stream! {
while let Some(msg) = rx.recv().await {
let msg: Result<Notification, _> = serde_json::from_str(msg.payload());
let msg = msg.map(|Notification { tenant_id }| tenant_id );
yield msg;
}
tracing::info!("sk_ps_discovery channel closed, stopping stream");
// keep client alive inside the returned sream object, othrwise `conn` ends as soon as we return from this function
drop(client);
}))
}
pub(crate) async fn get_all_sk_ps_discovery_work(
&self,
) -> DatabaseResult<Vec<SkPsDiscoveryPersistence>> {
use crate::schema::sk_ps_discovery::dsl;
self.with_measured_conn(DatabaseOperation::ListSkPsDiscovery, move |conn| {
Box::pin(async move {
let vec: Vec<SkPsDiscoveryPersistence> = dsl::sk_ps_discovery.load(conn).await?;
Ok(vec)
})
})
.await
}
pub(crate) async fn update_sk_ps_discovery_attempt(
&self,
pk: SkPsDiscoveryPersistencePk,
intent_state: String,
update: Result<(), ()>,
) -> DatabaseResult<()> {
use crate::schema::sk_ps_discovery::dsl;
self.with_measured_conn(DatabaseOperation::UpdateSkPsDiscoveryAttempt, move |conn| {
let pk = pk.clone();
let intent_state = intent_state.clone();
Box::pin(async move {
match update {
Ok(()) => {
let SkPsDiscoveryPersistencePk {
tenant_id,
shard_number,
shard_count,
ps_generation,
sk_id,
} = pk;
let nrows = diesel::delete(dsl::sk_ps_discovery)
// primary key
.filter(dsl::tenant_id.eq(tenant_id))
.filter(dsl::shard_number.eq(shard_number))
.filter(dsl::shard_count.eq(shard_count))
.filter(dsl::ps_generation.eq(ps_generation))
.filter(dsl::sk_id.eq(sk_id))
// intent_state could have changed beneath us (split brain or concurrent state gc)
// TODO: this could also just be a globally monotonic sequence number, maybe easier to reason about?
.filter(dsl::intent_state.eq(intent_state))
.execute(conn)
.await?;
if nrows != 1 {
return Err(DatabaseError::Logical(format!(
"unexpected number of deletes: {nrows}"
)));
}
}
Err(_) => {
let SkPsDiscoveryPersistencePk {
tenant_id,
shard_number,
shard_count,
ps_generation,
sk_id,
} = pk;
let nrows = diesel::update(dsl::sk_ps_discovery)
// primary key
.filter(dsl::tenant_id.eq(tenant_id))
.filter(dsl::shard_number.eq(shard_number))
.filter(dsl::shard_count.eq(shard_count))
.filter(dsl::ps_generation.eq(ps_generation))
.filter(dsl::sk_id.eq(sk_id))
// intent_state could have changed beneath us (split brain or concurrent state gc)
// TODO: this could also just be a globally monotonic sequence number, maybe easier to reason about?
.filter(dsl::intent_state.eq(intent_state))
// action:
.set((
dsl::retries.eq(dsl::retries.add(1)), // XXX: in split-brain situation we would bump twice...
dsl::last_retry_at.eq(diesel::dsl::now),
))
.execute(conn) // TODO: check update count?
.await?;
if nrows != 1 {
return Err(DatabaseError::Logical(format!(
"unexpected number of updates: {nrows}"
)));
}
}
}
Ok(())
})
})
.await
}
}
pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
@@ -1909,21 +2067,40 @@ fn client_config_with_root_certs() -> anyhow::Result<rustls::ClientConfig> {
})
}
fn establish_connection_rustls(config: &str) -> BoxFuture<ConnectionResult<AsyncPgConnection>> {
let fut = async {
type TokioPostgresConnectResult = ConnectionResult<(
tokio_postgres::Client,
tokio_postgres::Connection<
tokio_postgres::Socket,
tokio_postgres_rustls::RustlsStream<tokio_postgres::Socket>,
>,
)>;
fn establish_connection_rustls_tokio_postgres(
config: String,
) -> BoxFuture<'static, TokioPostgresConnectResult> {
let fut = async move {
// We first set up the way we want rustls to work.
let rustls_config = client_config_with_root_certs()
.map_err(|err| ConnectionError::BadConnection(format!("{err:?}")))?;
let tls = tokio_postgres_rustls::MakeRustlsConnect::new(rustls_config);
let (client, conn) = tokio_postgres::connect(config, tls)
let (client, conn) = tokio_postgres::connect(&config, tls)
.await
.map_err(|e| ConnectionError::BadConnection(e.to_string()))?;
AsyncPgConnection::try_from_client_and_connection(client, conn).await
Ok((client, conn))
};
fut.boxed()
}
fn establish_connection_rustls_diesel(
config: String,
) -> BoxFuture<'static, ConnectionResult<AsyncPgConnection>> {
async {
let (client, conn) = establish_connection_rustls_tokio_postgres(config).await?;
AsyncPgConnection::try_from_client_and_connection(client, conn).await
}
.boxed()
}
#[cfg_attr(test, test)]
fn test_config_debug_censors_password() {
let has_pw =
@@ -2386,3 +2563,61 @@ pub(crate) struct TimelineImportPersistence {
pub(crate) timeline_id: String,
pub(crate) shard_statuses: serde_json::Value,
}
#[derive(Insertable, AsChangeset, Selectable, Clone, PartialEq, Eq, Hash, Debug)]
#[diesel(table_name = crate::schema::sk_ps_discovery)]
pub(crate) struct SkPsDiscoveryPersistencePk {
pub(crate) tenant_id: String,
pub(crate) shard_number: i32,
pub(crate) shard_count: i32,
pub(crate) ps_generation: i32,
pub(crate) sk_id: i64,
}
#[derive(Queryable, Selectable, Clone, PartialEq, Eq)]
#[diesel(table_name = crate::schema::sk_ps_discovery)]
pub(crate) struct SkPsDiscoveryPersistence {
pub(crate) tenant_id: String,
pub(crate) shard_number: i32,
pub(crate) shard_count: i32,
pub(crate) ps_generation: i32,
pub(crate) sk_id: i64,
pub(crate) intent_state: String,
pub(crate) ps_id: i64,
pub(crate) created_at: chrono::DateTime<chrono::Utc>,
pub(crate) retries: i32,
pub(crate) last_retry_at: Option<chrono::DateTime<chrono::Utc>>,
pub(crate) acknowledged_at: Option<chrono::DateTime<chrono::Utc>>,
}
impl SkPsDiscoveryPersistence {
pub(crate) fn tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
Ok(TenantShardId {
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
shard_number: ShardNumber(self.shard_number as u8),
shard_count: ShardCount::new(self.shard_count as u8),
})
}
pub(crate) fn primary_key(&self) -> SkPsDiscoveryPersistencePk {
let SkPsDiscoveryPersistence {
tenant_id,
shard_number,
shard_count,
ps_generation,
sk_id,
intent_state: _,
ps_id: _,
created_at: _,
retries: _,
last_retry_at: _,
acknowledged_at: _,
} = self;
SkPsDiscoveryPersistencePk {
tenant_id: tenant_id.clone(),
shard_number: *shard_number,
shard_count: *shard_count,
ps_generation: *ps_generation,
sk_id: *sk_id,
}
}
}

View File

@@ -1,9 +1,11 @@
use safekeeper_api::models::{
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest,
self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization,
TenantShardPageserverAttachmentChange, TimelineCreateRequest,
};
use safekeeper_client::mgmt_api::{Client, Result};
use utils::id::{NodeId, TenantId, TimelineId};
use utils::logging::SecretString;
use utils::shard::TenantShardId;
use crate::metrics::PageserverRequestLabelGroup;
@@ -164,4 +166,19 @@ impl SafekeeperClient {
self.inner.utilization().await
)
}
pub async fn post_tenant_shard_pageserver_attachments(
&self,
tenant_shard_id: TenantShardId,
body: TenantShardPageserverAttachmentChange,
) -> Result<()> {
measured_request!(
"post_tenant_shard_pageserver_attachments",
crate::metrics::Method::Post,
&self.node_id_label,
self.inner
.post_tenant_shard_pageserver_attachments(tenant_shard_id, body)
.await
)
}
}

View File

@@ -60,6 +60,22 @@ diesel::table! {
}
}
diesel::table! {
sk_ps_discovery (tenant_id, shard_number, shard_count, ps_generation, sk_id) {
tenant_id -> Varchar,
shard_number -> Int4,
shard_count -> Int4,
ps_generation -> Int4,
sk_id -> Int8,
intent_state -> Varchar,
ps_id -> Int8,
created_at -> Timestamptz,
retries -> Int4,
last_retry_at -> Nullable<Timestamptz>,
acknowledged_at -> Nullable<Timestamptz>,
}
}
diesel::table! {
tenant_shards (tenant_id, shard_number, shard_count) {
tenant_id -> Varchar,
@@ -100,12 +116,16 @@ diesel::table! {
}
}
diesel::joinable!(sk_ps_discovery -> nodes (ps_id));
diesel::joinable!(sk_ps_discovery -> safekeepers (sk_id));
diesel::allow_tables_to_appear_in_same_query!(
controllers,
metadata_health,
nodes,
safekeeper_timeline_pending_ops,
safekeepers,
sk_ps_discovery,
tenant_shards,
timeline_imports,
timelines,

View File

@@ -2,6 +2,7 @@ pub mod chaos_injector;
mod context_iterator;
pub(crate) mod safekeeper_reconciler;
mod safekeeper_service;
mod sk_ps_discovery;
use std::borrow::Cow;
use std::cmp::Ordering;
@@ -1192,6 +1193,16 @@ impl Service {
}
}
}
#[instrument(skip_all)]
async fn run_sk_ps_discovery(self: &Arc<Self>) {
self.startup_complete.clone().wait().await;
sk_ps_discovery::run(
self.clone(),
self.http_client.clone(), /* TODO this client is configured to openf resh TCP connection each time, very inefficient */
).await;
}
/// Heartbeat all storage nodes once in a while.
#[instrument(skip_all)]
async fn spawn_heartbeat_driver(&self) {
@@ -1797,7 +1808,7 @@ impl Service {
reconcilers_gate: Gate::default(),
tenant_op_locks: Default::default(),
node_op_locks: Default::default(),
http_client,
http_client: http_client.clone(),
step_down_barrier: Default::default(),
});
@@ -1865,6 +1876,15 @@ impl Service {
}
});
tokio::task::spawn({
let this = this.clone();
let startup_complete = startup_complete.clone();
async move {
startup_complete.wait().await;
this.run_sk_ps_discovery().await
}
});
tokio::task::spawn({
let this = this.clone();
let startup_complete = startup_complete.clone();

View File

@@ -647,6 +647,11 @@ impl Service {
sk.describe_response()
}
pub(crate) fn get_safekeeper_object(&self, node_id: i64) -> Option<Safekeeper> {
let locked = self.inner.read().unwrap();
locked.safekeepers.get(&NodeId(node_id as u64)).cloned()
}
pub(crate) async fn upsert_safekeeper(
self: &Arc<Service>,
record: crate::persistence::SafekeeperUpsert,

View File

@@ -0,0 +1,266 @@
use std::{
collections::{HashMap, hash_map},
sync::Arc,
time::Duration,
};
use anyhow::Context;
use futures::{StreamExt, stream::FuturesUnordered};
use safekeeper_api::models::{
TenantShardPageserverAttachment, TenantShardPageserverAttachmentChange,
};
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use tracing::{Instrument, Span, error, info, info_span};
use utils::{
generation::Generation,
id::{NodeId, TenantId},
logging::SecretString,
shard::ShardIndex,
};
use crate::{
heartbeater::SafekeeperState,
persistence::{Persistence, SkPsDiscoveryPersistence},
};
use super::Service;
struct Actor {
service: Arc<Service>,
persistence: Arc<Persistence>,
http_client: reqwest::Client,
}
pub async fn run(service: Arc<Service>, http_client: reqwest::Client) {
let actor = Actor {
persistence: service.persistence.clone(),
service,
http_client, // XXX: build our own client instead of getting Service's client; we probably want idle conn to each sk
};
actor.run().await;
}
impl Actor {
async fn run(mut self) {
loop {
match self.run0().await {
Ok(()) => {
info!("sk_ps_discovery actor exiting after shutdown signal observed");
return;
}
Err(err) => {
tracing::error!(
?err,
"sk_ps_discovery actor encountered an error, restarting after backoff"
);
// TODO: proper backoff
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
}
}
}
}
async fn run0(&mut self) -> anyhow::Result<()> {
let mut subscription = self
.persistence
.listen_sk_ps_discovery()
.await
.context("listen to sk_ps_discovery")?;
let mut sync_full_ticker = tokio::time::interval(std::time::Duration::from_secs(5));
struct Task {
work: SkPsDiscoveryPersistence,
cancel: CancellationToken,
join_handle: Option<JoinHandle<()>>,
}
let mut tasks = HashMap::new();
loop {
tokio::select! {
biased; // control messages have higher priority, the periodic full tick, then subscriptions.
_ = sync_full_ticker.tick() => {
info!("rebuild");
}
maybe_res = subscription.next() => {
match maybe_res {
None => {
anyhow::bail!("subscription should never end");
}
Some(Ok(tenant_id)) => {
let tenant_id: TenantId = tenant_id;
info!(?tenant_id, "notify for tenant_id");
// for now, just also rebuild everything
}
Some(Err(err)) => {
let err: serde_json::Error = err;
anyhow::bail!("incorrect notification format: {err:?}"); // FIXME repeat message in error so it can be debugged ?
}
}
}
}
// get list of tasks from database
let mut new_tasks = self
.persistence
.get_all_sk_ps_discovery_work()
.await
.context("get_all_sk_ps_discovery_work")?
.into_iter()
.map(|work: SkPsDiscoveryPersistence| {
(
work.primary_key(),
Task {
work,
cancel: CancellationToken::new(),
join_handle: None,
},
)
})
.collect::<HashMap<_, _>>();
// Carry over ongoing tasks
let mut cancelled_wait = FuturesUnordered::new();
for (
task_key,
Task {
work: ongoing_persistence,
cancel,
join_handle,
},
) in tasks.drain()
{
match new_tasks.entry(task_key) {
hash_map::Entry::Occupied(mut planned) => {
let Task {
work: planned_persistence,
cancel: planned_cancel,
join_handle: planned_jh,
} = planned.get_mut();
assert!(planned_jh.is_none());
if *planned_persistence == ongoing_persistence {
*planned_jh = join_handle;
*planned_cancel = cancel;
continue;
}
}
hash_map::Entry::Vacant(_) => (),
}
cancel.cancel();
cancelled_wait.push(async move {
if let Some(jh) = join_handle {
let _ = jh.await;
}
});
}
while let Some(_) = cancelled_wait.next().await {}
tasks = new_tasks;
// Kick off new tasks
for (key, task) in tasks.iter_mut() {
if task.join_handle.is_none() {
task.join_handle = Some(tokio::spawn(
DeliveryAttempt {
cancel: task.cancel.clone(),
persistence: self.persistence.clone(),
service: self.service.clone(),
http_client: self.http_client.clone(),
work: task.work.clone(),
}
.run()
.instrument({
let span = info_span!(parent: None, "sk_ps_discovery_delivery", ?key);
span.follows_from(Span::current());
span
}),
))
}
}
}
}
}
struct DeliveryAttempt {
cancel: CancellationToken,
persistence: Arc<Persistence>,
service: Arc<super::Service>,
http_client: reqwest::Client,
work: SkPsDiscoveryPersistence,
}
impl DeliveryAttempt {
pub async fn run(self) {
let res = self.run0().await;
if self.cancel.is_cancelled() {
return;
}
if let Err(ref err) = res {
error!(?err, "attempt failed");
}
let res = self
.persistence
.update_sk_ps_discovery_attempt(
self.work.primary_key(),
self.work.intent_state.clone(),
res.map_err(|_| ()),
)
.await;
if let Err(ref err) = res {
error!(?err, "persistence of attempt result failed");
}
}
async fn run0(&self) -> anyhow::Result<()> {
let Some(sk) = self.service.get_safekeeper_object(self.work.sk_id) else {
anyhow::bail!("safekeeper object does not exist");
};
match sk.availability() {
SafekeeperState::Available { .. } => (),
SafekeeperState::Offline => {
anyhow::bail!("safekeeper is offline");
}
}
let body = {
let val = TenantShardPageserverAttachment {
shard_id: ShardIndex {
shard_number: utils::shard::ShardNumber(self.work.shard_number as u8),
shard_count: utils::shard::ShardCount(self.work.shard_count as u8),
},
ps_id: NodeId(self.work.ps_id as u64),
generation: Generation::new(self.work.ps_generation as u32),
};
match self.work.intent_state.as_str() {
"attached" => TenantShardPageserverAttachmentChange::Attach { field1: val },
"detached" => TenantShardPageserverAttachmentChange::Detach(val),
x => anyhow::bail!("unknown intent state {x:?}"),
}
};
let tenant_shard_id = self.work.tenant_shard_id()?;
sk.with_client_retries(
|client| {
let body = body.clone();
async move {
client
.post_tenant_shard_pageserver_attachments(tenant_shard_id, body)
.await
}
},
&self.http_client,
&self
.service
.config
.safekeeper_jwt_token
.clone()
.map(SecretString::from),
1,
3,
Duration::from_secs(1),
&self.cancel,
)
.await?;
Ok(())
}
}

View File

@@ -184,7 +184,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
"pageserver_evictions_with_low_residence_duration_total",
"pageserver_aux_file_estimated_size",
"pageserver_valid_lsn_lease_count",
"pageserver_tenant_offloaded_timelines",
counter("pageserver_tenant_throttling_count_accounted_start"),
counter("pageserver_tenant_throttling_count_accounted_finish"),
counter("pageserver_tenant_throttling_wait_usecs_sum"),

View File

@@ -187,7 +187,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
},
"rel_size_v2_enabled": True,
"relsize_snapshot_cache_capacity": 10000,
"gc_compaction_enabled": True,
"gc_compaction_verification": False,
"gc_compaction_initial_threshold_kb": 1024000,

View File

@@ -1,77 +0,0 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from fixtures.utils import wait_until
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnvBuilder
def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
"""
Simple test for basebackup cache.
1. Check that we always hit the cache after compute restart.
2. Check that we eventually delete old basebackup files, but not the latest one.
3. Check that we delete basebackup file for timeline with active compute.
"""
neon_env_builder.pageserver_config_override = """
tenant_config = { basebackup_cache_enabled = true }
basebackup_cache_config = { cleanup_period = '1s' }
"""
env = neon_env_builder.init_start()
ep = env.endpoints.create("main")
ps = env.pageserver
ps_http = ps.http_client()
# 1. Check that we always hit the cache after compute restart.
for i in range(3):
ep.start()
ep.stop()
def check_metrics(i=i):
metrics = ps_http.get_metrics()
# Never miss.
# The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests.
# All other requests should be a hit
assert (
metrics.query_one(
"pageserver_basebackup_cache_read_total", {"result": "miss"}
).value
== 0
)
# All but the first requests are hits.
assert (
metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value
== i
)
# Every compute shut down should trigger a prepare reuest.
assert (
metrics.query_one(
"pageserver_basebackup_cache_prepare_total", {"result": "ok"}
).value
== i + 1
)
wait_until(check_metrics)
# 2. Check that we eventually delete old basebackup files, but not the latest one.
def check_bb_file_count():
bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir())
# tmp dir + 1 basebackup file.
assert len(bb_files) == 2
wait_until(check_bb_file_count)
# 3. Check that we delete basebackup file for timeline with active compute.
ep.start()
ep.safe_psql("create table t1 as select generate_series(1, 10) as n")
def check_bb_dir_empty():
bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir())
# only tmp dir.
assert len(bb_files) == 1
wait_until(check_bb_dir_empty)

View File

@@ -19,16 +19,6 @@ TEST_ROLE_NAMES = [
{"name": "role$"},
{"name": "role$$"},
{"name": "role$x$"},
{"name": "x"},
{"name": "xx"},
{"name": "$x"},
{"name": "x$"},
{"name": "$x$"},
{"name": "xx$"},
{"name": "$xx"},
{"name": "$xx$"},
# 63 bytes is the limit for role/DB names in Postgres
{"name": "x" * 63},
]
TEST_DB_NAMES = [
@@ -84,43 +74,6 @@ TEST_DB_NAMES = [
"name": "db name$x$",
"owner": "role$x$",
},
{
"name": "x",
"owner": "x",
},
{
"name": "xx",
"owner": "xx",
},
{
"name": "$x",
"owner": "$x",
},
{
"name": "x$",
"owner": "x$",
},
{
"name": "$x$",
"owner": "$x$",
},
{
"name": "xx$",
"owner": "xx$",
},
{
"name": "$xx",
"owner": "$xx",
},
{
"name": "$xx$",
"owner": "$xx$",
},
# 63 bytes is the limit for role/DB names in Postgres
{
"name": "x" * 63,
"owner": "x" * 63,
},
]
@@ -193,10 +146,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
"""
Test that compute_ctl can create and work with databases and roles
with special characters (whitespaces, %, tabs, etc.) in the name.
Also use `drop_subscriptions_before_start: true`. We do not actually
have any subscriptions in this test, so it should be no-op, but it
i) simulates the case when we create a second dev branch together with
a new project creation, and ii) just generally stresses more code paths.
"""
env = neon_simple_env
@@ -210,7 +159,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
**{
"spec": {
"skip_pg_catalog_updates": False,
"drop_subscriptions_before_start": True,
"cluster": {
"roles": TEST_ROLE_NAMES,
"databases": TEST_DB_NAMES,
@@ -254,7 +202,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
**{
"spec": {
"skip_pg_catalog_updates": False,
"drop_subscriptions_before_start": True,
"cluster": {
"roles": [],
"databases": [],

View File

@@ -508,9 +508,6 @@ PER_METRIC_VERIFIERS = {
"remote_storage_size": CannotVerifyAnything,
"written_size": WrittenDataVerifier,
"written_data_bytes_delta": WrittenDataDeltaVerifier,
"written_size_since_parent": WrittenDataVerifier, # same as written_size on root
"pitr_cutoff": CannotVerifyAnything,
"pitr_history_size_since_parent": WrittenDataVerifier, # same as written_size on root w/o GC
"timeline_logical_size": CannotVerifyAnything,
"synthetic_storage_size": SyntheticSizeVerifier,
}

View File

@@ -27,9 +27,8 @@ from contextlib import closing
import psycopg2
import pytest
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, PgBin, wait_for_last_flush_lsn, wait_replica_caughtup
from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
from fixtures.pg_version import PgVersion
from fixtures.utils import query_scalar, skip_on_postgres, wait_until
@@ -696,110 +695,3 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv):
with secondary.cursor() as secondary_cur:
secondary_cur.execute("select count(*) from t")
assert secondary_cur.fetchone() == (n_restarts,)
def test_ephemeral_endpoints_vacuum(neon_simple_env: NeonEnv, pg_bin: PgBin):
env = neon_simple_env
endpoint = env.endpoints.create_start("main")
sql = """
CREATE TABLE CHAR_TBL(f1 char(4));
CREATE TABLE FLOAT8_TBL(f1 float8);
CREATE TABLE INT2_TBL(f1 int2);
CREATE TABLE INT4_TBL(f1 int4);
CREATE TABLE INT8_TBL(q1 int8, q2 int8);
CREATE TABLE POINT_TBL(f1 point);
CREATE TABLE TEXT_TBL (f1 text);
CREATE TABLE VARCHAR_TBL(f1 varchar(4));
CREATE TABLE onek (unique1 int4);
CREATE TABLE onek2 AS SELECT * FROM onek;
CREATE TABLE tenk1 (unique1 int4);
CREATE TABLE tenk2 AS SELECT * FROM tenk1;
CREATE TABLE person (name text, age int4,location point);
CREATE TABLE emp (salary int4, manager name) INHERITS (person);
CREATE TABLE student (gpa float8) INHERITS (person);
CREATE TABLE stud_emp ( percent int4) INHERITS (emp, student);
CREATE TABLE road (name text,thepath path);
CREATE TABLE ihighway () INHERITS (road);
CREATE TABLE shighway(surface text) INHERITS (road);
CREATE TABLE BOOLTBL3 (d text, b bool, o int);
CREATE TABLE booltbl4(isfalse bool, istrue bool, isnul bool);
DROP TABLE BOOLTBL3;
DROP TABLE BOOLTBL4;
CREATE TABLE ceil_floor_round (a numeric);
DROP TABLE ceil_floor_round;
CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
DROP TABLE width_bucket_test;
CREATE TABLE num_input_test (n1 numeric);
CREATE TABLE num_variance (a numeric);
INSERT INTO num_variance VALUES (0);
CREATE TABLE snapshot_test (nr integer, snap txid_snapshot);
CREATE TABLE guid1(guid_field UUID, text_field TEXT DEFAULT(now()));
CREATE TABLE guid2(guid_field UUID, text_field TEXT DEFAULT(now()));
CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field);
CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field);
TRUNCATE guid1;
DROP TABLE guid1;
DROP TABLE guid2 CASCADE;
CREATE TABLE numrange_test (nr NUMRANGE);
CREATE INDEX numrange_test_btree on numrange_test(nr);
CREATE TABLE numrange_test2(nr numrange);
CREATE INDEX numrange_test2_hash_idx on numrange_test2 using hash (nr);
INSERT INTO numrange_test2 VALUES('[, 5)');
CREATE TABLE textrange_test (tr text);
CREATE INDEX textrange_test_btree on textrange_test(tr);
CREATE TABLE test_range_gist(ir int4range);
CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
DROP INDEX test_range_gist_idx;
CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
CREATE TABLE test_range_spgist(ir int4range);
CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
DROP INDEX test_range_spgist_idx;
CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
CREATE TABLE test_range_elem(i int4);
CREATE INDEX test_range_elem_idx on test_range_elem (i);
CREATE INDEX ON test_range_elem using spgist(int4range(i,i+10));
DROP TABLE test_range_elem;
CREATE TABLE test_range_excl(room int4range, speaker int4range, during tsrange, exclude using gist (room with =, during with &&), exclude using gist (speaker with =, during with &&));
CREATE TABLE f_test(f text, i int);
CREATE TABLE i8r_array (f1 int, f2 text);
CREATE TYPE arrayrange as range (subtype=int4[]);
CREATE TYPE two_ints as (a int, b int);
DROP TYPE two_ints cascade;
CREATE TABLE text_support_test (t text);
CREATE TABLE TEMP_FLOAT (f1 FLOAT8);
CREATE TABLE TEMP_INT4 (f1 INT4);
CREATE TABLE TEMP_INT2 (f1 INT2);
CREATE TABLE TEMP_GROUP (f1 INT4, f2 INT4, f3 FLOAT8);
CREATE TABLE POLYGON_TBL(f1 polygon);
CREATE TABLE quad_poly_tbl (id int, p polygon);
INSERT INTO quad_poly_tbl SELECT (x - 1) * 100 + y, polygon(circle(point(x * 10, y * 10), 1 + (x + y) % 10)) FROM generate_series(1, 200) x, generate_series(1, 100) y;
CREATE TABLE quad_poly_tbl_ord_seq2 AS SELECT 1 FROM quad_poly_tbl;
CREATE TABLE quad_poly_tbl_ord_idx2 AS SELECT 1 FROM quad_poly_tbl;
"""
with endpoint.cursor() as cur:
lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
env.endpoints.create_start(branch_name="main", lsn=lsn)
log.info(f"lsn: {lsn}")
for line in sql.split("\n"):
if len(line.strip()) == 0 or line.startswith("--"):
continue
cur.execute(line)
lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
env.endpoints.create_start(branch_name="main", lsn=lsn)
log.info(f"lsn: {lsn}")
cur.execute("VACUUM FULL pg_class;")
for ep in env.endpoints.endpoints:
log.info(f"{ep.endpoint_id} / {ep.pg_port}")
pg_dump_command = ["pg_dumpall", "-f", f"/tmp/dump-{ep.endpoint_id}.sql"]
env_vars = {
"PGPORT": str(ep.pg_port),
"PGUSER": endpoint.default_options["user"],
"PGHOST": endpoint.default_options["host"],
}
pg_bin.run_capture(pg_dump_command, env=env_vars)

Some files were not shown because too many files have changed in this diff Show More