Compare commits

..

4 Commits

Author SHA1 Message Date
John Spray
94e1d80a64 scrubber: make scan-metadata enumerate relics & unreadable timelines 2023-12-08 14:48:41 +00:00
John Spray
a28d91e8bc scrubber: report on generation-ful-ness of indices 2023-12-08 13:59:42 +00:00
John Spray
041d610fbe scrubber: handle initdb files 2023-12-08 13:49:40 +00:00
John Spray
ec03c29644 scrubber: only trim prefix if it ends with / 2023-12-08 13:49:40 +00:00
113 changed files with 2097 additions and 4226 deletions

View File

@@ -199,10 +199,6 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v3
@@ -1101,10 +1097,6 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v3

View File

@@ -142,10 +142,6 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
@@ -242,20 +238,6 @@ jobs:
options: --init
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
with:

124
Cargo.lock generated
View File

@@ -44,12 +44,6 @@ dependencies = [
"memchr",
]
[[package]]
name = "allocator-api2"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "android_system_properties"
version = "0.1.5"
@@ -184,7 +178,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
dependencies = [
"concurrent-queue",
"event-listener 2.5.3",
"event-listener",
"futures-core",
]
@@ -205,13 +199,11 @@ dependencies = [
[[package]]
name = "async-lock"
version = "3.2.0"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c"
checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b"
dependencies = [
"event-listener 4.0.0",
"event-listener-strategy",
"pin-project-lite",
"event-listener",
]
[[package]]
@@ -694,9 +686,9 @@ dependencies = [
[[package]]
name = "azure_core"
version = "0.18.0"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
checksum = "8e29286b9edfdd6f2c7e9d970bb5b015df8621258acab9ecfcea09b2d7692467"
dependencies = [
"async-trait",
"base64 0.21.1",
@@ -704,10 +696,8 @@ dependencies = [
"dyn-clone",
"futures",
"getrandom 0.2.11",
"hmac",
"http-types",
"log",
"once_cell",
"paste",
"pin-project",
"quick-xml",
@@ -716,7 +706,6 @@ dependencies = [
"rustc_version",
"serde",
"serde_json",
"sha2",
"time",
"url",
"uuid",
@@ -724,9 +713,9 @@ dependencies = [
[[package]]
name = "azure_identity"
version = "0.18.1"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9"
dependencies = [
"async-lock",
"async-trait",
@@ -736,6 +725,7 @@ dependencies = [
"oauth2",
"pin-project",
"serde",
"serde_json",
"time",
"tz-rs",
"url",
@@ -744,18 +734,21 @@ dependencies = [
[[package]]
name = "azure_storage"
version = "0.18.0"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
checksum = "bed0ccefde57930b2886fd4aed1f70ac469c197b8c2e94828290d71bcbdb5d97"
dependencies = [
"RustyXML",
"async-lock",
"async-trait",
"azure_core",
"bytes",
"futures",
"hmac",
"log",
"serde",
"serde_derive",
"serde_json",
"sha2",
"time",
"url",
"uuid",
@@ -763,14 +756,13 @@ dependencies = [
[[package]]
name = "azure_storage_blobs"
version = "0.18.0"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
checksum = "f91a52da2d192cfe43759f61e8bb31a5969f1722d5b85ac89627f356ad674ab4"
dependencies = [
"RustyXML",
"azure_core",
"azure_storage",
"azure_svc_blobstorage",
"bytes",
"futures",
"log",
@@ -782,22 +774,6 @@ dependencies = [
"uuid",
]
[[package]]
name = "azure_svc_blobstorage"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
dependencies = [
"azure_core",
"bytes",
"futures",
"log",
"once_cell",
"serde",
"serde_json",
"time",
]
[[package]]
name = "backtrace"
version = "0.3.67"
@@ -914,7 +890,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
dependencies = [
"memchr",
"once_cell",
"regex-automata 0.1.10",
"regex-automata",
"serde",
]
@@ -1704,27 +1680,6 @@ version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "event-listener"
version = "4.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "770d968249b5d99410d61f5bf89057f3199a077a04d087092f58e7d10692baae"
dependencies = [
"concurrent-queue",
"parking",
"pin-project-lite",
]
[[package]]
name = "event-listener-strategy"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3"
dependencies = [
"event-listener 4.0.0",
"pin-project-lite",
]
[[package]]
name = "fail"
version = "0.5.1"
@@ -2087,10 +2042,6 @@ name = "hashbrown"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
dependencies = [
"ahash",
"allocator-api2",
]
[[package]]
name = "hashlink"
@@ -2582,7 +2533,7 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
dependencies = [
"regex-automata 0.1.10",
"regex-automata",
]
[[package]]
@@ -2608,9 +2559,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "memchr"
version = "2.6.4"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "memoffset"
@@ -3103,7 +3054,6 @@ dependencies = [
"humantime-serde",
"hyper",
"itertools",
"md5",
"metrics",
"nix 0.26.2",
"num-traits",
@@ -3718,9 +3668,9 @@ dependencies = [
[[package]]
name = "quick-xml"
version = "0.31.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
dependencies = [
"memchr",
"serde",
@@ -3860,14 +3810,13 @@ dependencies = [
[[package]]
name = "regex"
version = "1.10.2"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata 0.4.3",
"regex-syntax 0.8.2",
"regex-syntax 0.7.2",
]
[[package]]
@@ -3879,17 +3828,6 @@ dependencies = [
"regex-syntax 0.6.29",
]
[[package]]
name = "regex-automata"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax 0.8.2",
]
[[package]]
name = "regex-syntax"
version = "0.6.29"
@@ -3898,9 +3836,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
[[package]]
name = "regex-syntax"
version = "0.8.2"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
[[package]]
name = "relative-path"
@@ -5286,8 +5224,6 @@ dependencies = [
"futures-core",
"futures-io",
"futures-sink",
"futures-util",
"hashbrown 0.14.0",
"pin-project-lite",
"tokio",
"tracing",
@@ -5765,7 +5701,6 @@ dependencies = [
"serde",
"serde_assert",
"serde_json",
"serde_path_to_error",
"serde_with",
"signal-hook",
"strum",
@@ -6284,8 +6219,7 @@ dependencies = [
"prost",
"rand 0.8.5",
"regex",
"regex-automata 0.4.3",
"regex-syntax 0.8.2",
"regex-syntax 0.7.2",
"reqwest",
"ring 0.16.20",
"rustls",

View File

@@ -38,10 +38,10 @@ license = "Apache-2.0"
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
azure_core = "0.18"
azure_identity = "0.18"
azure_storage = "0.18"
azure_storage_blobs = "0.18"
azure_core = "0.16"
azure_identity = "0.16"
azure_storage = "0.16"
azure_storage_blobs = "0.16"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
@@ -109,7 +109,7 @@ pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"
regex = "1.10.2"
regex = "1.4"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
reqwest-middleware = "0.2.0"
@@ -149,7 +149,7 @@ tokio-postgres-rustls = "0.10.0"
tokio-rustls = "0.24"
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
tokio-util = { version = "0.7", features = ["io"] }
toml = "0.7"
toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}

View File

@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
IF NOT EXISTS (
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
IF array_length(roles, 1) IS NOT NULL THEN
EXECUTE format('GRANT neon_superuser TO %s',
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));

View File

@@ -193,11 +193,16 @@ impl Escaping for PgIdent {
/// Build a list of existing Postgres roles
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
let postgres_roles = xact
.query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
.query(
"SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
&[],
)?
.iter()
.map(|row| Role {
name: row.get("rolname"),
encrypted_password: row.get("rolpassword"),
replication: Some(row.get("rolreplication")),
bypassrls: Some(row.get("rolbypassrls")),
options: None,
})
.collect();

View File

@@ -252,6 +252,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
let action = if let Some(r) = pg_role {
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|| !r.bypassrls.unwrap_or(false)
|| !r.replication.unwrap_or(false)
{
RoleAction::Update
} else if let Some(pg_pwd) = &r.encrypted_password {
@@ -283,22 +285,14 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
match action {
RoleAction::None => {}
RoleAction::Update => {
// This can be run on /every/ role! Not just ones created through the console.
// This means that if you add some funny ALTER here that adds a permission,
// this will get run even on user-created roles! This will result in different
// behavior before and after a spec gets reapplied. The below ALTER as it stands
// now only grants LOGIN and changes the password. Please do not allow this branch
// to do anything silly.
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
let mut query: String =
format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?;
}
RoleAction::Create => {
// This branch only runs when roles are created through the console, so it is
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
// from neon_superuser.
let mut query: String = format!(
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
name.pg_quote()
);
info!("role create query: '{}'", &query);

View File

@@ -168,7 +168,7 @@ fn print_timelines_tree(
info: t.clone(),
children: BTreeSet::new(),
name: timeline_name_mappings
.remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
.remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
},
)
})

View File

@@ -407,7 +407,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
};
let request = models::TenantCreateRequest {
@@ -505,7 +504,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
}
};

View File

@@ -165,7 +165,7 @@ pub fn migrate_tenant(
let found = other_ps_tenants
.into_iter()
.map(|t| t.id)
.any(|i| i.tenant_id == tenant_id);
.any(|i| i == tenant_id);
if !found {
continue;
}

View File

@@ -207,6 +207,8 @@ pub struct DeltaOp {
pub struct Role {
pub name: PgIdent,
pub encrypted_password: Option<String>,
pub replication: Option<bool>,
pub bypassrls: Option<bool>,
pub options: GenericOptions,
}

View File

@@ -3,11 +3,8 @@
//! Otherwise, we might not see all metrics registered via
//! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)]
use once_cell::sync::Lazy;
use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
};
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
pub use prometheus::opts;
pub use prometheus::register;
pub use prometheus::Error;
@@ -135,137 +132,3 @@ fn get_rusage_stats() -> libc::rusage {
rusage.assume_init()
}
}
/// Create an [`IntCounterPairVec`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair_vec {
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{
match (
$crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES),
$crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES),
) {
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)),
(Err(e), _) | (_, Err(e)) => Err(e),
}
}};
}
/// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair {
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{
match (
$crate::register_int_counter!($NAME1, $HELP1),
$crate::register_int_counter!($NAME2, $HELP2),
) {
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)),
(Err(e), _) | (_, Err(e)) => Err(e),
}
}};
}
/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes
pub struct GenericCounterPairVec<P: Atomic> {
inc: GenericCounterVec<P>,
dec: GenericCounterVec<P>,
}
/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes
pub struct GenericCounterPair<P: Atomic> {
inc: GenericCounter<P>,
dec: GenericCounter<P>,
}
impl<P: Atomic> GenericCounterPairVec<P> {
pub fn new(inc: GenericCounterVec<P>, dec: GenericCounterVec<P>) -> Self {
Self { inc, dec }
}
/// `get_metric_with_label_values` returns the [`GenericCounterPair<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`GenericCounterPair<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?,
})
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
self.get_metric_with_label_values(vals).unwrap()
}
}
impl<P: Atomic> GenericCounterPair<P> {
pub fn new(inc: GenericCounter<P>, dec: GenericCounter<P>) -> Self {
Self { inc, dec }
}
/// Increment the gauge by 1, returning a guard that decrements by 1 on drop.
pub fn guard(&self) -> GenericCounterPairGuard<P> {
self.inc.inc();
GenericCounterPairGuard(self.dec.clone())
}
/// Increment the gauge by n, returning a guard that decrements by n on drop.
pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy<P> {
self.inc.inc_by(n);
GenericCounterPairGuardBy(self.dec.clone(), n)
}
/// Increase the gauge by 1.
#[inline]
pub fn inc(&self) {
self.inc.inc();
}
/// Decrease the gauge by 1.
#[inline]
pub fn dec(&self) {
self.dec.inc();
}
/// Add the given value to the gauge. (The value can be
/// negative, resulting in a decrement of the gauge.)
#[inline]
pub fn inc_by(&self, v: P::T) {
self.inc.inc_by(v);
}
/// Subtract the given value from the gauge. (The value can be
/// negative, resulting in an increment of the gauge.)
#[inline]
pub fn dec_by(&self, v: P::T) {
self.dec.inc_by(v);
}
}
/// Guard returned by [`GenericCounterPair::guard`]
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
impl<P: Atomic> Drop for GenericCounterPairGuard<P> {
fn drop(&mut self) {
self.0.inc();
}
}
/// Guard returned by [`GenericCounterPair::guard_by`]
pub struct GenericCounterPairGuardBy<P: Atomic>(GenericCounter<P>, P::T);
impl<P: Atomic> Drop for GenericCounterPairGuardBy<P> {
fn drop(&mut self) {
self.0.inc_by(self.1);
}
}
/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes
pub type IntCounterPairVec = GenericCounterPairVec<AtomicU64>;
/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes
pub type IntCounterPair = GenericCounterPair<AtomicU64>;
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;

View File

@@ -237,7 +237,6 @@ pub struct TenantConfig {
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
pub heatmap_period: Option<String>,
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
@@ -358,7 +357,7 @@ pub enum TenantAttachmentStatus {
#[derive(Serialize, Deserialize, Clone)]
pub struct TenantInfo {
pub id: TenantShardId,
pub id: TenantId,
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
pub state: TenantState,
/// Sum of the size of all layer files.
@@ -370,7 +369,7 @@ pub struct TenantInfo {
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
pub tenant_id: TenantShardId,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub ancestor_timeline_id: Option<TimelineId>,
@@ -386,9 +385,6 @@ pub struct TimelineInfo {
/// The LSN that we are advertizing to safekeepers
pub remote_consistent_lsn_visible: Lsn,
/// The LSN from the start of the root timeline (never changes)
pub initdb_lsn: Lsn,
pub current_logical_size: u64,
pub current_logical_size_is_accurate: bool,
@@ -827,7 +823,7 @@ mod tests {
fn test_tenantinfo_serde() {
// Test serialization/deserialization of TenantInfo
let original_active = TenantInfo {
id: TenantShardId::unsharded(TenantId::generate()),
id: TenantId::generate(),
state: TenantState::Active,
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
@@ -844,7 +840,7 @@ mod tests {
});
let original_broken = TenantInfo {
id: TenantShardId::unsharded(TenantId::generate()),
id: TenantId::generate(),
state: TenantState::Broken {
reason: "reason".into(),
backtrace: "backtrace info".into(),

View File

@@ -76,11 +76,6 @@ impl TenantShardId {
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
ShardSlug(self)
}
/// Convenience for code that has special behavior on the 0th shard.
pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0)
}
}
/// Formatting helper

View File

@@ -271,12 +271,17 @@ impl RemoteStorage for AzureBlobStorage {
let mut builder = blob_client.get();
let range: Range = if let Some(end_exclusive) = end_exclusive {
(start_inclusive..end_exclusive).into()
if let Some(end_exclusive) = end_exclusive {
builder = builder.range(Range::new(start_inclusive, end_exclusive));
} else {
(start_inclusive..).into()
};
builder = builder.range(range);
// Open ranges are not supported by the SDK so we work around
// by setting the upper limit extremely high (but high enough
// to still be representable by signed 64 bit integers).
// TODO remove workaround once the SDK adds open range support
// https://github.com/Azure/azure-sdk-for-rust/issues/1438
let end_exclusive = u64::MAX / 4;
builder = builder.range(Range::new(start_inclusive, end_exclusive));
}
self.download_for_builder(builder).await
}

View File

@@ -50,8 +50,6 @@ const_format.workspace = true
# why is it only here? no other crate should use it, streams are rarely needed.
tokio-stream = { version = "0.1.14" }
serde_path_to_error.workspace = true
[dev-dependencies]
byteorder.workspace = true
bytes.workspace = true

View File

@@ -1,14 +1,16 @@
use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
use std::sync::Arc;
use tokio::sync::{mpsc, Mutex};
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
///
/// Can be cloned, moved and kept around in futures as "guard objects".
#[derive(Clone)]
pub struct Completion(TaskTrackerToken);
pub struct Completion(mpsc::Sender<()>);
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
pub struct Barrier(TaskTracker);
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
impl Default for Barrier {
fn default() -> Self {
@@ -19,7 +21,7 @@ impl Default for Barrier {
impl Barrier {
pub async fn wait(self) {
self.0.wait().await;
self.0.lock().await.recv().await;
}
pub async fn maybe_wait(barrier: Option<Barrier>) {
@@ -31,7 +33,8 @@ impl Barrier {
impl PartialEq for Barrier {
fn eq(&self, other: &Self) -> bool {
TaskTracker::ptr_eq(&self.0, &other.0)
// we don't use dyn so this is good
Arc::ptr_eq(&self.0, &other.0)
}
}
@@ -39,10 +42,8 @@ impl Eq for Barrier {}
/// Create new Guard and Barrier pair.
pub fn channel() -> (Completion, Barrier) {
let tracker = TaskTracker::new();
// otherwise wait never exits
tracker.close();
let token = tracker.token();
(Completion(token), Barrier(tracker))
let (tx, rx) = mpsc::channel::<()>(1);
let rx = Mutex::new(rx);
let rx = Arc::new(rx);
(Completion(tx), Barrier(rx))
}

View File

@@ -25,12 +25,8 @@ pub async fn json_request_or_empty_body<T: for<'de> Deserialize<'de>>(
if body.remaining() == 0 {
return Ok(None);
}
let mut deser = serde_json::de::Deserializer::from_reader(body.reader());
serde_path_to_error::deserialize(&mut deser)
// intentionally stringify because the debug version is not helpful in python logs
.map_err(|e| anyhow::anyhow!("Failed to parse json request: {e}"))
serde_json::from_reader(body.reader())
.context("Failed to parse json request")
.map(Some)
.map_err(ApiError::BadRequest)
}

View File

@@ -1,7 +1,6 @@
use std::str::FromStr;
use anyhow::Context;
use metrics::{IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use strum_macros::{EnumString, EnumVariantNames};
@@ -25,48 +24,16 @@ impl LogFormat {
}
}
struct TracingEventCountMetric {
error: IntCounter,
warn: IntCounter,
info: IntCounter,
debug: IntCounter,
trace: IntCounter,
}
static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
let vec = metrics::register_int_counter_vec!(
static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
metrics::register_int_counter_vec!(
"libmetrics_tracing_event_count",
"Number of tracing events, by level",
&["level"]
)
.expect("failed to define metric");
TracingEventCountMetric::new(vec)
.expect("failed to define metric")
});
impl TracingEventCountMetric {
fn new(vec: IntCounterVec) -> Self {
Self {
error: vec.with_label_values(&["error"]),
warn: vec.with_label_values(&["warn"]),
info: vec.with_label_values(&["info"]),
debug: vec.with_label_values(&["debug"]),
trace: vec.with_label_values(&["trace"]),
}
}
fn inc_for_level(&self, level: tracing::Level) {
let counter = match level {
tracing::Level::ERROR => &self.error,
tracing::Level::WARN => &self.warn,
tracing::Level::INFO => &self.info,
tracing::Level::DEBUG => &self.debug,
tracing::Level::TRACE => &self.trace,
};
counter.inc();
}
}
struct TracingEventCountLayer(&'static TracingEventCountMetric);
struct TracingEventCountLayer(&'static metrics::IntCounterVec);
impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
where
@@ -77,7 +44,15 @@ where
event: &tracing::Event<'_>,
_ctx: tracing_subscriber::layer::Context<'_, S>,
) {
self.0.inc_for_level(*event.metadata().level());
let level = event.metadata().level();
let level = match *level {
tracing::Level::ERROR => "error",
tracing::Level::WARN => "warn",
tracing::Level::INFO => "info",
tracing::Level::DEBUG => "debug",
tracing::Level::TRACE => "trace",
};
self.0.with_label_values(&[level]).inc();
}
}
@@ -131,9 +106,7 @@ pub fn init(
};
log_layer.with_filter(rust_log_env_filter())
});
let r = r.with(
TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
);
let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
match tracing_error_layer_enablement {
TracingErrorLayerEnablement::EnableWithRustLogFilter => r
.with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
@@ -284,14 +257,14 @@ impl std::fmt::Debug for SecretString {
mod tests {
use metrics::{core::Opts, IntCounterVec};
use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};
use super::TracingEventCountLayer;
#[test]
fn tracing_event_count_metric() {
let counter_vec =
IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
let layer = TracingEventCountLayer(metric);
let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
let layer = TracingEventCountLayer(counter_vec);
use tracing_subscriber::prelude::*;
tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {

View File

@@ -30,32 +30,18 @@ async fn warn_if_stuck<Fut: std::future::Future>(
let mut fut = std::pin::pin!(fut);
let mut warned = false;
let ret = loop {
loop {
match tokio::time::timeout(warn_period, &mut fut).await {
Ok(ret) => break ret,
Ok(ret) => return ret,
Err(_) => {
tracing::warn!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"still waiting, taking longer than expected..."
);
warned = true;
}
}
};
// If we emitted a warning for slowness, also emit a message when we complete, so that
// someone debugging a shutdown can know for sure whether we have moved past this operation.
if warned {
tracing::info!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"completed, after taking longer than expected"
)
}
ret
}
#[derive(Debug)]

View File

@@ -436,9 +436,9 @@ mod tests {
event_mask: 0,
}),
expected_messages: vec![
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
vec![
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
@@ -478,7 +478,7 @@ mod tests {
// walproposer will panic when it finishes sync_safekeepers
std::panic::catch_unwind(|| wp.start()).unwrap_err();
// validate the resulting LSN
assert_eq!(receiver.try_recv(), Ok(1337));
assert_eq!(receiver.recv()?, 1337);
Ok(())
// drop() will free up resources here
}

View File

@@ -36,7 +36,6 @@ humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
itertools.workspace = true
md5.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
num_cpus = { version = "1.15" }

View File

@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources};
use pageserver::tenant::TenantSharedResources;
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;
@@ -425,6 +425,7 @@ fn start_pageserver(
let tenant_manager = Arc::new(tenant_manager);
BACKGROUND_RUNTIME.spawn({
let init_done_rx = init_done_rx;
let shutdown_pageserver = shutdown_pageserver.clone();
let drive_init = async move {
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -504,17 +505,6 @@ fn start_pageserver(
}
});
let secondary_controller = if let Some(remote_storage) = &remote_storage {
secondary::spawn_tasks(
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
shutdown_pageserver.clone(),
)
} else {
secondary::null_controller()
};
// shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint
// is still accessible even if background task is not configured as long as remote storage has
@@ -544,7 +534,6 @@ fn start_pageserver(
broker_client.clone(),
disk_usage_eviction_state,
deletion_queue.new_client(),
secondary_controller,
)
.context("Failed to initialize router state")?,
);
@@ -571,6 +560,7 @@ fn start_pageserver(
}
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
let background_jobs_barrier = background_jobs_barrier;
let metrics_ctx = RequestContext::todo_child(
TaskKind::MetricsCollection,
// This task itself shouldn't download anything.

View File

@@ -70,8 +70,6 @@ pub mod defaults {
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
///
/// Default built-in configuration file.
///
@@ -119,8 +117,6 @@ pub mod defaults {
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
#gc_feedback = false
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
[remote_storage]
"#
@@ -219,10 +215,6 @@ pub struct PageServerConf {
/// If true, pageserver will make best-effort to operate without a control plane: only
/// for use in major incidents.
pub control_plane_emergency_mode: bool,
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
/// heatmap uploads vs. other remote storage operations.
pub heatmap_upload_concurrency: usize,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -301,8 +293,6 @@ struct PageServerConfigBuilder {
control_plane_api: BuilderValue<Option<Url>>,
control_plane_api_token: BuilderValue<Option<SecretString>>,
control_plane_emergency_mode: BuilderValue<bool>,
heatmap_upload_concurrency: BuilderValue<usize>,
}
impl Default for PageServerConfigBuilder {
@@ -371,8 +361,6 @@ impl Default for PageServerConfigBuilder {
control_plane_api: Set(None),
control_plane_api_token: Set(None),
control_plane_emergency_mode: Set(false),
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
}
}
}
@@ -513,10 +501,6 @@ impl PageServerConfigBuilder {
self.control_plane_emergency_mode = BuilderValue::Set(enabled)
}
pub fn heatmap_upload_concurrency(&mut self, value: usize) {
self.heatmap_upload_concurrency = BuilderValue::Set(value)
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_size_logical_size_queries = self
.concurrent_tenant_size_logical_size_queries
@@ -611,10 +595,6 @@ impl PageServerConfigBuilder {
control_plane_emergency_mode: self
.control_plane_emergency_mode
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
heatmap_upload_concurrency: self
.heatmap_upload_concurrency
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
})
}
}
@@ -848,9 +828,7 @@ impl PageServerConf {
},
"control_plane_emergency_mode" => {
builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
},
"heatmap_upload_concurrency" => {
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
},
_ => bail!("unrecognized pageserver option '{key}'"),
}
@@ -918,7 +896,6 @@ impl PageServerConf {
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
}
}
}
@@ -1143,8 +1120,7 @@ background_task_maximum_delay = '334 s'
)?,
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
control_plane_emergency_mode: false
},
"Correct defaults should be used when no config values are provided"
);
@@ -1201,8 +1177,7 @@ background_task_maximum_delay = '334 s'
background_task_maximum_delay: Duration::from_secs(334),
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
control_plane_emergency_mode: false
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -3,7 +3,7 @@
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use pageserver_api::models::TenantState;
@@ -256,6 +256,8 @@ async fn calculate_synthetic_size_worker(
info!("calculate_synthetic_size_worker stopped");
};
let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
loop {
let started_at = Instant::now();
@@ -267,25 +269,26 @@ async fn calculate_synthetic_size_worker(
}
};
for (tenant_shard_id, tenant_state) in tenants {
for (tenant_id, tenant_state) in tenants {
if tenant_state != TenantState::Active {
continue;
}
if !tenant_shard_id.is_zero() {
// We only send consumption metrics from shard 0, so don't waste time calculating
// synthetic size on other shards.
continue;
if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
if let Some(PageReconstructError::Cancelled) =
e.downcast_ref::<PageReconstructError>()
{
return Ok(());
}
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
}
}
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
continue;
};
// there is never any reason to exit calculate_synthetic_size_worker following any
// return value -- we don't need to care about shutdown because no tenant is found when
// pageserver is shut down.
calculate_and_log(&tenant, cancel, ctx).await;
}
crate::tenant::tasks::warn_when_period_overrun(
@@ -296,7 +299,7 @@ async fn calculate_synthetic_size_worker(
let res = tokio::time::timeout_at(
started_at + synthetic_size_calculation_interval,
cancel.cancelled(),
task_mgr::shutdown_token().cancelled(),
)
.await;
if res.is_ok() {
@@ -304,31 +307,3 @@ async fn calculate_synthetic_size_worker(
}
}
}
async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
const CAUSE: LogicalSizeCalculationCause =
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
return;
};
// this error can be returned if timeline is shutting down, but it does not
// mean the synthetic size worker should terminate. we do not need any checks
// in this function because `mgr::get_tenant` will error out after shutdown has
// progressed to shutting down tenants.
let shutting_down = matches!(
e.downcast_ref::<PageReconstructError>(),
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
);
if !shutting_down {
let tenant_shard_id = tenant.tenant_shard_id();
error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
}
}

View File

@@ -2,6 +2,7 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi
use chrono::{DateTime, Utc};
use consumption_metrics::EventType;
use futures::stream::StreamExt;
use pageserver_api::shard::ShardNumber;
use std::{sync::Arc, time::SystemTime};
use utils::{
id::{TenantId, TimelineId},
@@ -197,12 +198,12 @@ pub(super) async fn collect_all_metrics(
};
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
if state != TenantState::Active || !id.is_zero() {
if state != TenantState::Active {
None
} else {
crate::tenant::mgr::get_tenant(id, true)
.ok()
.map(|tenant| (id.tenant_id, tenant))
.map(|tenant| (id, tenant))
}
});
@@ -228,6 +229,11 @@ where
while let Some((tenant_id, tenant)) = tenants.next().await {
let mut tenant_resident_size = 0;
// Sharded tenants report all consumption metrics from shard zero
if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
continue;
}
for timeline in tenant.list_timelines() {
let timeline_id = timeline.timeline_id;

View File

@@ -42,6 +42,7 @@
// reading these fields. We use the Debug impl for semi-structured logging, though.
use std::{
collections::HashMap,
sync::Arc,
time::{Duration, SystemTime},
};
@@ -124,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
async fn disk_usage_eviction_task(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
_storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
cancel: CancellationToken,
) {
@@ -148,14 +149,8 @@ async fn disk_usage_eviction_task(
let start = Instant::now();
async {
let res = disk_usage_eviction_task_iteration(
state,
task_config,
storage,
tenants_dir,
&cancel,
)
.await;
let res =
disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
match res {
Ok(()) => {}
@@ -186,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
async fn disk_usage_eviction_task_iteration(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?;
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
match res {
Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -274,9 +268,8 @@ struct LayerCount {
count: usize,
}
pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
state: &State,
_storage: &GenericRemoteStorage,
usage_pre: U,
cancel: &CancellationToken,
) -> anyhow::Result<IterationOutcome<U>> {
@@ -328,16 +321,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// Walk through the list of candidates, until we have accumulated enough layers to get
// us back under the pressure threshold. 'usage_planned' is updated so that it tracks
// how much disk space would be used after evicting all the layers up to the current
// point in the list.
// point in the list. The layers are collected in 'batched', grouped per timeline.
//
// If we get far enough in the list that we start to evict layers that are below
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
let mut batched: HashMap<_, Vec<_>> = HashMap::new();
let mut warned = None;
let mut usage_planned = usage_pre;
let mut evicted_amount = 0;
for (i, (partition, candidate)) in candidates.iter().enumerate() {
let mut max_batch_size = 0;
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
if !usage_planned.has_pressure() {
debug!(
no_candidates_evicted = i,
@@ -346,13 +339,25 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
break;
}
if partition == &MinResidentSizePartition::Below && warned.is_none() {
if partition == MinResidentSizePartition::Below && warned.is_none() {
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
warned = Some(usage_planned);
}
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
evicted_amount += 1;
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
// tasks to evict all seen layers until we have evicted enough
let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
// semaphore will later be used to limit eviction concurrency, and we can express at
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
// but fail gracefully by not making batches larger.
if batch.len() < u32::MAX as usize {
batch.push(candidate.layer);
max_batch_size = max_batch_size.max(batch.len());
}
}
let usage_planned = match warned {
@@ -367,79 +372,100 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
};
debug!(?usage_planned, "usage planned");
// phase2: evict layers
// phase2: evict victims batched by timeline
let mut js = tokio::task::JoinSet::new();
let limit = 1000;
let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
let mut consumed_all = false;
// ratelimit to 1k files or any higher max batch size
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
// After the evictions, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
for (timeline, batch) in batched {
let tenant_shard_id = timeline.tenant_shard_id;
let timeline_id = timeline.timeline_id;
let batch_size =
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
let evict_layers = async move {
loop {
let next = if js.len() >= limit || consumed_all {
js.join_next().await
} else if !js.is_empty() {
// opportunistically consume ready result, one per each new evicted
futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
} else {
None
};
// I dislike naming of `available_permits` but it means current total amount of permits
// because permits can be added
assert!(batch_size as usize <= limit.available_permits());
if let Some(next) = next {
match next {
Ok(Ok(file_size)) => {
usage_assumed.add_available_bytes(file_size);
debug!(%timeline_id, "evicting batch for timeline");
let evict = {
let limit = limit.clone();
let cancel = cancel.clone();
async move {
let mut evicted_bytes = 0;
let mut evictions_failed = LayerCount::default();
let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
// semaphore closing means cancelled
return (evicted_bytes, evictions_failed);
};
let results = timeline.evict_layers(&batch).await;
match results {
Ok(results) => {
assert_eq!(results.len(), batch.len());
for (result, layer) in results.into_iter().zip(batch.iter()) {
let file_size = layer.layer_desc().file_size;
match result {
Some(Ok(())) => {
evicted_bytes += file_size;
}
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
}
None => {
assert!(cancel.is_cancelled());
}
}
}
}
Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
Err(e) => {
warn!("failed to evict batch: {:#}", e);
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
(evicted_bytes, evictions_failed)
}
if consumed_all && js.is_empty() {
break;
}
// calling again when consumed_all is fine as evicted is fused.
let Some((_partition, candidate)) = evicted.next() else {
consumed_all = true;
continue;
};
js.spawn(async move {
let rtc = candidate.timeline.remote_client.as_ref().expect(
"holding the witness, all timelines must have a remote timeline client",
);
let file_size = candidate.layer.layer_desc().file_size;
candidate
.layer
.evict_and_wait(rtc)
.await
.map(|()| file_size)
.map_err(|e| (file_size, e))
});
tokio::task::yield_now().await;
}
.instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size));
js.spawn(evict);
// spwaning multiple thousands of these is essentially blocking, so give already spawned a
// chance of making progress
tokio::task::yield_now().await;
}
let join_all = async move {
// After the evictions, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
while let Some(res) = js.join_next().await {
match res {
Ok((evicted_bytes, failed)) => {
usage_assumed.add_available_bytes(evicted_bytes);
evictions_failed.file_sizes += failed.file_sizes;
evictions_failed.count += failed.count;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
}
(usage_assumed, evictions_failed)
};
let (usage_assumed, evictions_failed) = tokio::select! {
tuple = evict_layers => { tuple },
tuple = join_all => { tuple },
_ = cancel.cancelled() => {
// dropping joinset will abort all pending evict_and_waits and that is fine, our
// requests will still stand
// close the semaphore to stop any pending acquires
limit.close();
return Ok(IterationOutcome::Cancelled);
}
};

View File

@@ -84,6 +84,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: Get tenant status
responses:
@@ -180,6 +181,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: Get timelines for tenant
responses:
@@ -230,6 +232,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -335,6 +338,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -397,6 +401,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -464,6 +469,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -517,6 +523,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Schedules attach operation to happen in the background for the given tenant.
@@ -624,6 +631,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: flush_ms
in: query
required: false
@@ -716,6 +724,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: detach_ignored
in: query
required: false
@@ -775,6 +784,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Remove tenant data (including all corresponding timelines) from pageserver's memory.
@@ -823,6 +833,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Schedules an operation that attempts to load a tenant from the local disk and
@@ -879,6 +890,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: |
Calculate tenant's synthetic size
@@ -921,6 +933,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: inputs_only
in: query
required: false
@@ -990,10 +1003,11 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Create a timeline. Returns new timeline id on success.
Recreating the same timeline will succeed if the parameters match the existing timeline.
Create a timeline. Returns new timeline id on success.\
If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
requestBody:
content:
@@ -1123,6 +1137,7 @@ paths:
application/json:
schema:
type: string
format: hex
"400":
description: Malformed tenant create request
content:
@@ -1219,6 +1234,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: |
Returns tenant's config description: specific config overrides a tenant has
@@ -1324,6 +1340,7 @@ components:
properties:
new_tenant_id:
type: string
format: hex
generation:
type: integer
description: Attachment generation number.
@@ -1352,6 +1369,7 @@ components:
properties:
tenant_id:
type: string
format: hex
TenantLocationConfigRequest:
type: object
required:
@@ -1359,6 +1377,7 @@ components:
properties:
tenant_id:
type: string
format: hex
mode:
type: string
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
@@ -1405,8 +1424,6 @@ components:
type: integer
trace_read_requests:
type: boolean
heatmap_period:
type: integer
TenantConfigResponse:
type: object
properties:
@@ -1429,6 +1446,7 @@ components:
format: hex
tenant_id:
type: string
format: hex
last_record_lsn:
type: string
format: hex

View File

@@ -42,7 +42,6 @@ use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlotError, TenantSlotUpsertError, TenantStateError,
};
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::CompactFlags;
@@ -76,11 +75,9 @@ pub struct State {
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
}
impl State {
#[allow(clippy::too_many_arguments)]
pub fn new(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
@@ -89,7 +86,6 @@ impl State {
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
.iter()
@@ -104,7 +100,6 @@ impl State {
broker_client,
disk_usage_eviction_state,
deletion_queue_client,
secondary_controller,
})
}
@@ -141,6 +136,11 @@ impl From<PageReconstructError> for ApiError {
fn from(pre: PageReconstructError) -> ApiError {
match pre {
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
PageReconstructError::NeedsDownload(_, _) => {
// This shouldn't happen, because we use a RequestContext that requests to
// download any missing layer files on-demand.
ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
}
PageReconstructError::Cancelled => {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
@@ -319,7 +319,6 @@ async fn build_timeline_info_common(
ctx: &RequestContext,
) -> anyhow::Result<TimelineInfo> {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
let initdb_lsn = timeline.initdb_lsn;
let last_record_lsn = timeline.get_last_record_lsn();
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
let guard = timeline.last_received_wal.lock().unwrap();
@@ -353,14 +352,14 @@ async fn build_timeline_info_common(
let walreceiver_status = timeline.walreceiver_status();
let info = TimelineInfo {
tenant_id: timeline.tenant_shard_id,
// TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id
tenant_id: timeline.tenant_shard_id.tenant_id,
timeline_id: timeline.timeline_id,
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
remote_consistent_lsn: remote_consistent_lsn_projected,
remote_consistent_lsn_visible,
initdb_lsn,
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -453,7 +452,7 @@ async fn timeline_create_handler(
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
Err(tenant::CreateTimelineError::AlreadyExists) => {
json_response(StatusCode::CONFLICT, ())
}
Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
@@ -481,15 +480,15 @@ async fn timeline_list_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let include_non_incremental_logical_size: Option<bool> =
parse_query_param(&request, "include-non-incremental-logical-size")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let response_data = async {
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let tenant = mgr::get_tenant(tenant_id, true)?;
let timelines = tenant.list_timelines();
let mut response_data = Vec::with_capacity(timelines.len());
@@ -508,9 +507,7 @@ async fn timeline_list_handler(
}
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
}
.instrument(info_span!("timeline_list",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()))
.instrument(info_span!("timeline_list", %tenant_id))
.await?;
json_response(StatusCode::OK, response_data)
@@ -520,17 +517,17 @@ async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let include_non_incremental_logical_size: Option<bool> =
parse_query_param(&request, "include-non-incremental-logical-size")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
// Logical size calculation needs downloading.
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline_info = async {
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let tenant = mgr::get_tenant(tenant_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, false)
@@ -547,10 +544,7 @@ async fn timeline_detail_handler(
Ok::<_, ApiError>(timeline_info)
}
.instrument(info_span!("timeline_detail",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
%timeline_id))
.instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
.await?;
json_response(StatusCode::OK, timeline_info)
@@ -560,15 +554,8 @@ async fn get_lsn_by_timestamp_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
if !tenant_shard_id.is_zero() {
// Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero"
)));
}
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let version: Option<u8> = parse_query_param(&request, "version")?;
@@ -580,7 +567,7 @@ async fn get_lsn_by_timestamp_handler(
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let result = timeline
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
.await?;
@@ -615,15 +602,8 @@ async fn get_timestamp_of_lsn_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
if !tenant_shard_id.is_zero() {
// Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero"
)));
}
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -633,7 +613,7 @@ async fn get_timestamp_of_lsn_handler(
.map_err(ApiError::BadRequest)?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
match result {
@@ -825,11 +805,11 @@ async fn tenant_status(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant_info = async {
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
let tenant = mgr::get_tenant(tenant_id, false)?;
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
@@ -839,15 +819,13 @@ async fn tenant_status(
let state = tenant.current_state();
Result::<_, ApiError>::Ok(TenantInfo {
id: tenant_shard_id,
id: tenant_id,
state: state.clone(),
current_physical_size: Some(current_physical_size),
attachment_status: state.attachment_status(),
})
}
.instrument(info_span!("tenant_status_handler",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()))
.instrument(info_span!("tenant_status_handler", %tenant_id))
.await?;
json_response(StatusCode::OK, tenant_info)
@@ -890,20 +868,14 @@ async fn tenant_size_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
let headers = request.headers();
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
if !tenant_shard_id.is_zero() {
return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero"
)));
}
let tenant = mgr::get_tenant(tenant_id, true)?;
// this can be long operation
let inputs = tenant
@@ -955,7 +927,7 @@ async fn tenant_size_handler(
json_response(
StatusCode::OK,
TenantHistorySize {
id: tenant_shard_id.tenant_id,
id: tenant_id,
size: sizes.as_ref().map(|x| x.total_size),
segment_sizes: sizes.map(|x| x.segments),
inputs,
@@ -967,14 +939,14 @@ async fn layer_map_info_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let reset: LayerAccessStatsReset =
parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let layer_map_info = timeline.layer_map_info(reset).await;
json_response(StatusCode::OK, layer_map_info)
@@ -984,12 +956,13 @@ async fn layer_download_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let layer_file_name = get_request_param(&request, "layer_file_name")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let downloaded = timeline
.download_layer(layer_file_name)
.await
@@ -1000,7 +973,7 @@ async fn layer_download_handler(
Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
None => json_response(
StatusCode::BAD_REQUEST,
format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
),
}
}
@@ -1009,12 +982,12 @@ async fn evict_timeline_layer_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let layer_file_name = get_request_param(&request, "layer_file_name")?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let evicted = timeline
.evict_layer(layer_file_name)
.await
@@ -1025,7 +998,7 @@ async fn evict_timeline_layer_handler(
Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
None => json_response(
StatusCode::BAD_REQUEST,
format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
),
}
}
@@ -1157,10 +1130,10 @@ async fn get_tenant_config_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
let tenant = mgr::get_tenant(tenant_id, false)?;
let response = HashMap::from([
(
@@ -1254,9 +1227,9 @@ async fn handle_tenant_break(
r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
tenant.set_broken("broken from test".to_owned()).await;
@@ -1297,15 +1270,14 @@ async fn timeline_gc_handler(
mut request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let wait_task_done =
mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?;
let gc_result = wait_task_done
.await
.context("wait for gc task")
@@ -1320,9 +1292,9 @@ async fn timeline_compact_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let mut flags = EnumSet::empty();
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1330,14 +1302,14 @@ async fn timeline_compact_handler(
}
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
timeline
.compact(&cancel, flags, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
.await
}
@@ -1346,9 +1318,9 @@ async fn timeline_checkpoint_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let mut flags = EnumSet::empty();
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1356,7 +1328,7 @@ async fn timeline_checkpoint_handler(
}
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
timeline
.freeze_and_flush()
.await
@@ -1368,7 +1340,7 @@ async fn timeline_checkpoint_handler(
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
.await
}
@@ -1376,12 +1348,12 @@ async fn timeline_download_remote_layers_handler_post(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
match timeline.spawn_download_all_remote_layers(body).await {
Ok(st) => json_response(StatusCode::ACCEPTED, st),
Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1392,11 +1364,11 @@ async fn timeline_download_remote_layers_handler_get(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let info = timeline
.get_download_all_remote_layers_task_info()
.context("task never started since last pageserver process start")
@@ -1442,9 +1414,9 @@ async fn getpage_at_lsn_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
struct Key(crate::repository::Key);
@@ -1463,7 +1435,7 @@ async fn getpage_at_lsn_handler(
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let page = timeline.get(key.0, lsn, &ctx).await?;
@@ -1475,7 +1447,7 @@ async fn getpage_at_lsn_handler(
.unwrap(),
)
}
.instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
.await
}
@@ -1483,9 +1455,9 @@ async fn timeline_collect_keyspace(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
struct Partitioning {
keys: crate::keyspace::KeySpace,
@@ -1554,7 +1526,7 @@ async fn timeline_collect_keyspace(
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let keys = timeline
.collect_keyspace(at_lsn, &ctx)
@@ -1563,15 +1535,15 @@ async fn timeline_collect_keyspace(
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
}
.instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
.await
}
async fn active_timeline_of_active_tenant(
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Arc<Timeline>, ApiError> {
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let tenant = mgr::get_tenant(tenant_id, true)?;
tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))
@@ -1593,7 +1565,7 @@ async fn always_panic_handler(
async fn disk_usage_eviction_run(
mut r: Request<Body>,
cancel: CancellationToken,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
@@ -1621,48 +1593,57 @@ async fn disk_usage_eviction_run(
}
}
let config = json_request::<Config>(&mut r).await?;
let config = json_request::<Config>(&mut r)
.await
.map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
let usage = Usage {
config,
freed_bytes: 0,
};
let (tx, rx) = tokio::sync::oneshot::channel();
let state = get_state(&r);
let Some(storage) = state.remote_storage.as_ref() else {
if state.remote_storage.as_ref().is_none() {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run eviction iteration"
)));
};
}
let state = state.disk_usage_eviction_state.clone();
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state, storage, usage, &cancel,
)
.await;
let cancel = CancellationToken::new();
let child_cancel = cancel.clone();
let _g = cancel.drop_guard();
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
crate::task_mgr::spawn(
crate::task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::DiskUsageEviction,
None,
None,
"ondemand disk usage eviction",
false,
async move {
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state,
usage,
&child_cancel,
)
.await;
let res = res.map_err(ApiError::InternalServerError)?;
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
json_response(StatusCode::OK, res)
}
let _ = tx.send(res);
Ok(())
}
.in_current_span(),
);
async fn secondary_upload_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&request);
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
state
.secondary_controller
.upload_tenant(tenant_shard_id)
.await
.map_err(ApiError::InternalServerError)?;
let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
json_response(StatusCode::OK, response)
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1839,25 +1820,23 @@ pub fn make_router(
})
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
.get("/v1/tenant/:tenant_shard_id", |r| {
api_handler(r, tenant_status)
})
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
.delete("/v1/tenant/:tenant_shard_id", |r| {
api_handler(r, tenant_delete_handler)
})
.get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
api_handler(r, tenant_size_handler)
})
.put("/v1/tenant/config", |r| {
api_handler(r, update_tenant_config_handler)
})
.get("/v1/tenant/:tenant_shard_id/config", |r| {
.get("/v1/tenant/:tenant_id/config", |r| {
api_handler(r, get_tenant_config_handler)
})
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
api_handler(r, put_tenant_location_config_handler)
})
.get("/v1/tenant/:tenant_shard_id/timeline", |r| {
.get("/v1/tenant/:tenant_id/timeline", |r| {
api_handler(r, timeline_list_handler)
})
.post("/v1/tenant/:tenant_shard_id/timeline", |r| {
@@ -1878,74 +1857,67 @@ pub fn make_router(
.post("/v1/tenant/:tenant_id/ignore", |r| {
api_handler(r, tenant_ignore_handler)
})
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_detail_handler)
})
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp",
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|r| api_handler(r, get_lsn_by_timestamp_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
|r| api_handler(r, get_timestamp_of_lsn_handler),
)
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
api_handler(r, timeline_gc_handler)
})
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
testing_api_handler("run timeline compaction", r, timeline_compact_handler)
})
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
|r| api_handler(r, timeline_gc_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_post),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_get),
)
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_delete_handler)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
api_handler(r, layer_map_info_handler)
})
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
|r| api_handler(r, layer_map_info_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, layer_download_handler),
)
.delete(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, evict_timeline_layer_handler),
)
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
api_handler(r, secondary_upload_handler)
})
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.put("/v1/tenant/:tenant_shard_id/break", |r| {
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})
.get("/v1/panic", |r| api_handler(r, always_panic_handler))
.post("/v1/tracing/event", |r| {
testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
})
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
|r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
"/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
|r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
)
.any(handler_404))

View File

@@ -2,10 +2,9 @@ use enum_map::EnumMap;
use metrics::metric_vec_duration::DurationResultObserver;
use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
@@ -286,63 +285,6 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
},
});
pub(crate) mod page_cache_eviction_metrics {
use std::num::NonZeroUsize;
use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
#[derive(Clone, Copy)]
pub(crate) enum Outcome {
FoundSlotUnused { iters: NonZeroUsize },
FoundSlotEvicted { iters: NonZeroUsize },
ItersExceeded { iters: NonZeroUsize },
}
static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
&["outcome"],
)
.expect("failed to define a metric")
});
static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_calls",
"Incremented at the end of each find_victim() call.\
Filter by outcome to get e.g., eviction rate.",
&["outcome"]
)
.unwrap()
});
pub(crate) fn observe(outcome: Outcome) {
macro_rules! dry {
($label:literal, $iters:expr) => {{
static LABEL: &'static str = $label;
static ITERS_TOTAL: Lazy<IntCounter> =
Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
static CALLS: Lazy<IntCounter> =
Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
ITERS_TOTAL.inc_by(($iters.get()) as u64);
CALLS.inc();
}};
}
match outcome {
Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
Outcome::FoundSlotEvicted { iters } => {
dry!("found_evicted", iters)
}
Outcome::ItersExceeded { iters } => {
dry!("err_iters_exceeded", iters);
super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
}
}
}
}
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_page_cache_acquire_pinned_slot_seconds",
@@ -352,6 +294,14 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
.expect("failed to define a metric")
});
pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
)
.expect("failed to define a metric")
});
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"page_cache_errors_total",
@@ -651,7 +601,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
"pageserver_evictions_with_low_residence_duration",
"If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
Residence duration is determined using the `residence_duration_data_source`.",
&["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
&["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
)
.expect("failed to define a metric")
});
@@ -715,16 +665,10 @@ impl EvictionsWithLowResidenceDurationBuilder {
}
}
fn build(
&self,
tenant_id: &str,
shard_id: &str,
timeline_id: &str,
) -> EvictionsWithLowResidenceDuration {
fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
.get_metric_with_label_values(&[
tenant_id,
shard_id,
timeline_id,
self.data_source,
&EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
@@ -755,24 +699,21 @@ impl EvictionsWithLowResidenceDuration {
pub fn change_threshold(
&mut self,
tenant_id: &str,
shard_id: &str,
timeline_id: &str,
new_threshold: Duration,
) {
if new_threshold == self.threshold {
return;
}
let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
self.data_source,
new_threshold,
)
.build(tenant_id, shard_id, timeline_id);
let mut with_new =
EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
.build(tenant_id, timeline_id);
std::mem::swap(self, &mut with_new);
with_new.remove(tenant_id, shard_id, timeline_id);
with_new.remove(tenant_id, timeline_id);
}
// This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
let Some(_counter) = self.counter.take() else {
return;
};
@@ -781,7 +722,6 @@ impl EvictionsWithLowResidenceDuration {
let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
self.data_source,
&threshold,
@@ -902,26 +842,6 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
)
.expect("failed to define a metric")
});
pub(crate) mod virtual_file_descriptor_cache {
use super::*;
pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_virtual_file_descriptor_cache_size_max",
"Maximum number of open file descriptors in the cache."
)
.unwrap()
});
// SIZE_CURRENT: derive it like so:
// ```
// sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
// -ignoring(operation)
// sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
// ```
}
#[derive(Debug)]
struct GlobalAndPerTimelineHistogram {
global: Histogram,
@@ -1271,28 +1191,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
)
.expect("failed to define a metric"),
});
pub(crate) struct SecondaryModeMetrics {
pub(crate) upload_heatmap: IntCounter,
pub(crate) upload_heatmap_errors: IntCounter,
pub(crate) upload_heatmap_duration: Histogram,
}
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
upload_heatmap: register_int_counter!(
"pageserver_secondary_upload_heatmap",
"Number of heatmaps written to remote storage by attached tenants"
)
.expect("failed to define a metric"),
upload_heatmap_errors: register_int_counter!(
"pageserver_secondary_upload_heatmap_errors",
"Failures writing heatmap to remote storage"
)
.expect("failed to define a metric"),
upload_heatmap_duration: register_histogram!(
"pageserver_secondary_upload_heatmap_duration",
"Time to build and upload a heatmap, including any waiting inside the S3 client"
)
.expect("failed to define a metric"),
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
@@ -1344,16 +1242,25 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("Failed to register tenant_task_events metric")
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
@@ -1627,7 +1534,6 @@ impl StorageTimeMetrics {
#[derive(Debug)]
pub struct TimelineMetrics {
tenant_id: String,
shard_id: String,
timeline_id: String,
pub flush_time_histo: StorageTimeMetrics,
pub compact_time_histo: StorageTimeMetrics,
@@ -1648,12 +1554,11 @@ pub struct TimelineMetrics {
impl TimelineMetrics {
pub fn new(
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", tenant_shard_id.shard_slug());
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let flush_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
@@ -1690,12 +1595,11 @@ impl TimelineMetrics {
let evictions = EVICTIONS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
.build(&tenant_id, &shard_id, &timeline_id);
let evictions_with_low_residence_duration =
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
TimelineMetrics {
tenant_id,
shard_id,
timeline_id,
flush_time_histo,
compact_time_histo,
@@ -1741,7 +1645,6 @@ impl Drop for TimelineMetrics {
fn drop(&mut self) {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
@@ -1755,7 +1658,7 @@ impl Drop for TimelineMetrics {
self.evictions_with_low_residence_duration
.write()
.unwrap()
.remove(tenant_id, shard_id, timeline_id);
.remove(tenant_id, timeline_id);
// The following metrics are born outside of the TimelineMetrics lifecycle but still
// removed at the end of it. The idea is to have the metrics outlive the

View File

@@ -28,7 +28,7 @@
//! Page cache maps from a cache key to a buffer slot.
//! The cache key uniquely identifies the piece of data that is being cached.
//!
//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
//! The cache key for **materialized pages** is [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
//!
//! The cache key for **immutable file** pages is [`FileId`] and a block number.
@@ -83,15 +83,13 @@ use std::{
use anyhow::Context;
use once_cell::sync::OnceCell;
use pageserver_api::shard::TenantShardId;
use utils::{id::TimelineId, lsn::Lsn};
use crate::{
context::RequestContext,
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
repository::Key,
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -152,13 +150,7 @@ enum CacheKey {
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
struct MaterializedPageHashKey {
/// Why is this TenantShardId rather than TenantId?
///
/// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this
/// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this
/// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
/// special-cased in some other way.
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
key: Key,
}
@@ -382,7 +374,7 @@ impl PageCache {
/// returned page.
pub async fn lookup_materialized_page(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
key: &Key,
lsn: Lsn,
@@ -399,7 +391,7 @@ impl PageCache {
let mut cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_shard_id,
tenant_id,
timeline_id,
key: *key,
},
@@ -440,7 +432,7 @@ impl PageCache {
///
pub async fn memorize_materialized_page(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
key: Key,
lsn: Lsn,
@@ -448,7 +440,7 @@ impl PageCache {
) -> anyhow::Result<()> {
let cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_shard_id,
tenant_id,
timeline_id,
key,
},
@@ -905,10 +897,8 @@ impl PageCache {
// Note that just yielding to tokio during iteration without such
// priority boosting is likely counter-productive. We'd just give more opportunities
// for B to bump usage count, further starving A.
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::ItersExceeded {
iters: iters.try_into().unwrap(),
},
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::EvictIterLimit,
);
anyhow::bail!("exceeded evict iter limit");
}
@@ -919,18 +909,8 @@ impl PageCache {
// remove mapping for old buffer
self.remove_mapping(old_key);
inner.key = None;
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
iters: iters.try_into().unwrap(),
},
);
} else {
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotUnused {
iters: iters.try_into().unwrap(),
},
);
}
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
return Ok((slot_idx, inner));
}
}

View File

@@ -822,7 +822,10 @@ impl<'a> DatadirModification<'a> {
self.put(DBDIR_KEY, Value::Image(buf.into()));
// Create AuxFilesDirectory
self.init_aux_dir()?;
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
xids: HashSet::new(),
@@ -930,7 +933,10 @@ impl<'a> DatadirModification<'a> {
self.put(DBDIR_KEY, Value::Image(buf.into()));
// Create AuxFilesDirectory as well
self.init_aux_dir()?;
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
}
if r.is_none() {
// Create RelDirectory
@@ -1255,14 +1261,6 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
Ok(())
}
pub async fn put_file(
&mut self,
path: &str,
@@ -1769,13 +1767,6 @@ const AUX_FILES_KEY: Key = Key {
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
pub fn is_inherited_key(key: Key) -> bool {
key != AUX_FILES_KEY
}
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (

View File

@@ -42,7 +42,6 @@ use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use futures::FutureExt;
use pageserver_api::shard::TenantShardId;
use tokio::runtime::Runtime;
use tokio::task::JoinHandle;
use tokio::task_local;
@@ -52,7 +51,7 @@ use tracing::{debug, error, info, warn};
use once_cell::sync::Lazy;
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
use crate::shutdown_pageserver;
@@ -258,9 +257,6 @@ pub enum TaskKind {
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
/// See [`crate::tenant::secondary`].
SecondaryUploads,
// Initial logical size calculation
InitialLogicalSizeCalculation,
@@ -321,7 +317,7 @@ struct PageServerTask {
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
tenant_shard_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
mutable: Mutex<MutableTaskState>,
@@ -333,7 +329,7 @@ struct PageServerTask {
pub fn spawn<F>(
runtime: &tokio::runtime::Handle,
kind: TaskKind,
tenant_shard_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
name: &str,
shutdown_process_on_error: bool,
@@ -349,7 +345,7 @@ where
kind,
name: name.to_string(),
cancel: cancel.clone(),
tenant_shard_id,
tenant_id,
timeline_id,
mutable: Mutex::new(MutableTaskState { join_handle: None }),
});
@@ -428,28 +424,28 @@ async fn task_finish(
Ok(Err(err)) => {
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
}
}
Err(err) => {
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
}
}
@@ -471,11 +467,11 @@ async fn task_finish(
///
/// Or to shut down all tasks for given timeline:
///
/// shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id))
/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
///
pub async fn shutdown_tasks(
kind: Option<TaskKind>,
tenant_shard_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
) {
let mut victim_tasks = Vec::new();
@@ -484,35 +480,35 @@ pub async fn shutdown_tasks(
let tasks = TASKS.lock().unwrap();
for task in tasks.values() {
if (kind.is_none() || Some(task.kind) == kind)
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
&& (tenant_id.is_none() || task.tenant_id == tenant_id)
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
{
task.cancel.cancel();
victim_tasks.push((
Arc::clone(task),
task.kind,
task.tenant_shard_id,
task.tenant_id,
task.timeline_id,
));
}
}
}
let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none();
let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks {
for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
let join_handle = {
let mut task_mut = task.mutable.lock().unwrap();
task_mut.join_handle.take()
};
if let Some(mut join_handle) = join_handle {
if log_all {
if tenant_shard_id.is_none() {
if tenant_id.is_none() {
// there are quite few of these
info!(name = task.name, kind = ?task_kind, "stopping global task");
} else {
// warn to catch these in tests; there shouldn't be any
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
}
}
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
@@ -521,13 +517,12 @@ pub async fn shutdown_tasks(
{
// allow some time to elapse before logging to cut down the number of log
// lines.
info!("waiting for task {} to shut down", task.name);
info!("waiting for {} to shut down", task.name);
// we never handled this return value, but:
// - we don't deschedule which would lead to is_cancelled
// - panics are already logged (is_panicked)
// - task errors are already logged in the wrapper
let _ = join_handle.await;
info!("task {} completed", task.name);
}
} else {
// Possibly one of:
@@ -561,14 +556,9 @@ pub async fn shutdown_watcher() {
/// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
/// `tokio::task::JoinSet::spawn`.
pub fn shutdown_token() -> CancellationToken {
let res = SHUTDOWN_TOKEN.try_with(|t| t.clone());
if cfg!(test) {
// in tests this method is called from non-taskmgr spawned tasks, and that is all ok.
res.unwrap_or_default()
} else {
res.expect("shutdown_token() called in an unexpected task or thread")
}
SHUTDOWN_TOKEN
.try_with(|t| t.clone())
.expect("shutdown_token() called in an unexpected task or thread")
}
/// Has the current task been requested to shut down?

View File

@@ -48,7 +48,6 @@ use self::mgr::GetActiveTenantError;
use self::mgr::GetTenantError;
use self::mgr::TenantsMap;
use self::remote_timeline_client::RemoteTimelineClient;
use self::timeline::uninit::TimelineExclusionError;
use self::timeline::uninit::TimelineUninitMark;
use self::timeline::uninit::UninitializedTimeline;
use self::timeline::EvictionTaskTenantState;
@@ -88,6 +87,7 @@ use std::process::Stdio;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::sync::MutexGuard;
use std::sync::{Mutex, RwLock};
use std::time::{Duration, Instant};
@@ -144,7 +144,6 @@ pub mod storage_layer;
pub mod config;
pub mod delete;
pub mod mgr;
pub mod secondary;
pub mod tasks;
pub mod upload_queue;
@@ -249,12 +248,6 @@ pub struct Tenant {
generation: Generation,
timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
/// During timeline creation, we first insert the TimelineId to the
/// creating map, then `timelines`, then remove it from the creating map.
/// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating`
timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
// This mutex prevents creation of new timelines during GC.
// Adding yet another mutex (in addition to `timelines`) is needed because holding
// `timelines` mutex during all GC iteration
@@ -413,10 +406,8 @@ impl Debug for SetStoppingError {
#[derive(thiserror::Error, Debug)]
pub enum CreateTimelineError {
#[error("creation of timeline with the given ID is in progress")]
AlreadyCreating,
#[error("timeline already exists with different parameters")]
Conflict,
#[error("a timeline with the given ID already exists")]
AlreadyExists,
#[error(transparent)]
AncestorLsn(anyhow::Error),
#[error("ancestor timeline is not active")]
@@ -617,7 +608,7 @@ impl Tenant {
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::Attach,
Some(tenant_shard_id),
Some(tenant_shard_id.tenant_id),
None,
"attach tenant",
false,
@@ -1466,7 +1457,7 @@ impl Tenant {
/// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
/// minimum amount of keys required to get a writable timeline.
/// (Without it, `put` might fail due to `repartition` failing.)
pub(crate) async fn create_empty_timeline(
pub async fn create_empty_timeline(
&self,
new_timeline_id: TimelineId,
initdb_lsn: Lsn,
@@ -1478,7 +1469,10 @@ impl Tenant {
"Cannot create empty timelines on inactive tenant"
);
let timeline_uninit_mark = self.create_timeline_uninit_mark(new_timeline_id)?;
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
self.create_timeline_uninit_mark(new_timeline_id, &timelines)?
};
let new_metadata = TimelineMetadata::new(
// Initialize disk_consistent LSN to 0, The caller must import some data to
// make it valid, before calling finish_creation()
@@ -1555,7 +1549,7 @@ impl Tenant {
/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
#[allow(clippy::too_many_arguments)]
pub(crate) async fn create_timeline(
pub async fn create_timeline(
&self,
new_timeline_id: TimelineId,
ancestor_timeline_id: Option<TimelineId>,
@@ -1576,51 +1570,26 @@ impl Tenant {
.enter()
.map_err(|_| CreateTimelineError::ShuttingDown)?;
// Get exclusive access to the timeline ID: this ensures that it does not already exist,
// and that no other creation attempts will be allowed in while we are working. The
// uninit_mark is a guard.
let uninit_mark = match self.create_timeline_uninit_mark(new_timeline_id) {
Ok(m) => m,
Err(TimelineExclusionError::AlreadyCreating) => {
// Creation is in progress, we cannot create it again, and we cannot
// check if this request matches the existing one, so caller must try
// again later.
return Err(CreateTimelineError::AlreadyCreating);
}
Err(TimelineExclusionError::Other(e)) => {
return Err(CreateTimelineError::Other(e));
}
Err(TimelineExclusionError::AlreadyExists(existing)) => {
debug!("timeline {new_timeline_id} already exists");
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
debug!("timeline {new_timeline_id} already exists");
// Idempotency: creating the same timeline twice is not an error, unless
// the second creation has different parameters.
if existing.get_ancestor_timeline_id() != ancestor_timeline_id
|| existing.pg_version != pg_version
|| (ancestor_start_lsn.is_some()
&& ancestor_start_lsn != Some(existing.get_ancestor_lsn()))
{
return Err(CreateTimelineError::Conflict);
}
if let Some(remote_client) = existing.remote_client.as_ref() {
// Wait for uploads to complete, so that when we return Ok, the timeline
// is known to be durable on remote storage. Just like we do at the end of
// this function, after we have created the timeline ourselves.
//
// We only really care that the initial version of `index_part.json` has
// been uploaded. That's enough to remember that the timeline
// exists. However, there is no function to wait specifically for that so
// we just wait for all in-progress uploads to finish.
remote_client
.wait_completion()
.await
.context("wait for timeline uploads to complete")?;
}
return Ok(existing);
if let Some(remote_client) = existing.remote_client.as_ref() {
// Wait for uploads to complete, so that when we return Ok, the timeline
// is known to be durable on remote storage. Just like we do at the end of
// this function, after we have created the timeline ourselves.
//
// We only really care that the initial version of `index_part.json` has
// been uploaded. That's enough to remember that the timeline
// exists. However, there is no function to wait specifically for that so
// we just wait for all in-progress uploads to finish.
remote_client
.wait_completion()
.await
.context("wait for timeline uploads to complete")?;
}
};
return Err(CreateTimelineError::AlreadyExists);
}
let loaded_timeline = match ancestor_timeline_id {
Some(ancestor_timeline_id) => {
@@ -1657,32 +1626,18 @@ impl Tenant {
ancestor_timeline.wait_lsn(*lsn, ctx).await?;
}
self.branch_timeline(
&ancestor_timeline,
new_timeline_id,
ancestor_start_lsn,
uninit_mark,
ctx,
)
.await?
self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
.await?
}
None => {
self.bootstrap_timeline(
new_timeline_id,
pg_version,
load_existing_initdb,
uninit_mark,
ctx,
)
.await?
self.bootstrap_timeline(new_timeline_id, pg_version, load_existing_initdb, ctx)
.await?
}
};
// At this point we have dropped our guard on [`Self::timelines_creating`], and
// the timeline is visible in [`Self::timelines`], but it is _not_ durable yet. We must
// not send a success to the caller until it is. The same applies to handling retries,
// see the handling of [`TimelineExclusionError::AlreadyExists`] above.
if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
// Wait for the upload of the 'index_part.json` file to finish, so that when we return
// Ok, the timeline is durable in remote storage.
let kind = ancestor_timeline_id
.map(|_| "branched")
.unwrap_or("bootstrapped");
@@ -1962,7 +1917,7 @@ impl Tenant {
//
// this will additionally shutdown and await all timeline tasks.
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id.tenant_id), None).await;
// Wait for any in-flight operations to complete
self.gate.close().await;
@@ -2159,14 +2114,6 @@ impl Tenant {
.attach_mode
.clone()
}
pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
&self.tenant_shard_id
}
pub(crate) fn get_generation(&self) -> Generation {
self.generation
}
}
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2305,18 +2252,6 @@ impl Tenant {
.or(self.conf.default_tenant_conf.min_resident_size_override)
}
pub fn get_heatmap_period(&self) -> Option<Duration> {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
let heatmap_period = tenant_conf
.heatmap_period
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
if heatmap_period.is_zero() {
None
} else {
Some(heatmap_period)
}
}
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
// Don't hold self.timelines.lock() during the notifies.
@@ -2466,7 +2401,6 @@ impl Tenant {
loading_started_at: Instant::now(),
tenant_conf: Arc::new(RwLock::new(attached_conf)),
timelines: Mutex::new(HashMap::new()),
timelines_creating: Mutex::new(HashSet::new()),
gc_cs: tokio::sync::Mutex::new(()),
walredo_mgr,
remote_storage,
@@ -2858,9 +2792,8 @@ impl Tenant {
start_lsn: Option<Lsn>,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
let uninit_mark = self.create_timeline_uninit_mark(dst_id).unwrap();
let tl = self
.branch_timeline_impl(src_timeline, dst_id, start_lsn, uninit_mark, ctx)
.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
.await?;
tl.set_state(TimelineState::Active);
Ok(tl)
@@ -2874,10 +2807,9 @@ impl Tenant {
src_timeline: &Arc<Timeline>,
dst_id: TimelineId,
start_lsn: Option<Lsn>,
timeline_uninit_mark: TimelineUninitMark<'_>,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_uninit_mark, ctx)
self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
.await
}
@@ -2886,14 +2818,13 @@ impl Tenant {
src_timeline: &Arc<Timeline>,
dst_id: TimelineId,
start_lsn: Option<Lsn>,
timeline_uninit_mark: TimelineUninitMark<'_>,
_ctx: &RequestContext,
) -> Result<Arc<Timeline>, CreateTimelineError> {
let src_id = src_timeline.timeline_id;
// We will validate our ancestor LSN in this function. Acquire the GC lock so that
// this check cannot race with GC, and the ancestor LSN is guaranteed to remain
// valid while we are creating the branch.
// First acquire the GC lock so that another task cannot advance the GC
// cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
// creating the branch.
let _gc_cs = self.gc_cs.lock().await;
// If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
@@ -2903,6 +2834,13 @@ impl Tenant {
lsn
});
// Create a placeholder for the new branch. This will error
// out if the new timeline ID is already in use.
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
self.create_timeline_uninit_mark(dst_id, &timelines)?
};
// Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
// horizon on the source timeline
//
@@ -2994,38 +2932,21 @@ impl Tenant {
Ok(new_timeline)
}
/// For unit tests, make this visible so that other modules can directly create timelines
#[cfg(test)]
pub(crate) async fn bootstrap_timeline_test(
&self,
timeline_id: TimelineId,
pg_version: u32,
load_existing_initdb: Option<TimelineId>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
let uninit_mark = self.create_timeline_uninit_mark(timeline_id).unwrap();
self.bootstrap_timeline(
timeline_id,
pg_version,
load_existing_initdb,
uninit_mark,
ctx,
)
.await
}
/// - run initdb to init temporary instance and get bootstrap data
/// - after initialization completes, tar up the temp dir and upload it to S3.
///
/// The caller is responsible for activating the returned timeline.
async fn bootstrap_timeline(
pub(crate) async fn bootstrap_timeline(
&self,
timeline_id: TimelineId,
pg_version: u32,
load_existing_initdb: Option<TimelineId>,
timeline_uninit_mark: TimelineUninitMark<'_>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Timeline>> {
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
self.create_timeline_uninit_mark(timeline_id, &timelines)?
};
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
// temporary directory for basebackup files for the given timeline.
@@ -3106,9 +3027,8 @@ impl Tenant {
3,
u32::MAX,
"persist_initdb_tar_zst",
backoff::Cancel::new(self.cancel.clone(), || {
anyhow::anyhow!("initdb upload cancelled")
}),
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await?;
@@ -3223,11 +3143,11 @@ impl Tenant {
/// at 'disk_consistent_lsn'. After any initial data has been imported, call
/// `finish_creation` to insert the Timeline into the timelines map and to remove the
/// uninit mark file.
async fn prepare_new_timeline<'a>(
&'a self,
async fn prepare_new_timeline(
&self,
new_timeline_id: TimelineId,
new_metadata: &TimelineMetadata,
uninit_mark: TimelineUninitMark<'a>,
uninit_mark: TimelineUninitMark,
start_lsn: Lsn,
ancestor: Option<Arc<Timeline>>,
) -> anyhow::Result<UninitializedTimeline> {
@@ -3300,38 +3220,23 @@ impl Tenant {
fn create_timeline_uninit_mark(
&self,
timeline_id: TimelineId,
) -> Result<TimelineUninitMark, TimelineExclusionError> {
timelines: &MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
) -> anyhow::Result<TimelineUninitMark> {
let tenant_shard_id = self.tenant_shard_id;
anyhow::ensure!(
timelines.get(&timeline_id).is_none(),
"Timeline {tenant_shard_id}/{timeline_id} already exists in pageserver's memory"
);
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
anyhow::ensure!(
!timeline_path.exists(),
"Timeline {timeline_path} already exists, cannot create its uninit mark file",
);
let uninit_mark_path = self
.conf
.timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
let uninit_mark = TimelineUninitMark::new(
self,
timeline_id,
uninit_mark_path.clone(),
timeline_path.clone(),
)?;
// At this stage, we have got exclusive access to in-memory state for this timeline ID
// for creation.
// A timeline directory should never exist on disk already:
// - a previous failed creation would have cleaned up after itself
// - a pageserver restart would clean up timeline directories that don't have valid remote state
//
// Therefore it is an unexpected internal error to encounter a timeline directory already existing here,
// this error may indicate a bug in cleanup on failed creations.
if timeline_path.exists() {
return Err(TimelineExclusionError::Other(anyhow::anyhow!(
"Timeline directory already exists! This is a bug."
)));
}
// Create the on-disk uninit mark _after_ the in-memory acquisition of the tenant ID: guarantees
// that during process runtime, colliding creations will be caught in-memory without getting
// as far as failing to write a file.
fs::OpenOptions::new()
.write(true)
.create_new(true)
@@ -3345,6 +3250,8 @@ impl Tenant {
format!("Failed to crate uninit mark for timeline {tenant_shard_id}/{timeline_id}")
})?;
let uninit_mark = TimelineUninitMark::new(uninit_mark_path, timeline_path);
Ok(uninit_mark)
}
@@ -3787,7 +3694,6 @@ pub(crate) mod harness {
tenant_conf.evictions_low_residence_duration_metric_threshold,
),
gc_feedback: Some(tenant_conf.gc_feedback),
heatmap_period: Some(tenant_conf.heatmap_period),
}
}
}
@@ -4094,7 +4000,13 @@ mod tests {
.await
{
Ok(_) => panic!("duplicate timeline creation should fail"),
Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()),
Err(e) => assert_eq!(
e.to_string(),
format!(
"Timeline {}/{} already exists in pageserver's memory",
tenant.tenant_shard_id, TIMELINE_ID
)
),
}
Ok(())

View File

@@ -334,11 +334,6 @@ pub struct TenantConf {
#[serde(with = "humantime_serde")]
pub evictions_low_residence_duration_metric_threshold: Duration,
pub gc_feedback: bool,
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
/// may be disabled if a Tenant will not have secondary locations: only secondary
/// locations will use the heatmap uploaded by attached locations.
pub heatmap_period: Duration,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -419,11 +414,6 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub gc_feedback: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub heatmap_period: Option<Duration>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -492,7 +482,6 @@ impl TenantConfOpt {
.evictions_low_residence_duration_metric_threshold
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
}
}
}
@@ -530,7 +519,6 @@ impl Default for TenantConf {
)
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
gc_feedback: false,
heatmap_period: Duration::ZERO,
}
}
}

View File

@@ -463,7 +463,7 @@ impl DeleteTenantFlow {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id),
Some(tenant_shard_id.tenant_id),
None,
"tenant_delete",
false,
@@ -550,7 +550,7 @@ impl DeleteTenantFlow {
// we encounter an InProgress marker, yield the barrier it contains and wait on it.
let barrier = {
let mut locked = tenants.write().unwrap();
let removed = locked.remove(tenant.tenant_shard_id);
let removed = locked.remove(&tenant.tenant_shard_id.tenant_id);
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)

View File

@@ -98,6 +98,33 @@ pub(crate) enum TenantsMap {
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
}
/// Helper for mapping shard-unaware functions to a sharding-aware map
/// TODO(sharding): all users of this must be made shard-aware.
fn exactly_one_or_none<'a>(
map: &'a BTreeMap<TenantShardId, TenantSlot>,
tenant_id: &TenantId,
) -> Option<(&'a TenantShardId, &'a TenantSlot)> {
let mut slots = map.range(TenantShardId::tenant_range(*tenant_id));
// Retrieve the first two slots in the range: if both are populated, we must panic because the caller
// needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time.
let slot_a = slots.next();
let slot_b = slots.next();
match (slot_a, slot_b) {
(None, None) => None,
(Some(slot), None) => {
// Exactly one matching slot
Some(slot)
}
(Some(_slot_a), Some(_slot_b)) => {
// Multiple shards for this tenant: cannot handle this yet.
// TODO(sharding): callers of get() should be shard-aware.
todo!("Attaching multiple shards in teh same tenant to the same pageserver")
}
(None, Some(_)) => unreachable!(),
}
}
pub(crate) enum TenantsMapRemoveResult {
Occupied(TenantSlot),
Vacant,
@@ -120,11 +147,12 @@ impl TenantsMap {
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
/// None is returned.
pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
m.get(tenant_shard_id).and_then(|slot| slot.get_attached())
// TODO(sharding): callers of get() should be shard-aware.
exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached())
}
}
}
@@ -176,19 +204,25 @@ impl TenantsMap {
///
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
/// slot if the enclosed tenant is shutdown.
pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult {
use std::collections::btree_map::Entry;
match self {
TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
Entry::Occupied(entry) => match entry.get() {
TenantSlot::InProgress(barrier) => {
TenantsMapRemoveResult::InProgress(barrier.clone())
}
_ => TenantsMapRemoveResult::Occupied(entry.remove()),
},
Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
},
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
match key {
Some(key) => match m.entry(key) {
Entry::Occupied(entry) => match entry.get() {
TenantSlot::InProgress(barrier) => {
TenantsMapRemoveResult::InProgress(barrier.clone())
}
_ => TenantsMapRemoveResult::Occupied(entry.remove()),
},
Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
},
None => TenantsMapRemoveResult::Vacant,
}
}
}
}
@@ -788,16 +822,14 @@ pub(crate) async fn set_new_tenant_config(
new_tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> Result<(), SetNewTenantConfigError> {
// Legacy API: does not support sharding
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_shard_id, true)?;
let tenant = get_tenant(tenant_id, true)?;
// This is a legacy API that only operates on attached tenants: the preferred
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
.await
@@ -807,12 +839,6 @@ pub(crate) async fn set_new_tenant_config(
}
impl TenantManager {
/// Convenience function so that anyone with a TenantManager can get at the global configuration, without
/// having to pass it around everywhere as a separate object.
pub(crate) fn get_conf(&self) -> &'static PageServerConf {
self.conf
}
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
pub(crate) fn get_attached_tenant_shard(
@@ -1093,20 +1119,6 @@ impl TenantManager {
Ok(())
}
pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
let locked = self.tenants.read().unwrap();
match &*locked {
TenantsMap::Initializing => Vec::new(),
TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => map
.values()
.filter_map(|slot| {
slot.get_attached()
.and_then(|t| if t.is_active() { Some(t.clone()) } else { None })
})
.collect(),
}
}
}
#[derive(Debug, thiserror::Error)]
@@ -1131,11 +1143,14 @@ pub(crate) enum GetTenantError {
///
/// This method is cancel-safe.
pub(crate) fn get_tenant(
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
active_only: bool,
) -> Result<Arc<Tenant>, GetTenantError> {
let locked = TENANTS.read().unwrap();
// TODO(sharding): make all callers of get_tenant shard-aware
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
match peek_slot {
@@ -1147,18 +1162,14 @@ pub(crate) fn get_tenant(
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
Err(GetTenantError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
}
},
Some(TenantSlot::InProgress(_)) => {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
}
None | Some(TenantSlot::Secondary) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
}
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)),
None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)),
}
}
@@ -1531,8 +1542,7 @@ pub(crate) enum TenantMapListError {
///
/// Get list of tenants, for the mgmt API
///
pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
{
pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
let tenants = TENANTS.read().unwrap();
let m = match &*tenants {
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1540,10 +1550,12 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
};
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
TenantSlot::Attached(tenant) => Some((id, tenant.current_state())),
TenantSlot::Secondary => None,
TenantSlot::InProgress(_) => None,
})
// TODO(sharding): make callers of this function shard-aware
.map(|(k, v)| (k.tenant_id, v))
.collect())
}
@@ -2077,20 +2089,22 @@ use {
};
pub(crate) async fn immediate_gc(
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
let guard = TENANTS.read().unwrap();
let tenant = guard
.get(&tenant_shard_id)
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("tenant {tenant_shard_id}"))
.with_context(|| format!("tenant {tenant_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?;
// TODO(sharding): make callers of this function shard-aware
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
@@ -2102,9 +2116,9 @@ pub(crate) async fn immediate_gc(
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::GarbageCollector,
Some(tenant_shard_id),
Some(tenant_id),
Some(timeline_id),
&format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
false,
async move {
fail::fail_point!("immediate_gc_task_pre");

View File

@@ -180,7 +180,7 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
pub(crate) mod download;
mod download;
pub mod index;
mod upload;
@@ -1223,7 +1223,7 @@ impl RemoteTimelineClient {
task_mgr::spawn(
&self.runtime,
TaskKind::RemoteUploadTask,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"remote upload",
false,
@@ -1604,23 +1604,6 @@ impl RemoteTimelineClient {
}
}
}
pub(crate) fn get_layers_metadata(
&self,
layers: Vec<LayerFileName>,
) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
let q = self.upload_queue.lock().unwrap();
let q = match &*q {
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
anyhow::bail!("queue is in state {}", q.as_str())
}
UploadQueue::Initialized(inner) => inner,
};
let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
Ok(decorated.collect())
}
}
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -1676,13 +1659,6 @@ pub fn remote_index_path(
.expect("Failed to construct path")
}
pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
.expect("Failed to construct path")
}
/// Given the key of an index, parse out the generation part of the name
pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
let file_name = match path.get_path().file_name() {

View File

@@ -4,9 +4,8 @@ use anyhow::{bail, Context};
use camino::Utf8Path;
use fail::fail_point;
use pageserver_api::shard::TenantShardId;
use std::io::{ErrorKind, SeekFrom};
use std::io::ErrorKind;
use tokio::fs::{self, File};
use tokio::io::AsyncSeekExt;
use super::Generation;
use crate::{
@@ -120,14 +119,11 @@ pub(crate) async fn upload_initdb_dir(
storage: &GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
mut initdb_tar_zst: File,
initdb_tar_zst: File,
size: u64,
) -> anyhow::Result<()> {
tracing::trace!("uploading initdb dir");
// We might have read somewhat into the file already in the prior retry attempt
initdb_tar_zst.seek(SeekFrom::Start(0)).await?;
let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);

View File

@@ -1,104 +0,0 @@
pub mod heatmap;
mod heatmap_uploader;
use std::sync::Arc;
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use self::heatmap_uploader::heatmap_uploader_task;
use super::mgr::TenantManager;
use pageserver_api::shard::TenantShardId;
use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use utils::completion::Barrier;
enum UploadCommand {
Upload(TenantShardId),
}
struct CommandRequest<T> {
payload: T,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
}
struct CommandResponse {
result: anyhow::Result<()>,
}
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
/// where we want to immediately upload/download for a particular tenant. In normal operation
/// uploads & downloads are autonomous and not driven by this interface.
pub struct SecondaryController {
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
}
impl SecondaryController {
async fn dispatch<T>(
&self,
queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
payload: T,
) -> anyhow::Result<()> {
let (response_tx, response_rx) = tokio::sync::oneshot::channel();
queue
.send(CommandRequest {
payload,
response_tx,
})
.await
.map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
let response = response_rx
.await
.map_err(|_| anyhow::anyhow!("Request dropped"))?;
response.result
}
pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
.await
}
}
pub fn spawn_tasks(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> SecondaryController {
let (upload_req_tx, upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryUploads,
None,
None,
"heatmap uploads",
false,
async move {
heatmap_uploader_task(
tenant_manager,
remote_storage,
upload_req_rx,
background_jobs_can_start,
cancel,
)
.await
},
);
SecondaryController { upload_req_tx }
}
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
pub fn null_controller() -> SecondaryController {
let (upload_req_tx, _upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
SecondaryController { upload_req_tx }
}

View File

@@ -1,64 +0,0 @@
use std::time::SystemTime;
use crate::tenant::{
remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
use utils::{generation::Generation, id::TimelineId};
#[derive(Serialize, Deserialize)]
pub(super) struct HeatMapTenant {
/// Generation of the attached location that uploaded the heatmap: this is not required
/// for correctness, but acts as a hint to secondary locations in order to detect thrashing
/// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
pub(super) generation: Generation,
pub(super) timelines: Vec<HeatMapTimeline>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(super) timeline_id: TimelineId,
pub(super) layers: Vec<HeatMapLayer>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(super) name: LayerFileName,
pub(super) metadata: IndexLayerMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
pub(super) access_time: SystemTime,
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
}
impl HeatMapLayer {
pub(crate) fn new(
name: LayerFileName,
metadata: IndexLayerMetadata,
access_time: SystemTime,
) -> Self {
Self {
name,
metadata,
access_time,
}
}
}
impl HeatMapTimeline {
pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
Self {
timeline_id,
layers,
}
}
}

View File

@@ -1,582 +0,0 @@
use std::{
collections::HashMap,
sync::{Arc, Weak},
time::{Duration, Instant},
};
use crate::{
metrics::SECONDARY_MODE,
tenant::{
config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
},
};
use md5;
use pageserver_api::shard::TenantShardId;
use remote_storage::GenericRemoteStorage;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::instrument;
use utils::{backoff, completion::Barrier};
use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
/// Period between heatmap uploader walking Tenants to look for work to do.
/// If any tenants have a heatmap upload period lower than this, it will be adjusted
/// downward to match.
const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
struct WriteInProgress {
barrier: Barrier,
}
struct UploadPending {
tenant: Arc<Tenant>,
last_digest: Option<md5::Digest>,
}
struct WriteComplete {
tenant_shard_id: TenantShardId,
completed_at: Instant,
digest: Option<md5::Digest>,
next_upload: Option<Instant>,
}
/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
/// when we last did a write. We only populate this after doing at least one
/// write for a tenant -- this avoids holding state for tenants that have
/// uploads disabled.
struct UploaderTenantState {
// This Weak only exists to enable culling idle instances of this type
// when the Tenant has been deallocated.
tenant: Weak<Tenant>,
/// Digest of the serialized heatmap that we last successfully uploaded
///
/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag,
/// which is also an md5sum.
last_digest: Option<md5::Digest>,
/// When the last upload attempt completed (may have been successful or failed)
last_upload: Option<Instant>,
/// When should we next do an upload? None means never.
next_upload: Option<Instant>,
}
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
/// handling loop and mutates it as needed: there are no locks here, because that event loop
/// can hold &mut references to this type throughout.
struct HeatmapUploader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
tenants: HashMap<TenantShardId, UploaderTenantState>,
/// Tenants with work to do, for which tasks should be spawned as soon as concurrency
/// limits permit it.
tenants_pending: std::collections::VecDeque<UploadPending>,
/// Tenants for which a task in `tasks` has been spawned.
tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
tasks: JoinSet<()>,
/// Channel for our child tasks to send results to: we use a channel for results rather than
/// just getting task results via JoinSet because we need the channel's recv() "sleep until something
/// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
/// behavior.
task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
concurrent_uploads: usize,
scheduling_interval: Duration,
}
/// The uploader task runs a loop that periodically wakes up and schedules tasks for
/// tenants that require an upload, or handles any commands that have been sent into
/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we
/// spawn.
///
/// Scheduling iterations are somewhat infrequent. However, each one will enqueue
/// all tenants that require an upload, and in between scheduling iterations we will
/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
///
/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
/// we might block waiting on a Tenant.
pub(super) async fn heatmap_uploader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
let mut uploader = HeatmapUploader {
tenant_manager,
remote_storage,
cancel: cancel.clone(),
tasks: JoinSet::new(),
tenants: HashMap::new(),
tenants_pending: std::collections::VecDeque::new(),
tenants_uploading: HashMap::new(),
task_result_tx: result_tx,
task_result_rx: result_rx,
concurrent_uploads,
scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
};
tracing::info!("Waiting for background_jobs_can start...");
background_jobs_can_start.wait().await;
tracing::info!("background_jobs_can is ready, proceeding.");
while !cancel.is_cancelled() {
// Look for new work: this is relatively expensive because we have to go acquire the lock on
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
// require an upload.
uploader.schedule_iteration().await?;
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
// - Check our cancellation token
let next_scheduling_iteration = Instant::now()
.checked_add(uploader.scheduling_interval)
.unwrap_or_else(|| {
tracing::warn!(
"Scheduling interval invalid ({}s), running immediately!",
uploader.scheduling_interval.as_secs_f64()
);
Instant::now()
});
loop {
tokio::select! {
_ = cancel.cancelled() => {
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
tracing::info!("Heatmap uploader joining tasks");
while let Some(_r) = uploader.tasks.join_next().await {};
tracing::info!("Heatmap uploader terminating");
break;
},
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
break;},
cmd = command_queue.recv() => {
tracing::debug!("heatmap_uploader_task: woke for command queue");
let cmd = match cmd {
Some(c) =>c,
None => {
// SecondaryController was destroyed, and this has raced with
// our CancellationToken
tracing::info!("Heatmap uploader terminating");
cancel.cancel();
break;
}
};
let CommandRequest{
response_tx,
payload
} = cmd;
uploader.handle_command(payload, response_tx);
},
_ = uploader.process_next_completion() => {
if !cancel.is_cancelled() {
uploader.spawn_pending();
}
}
}
}
}
Ok(())
}
impl HeatmapUploader {
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
// Cull any entries in self.tenants whose Arc<Tenant> is gone
self.tenants
.retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
// The priority order of previously scheduled work may be invalidated by current state: drop
// all pending work (it will be re-scheduled if still needed)
self.tenants_pending.clear();
// Used a fixed 'now' through the following loop, for efficiency and fairness.
let now = Instant::now();
// While iterating over the potentially-long list of tenants, we will periodically yield
// to avoid blocking executor.
const YIELD_ITERATIONS: usize = 1000;
// Iterate over tenants looking for work to do.
let tenants = self.tenant_manager.get_attached_active_tenant_shards();
for (i, tenant) in tenants.into_iter().enumerate() {
// Process is shutting down, drop out
if self.cancel.is_cancelled() {
return Ok(());
}
// Skip tenants that already have a write in flight
if self
.tenants_uploading
.contains_key(tenant.get_tenant_shard_id())
{
continue;
}
self.maybe_schedule_upload(&now, tenant);
if i + 1 % YIELD_ITERATIONS == 0 {
tokio::task::yield_now().await;
}
}
// Spawn tasks for as many of our pending tenants as we can.
self.spawn_pending();
Ok(())
}
///
/// Cancellation: this method is cancel-safe.
async fn process_next_completion(&mut self) {
match self.task_result_rx.recv().await {
Some(r) => {
self.on_completion(r);
}
None => {
unreachable!("Result sender is stored on Self");
}
}
}
/// The 'maybe' refers to the tenant's state: whether it is configured
/// for heatmap uploads at all, and whether sufficient time has passed
/// since the last upload.
fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
match tenant.get_heatmap_period() {
None => {
// Heatmaps are disabled for this tenant
return;
}
Some(period) => {
// If any tenant has asked for uploads more frequent than our scheduling interval,
// reduce it to match so that we can keep up. This is mainly useful in testing, where
// we may set rather short intervals.
if period < self.scheduling_interval {
self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
}
}
}
// Stale attachments do not upload anything: if we are in this state, there is probably some
// other attachment in mode Single or Multi running on another pageserver, and we don't
// want to thrash and overwrite their heatmap uploads.
if tenant.get_attach_mode() == AttachmentMode::Stale {
return;
}
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
// with the completion time in on_completion.
let state = self
.tenants
.entry(*tenant.get_tenant_shard_id())
.or_insert_with(|| UploaderTenantState {
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(Instant::now()),
last_digest: None,
});
// Decline to do the upload if insufficient time has passed
if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
return;
}
let last_digest = state.last_digest;
self.tenants_pending.push_back(UploadPending {
tenant,
last_digest,
})
}
fn spawn_pending(&mut self) {
while !self.tenants_pending.is_empty()
&& self.tenants_uploading.len() < self.concurrent_uploads
{
// unwrap: loop condition includes !is_empty()
let pending = self.tenants_pending.pop_front().unwrap();
self.spawn_upload(pending.tenant, pending.last_digest);
}
}
fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
let remote_storage = self.remote_storage.clone();
let tenant_shard_id = *tenant.get_tenant_shard_id();
let (completion, barrier) = utils::completion::channel();
let result_tx = self.task_result_tx.clone();
self.tasks.spawn(async move {
// Guard for the barrier in [`WriteInProgress`]
let _completion = completion;
let started_at = Instant::now();
let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
let duration = Instant::now().duration_since(started_at);
SECONDARY_MODE
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap.inc();
Some(digest)
}
Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
Err(UploadHeatmapError::Upload(e)) => {
tracing::warn!(
"Failed to upload heatmap for tenant {}: {e:#}",
tenant.get_tenant_shard_id(),
);
let duration = Instant::now().duration_since(started_at);
SECONDARY_MODE
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap_errors.inc();
last_digest
}
Err(UploadHeatmapError::Cancelled) => {
tracing::info!("Cancelled heatmap upload, shutting down");
last_digest
}
};
let now = Instant::now();
let next_upload = tenant
.get_heatmap_period()
.and_then(|period| now.checked_add(period));
result_tx
.send(WriteComplete {
tenant_shard_id: *tenant.get_tenant_shard_id(),
completed_at: now,
digest,
next_upload,
})
.ok();
});
self.tenants_uploading
.insert(tenant_shard_id, WriteInProgress { barrier });
}
#[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
fn on_completion(&mut self, completion: WriteComplete) {
tracing::debug!("Heatmap upload completed");
let WriteComplete {
tenant_shard_id,
completed_at,
digest,
next_upload,
} = completion;
self.tenants_uploading.remove(&tenant_shard_id);
use std::collections::hash_map::Entry;
match self.tenants.entry(tenant_shard_id) {
Entry::Vacant(_) => {
// Tenant state was dropped, nothing to update.
}
Entry::Occupied(mut entry) => {
entry.get_mut().last_upload = Some(completed_at);
entry.get_mut().last_digest = digest;
entry.get_mut().next_upload = next_upload
}
}
}
fn handle_command(
&mut self,
command: UploadCommand,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
) {
match command {
UploadCommand::Upload(tenant_shard_id) => {
// If an upload was ongoing for this tenant, let it finish first.
let barrier = if let Some(writing_state) =
self.tenants_uploading.get(&tenant_shard_id)
{
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap write to complete");
writing_state.barrier.clone()
} else {
// Spawn the upload then immediately wait for it. This will block processing of other commands and
// starting of other background work.
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Starting heatmap write on command");
let tenant = match self
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, true)
{
Ok(t) => t,
Err(e) => {
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse {
result: Err(e.into()),
}));
return;
}
};
self.spawn_upload(tenant, None);
let writing_state = self
.tenants_uploading
.get(&tenant_shard_id)
.expect("We just inserted this");
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap upload to complete");
writing_state.barrier.clone()
};
// This task does no I/O: it only listens for a barrier's completion and then
// sends to the command response channel. It is therefore safe to spawn this without
// any gates/task_mgr hooks.
tokio::task::spawn(async move {
barrier.wait().await;
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Heatmap upload complete");
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse { result: Ok(()) }))
});
}
}
}
}
enum UploadHeatmapOutcome {
/// We successfully wrote to remote storage, with this digest.
Uploaded(md5::Digest),
/// We did not upload because the heatmap digest was unchanged since the last upload
NoChange,
/// We skipped the upload for some reason, such as tenant/timeline not ready
Skipped,
}
#[derive(thiserror::Error, Debug)]
enum UploadHeatmapError {
#[error("Cancelled")]
Cancelled,
#[error(transparent)]
Upload(#[from] anyhow::Error),
}
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
/// of the object we would have uploaded.
#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
async fn upload_tenant_heatmap(
remote_storage: GenericRemoteStorage,
tenant: &Arc<Tenant>,
last_digest: Option<md5::Digest>,
) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
debug_assert_current_span_has_tenant_id();
let generation = tenant.get_generation();
if generation.is_none() {
// We do not expect this: generations were implemented before heatmap uploads. However,
// handle it so that we don't have to make the generation in the heatmap an Option<>
// (Generation::none is not serializable)
tracing::warn!("Skipping heatmap upload for tenant with generation==None");
return Ok(UploadHeatmapOutcome::Skipped);
}
let mut heatmap = HeatMapTenant {
timelines: Vec::new(),
generation,
};
let timelines = tenant.timelines.lock().unwrap().clone();
let tenant_cancel = tenant.cancel.clone();
// Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
// when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
// in remote storage.
let _guard = match tenant.gate.enter() {
Ok(g) => g,
Err(_) => {
tracing::info!("Skipping heatmap upload for tenant which is shutting down");
return Err(UploadHeatmapError::Cancelled);
}
};
for (timeline_id, timeline) in timelines {
let heatmap_timeline = timeline.generate_heatmap().await;
match heatmap_timeline {
None => {
tracing::debug!(
"Skipping heatmap upload because timeline {timeline_id} is not ready"
);
return Ok(UploadHeatmapOutcome::Skipped);
}
Some(heatmap_timeline) => {
heatmap.timelines.push(heatmap_timeline);
}
}
}
// Serialize the heatmap
let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
let size = bytes.len();
// Drop out early if nothing changed since our last upload
let digest = md5::compute(&bytes);
if Some(digest) == last_digest {
return Ok(UploadHeatmapOutcome::NoChange);
}
let path = remote_heatmap_path(tenant.get_tenant_shard_id());
// Write the heatmap.
tracing::debug!("Uploading {size} byte heatmap to {path}");
if let Err(e) = backoff::retry(
|| async {
let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
bytes.clone(),
))));
remote_storage
.upload_storage_object(bytes, size, &path)
.await
},
|_| false,
3,
u32::MAX,
"Uploading heatmap",
backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
)
.await
{
if tenant_cancel.is_cancelled() {
return Err(UploadHeatmapError::Cancelled);
} else {
return Err(e.into());
}
}
tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
Ok(UploadHeatmapOutcome::Uploaded(digest))
}

View File

@@ -457,8 +457,6 @@ struct LayerInner {
/// For loaded layers, this may be some other value if the tenant has undergone
/// a shard split since the layer was originally written.
shard: ShardIndex,
last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,
}
impl std::fmt::Display for LayerInner {
@@ -589,7 +587,6 @@ impl LayerInner {
consecutive_failures: AtomicUsize::new(0),
generation,
shard,
last_evicted_at: std::sync::Mutex::default(),
}
}
@@ -725,14 +722,6 @@ impl LayerInner {
permit
};
let since_last_eviction =
self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
if let Some(since_last_eviction) = since_last_eviction {
// FIXME: this will not always be recorded correctly until #6028 (the no
// download needed branch above)
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
}
let res = Arc::new(DownloadedLayer {
owner: Arc::downgrade(self),
kind: tokio::sync::OnceCell::default(),
@@ -848,7 +837,7 @@ impl LayerInner {
crate::task_mgr::spawn(
&tokio::runtime::Handle::current(),
crate::task_mgr::TaskKind::RemoteDownloadTask,
Some(self.desc.tenant_shard_id),
Some(self.desc.tenant_shard_id.tenant_id),
Some(self.desc.timeline_id),
&task_name,
false,
@@ -1128,8 +1117,6 @@ impl LayerInner {
// we are still holding the permit, so no new spawn_download_and_wait can happen
drop(self.status.send(Status::Evicted));
*self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
res
}
@@ -1434,7 +1421,6 @@ pub(crate) struct LayerImplMetrics {
rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
redownload_after: metrics::Histogram,
}
impl Default for LayerImplMetrics {
@@ -1510,26 +1496,6 @@ impl Default for LayerImplMetrics {
)
.unwrap();
let redownload_after = {
let minute = 60.0;
let hour = 60.0 * minute;
metrics::register_histogram!(
"pageserver_layer_redownloaded_after",
"Time between evicting and re-downloading.",
vec![
10.0,
30.0,
minute,
5.0 * minute,
15.0 * minute,
30.0 * minute,
hour,
12.0 * hour,
]
)
.unwrap()
};
Self {
started_evictions,
completed_evictions,
@@ -1541,7 +1507,6 @@ impl Default for LayerImplMetrics {
rare_counters,
inits_cancelled,
redownload_after,
}
}
}
@@ -1609,10 +1574,6 @@ impl LayerImplMetrics {
fn inc_init_cancelled(&self) {
self.inits_cancelled.inc()
}
fn record_redownloaded_after(&self, duration: std::time::Duration) {
self.redownload_after.observe(duration.as_secs_f64())
}
}
#[derive(enum_map::Enum)]

View File

@@ -54,18 +54,31 @@ impl BackgroundLoopKind {
}
}
/// Cancellation safe.
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
pub(crate) enum RateLimitError {
Cancelled,
}
pub(crate) async fn concurrent_background_tasks_rate_limit(
loop_kind: BackgroundLoopKind,
_ctx: &RequestContext,
) -> impl Drop {
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
cancel: &CancellationToken,
) -> Result<impl Drop, RateLimitError> {
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
.with_label_values(&[loop_kind.as_static_str()])
.guard();
match CONCURRENT_BACKGROUND_TASKS.acquire().await {
Ok(permit) => permit,
Err(_closed) => unreachable!("we never close the semaphore"),
.inc();
scopeguard::defer!(
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
);
tokio::select! {
permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
match permit {
Ok(permit) => Ok(permit),
Err(_closed) => unreachable!("we never close the semaphore"),
}
},
_ = cancel.cancelled() => {
Err(RateLimitError::Cancelled)
}
}
}
@@ -74,13 +87,13 @@ pub fn start_background_loops(
tenant: &Arc<Tenant>,
background_jobs_can_start: Option<&completion::Barrier>,
) {
let tenant_shard_id = tenant.tenant_shard_id;
let tenant_id = tenant.tenant_shard_id.tenant_id;
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Compaction,
Some(tenant_shard_id),
Some(tenant_id),
None,
&format!("compactor for tenant {tenant_shard_id}"),
&format!("compactor for tenant {tenant_id}"),
false,
{
let tenant = Arc::clone(tenant);
@@ -92,7 +105,7 @@ pub fn start_background_loops(
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
compaction_loop(tenant, cancel)
.instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
.await;
Ok(())
}
@@ -101,9 +114,9 @@ pub fn start_background_loops(
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::GarbageCollector,
Some(tenant_shard_id),
Some(tenant_id),
None,
&format!("garbage collector for tenant {tenant_shard_id}"),
&format!("garbage collector for tenant {tenant_id}"),
false,
{
let tenant = Arc::clone(tenant);
@@ -115,7 +128,7 @@ pub fn start_background_loops(
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
gc_loop(tenant, cancel)
.instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
.await;
Ok(())
}

View File

@@ -29,7 +29,7 @@ use tokio::{
};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::sync::gate::Gate;
use utils::{id::TenantTimelineId, sync::gate::Gate};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::ops::{Deref, Range};
@@ -51,7 +51,7 @@ use crate::tenant::storage_layer::{
LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
ValueReconstructState,
};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
use crate::tenant::{
layer_map::{LayerMap, SearchResult},
@@ -66,7 +66,7 @@ use crate::metrics::{
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
use pageserver_api::reltag::RelTag;
@@ -77,7 +77,7 @@ use postgres_ffi::to_pg_timestamp;
use utils::{
completion,
generation::Generation,
id::TimelineId,
id::{TenantId, TimelineId},
lsn::{AtomicLsn, Lsn, RecordLsn},
seqwait::SeqWait,
simple_rcu::{Rcu, RcuReadGuard},
@@ -98,9 +98,8 @@ use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::RemoteTimelineClient;
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -378,6 +377,9 @@ pub enum PageReconstructError {
#[error(transparent)]
Other(#[from] anyhow::Error),
/// The operation would require downloading a layer that is missing locally.
NeedsDownload(TenantTimelineId, LayerFileName),
/// The operation was cancelled
Cancelled,
@@ -406,6 +408,14 @@ impl std::fmt::Debug for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
write!(
f,
"layer {}/{} needs download",
tenant_timeline_id,
layer_file_name.file_name()
)
}
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
@@ -419,6 +429,14 @@ impl std::fmt::Display for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
write!(
f,
"layer {}/{} needs download",
tenant_timeline_id,
layer_file_name.file_name()
)
}
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
@@ -446,12 +464,6 @@ pub(crate) enum CompactFlags {
ForceRepartition,
}
impl std::fmt::Debug for Timeline {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "Timeline<{}>", self.timeline_id)
}
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -715,27 +727,19 @@ impl Timeline {
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> Result<(), CompactionError> {
// most likely the cancellation token is from background task, but in tests it could be the
// request task as well.
let prepare = async move {
let guard = self.compaction_lock.lock().await;
let permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Compaction,
ctx,
)
.await;
(guard, permit)
};
let _g = self.compaction_lock.lock().await;
// this wait probably never needs any "long time spent" logging, because we already nag if
// compaction task goes over it's period (20s) which is quite often in production.
let (_guard, _permit) = tokio::select! {
tuple = prepare => { tuple },
_ = self.cancel.cancelled() => return Ok(()),
_ = cancel.cancelled() => return Ok(()),
let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
BackgroundLoopKind::Compaction,
ctx,
cancel,
)
.await
{
Ok(permit) => permit,
Err(RateLimitError::Cancelled) => return Ok(()),
};
let last_record_lsn = self.get_last_record_lsn();
@@ -922,7 +926,7 @@ impl Timeline {
tracing::debug!("Waiting for WalReceiverManager...");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
)
.await;
@@ -973,7 +977,7 @@ impl Timeline {
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
)
.await;
@@ -991,7 +995,12 @@ impl Timeline {
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
task_mgr::shutdown_tasks(
None,
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
)
.await;
// Finally wait until any gate-holders are complete
self.gate.close().await;
@@ -1114,9 +1123,8 @@ impl Timeline {
Ok(Some(true))
}
/// Evict just one layer.
///
/// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let _gate = self
.gate
@@ -1127,17 +1135,109 @@ impl Timeline {
return Ok(None);
};
let rtc = self
let Some(local_layer) = local_layer.keep_resident().await? else {
return Ok(Some(false));
};
let local_layer: Layer = local_layer.into();
let remote_client = self
.remote_client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
match local_layer.evict_and_wait(rtc).await {
Ok(()) => Ok(Some(true)),
Err(EvictionError::NotFound) => Ok(Some(false)),
Err(EvictionError::Downloaded) => Ok(Some(false)),
let results = self
.evict_layer_batch(remote_client, &[local_layer])
.await?;
assert_eq!(results.len(), 1);
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
match result {
None => anyhow::bail!("task_mgr shutdown requested"),
Some(Ok(())) => Ok(Some(true)),
Some(Err(e)) => Err(anyhow::Error::new(e)),
}
}
/// Evict a batch of layers.
pub(crate) async fn evict_layers(
&self,
layers_to_evict: &[Layer],
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
let _gate = self
.gate
.enter()
.map_err(|_| anyhow::anyhow!("Shutting down"))?;
let remote_client = self
.remote_client
.as_ref()
.context("timeline must have RemoteTimelineClient")?;
self.evict_layer_batch(remote_client, layers_to_evict).await
}
/// Evict multiple layers at once, continuing through errors.
///
/// The `remote_client` should be this timeline's `self.remote_client`.
/// We make the caller provide it so that they are responsible for handling the case
/// where someone wants to evict the layer but no remote storage is configured.
///
/// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`.
/// If `Err()` is returned, no eviction was attempted.
/// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`.
/// Meaning of each `result[i]`:
/// - `Some(Err(...))` if layer replacement failed for some reason
/// - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks)
/// - `Some(Ok(()))` if everything went well.
/// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`.
async fn evict_layer_batch(
&self,
remote_client: &Arc<RemoteTimelineClient>,
layers_to_evict: &[Layer],
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
{
// to avoid racing with detach and delete_timeline
let state = self.current_state();
anyhow::ensure!(
state == TimelineState::Active,
"timeline is not active but {state:?}"
);
}
let mut results = Vec::with_capacity(layers_to_evict.len());
for _ in 0..layers_to_evict.len() {
results.push(None);
}
let mut js = tokio::task::JoinSet::new();
for (i, l) in layers_to_evict.iter().enumerate() {
js.spawn({
let l = l.to_owned();
let remote_client = remote_client.clone();
async move { (i, l.evict_and_wait(&remote_client).await) }
});
}
let join = async {
while let Some(next) = js.join_next().await {
match next {
Ok((i, res)) => results[i] = Some(res),
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
}
};
tokio::select! {
_ = self.cancel.cancelled() => {},
_ = join => {}
}
assert_eq!(results.len(), layers_to_evict.len());
Ok(results)
}
}
/// Number of times we will compute partition within a checkpoint distance.
@@ -1214,20 +1314,16 @@ impl Timeline {
&self.conf.default_tenant_conf,
);
// TODO(sharding): make evictions state shard aware
// (https://github.com/neondatabase/neon/issues/5953)
let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
let timeline_id_str = self.timeline_id.to_string();
self.metrics
.evictions_with_low_residence_duration
.write()
.unwrap()
.change_threshold(
&tenant_id_str,
&shard_id_str,
&timeline_id_str,
new_threshold,
);
.change_threshold(&tenant_id_str, &timeline_id_str, new_threshold);
}
}
@@ -1299,7 +1395,7 @@ impl Timeline {
ancestor_lsn: metadata.ancestor_lsn(),
metrics: TimelineMetrics::new(
&tenant_shard_id,
&tenant_shard_id.tenant_id,
&timeline_id,
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
"mtime",
@@ -1400,7 +1496,7 @@ impl Timeline {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::LayerFlushTask,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"layer flush task",
false,
@@ -1751,7 +1847,7 @@ impl Timeline {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::InitialLogicalSizeCalculation,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"initial size calculation",
false,
@@ -1790,22 +1886,22 @@ impl Timeline {
let skip_concurrency_limiter = &skip_concurrency_limiter;
async move {
let cancel = task_mgr::shutdown_token();
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit_permit(
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
BackgroundLoopKind::InitialLogicalSizeCalculation,
background_ctx,
&cancel,
);
use crate::metrics::initial_logical_size::StartCircumstances;
let (_maybe_permit, circumstances) = tokio::select! {
permit = wait_for_permit => {
(Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
res = wait_for_permit => {
match res {
Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
Err(RateLimitError::Cancelled) => {
return Err(BackgroundCalculationError::Cancelled);
}
}
}
_ = self_ref.cancel.cancelled() => {
return Err(BackgroundCalculationError::Cancelled);
}
_ = cancel.cancelled() => {
return Err(BackgroundCalculationError::Cancelled);
},
() = skip_concurrency_limiter.cancelled() => {
// Some action that is part of a end user interaction requested logical size
// => break out of the rate limit
@@ -1924,7 +2020,7 @@ impl Timeline {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::OndemandLogicalSizeCalculation,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"ondemand logical size calculation",
false,
@@ -2070,55 +2166,6 @@ impl Timeline {
None
}
/// The timeline heatmap is a hint to secondary locations from the primary location,
/// indicating which layers are currently on-disk on the primary.
///
/// None is returned if the Timeline is in a state where uploading a heatmap
/// doesn't make sense, such as shutting down or initializing. The caller
/// should treat this as a cue to simply skip doing any heatmap uploading
/// for this timeline.
pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
let remote_client = match &self.remote_client {
Some(c) => c,
None => return None,
};
let layer_file_names = eviction_info
.resident_layers
.iter()
.map(|l| l.layer.layer_desc().filename())
.collect::<Vec<_>>();
let decorated = match remote_client.get_layers_metadata(layer_file_names) {
Ok(d) => d,
Err(_) => {
// Getting metadata only fails on Timeline in bad state.
return None;
}
};
let heatmap_layers = std::iter::zip(
eviction_info.resident_layers.into_iter(),
decorated.into_iter(),
)
.filter_map(|(layer, remote_info)| {
remote_info.map(|remote_info| {
HeatMapLayer::new(
layer.layer.layer_desc().filename(),
IndexLayerMetadata::from(remote_info),
layer.last_activity_ts,
)
})
});
Some(HeatMapTimeline::new(
self.timeline_id,
heatmap_layers.collect(),
))
}
}
type TraversalId = String;
@@ -2232,7 +2279,7 @@ impl Timeline {
}
// Recurse into ancestor if needed
if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
trace!(
"going into ancestor {}, cont_lsn is {}",
timeline.ancestor_lsn,
@@ -2414,7 +2461,13 @@ impl Timeline {
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
// We should look at the key to determine if it's a cacheable object
let (lsn, read_guard) = cache
.lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
.lookup_materialized_page(
self.tenant_shard_id.tenant_id,
self.timeline_id,
key,
lsn,
ctx,
)
.await?;
let img = Bytes::from(read_guard.to_vec());
Some((lsn, img))
@@ -3156,7 +3209,7 @@ impl DurationRecorder {
#[derive(Default)]
struct CompactLevel0Phase1StatsBuilder {
version: Option<u64>,
tenant_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
read_lock_acquisition_micros: DurationRecorder,
read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
@@ -3173,7 +3226,7 @@ struct CompactLevel0Phase1StatsBuilder {
#[derive(serde::Serialize)]
struct CompactLevel0Phase1Stats {
version: u64,
tenant_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
read_lock_acquisition_micros: RecordedDuration,
read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
@@ -3692,7 +3745,7 @@ impl Timeline {
let ctx = ctx.attached_child();
let mut stats = CompactLevel0Phase1StatsBuilder {
version: Some(2),
tenant_id: Some(self.tenant_shard_id),
tenant_id: Some(self.tenant_shard_id.tenant_id),
timeline_id: Some(self.timeline_id),
..Default::default()
};
@@ -3860,14 +3913,7 @@ impl Timeline {
/// within a layer file. We can only remove the whole file if it's fully
/// obsolete.
pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
// this is most likely the background tasks, but it might be the spawned task from
// immediate_gc
let cancel = crate::task_mgr::shutdown_token();
let _g = tokio::select! {
guard = self.gc_lock.lock() => guard,
_ = self.cancel.cancelled() => return Ok(GcResult::default()),
_ = cancel.cancelled() => return Ok(GcResult::default()),
};
let _g = self.gc_lock.lock().await;
let timer = self.metrics.garbage_collect_histo.start_timer();
fail_point!("before-timeline-gc");
@@ -4161,7 +4207,7 @@ impl Timeline {
let cache = page_cache::get();
if let Err(e) = cache
.memorize_materialized_page(
self.tenant_shard_id,
self.tenant_shard_id.tenant_id,
self.timeline_id,
key,
last_rec_lsn,
@@ -4205,7 +4251,7 @@ impl Timeline {
let task_id = task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::DownloadAllRemoteLayers,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"download all remote layers task",
false,
@@ -4566,7 +4612,7 @@ mod tests {
.await
.unwrap();
let rtc = timeline
let rc = timeline
.remote_client
.clone()
.expect("just configured this");
@@ -4579,12 +4625,16 @@ mod tests {
.expect("should had been resident")
.drop_eviction_guard();
let first = async { layer.evict_and_wait(&rtc).await };
let second = async { layer.evict_and_wait(&rtc).await };
let batch = [layer];
let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
let (first, second) = tokio::join!(first, second);
let res = layer.keep_resident().await;
let (first, second) = (only_one(first), only_one(second));
let res = batch[0].keep_resident().await;
assert!(matches!(res, Ok(None)), "{res:?}");
match (first, second) {
@@ -4605,6 +4655,14 @@ mod tests {
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
fn only_one<T>(mut input: Vec<Option<T>>) -> T {
assert_eq!(1, input.len());
input
.pop()
.expect("length just checked")
.expect("no cancellation")
}
async fn find_some_layer(timeline: &Timeline) -> Layer {
let layers = timeline.layers.read().await;
let desc = layers

View File

@@ -43,7 +43,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(timeline.tenant_shard_id),
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.timeline_id),
)
.await;
@@ -71,7 +71,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(
None,
Some(timeline.tenant_shard_id),
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.timeline_id),
)
.await;
@@ -528,7 +528,7 @@ impl DeleteTimelineFlow {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id),
Some(tenant_shard_id.tenant_id),
Some(timeline_id),
"timeline_delete",
false,

View File

@@ -30,7 +30,7 @@ use crate::{
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
tasks::BackgroundLoopKind,
tasks::{BackgroundLoopKind, RateLimitError},
timeline::EvictionError,
LogicalSizeCalculationCause, Tenant,
},
@@ -60,7 +60,7 @@ impl Timeline {
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Eviction,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
&format!(
"layer eviction for {}/{}",
@@ -158,15 +158,15 @@ impl Timeline {
) -> ControlFlow<()> {
let now = SystemTime::now();
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
BackgroundLoopKind::Eviction,
ctx,
);
let _permit = tokio::select! {
permit = acquire_permit => permit,
_ = cancel.cancelled() => return ControlFlow::Break(()),
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
cancel,
)
.await
{
Ok(permit) => permit,
Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
};
// If we evict layers but keep cached values derived from those layers, then
@@ -212,21 +212,11 @@ impl Timeline {
// Gather layers for eviction.
// NB: all the checks can be invalidated as soon as we release the layer map lock.
// We don't want to hold the layer map lock during eviction.
// So, we just need to deal with this.
let remote_client = match self.remote_client.as_ref() {
Some(c) => c,
None => {
error!("no remote storage configured, cannot evict layers");
return ControlFlow::Continue(());
}
};
let mut js = tokio::task::JoinSet::new();
{
let candidates: Vec<_> = {
let guard = self.layers.read().await;
let layers = guard.layer_map();
let mut candidates = Vec::new();
for hist_layer in layers.iter_historic_layers() {
let hist_layer = guard.get_from_desc(&hist_layer);
@@ -272,49 +262,54 @@ impl Timeline {
continue;
}
};
let layer = guard.drop_eviction_guard();
if no_activity_for > p.threshold {
let remote_client = remote_client.clone();
// this could cause a lot of allocations in some cases
js.spawn(async move { layer.evict_and_wait(&remote_client).await });
stats.candidates += 1;
candidates.push(guard.drop_eviction_guard())
}
}
candidates
};
stats.candidates = candidates.len();
let remote_client = match self.remote_client.as_ref() {
None => {
error!(
num_candidates = candidates.len(),
"no remote storage configured, cannot evict layers"
);
return ControlFlow::Continue(());
}
Some(c) => c,
};
let join_all = async move {
while let Some(next) = js.join_next().await {
match next {
Ok(Ok(())) => stats.evicted += 1,
Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
stats.not_evictable += 1;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
/* already logged */
stats.errors += 1;
}
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
let results = match self.evict_layer_batch(remote_client, &candidates).await {
Err(pre_err) => {
stats.errors += candidates.len();
error!("could not do any evictions: {pre_err:#}");
return ControlFlow::Continue(());
}
stats
Ok(results) => results,
};
tokio::select! {
stats = join_all => {
if stats.candidates == stats.not_evictable {
debug!(stats=?stats, "eviction iteration complete");
} else if stats.errors > 0 || stats.not_evictable > 0 {
warn!(stats=?stats, "eviction iteration complete");
} else {
info!(stats=?stats, "eviction iteration complete");
assert_eq!(results.len(), candidates.len());
for result in results {
match result {
None => {
stats.skipped_for_shutdown += 1;
}
Some(Ok(())) => {
stats.evicted += 1;
}
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
stats.not_evictable += 1;
}
}
_ = cancel.cancelled() => {
// just drop the joinset to "abort"
}
}
if stats.candidates == stats.not_evictable {
debug!(stats=?stats, "eviction iteration complete");
} else if stats.errors > 0 || stats.not_evictable > 0 {
warn!(stats=?stats, "eviction iteration complete");
} else {
info!(stats=?stats, "eviction iteration complete");
}
ControlFlow::Continue(())
}
@@ -348,7 +343,7 @@ impl Timeline {
// Make one of the tenant's timelines draw the short straw and run the calculation.
// The others wait until the calculation is done so that they take into account the
// imitated accesses that the winner made.
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) {
Ok(t) => t,
Err(_) => {
return ControlFlow::Break(());

View File

@@ -19,14 +19,14 @@ use super::Timeline;
pub struct UninitializedTimeline<'t> {
pub(crate) owning_tenant: &'t Tenant,
timeline_id: TimelineId,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
}
impl<'t> UninitializedTimeline<'t> {
pub(crate) fn new(
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark<'t>)>,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
) -> Self {
Self {
owning_tenant,
@@ -169,55 +169,18 @@ pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
///
/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
#[must_use]
pub(crate) struct TimelineUninitMark<'t> {
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
pub(crate) struct TimelineUninitMark {
uninit_mark_deleted: bool,
uninit_mark_path: Utf8PathBuf,
pub(crate) timeline_path: Utf8PathBuf,
}
/// Errors when acquiring exclusive access to a timeline ID for creation
#[derive(thiserror::Error, Debug)]
pub(crate) enum TimelineExclusionError {
#[error("Already exists")]
AlreadyExists(Arc<Timeline>),
#[error("Already creating")]
AlreadyCreating,
// e.g. I/O errors, or some failure deep in postgres initdb
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl<'t> TimelineUninitMark<'t> {
pub(crate) fn new(
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
uninit_mark_path: Utf8PathBuf,
timeline_path: Utf8PathBuf,
) -> Result<Self, TimelineExclusionError> {
// Lock order: this is the only place we take both locks. During drop() we only
// lock creating_timelines
let timelines = owning_tenant.timelines.lock().unwrap();
let mut creating_timelines: std::sync::MutexGuard<
'_,
std::collections::HashSet<TimelineId>,
> = owning_tenant.timelines_creating.lock().unwrap();
if let Some(existing) = timelines.get(&timeline_id) {
Err(TimelineExclusionError::AlreadyExists(existing.clone()))
} else if creating_timelines.contains(&timeline_id) {
Err(TimelineExclusionError::AlreadyCreating)
} else {
creating_timelines.insert(timeline_id);
Ok(Self {
owning_tenant,
timeline_id,
uninit_mark_deleted: false,
uninit_mark_path,
timeline_path,
})
impl TimelineUninitMark {
pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self {
Self {
uninit_mark_deleted: false,
uninit_mark_path,
timeline_path,
}
}
@@ -244,7 +207,7 @@ impl<'t> TimelineUninitMark<'t> {
}
}
impl Drop for TimelineUninitMark<'_> {
impl Drop for TimelineUninitMark {
fn drop(&mut self) {
if !self.uninit_mark_deleted {
if self.timeline_path.exists() {
@@ -263,11 +226,5 @@ impl Drop for TimelineUninitMark<'_> {
}
}
}
self.owning_tenant
.timelines_creating
.lock()
.unwrap()
.remove(&self.timeline_id);
}
}

View File

@@ -30,7 +30,6 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
connection_manager_loop_step, ConnectionManagerState,
};
use pageserver_api::shard::TenantShardId;
use std::future::Future;
use std::num::NonZeroU64;
use std::ops::ControlFlow;
@@ -42,7 +41,7 @@ use tokio::sync::watch;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::TimelineId;
use utils::id::TenantTimelineId;
use self::connection_manager::ConnectionManagerStatus;
@@ -61,8 +60,7 @@ pub struct WalReceiverConf {
}
pub struct WalReceiver {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
timeline: TenantTimelineId,
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
}
@@ -73,7 +71,7 @@ impl WalReceiver {
mut broker_client: BrokerClientChannel,
ctx: &RequestContext,
) -> Self {
let tenant_shard_id = timeline.tenant_shard_id;
let tenant_id = timeline.tenant_shard_id.tenant_id;
let timeline_id = timeline.timeline_id;
let walreceiver_ctx =
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
@@ -83,9 +81,9 @@ impl WalReceiver {
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverManager,
Some(timeline.tenant_shard_id),
Some(tenant_id),
Some(timeline_id),
&format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
false,
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -119,12 +117,11 @@ impl WalReceiver {
*loop_status.write().unwrap() = None;
Ok(())
}
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
);
Self {
tenant_shard_id,
timeline_id,
timeline: TenantTimelineId::new(tenant_id, timeline_id),
manager_status,
}
}
@@ -132,8 +129,8 @@ impl WalReceiver {
pub async fn stop(self) {
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_shard_id),
Some(self.timeline_id),
Some(self.timeline.tenant_id),
Some(self.timeline.timeline_id),
)
.await;
}

View File

@@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection(
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverConnectionPoller,
Some(timeline.tenant_shard_id),
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.timeline_id),
"walreceiver connection",
false,

View File

@@ -654,7 +654,6 @@ pub fn init(num_slots: usize) {
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
panic!("virtual_file::init called twice");
}
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
}
const TEST_MAX_FILE_DESCRIPTORS: usize = 10;

View File

@@ -458,10 +458,8 @@ impl<'a> WalIngest<'a> {
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
// do not materialize null pages because them most likely be soon replaced with real data
&& blk.bimg_len != 0
{
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
@@ -2191,7 +2189,7 @@ mod tests {
.load()
.await;
let tline = tenant
.bootstrap_timeline_test(TIMELINE_ID, pg_version, None, &ctx)
.bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
.await
.unwrap();

View File

@@ -61,7 +61,6 @@ thiserror.workspace = true
tls-listener.workspace = true
tokio-postgres.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
tokio = { workspace = true, features = ["signal"] }
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
@@ -78,6 +77,7 @@ postgres-protocol.workspace = true
smol_str.workspace = true
workspace_hack.workspace = true
tokio-util.workspace = true
[dev-dependencies]
rcgen.workspace = true

View File

@@ -62,9 +62,6 @@ pub enum AuthErrorImpl {
Please add it to the allowed list in the Neon console."
)]
IpAddressNotAllowed,
#[error("Too many connections to this endpoint. Please try again later.")]
TooManyConnections,
}
#[derive(Debug, Error)]
@@ -83,10 +80,6 @@ impl AuthError {
pub fn ip_address_not_allowed() -> Self {
AuthErrorImpl::IpAddressNotAllowed.into()
}
pub fn too_many_connections() -> Self {
AuthErrorImpl::TooManyConnections.into()
}
}
impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -109,7 +102,6 @@ impl UserFacingError for AuthError {
MissingEndpointName => self.to_string(),
Io(_) => "Internal error".to_string(),
IpAddressNotAllowed => self.to_string(),
TooManyConnections => self.to_string(),
}
}
}

View File

@@ -166,7 +166,7 @@ impl TryFrom<ClientCredentials> for ComputeUserInfo {
/// All authentication flows will emit an AuthenticationOk message if successful.
async fn auth_quirks(
api: &impl console::Api,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: ClientCredentials,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
@@ -235,7 +235,7 @@ async fn auth_quirks(
/// only if authentication was successfuly.
async fn auth_and_wake_compute(
api: &impl console::Api,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: ClientCredentials,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
@@ -314,7 +314,7 @@ impl<'a> BackendType<'a, ClientCredentials> {
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
pub async fn authenticate(
self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
@@ -387,7 +387,7 @@ impl<'a> BackendType<'a, ClientCredentials> {
impl BackendType<'_, ComputeUserInfo> {
pub async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
use BackendType::*;
match self {
@@ -404,7 +404,7 @@ impl BackendType<'_, ComputeUserInfo> {
/// The link auth flow doesn't support this, so we return [`None`] in that case.
pub async fn wake_compute(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
use BackendType::*;

View File

@@ -3,7 +3,7 @@
use crate::{
auth::password_hack::parse_endpoint_param,
error::UserFacingError,
proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
proxy::{neon_options, NUM_CONNECTION_ACCEPTED_BY_SNI},
};
use itertools::Itertools;
use pq_proto::StartupMessageParams;
@@ -140,7 +140,7 @@ impl ClientCredentials {
let cache_key = format!(
"{}{}",
project.as_deref().unwrap_or(""),
neon_options_str(params)
neon_options(params).unwrap_or("".to_string())
)
.into();
@@ -406,7 +406,10 @@ mod tests {
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.project.as_deref(), Some("project"));
assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");
assert_eq!(
creds.cache_key,
"projectneon_endpoint_type:read_write neon_lsn:0/2"
);
Ok(())
}

View File

@@ -8,7 +8,6 @@ use std::{net::SocketAddr, sync::Arc};
use futures::future::Either;
use itertools::Itertools;
use proxy::config::TlsServerEndPoint;
use proxy::proxy::run_until_cancelled;
use tokio::net::TcpListener;
use anyhow::{anyhow, bail, ensure, Context};
@@ -21,7 +20,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::sync::CancellationToken;
use utils::{project_git_version, sentry_init::init_sentry};
use tracing::{error, info, Instrument};
use tracing::{error, info, warn, Instrument};
project_git_version!(GIT_VERSION);
@@ -152,39 +151,63 @@ async fn task_main(
// will be inherited by all accepted client sockets.
socket2::SockRef::from(&listener).set_keepalive(true)?;
let connections = tokio_util::task::task_tracker::TaskTracker::new();
let mut connections = tokio::task::JoinSet::new();
while let Some(accept_result) =
run_until_cancelled(listener.accept(), &cancellation_token).await
{
let (socket, peer_addr) = accept_result?;
loop {
tokio::select! {
accept_result = listener.accept() => {
let (socket, peer_addr) = accept_result?;
let session_id = uuid::Uuid::new_v4();
let tls_config = Arc::clone(&tls_config);
let dest_suffix = Arc::clone(&dest_suffix);
let session_id = uuid::Uuid::new_v4();
let tls_config = Arc::clone(&tls_config);
let dest_suffix = Arc::clone(&dest_suffix);
connections.spawn(
async move {
socket
.set_nodelay(true)
.context("failed to set socket option")?;
connections.spawn(
async move {
socket
.set_nodelay(true)
.context("failed to set socket option")?;
info!(%peer_addr, "serving");
handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
info!(%peer_addr, "serving");
handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
error!("per-client task finished with an error: {e:#}");
})
.instrument(tracing::info_span!("handle_client", ?session_id))
);
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
error!("per-client task finished with an error: {e:#}");
})
.instrument(tracing::info_span!("handle_client", ?session_id)),
);
// Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
// If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
// This only counts for this loop and it will be enabled again on next `select!`.
//
// Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
// When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
// not get called again, even if there are more connections to remove.
Some(res) = connections.join_next() => {
if let Err(e) = res {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
}
}
_ = cancellation_token.cancelled() => {
drop(listener);
break;
}
}
}
connections.close();
drop(listener);
connections.wait().await;
// Drain connections
info!("waiting for all client connections to finish");
while let Some(res) = connections.join_next().await {
if let Err(e) = res {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
}
}
info!("all client connections have finished");
Ok(())
}

View File

@@ -7,8 +7,6 @@ use proxy::console;
use proxy::console::provider::AllowedIpsCache;
use proxy::console::provider::NodeInfoCache;
use proxy::http;
use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo;
use proxy::rate_limiter::RateLimiterConfig;
use proxy::usage_metrics;
@@ -16,7 +14,6 @@ use anyhow::bail;
use proxy::config::{self, ProxyConfig};
use proxy::serverless;
use std::pin::pin;
use std::sync::Arc;
use std::{borrow::Cow, net::SocketAddr};
use tokio::net::TcpListener;
use tokio::task::JoinSet;
@@ -115,12 +112,6 @@ struct ProxyCliArgs {
/// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
rate_limiter_timeout: tokio::time::Duration,
/// Endpoint rate limiter max number of requests per second.
///
/// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
/// Can be given multiple times for different bucket sizes.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
endpoint_rps_limit: Vec<RateBucketInfo>,
/// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
#[clap(long, default_value_t = 100)]
initial_limit: usize,
@@ -163,8 +154,6 @@ async fn main() -> anyhow::Result<()> {
let proxy_listener = TcpListener::bind(proxy_address).await?;
let cancellation_token = CancellationToken::new();
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
// client facing tasks. these will exit on error or on cancellation
// cancellation returns Ok(())
let mut client_tasks = JoinSet::new();
@@ -172,7 +161,6 @@ async fn main() -> anyhow::Result<()> {
config,
proxy_listener,
cancellation_token.clone(),
endpoint_rate_limiter.clone(),
));
// TODO: rename the argument to something like serverless.
@@ -186,7 +174,6 @@ async fn main() -> anyhow::Result<()> {
config,
serverless_listener,
cancellation_token.clone(),
endpoint_rate_limiter.clone(),
));
}
@@ -321,10 +308,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let authentication_config = AuthenticationConfig {
scram_protocol_timeout: args.scram_protocol_timeout,
};
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
let config = Box::leak(Box::new(ProxyConfig {
tls_config,
auth_backend,
@@ -334,35 +317,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
authentication_config,
require_client_ip: args.require_client_ip,
disable_ip_check_for_http: args.disable_ip_check_for_http,
endpoint_rps_limit,
}));
Ok(config)
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use clap::Parser;
use proxy::rate_limiter::RateBucketInfo;
#[test]
fn parse_endpoint_rps_limit() {
let config = super::ProxyCliArgs::parse_from([
"proxy",
"--endpoint-rps-limit",
"100@1s",
"--endpoint-rps-limit",
"20@30s",
]);
assert_eq!(
config.endpoint_rps_limit,
vec![
RateBucketInfo::new(100, Duration::from_secs(1)),
RateBucketInfo::new(20, Duration::from_secs(30)),
]
);
}
}

View File

@@ -1,13 +1,9 @@
use crate::{
auth::parse_endpoint_param,
cancellation::CancelClosure,
console::errors::WakeComputeError,
error::UserFacingError,
proxy::{neon_option, NUM_DB_CONNECTIONS_GAUGE},
auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
error::UserFacingError, proxy::is_neon_param,
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
use metrics::IntCounterPairGuard;
use pq_proto::StartupMessageParams;
use std::{io, net::SocketAddr, time::Duration};
use thiserror::Error;
@@ -227,8 +223,6 @@ pub struct PostgresConnection {
pub params: std::collections::HashMap<String, String>,
/// Query cancellation token.
pub cancel_closure: CancelClosure,
_guage: IntCounterPairGuard,
}
impl ConnCfg {
@@ -237,7 +231,6 @@ impl ConnCfg {
&self,
allow_self_signed_compute: bool,
timeout: Duration,
proto: &'static str,
) -> Result<PostgresConnection, ConnectionError> {
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
@@ -271,7 +264,6 @@ impl ConnCfg {
stream,
params,
cancel_closure,
_guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(),
};
Ok(connection)
@@ -283,7 +275,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
#[allow(unstable_name_collisions)]
let options: String = params
.options_raw()?
.filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
.filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
.intersperse(" ") // TODO: use impl from std once it's stabilized
.collect();

View File

@@ -1,4 +1,4 @@
use crate::{auth, rate_limiter::RateBucketInfo};
use crate::auth;
use anyhow::{bail, ensure, Context, Ok};
use rustls::{sign, Certificate, PrivateKey};
use sha2::{Digest, Sha256};
@@ -20,7 +20,6 @@ pub struct ProxyConfig {
pub authentication_config: AuthenticationConfig,
pub require_client_ip: bool,
pub disable_ip_check_for_http: bool,
pub endpoint_rps_limit: Vec<RateBucketInfo>,
}
#[derive(Debug)]
@@ -33,8 +32,6 @@ pub struct TlsConfig {
pub config: Arc<rustls::ServerConfig>,
pub common_names: Option<HashSet<String>>,
pub cert_resolver: Arc<CertResolver>,
pub handshake_timeout: Duration,
pub max_handshaking: usize,
}
pub struct HttpConfig {
@@ -100,8 +97,6 @@ pub fn configure_tls(
config,
common_names: Some(common_names),
cert_resolver,
handshake_timeout: tls_listener::DEFAULT_HANDSHAKE_TIMEOUT,
max_handshaking: tls_listener::DEFAULT_MAX_HANDSHAKES,
})
}

View File

@@ -196,23 +196,12 @@ pub mod errors {
}
/// Extra query params we'd like to pass to the console.
pub struct ConsoleReqExtra {
pub struct ConsoleReqExtra<'a> {
/// A unique identifier for a connection.
pub session_id: uuid::Uuid,
/// Name of client application, if set.
pub application_name: String,
pub options: Vec<(String, String)>,
}
impl ConsoleReqExtra {
// https://swagger.io/docs/specification/serialization/ DeepObject format
// paramName[prop1]=value1&paramName[prop2]=value2&....
pub fn options_as_deep_object(&self) -> Vec<(String, String)> {
self.options
.iter()
.map(|(k, v)| (format!("options[{}]", k), v.to_string()))
.collect()
}
pub application_name: Option<&'a str>,
pub options: Option<&'a str>,
}
/// Auth secret which is managed by the cloud.
@@ -259,20 +248,20 @@ pub trait Api {
/// Get the client's auth secret for authentication.
async fn get_auth_info(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<AuthInfo, errors::GetAuthInfoError>;
async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;
/// Wake up the compute node and return the corresponding connection info.
async fn wake_compute(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
}

View File

@@ -144,7 +144,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn get_auth_info(
&self,
_extra: &ConsoleReqExtra,
_extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<AuthInfo, GetAuthInfoError> {
self.do_get_auth_info(creds).await
@@ -152,7 +152,7 @@ impl super::Api for Api {
async fn get_allowed_ips(
&self,
_extra: &ConsoleReqExtra,
_extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
@@ -161,7 +161,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn wake_compute(
&self,
_extra: &ConsoleReqExtra,
_extra: &ConsoleReqExtra<'_>,
_creds: &ComputeUserInfo,
) -> Result<CachedNodeInfo, WakeComputeError> {
self.do_wake_compute()

View File

@@ -48,7 +48,7 @@ impl Api {
async fn do_get_auth_info(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<AuthInfo, GetAuthInfoError> {
let request_id = uuid::Uuid::new_v4().to_string();
@@ -60,9 +60,9 @@ impl Api {
.header("Authorization", format!("Bearer {}", &self.jwt))
.query(&[("session_id", extra.session_id)])
.query(&[
("application_name", extra.application_name.as_str()),
("project", creds.endpoint.as_str()),
("role", creds.inner.user.as_str()),
("application_name", extra.application_name),
("project", Some(&creds.endpoint)),
("role", Some(&creds.inner.user)),
])
.build()?;
@@ -101,28 +101,23 @@ impl Api {
async fn do_wake_compute(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<NodeInfo, WakeComputeError> {
let request_id = uuid::Uuid::new_v4().to_string();
async {
let mut request_builder = self
let request = self
.endpoint
.get("proxy_wake_compute")
.header("X-Request-ID", &request_id)
.header("Authorization", format!("Bearer {}", &self.jwt))
.query(&[("session_id", extra.session_id)])
.query(&[
("application_name", extra.application_name.as_str()),
("project", creds.endpoint.as_str()),
]);
request_builder = if extra.options.is_empty() {
request_builder
} else {
request_builder.query(&extra.options_as_deep_object())
};
let request = request_builder.build()?;
("application_name", extra.application_name),
("project", Some(&creds.endpoint)),
("options", extra.options),
])
.build()?;
info!(url = request.url().as_str(), "sending http request");
let start = Instant::now();
@@ -161,7 +156,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn get_auth_info(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<AuthInfo, GetAuthInfoError> {
self.do_get_auth_info(extra, creds).await
@@ -169,7 +164,7 @@ impl super::Api for Api {
async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
let key: &str = &creds.endpoint;
@@ -192,7 +187,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn wake_compute(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
creds: &ComputeUserInfo,
) -> Result<CachedNodeInfo, WakeComputeError> {
let key: &str = &creds.inner.cache_key;

View File

@@ -9,7 +9,6 @@ use crate::{
console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
http::StatusCode,
protocol2::WithClientIp,
rate_limiter::EndpointRateLimiter,
stream::{PqStream, Stream},
usage_metrics::{Ids, USAGE_METRICS},
};
@@ -17,10 +16,7 @@ use anyhow::{bail, Context};
use async_trait::async_trait;
use futures::TryFutureExt;
use itertools::Itertools;
use metrics::{
exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
IntCounterPairVec, IntCounterVec,
};
use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
use once_cell::sync::{Lazy, OnceCell};
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
use prometheus::{
@@ -28,7 +24,7 @@ use prometheus::{
IntGaugeVec,
};
use regex::Regex;
use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc};
use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant};
use tokio::{
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
time,
@@ -47,10 +43,17 @@ const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
const ERR_PROTO_VIOLATION: &str = "protocol violation";
pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
pub static NUM_DB_CONNECTIONS_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_opened_db_connections_total",
"Number of opened connections to a database.",
&["protocol"],
)
.unwrap()
});
pub static NUM_DB_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_closed_db_connections_total",
"Number of closed connections to a database.",
&["protocol"],
@@ -58,10 +61,17 @@ pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
.unwrap()
});
pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
pub static NUM_CLIENT_CONNECTION_OPENED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_opened_client_connections_total",
"Number of opened connections from a client.",
&["protocol"],
)
.unwrap()
});
pub static NUM_CLIENT_CONNECTION_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_closed_client_connections_total",
"Number of closed connections from a client.",
&["protocol"],
@@ -69,10 +79,17 @@ pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
.unwrap()
});
pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
pub static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_accepted_connections_total",
"Number of client connections accepted.",
&["protocol"],
)
.unwrap()
});
pub static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_closed_connections_total",
"Number of client connections closed.",
&["protocol"],
@@ -154,7 +171,7 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
pub struct LatencyTimer {
// time since the stopwatch was started
start: Option<time::Instant>,
start: Option<Instant>,
// accumulated time on the stopwatch
accumulated: std::time::Duration,
// label data
@@ -171,7 +188,7 @@ pub struct LatencyTimerPause<'a> {
impl LatencyTimer {
pub fn new(protocol: &'static str) -> Self {
Self {
start: Some(time::Instant::now()),
start: Some(Instant::now()),
accumulated: std::time::Duration::ZERO,
protocol,
cache_miss: false,
@@ -205,7 +222,7 @@ impl LatencyTimer {
impl Drop for LatencyTimerPause<'_> {
fn drop(&mut self) {
// start the stopwatch again
self.timer.start = Some(time::Instant::now());
self.timer.start = Some(Instant::now());
}
}
@@ -260,26 +277,10 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
.unwrap()
});
pub async fn run_until_cancelled<F: std::future::Future>(
f: F,
cancellation_token: &CancellationToken,
) -> Option<F::Output> {
match futures::future::select(
std::pin::pin!(f),
std::pin::pin!(cancellation_token.cancelled()),
)
.await
{
futures::future::Either::Left((f, _)) => Some(f),
futures::future::Either::Right(((), _)) => None,
}
}
pub async fn task_main(
config: &'static ProxyConfig,
listener: tokio::net::TcpListener,
cancellation_token: CancellationToken,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
scopeguard::defer! {
info!("proxy has shut down");
@@ -289,65 +290,71 @@ pub async fn task_main(
// will be inherited by all accepted client sockets.
socket2::SockRef::from(&listener).set_keepalive(true)?;
let connections = tokio_util::task::task_tracker::TaskTracker::new();
let mut connections = tokio::task::JoinSet::new();
let cancel_map = Arc::new(CancelMap::default());
while let Some(accept_result) =
run_until_cancelled(listener.accept(), &cancellation_token).await
{
let (socket, peer_addr) = accept_result?;
loop {
tokio::select! {
accept_result = listener.accept() => {
let (socket, peer_addr) = accept_result?;
let session_id = uuid::Uuid::new_v4();
let cancel_map = Arc::clone(&cancel_map);
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
let session_id = uuid::Uuid::new_v4();
let cancel_map = Arc::clone(&cancel_map);
connections.spawn(
async move {
info!("accepted postgres client connection");
connections.spawn(
async move {
info!("accepted postgres client connection");
let mut socket = WithClientIp::new(socket);
let mut peer_addr = peer_addr;
if let Some(ip) = socket.wait_for_addr().await? {
peer_addr = ip;
tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
} else if config.require_client_ip {
bail!("missing required client IP");
}
let mut socket = WithClientIp::new(socket);
let mut peer_addr = peer_addr;
if let Some(ip) = socket.wait_for_addr().await? {
peer_addr = ip;
tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
} else if config.require_client_ip {
bail!("missing required client IP");
}
socket
.inner
.set_nodelay(true)
.context("failed to set socket option")?;
socket
.inner
.set_nodelay(true)
.context("failed to set socket option")?;
handle_client(
config,
&cancel_map,
session_id,
socket,
ClientMode::Tcp,
peer_addr.ip(),
endpoint_rate_limiter,
)
.await
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr.ip()).await
}
.instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty))
.unwrap_or_else(move |e| {
// Acknowledge that the task has finished with an error.
error!(?session_id, "per-client task finished with an error: {e:#}");
}),
);
}
.instrument(info_span!(
"handle_client",
?session_id,
peer_addr = tracing::field::Empty
))
.unwrap_or_else(move |e| {
// Acknowledge that the task has finished with an error.
error!(?session_id, "per-client task finished with an error: {e:#}");
}),
);
// Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
// If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
// This only counts for this loop and it will be enabled again on next `select!`.
//
// Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
// When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
// not get called again, even if there are more connections to remove.
Some(res) = connections.join_next() => {
if let Err(e) = res {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
}
}
_ = cancellation_token.cancelled() => {
drop(listener);
break;
}
}
}
connections.close();
drop(listener);
// Drain connections
connections.wait().await;
while let Some(res) = connections.join_next().await {
if let Err(e) = res {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
}
}
Ok(())
}
@@ -402,7 +409,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
stream: S,
mode: ClientMode,
peer_addr: IpAddr,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
info!(
protocol = mode.protocol_label(),
@@ -410,12 +416,16 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
);
let proto = mode.protocol_label();
let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
NUM_CLIENT_CONNECTION_OPENED_COUNTER
.with_label_values(&[proto])
.guard();
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
.inc();
NUM_CONNECTIONS_ACCEPTED_COUNTER
.with_label_values(&[proto])
.guard();
.inc();
scopeguard::defer! {
NUM_CLIENT_CONNECTION_CLOSED_COUNTER.with_label_values(&[proto]).inc();
NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
}
let tls = config.tls_config.as_ref();
@@ -447,7 +457,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
&params,
session_id,
mode.allow_self_signed_compute(config),
endpoint_rate_limiter,
);
cancel_map
.with_session(|session| client.connect_to_db(session, mode, &config.authentication_config))
@@ -467,14 +476,9 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
// Client may try upgrading to each protocol only once
let (mut tried_ssl, mut tried_gss) = (false, false);
let handshake_timeout = tls
.map(|tls| tls.handshake_timeout)
.unwrap_or(tls_listener::DEFAULT_HANDSHAKE_TIMEOUT);
let deadline = time::Instant::now() + handshake_timeout;
let mut stream = PqStream::new(Stream::from_raw(stream));
loop {
let msg = tokio::time::timeout_at(deadline, stream.read_startup_packet()).await??;
let msg = stream.read_startup_packet().await?;
info!("received {msg:?}");
use FeStartupPacket::*;
@@ -500,9 +504,7 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
if !read_buf.is_empty() {
bail!("data is sent before server replied with EncryptionResponse");
}
let tls_stream =
tokio::time::timeout_at(deadline, raw.upgrade(tls.to_server_config()))
.await??;
let tls_stream = raw.upgrade(tls.to_server_config()).await?;
let (_, tls_server_end_point) = tls
.cert_resolver
@@ -569,13 +571,12 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
async fn connect_to_compute_once(
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
proto: &'static str,
) -> Result<PostgresConnection, compute::ConnectionError> {
let allow_self_signed_compute = node_info.allow_self_signed_compute;
node_info
.config
.connect(allow_self_signed_compute, timeout, proto)
.connect(allow_self_signed_compute, timeout)
.await
}
@@ -596,7 +597,6 @@ pub trait ConnectMechanism {
pub struct TcpMechanism<'a> {
/// KV-dictionary with PostgreSQL connection params.
pub params: &'a StartupMessageParams,
pub proto: &'static str,
}
#[async_trait]
@@ -610,7 +610,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
) -> Result<PostgresConnection, Self::Error> {
connect_to_compute_once(node_info, timeout, self.proto).await
connect_to_compute_once(node_info, timeout).await
}
fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -665,7 +665,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
pub async fn connect_to_compute<M: ConnectMechanism>(
mechanism: &M,
mut node_info: console::CachedNodeInfo,
extra: &console::ConsoleReqExtra,
extra: &console::ConsoleReqExtra<'_>,
creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
mut latency_timer: LatencyTimer,
) -> Result<M::Connection, M::Error>
@@ -922,8 +922,6 @@ struct Client<'a, S> {
session_id: uuid::Uuid,
/// Allow self-signed certificates (for testing).
allow_self_signed_compute: bool,
/// Rate limiter for endpoints
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
}
impl<'a, S> Client<'a, S> {
@@ -934,7 +932,6 @@ impl<'a, S> Client<'a, S> {
params: &'a StartupMessageParams,
session_id: uuid::Uuid,
allow_self_signed_compute: bool,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> Self {
Self {
stream,
@@ -942,7 +939,6 @@ impl<'a, S> Client<'a, S> {
params,
session_id,
allow_self_signed_compute,
endpoint_rate_limiter,
}
}
}
@@ -964,29 +960,17 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
params,
session_id,
allow_self_signed_compute,
endpoint_rate_limiter,
} = self;
// check rate limit
if let Some(ep) = creds.get_endpoint() {
if !endpoint_rate_limiter.check(ep) {
return stream
.throw_error(auth::AuthError::too_many_connections())
.await;
}
}
let console_options = neon_options(params);
let proto = mode.protocol_label();
let extra = console::ConsoleReqExtra {
session_id, // aka this connection's id
application_name: format!(
"{}/{}",
params.get("application_name").unwrap_or_default(),
proto
),
options: neon_options(params),
application_name: params.get("application_name"),
options: console_options.as_deref(),
};
let mut latency_timer = LatencyTimer::new(proto);
let mut latency_timer = LatencyTimer::new(mode.protocol_label());
let user = creds.get_user().to_owned();
let auth_result = match creds
@@ -1015,7 +999,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
let aux = node_info.aux.clone();
let mut node = connect_to_compute(
&TcpMechanism { params, proto },
&TcpMechanism { params },
node_info,
&extra,
&creds,
@@ -1024,6 +1008,14 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
.or_else(|e| stream.throw_error(e))
.await?;
let proto = mode.protocol_label();
NUM_DB_CONNECTIONS_OPENED_COUNTER
.with_label_values(&[proto])
.inc();
scopeguard::defer! {
NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
}
prepare_client_connection(&node, session, &mut stream).await?;
// Before proxy passing, forward to compute whatever data is left in the
// PqStream input buffer. Normally there is none, but our serverless npm
@@ -1035,29 +1027,26 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
}
}
pub fn neon_options(params: &StartupMessageParams) -> Vec<(String, String)> {
pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
#[allow(unstable_name_collisions)]
match params.options_raw() {
Some(options) => options.filter_map(neon_option).collect(),
None => vec![],
}
}
pub fn neon_options_str(params: &StartupMessageParams) -> String {
#[allow(unstable_name_collisions)]
neon_options(params)
.iter()
.map(|(k, v)| format!("{}:{}", k, v))
let options: String = params
.options_raw()?
.filter(|opt| is_neon_param(opt))
.sorted() // we sort it to use as cache key
.intersperse(" ".to_owned())
.collect()
.intersperse(" ") // TODO: use impl from std once it's stabilized
.collect();
// Don't even bother with empty options.
if options.is_empty() {
return None;
}
Some(options)
}
pub fn neon_option(bytes: &str) -> Option<(String, String)> {
pub fn is_neon_param(bytes: &str) -> bool {
static RE: OnceCell<Regex> = OnceCell::new();
let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap());
RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
let cap = re.captures(bytes)?;
let (_, [k, v]) = cap.extract();
Some((k.to_owned(), v.to_owned()))
RE.get().unwrap().is_match(bytes)
}

View File

@@ -85,8 +85,6 @@ fn generate_tls_config<'a>(
config,
common_names,
cert_resolver: Arc::new(cert_resolver),
handshake_timeout: tls_listener::DEFAULT_HANDSHAKE_TIMEOUT,
max_handshaking: tls_listener::DEFAULT_MAX_HANDSHAKES,
}
};
@@ -486,14 +484,14 @@ fn helper_create_connect_info(
mechanism: &TestConnectMechanism,
) -> (
CachedNodeInfo,
console::ConsoleReqExtra,
console::ConsoleReqExtra<'static>,
auth::BackendType<'_, ComputeUserInfo>,
) {
let cache = helper_create_cached_node_info();
let extra = console::ConsoleReqExtra {
session_id: uuid::Uuid::new_v4(),
application_name: "TEST".into(),
options: vec![],
application_name: Some("TEST"),
options: None,
};
let creds = auth::BackendType::Test(mechanism);
(cache, extra, creds)

View File

@@ -4,4 +4,3 @@ mod limiter;
pub use aimd::Aimd;
pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
pub use limiter::Limiter;
pub use limiter::{EndpointRateLimiter, RateBucketInfo};

View File

@@ -33,6 +33,39 @@ impl Aimd {
min_utilisation_threshold: config.aimd_min_utilisation_threshold,
}
}
pub fn decrease_factor(self, factor: f32) -> Self {
assert!((0.5..1.0).contains(&factor));
Self {
decrease_factor: factor,
..self
}
}
pub fn increase_by(self, increase: usize) -> Self {
assert!(increase > 0);
Self {
increase_by: increase,
..self
}
}
pub fn with_max_limit(self, max: usize) -> Self {
assert!(max > 0);
Self {
max_limit: max,
..self
}
}
/// A threshold below which the limit won't be increased. 0.5 = 50%.
pub fn with_min_utilisation_threshold(self, min_util: f32) -> Self {
assert!(min_util > 0. && min_util < 1.);
Self {
min_utilisation_threshold: min_util,
..self
}
}
}
#[async_trait]

View File

@@ -1,19 +1,13 @@
use std::{
collections::hash_map::RandomState,
hash::BuildHasher,
sync::{
atomic::{AtomicUsize, Ordering},
Arc, Mutex,
Arc,
},
time::Duration,
};
use anyhow::bail;
use dashmap::DashMap;
use itertools::Itertools;
use rand::{rngs::StdRng, Rng, SeedableRng};
use smol_str::SmolStr;
use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
use tokio::time::{timeout, Duration, Instant};
use tokio::time::{timeout, Instant};
use tracing::info;
use super::{
@@ -21,180 +15,6 @@ use super::{
RateLimiterConfig,
};
// Simple per-endpoint rate limiter.
//
// Check that number of connections to the endpoint is below `max_rps` rps.
// Purposefully ignore user name and database name as clients can reconnect
// with different names, so we'll end up sending some http requests to
// the control plane.
//
// We also may save quite a lot of CPU (I think) by bailing out right after we
// saw SNI, before doing TLS handshake. User-side error messages in that case
// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
// I went with a more expensive way that yields user-friendlier error messages.
pub struct EndpointRateLimiter<Rand = StdRng, Hasher = RandomState> {
map: DashMap<SmolStr, Vec<RateBucket>, Hasher>,
info: &'static [RateBucketInfo],
access_count: AtomicUsize,
rand: Mutex<Rand>,
}
#[derive(Clone, Copy)]
struct RateBucket {
start: Instant,
count: u32,
}
impl RateBucket {
fn should_allow_request(&mut self, info: &RateBucketInfo, now: Instant) -> bool {
if now - self.start < info.interval {
self.count < info.max_rpi
} else {
// bucket expired, reset
self.count = 0;
self.start = now;
true
}
}
fn inc(&mut self) {
self.count += 1;
}
}
#[derive(Clone, Copy, PartialEq)]
pub struct RateBucketInfo {
pub interval: Duration,
// requests per interval
pub max_rpi: u32,
}
impl std::fmt::Display for RateBucketInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let rps = self.max_rpi * 1000 / self.interval.as_millis() as u32;
write!(f, "{rps}@{}", humantime::format_duration(self.interval))
}
}
impl std::fmt::Debug for RateBucketInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{self}")
}
}
impl std::str::FromStr for RateBucketInfo {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let Some((max_rps, interval)) = s.split_once('@') else {
bail!("invalid rate info")
};
let max_rps = max_rps.parse()?;
let interval = humantime::parse_duration(interval)?;
Ok(Self::new(max_rps, interval))
}
}
impl RateBucketInfo {
pub const DEFAULT_SET: [Self; 3] = [
Self::new(300, Duration::from_secs(1)),
Self::new(200, Duration::from_secs(60)),
Self::new(100, Duration::from_secs(600)),
];
pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
info.sort_unstable_by_key(|info| info.interval);
let invalid = info
.iter()
.tuple_windows()
.find(|(a, b)| a.max_rpi > b.max_rpi);
if let Some((a, b)) = invalid {
bail!(
"invalid endpoint RPS limits. {b} allows fewer requests per bucket than {a} ({} vs {})",
b.max_rpi,
a.max_rpi,
);
}
Ok(())
}
pub const fn new(max_rps: u32, interval: Duration) -> Self {
Self {
interval,
max_rpi: max_rps * interval.as_millis() as u32 / 1000,
}
}
}
impl EndpointRateLimiter {
pub fn new(info: &'static [RateBucketInfo]) -> Self {
Self::new_with_rand_and_hasher(info, StdRng::from_entropy(), RandomState::new())
}
}
impl<R: Rng, S: BuildHasher + Clone> EndpointRateLimiter<R, S> {
fn new_with_rand_and_hasher(info: &'static [RateBucketInfo], rand: R, hasher: S) -> Self {
info!(buckets = ?info, "endpoint rate limiter");
Self {
info,
map: DashMap::with_hasher_and_shard_amount(hasher, 64),
access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request
rand: Mutex::new(rand),
}
}
/// Check that number of connections to the endpoint is below `max_rps` rps.
pub fn check(&self, endpoint: SmolStr) -> bool {
// do a partial GC every 2k requests. This cleans up ~ 1/64th of the map.
// worst case memory usage is about:
// = 2 * 2048 * 64 * (48B + 72B)
// = 30MB
if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
self.do_gc();
}
let now = Instant::now();
let mut entry = self.map.entry(endpoint).or_insert_with(|| {
vec![
RateBucket {
start: now,
count: 0,
};
self.info.len()
]
});
let should_allow_request = entry
.iter_mut()
.zip(self.info)
.all(|(bucket, info)| bucket.should_allow_request(info, now));
if should_allow_request {
// only increment the bucket counts if the request will actually be accepted
entry.iter_mut().for_each(RateBucket::inc);
}
should_allow_request
}
/// Clean the map. Simple strategy: remove all entries in a random shard.
/// At worst, we'll double the effective max_rps during the cleanup.
/// But that way deletion does not aquire mutex on each entry access.
pub fn do_gc(&self) {
info!(
"cleaning up endpoint rate limiter, current size = {}",
self.map.len()
);
let n = self.map.shards().len();
// this lock is ok as the periodic cycle of do_gc makes this very unlikely to collide
// (impossible, infact, unless we have 2048 threads)
let shard = self.rand.lock().unwrap().gen_range(0..n);
self.map.shards()[shard].write().clear();
}
}
/// Limits the number of concurrent jobs.
///
/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the
@@ -233,6 +53,7 @@ pub struct Token<'t> {
#[derive(Debug, Clone, Copy)]
pub struct LimiterState {
limit: usize,
available: usize,
in_flight: usize,
}
@@ -410,7 +231,11 @@ impl Limiter {
pub fn state(&self) -> LimiterState {
let limit = self.limits.load(Ordering::Relaxed);
let in_flight = self.in_flight.load(Ordering::Relaxed);
LimiterState { limit, in_flight }
LimiterState {
limit,
available: limit.saturating_sub(in_flight),
in_flight,
}
}
}
@@ -423,6 +248,13 @@ impl<'t> Token<'t> {
}
}
#[cfg(test)]
pub fn set_latency(&mut self, latency: Duration) {
use std::ops::Sub;
self.start = Instant::now().sub(latency);
}
pub fn forget(&mut self) {
if let Some(permit) = self.permit.take() {
permit.forget();
@@ -441,6 +273,10 @@ impl LimiterState {
pub fn limit(&self) -> usize {
self.limit
}
/// The amount of concurrency available to use.
pub fn available(&self) -> usize {
self.available
}
/// The number of jobs in flight.
pub fn in_flight(&self) -> usize {
self.in_flight
@@ -488,16 +324,12 @@ impl reqwest_middleware::Middleware for Limiter {
#[cfg(test)]
mod tests {
use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};
use std::{pin::pin, task::Context, time::Duration};
use futures::{task::noop_waker_ref, Future};
use rand::SeedableRng;
use rustc_hash::FxHasher;
use smol_str::SmolStr;
use tokio::time;
use super::{EndpointRateLimiter, Limiter, Outcome};
use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm};
use super::{Limiter, Outcome};
use crate::rate_limiter::RateLimitAlgorithm;
#[tokio::test]
async fn it_works() {
@@ -606,105 +438,4 @@ mod tests {
limiter.release(token1, None).await;
limiter.release(token2, None).await;
}
#[test]
fn rate_bucket_rpi() {
let rate_bucket = RateBucketInfo::new(50, Duration::from_secs(5));
assert_eq!(rate_bucket.max_rpi, 50 * 5);
let rate_bucket = RateBucketInfo::new(50, Duration::from_millis(500));
assert_eq!(rate_bucket.max_rpi, 50 / 2);
}
#[test]
fn rate_bucket_parse() {
let rate_bucket: RateBucketInfo = "100@10s".parse().unwrap();
assert_eq!(rate_bucket.interval, Duration::from_secs(10));
assert_eq!(rate_bucket.max_rpi, 100 * 10);
assert_eq!(rate_bucket.to_string(), "100@10s");
let rate_bucket: RateBucketInfo = "100@1m".parse().unwrap();
assert_eq!(rate_bucket.interval, Duration::from_secs(60));
assert_eq!(rate_bucket.max_rpi, 100 * 60);
assert_eq!(rate_bucket.to_string(), "100@1m");
}
#[test]
fn default_rate_buckets() {
let mut defaults = RateBucketInfo::DEFAULT_SET;
RateBucketInfo::validate(&mut defaults[..]).unwrap();
}
#[test]
#[should_panic = "invalid endpoint RPS limits. 10@10s allows fewer requests per bucket than 300@1s (100 vs 300)"]
fn rate_buckets_validate() {
let mut rates: Vec<RateBucketInfo> = ["300@1s", "10@10s"]
.into_iter()
.map(|s| s.parse().unwrap())
.collect();
RateBucketInfo::validate(&mut rates).unwrap();
}
#[tokio::test]
async fn test_rate_limits() {
let mut rates: Vec<RateBucketInfo> = ["100@1s", "20@30s"]
.into_iter()
.map(|s| s.parse().unwrap())
.collect();
RateBucketInfo::validate(&mut rates).unwrap();
let limiter = EndpointRateLimiter::new(Vec::leak(rates));
let endpoint = SmolStr::from("ep-my-endpoint-1234");
time::pause();
for _ in 0..100 {
assert!(limiter.check(endpoint.clone()));
}
// more connections fail
assert!(!limiter.check(endpoint.clone()));
// fail even after 500ms as it's in the same bucket
time::advance(time::Duration::from_millis(500)).await;
assert!(!limiter.check(endpoint.clone()));
// after a full 1s, 100 requests are allowed again
time::advance(time::Duration::from_millis(500)).await;
for _ in 1..6 {
for _ in 0..100 {
assert!(limiter.check(endpoint.clone()));
}
time::advance(time::Duration::from_millis(1000)).await;
}
// more connections after 600 will exceed the 20rps@30s limit
assert!(!limiter.check(endpoint.clone()));
// will still fail before the 30 second limit
time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
assert!(!limiter.check(endpoint.clone()));
// after the full 30 seconds, 100 requests are allowed again
time::advance(time::Duration::from_millis(1)).await;
for _ in 0..100 {
assert!(limiter.check(endpoint.clone()));
}
}
#[tokio::test]
async fn test_rate_limits_gc() {
// fixed seeded random/hasher to ensure that the test is not flaky
let rand = rand::rngs::StdRng::from_seed([1; 32]);
let hasher = BuildHasherDefault::<FxHasher>::default();
let limiter = EndpointRateLimiter::new_with_rand_and_hasher(
&RateBucketInfo::DEFAULT_SET,
rand,
hasher,
);
for i in 0..1_000_000 {
limiter.check(format!("{i}").into());
}
assert!(limiter.map.len() < 150_000);
}
}

View File

@@ -8,14 +8,11 @@ mod websocket;
use anyhow::bail;
use hyper::StatusCode;
use metrics::IntCounterPairGuard;
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio_util::task::TaskTracker;
use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
use crate::proxy::NUM_CLIENT_CONNECTION_GAUGE;
use crate::rate_limiter::EndpointRateLimiter;
use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
use crate::{cancellation::CancelMap, config::ProxyConfig};
use futures::StreamExt;
use hyper::{
@@ -29,6 +26,7 @@ use hyper::{
use std::net::IpAddr;
use std::task::Poll;
use std::{future::ready, sync::Arc};
use tls_listener::TlsListener;
use tokio::net::TcpListener;
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, warn, Instrument};
@@ -38,7 +36,6 @@ pub async fn task_main(
config: &'static ProxyConfig,
ws_listener: TcpListener,
cancellation_token: CancellationToken,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
scopeguard::defer! {
info!("websocket server has shut down");
@@ -58,15 +55,14 @@ pub async fn task_main(
}
});
// let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
let tls_config = match config.tls_config.as_ref() {
Some(config) => config,
let tls_config = config.tls_config.as_ref().map(|cfg| cfg.to_server_config());
let tls_acceptor: tokio_rustls::TlsAcceptor = match tls_config {
Some(config) => config.into(),
None => {
warn!("TLS config is missing, WebSocket Secure server will not be started");
return Ok(());
}
};
let tls_acceptor: tokio_rustls::TlsAcceptor = tls_config.to_server_config().into();
let mut addr_incoming = AddrIncoming::from_listener(ws_listener)?;
let _ = addr_incoming.set_nodelay(true);
@@ -74,20 +70,14 @@ pub async fn task_main(
incoming: addr_incoming,
};
let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
ws_connections.close(); // allows `ws_connections.wait to complete`
let tls_listener = tls_listener::builder(tls_acceptor)
.handshake_timeout(tls_config.handshake_timeout)
.listen(addr_incoming)
.filter(|conn| {
if let Err(err) = conn {
error!("failed to accept TLS connection for websockets: {err:?}");
ready(false)
} else {
ready(true)
}
});
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
if let Err(err) = conn {
error!("failed to accept TLS connection for websockets: {err:?}");
ready(false)
} else {
ready(true)
}
});
let make_svc = hyper::service::make_service_fn(
|stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
@@ -96,8 +86,6 @@ pub async fn task_main(
let remote_addr = io.inner.remote_addr();
let sni_name = tls.server_name().map(|s| s.to_string());
let conn_pool = conn_pool.clone();
let ws_connections = ws_connections.clone();
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
async move {
let peer_addr = match client_addr {
@@ -109,8 +97,6 @@ pub async fn task_main(
move |req: Request<Body>| {
let sni_name = sni_name.clone();
let conn_pool = conn_pool.clone();
let ws_connections = ws_connections.clone();
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
async move {
let cancel_map = Arc::new(CancelMap::default());
@@ -120,12 +106,10 @@ pub async fn task_main(
req,
config,
conn_pool,
ws_connections,
cancel_map,
session_id,
sni_name,
peer_addr.ip(),
endpoint_rate_limiter,
)
.instrument(info_span!(
"serverless",
@@ -145,25 +129,27 @@ pub async fn task_main(
.with_graceful_shutdown(cancellation_token.cancelled())
.await?;
// await websocket connections
ws_connections.wait().await;
Ok(())
}
struct MetricService<S> {
inner: S,
_gauge: IntCounterPairGuard,
}
impl<S> MetricService<S> {
fn new(inner: S) -> MetricService<S> {
MetricService {
inner,
_gauge: NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&["http"])
.guard(),
}
NUM_CLIENT_CONNECTION_OPENED_COUNTER
.with_label_values(&["http"])
.inc();
MetricService { inner }
}
}
impl<S> Drop for MetricService<S> {
fn drop(&mut self) {
NUM_CLIENT_CONNECTION_CLOSED_COUNTER
.with_label_values(&["http"])
.inc();
}
}
@@ -184,17 +170,14 @@ where
}
}
#[allow(clippy::too_many_arguments)]
async fn request_handler(
mut request: Request<Body>,
config: &'static ProxyConfig,
conn_pool: Arc<conn_pool::GlobalConnPool>,
ws_connections: TaskTracker,
cancel_map: Arc<CancelMap>,
session_id: uuid::Uuid,
sni_hostname: Option<String>,
peer_addr: IpAddr,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> Result<Response<Body>, ApiError> {
let host = request
.headers()
@@ -210,7 +193,7 @@ async fn request_handler(
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
.map_err(|e| ApiError::BadRequest(e.into()))?;
ws_connections.spawn(
tokio::spawn(
async move {
if let Err(e) = websocket::serve_websocket(
websocket,
@@ -219,7 +202,6 @@ async fn request_handler(
session_id,
host,
peer_addr,
endpoint_rate_limiter,
)
.await
{
@@ -247,7 +229,7 @@ async fn request_handler(
.header("Access-Control-Allow-Origin", "*")
.header(
"Access-Control-Allow-Headers",
"Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In, Neon-Batch-Read-Only, Neon-Batch-Isolation-Level",
"Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
)
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code

View File

@@ -24,7 +24,10 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
console,
proxy::{neon_options, LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
proxy::{
neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
NUM_DB_CONNECTIONS_OPENED_COUNTER,
},
usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
};
use crate::{compute, config};
@@ -34,7 +37,7 @@ use crate::proxy::ConnectMechanism;
use tracing::{error, warn, Span};
use tracing::{info, info_span, Instrument};
pub const APP_NAME: &str = "/sql_over_http";
pub const APP_NAME: &str = "sql_over_http";
const MAX_CONNS_PER_ENDPOINT: usize = 20;
#[derive(Debug, Clone)]
@@ -429,8 +432,8 @@ async fn connect_to_compute(
let extra = console::ConsoleReqExtra {
session_id: uuid::Uuid::new_v4(),
application_name: APP_NAME.to_string(),
options: console_options,
application_name: Some(APP_NAME),
options: console_options.as_deref(),
};
// TODO(anna): this is a bit hacky way, consider using console notification listener.
if !config.disable_ip_check_for_http {
@@ -474,11 +477,6 @@ async fn connect_to_compute_once(
.connect_timeout(timeout)
.connect(tokio_postgres::NoTls)
.await?;
let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
.with_label_values(&["http"])
.guard();
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
let (tx, mut rx) = tokio::sync::watch::channel(session);
@@ -494,7 +492,10 @@ async fn connect_to_compute_once(
tokio::spawn(
async move {
let _conn_gauge = conn_gauge;
NUM_DB_CONNECTIONS_OPENED_COUNTER.with_label_values(&["http"]).inc();
scopeguard::defer! {
NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
}
poll_fn(move |cx| {
if matches!(rx.has_changed(), Ok(true)) {
session = *rx.borrow_and_update();

View File

@@ -29,7 +29,7 @@ use utils::http::error::ApiError;
use utils::http::json::json_response;
use crate::config::HttpConfig;
use crate::proxy::NUM_CONNECTION_REQUESTS_GAUGE;
use crate::proxy::{NUM_CONNECTIONS_ACCEPTED_COUNTER, NUM_CONNECTIONS_CLOSED_COUNTER};
use super::conn_pool::ConnInfo;
use super::conn_pool::GlobalConnPool;
@@ -303,9 +303,12 @@ async fn handle_inner(
session_id: uuid::Uuid,
peer_addr: IpAddr,
) -> anyhow::Result<Response<Body>> {
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
NUM_CONNECTIONS_ACCEPTED_COUNTER
.with_label_values(&["http"])
.guard();
.inc();
scopeguard::defer! {
NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&["http"]).inc();
}
//
// Determine the destination and connection params

View File

@@ -3,7 +3,6 @@ use crate::{
config::ProxyConfig,
error::io_error,
proxy::{handle_client, ClientMode},
rate_limiter::EndpointRateLimiter,
};
use bytes::{Buf, Bytes};
use futures::{Sink, Stream};
@@ -14,7 +13,6 @@ use pin_project_lite::pin_project;
use std::{
net::IpAddr,
pin::Pin,
sync::Arc,
task::{ready, Context, Poll},
};
use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
@@ -27,15 +25,15 @@ use sync_wrapper::SyncWrapper;
pin_project! {
/// This is a wrapper around a [`WebSocketStream`] that
/// implements [`AsyncRead`] and [`AsyncWrite`].
pub struct WebSocketRw<S = Upgraded> {
pub struct WebSocketRw {
#[pin]
stream: SyncWrapper<WebSocketStream<S>>,
stream: SyncWrapper<WebSocketStream<Upgraded>>,
bytes: Bytes,
}
}
impl<S> WebSocketRw<S> {
pub fn new(stream: WebSocketStream<S>) -> Self {
impl WebSocketRw {
pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
Self {
stream: stream.into(),
bytes: Bytes::new(),
@@ -43,7 +41,7 @@ impl<S> WebSocketRw<S> {
}
}
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
impl AsyncWrite for WebSocketRw {
fn poll_write(
self: Pin<&mut Self>,
cx: &mut Context<'_>,
@@ -69,7 +67,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
}
}
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
impl AsyncRead for WebSocketRw {
fn poll_read(
mut self: Pin<&mut Self>,
cx: &mut Context<'_>,
@@ -86,7 +84,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncRead for WebSocketRw<S> {
}
}
impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
impl AsyncBufRead for WebSocketRw {
fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
// Please refer to poll_fill_buf's documentation.
const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
@@ -136,7 +134,6 @@ pub async fn serve_websocket(
session_id: uuid::Uuid,
hostname: Option<String>,
peer_addr: IpAddr,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
let websocket = websocket.await?;
handle_client(
@@ -146,65 +143,7 @@ pub async fn serve_websocket(
WebSocketRw::new(websocket),
ClientMode::Websockets { hostname },
peer_addr,
endpoint_rate_limiter,
)
.await?;
Ok(())
}
#[cfg(test)]
mod tests {
use std::pin::pin;
use futures::{SinkExt, StreamExt};
use hyper_tungstenite::{
tungstenite::{protocol::Role, Message},
WebSocketStream,
};
use tokio::{
io::{duplex, AsyncReadExt, AsyncWriteExt},
task::JoinSet,
};
use super::WebSocketRw;
#[tokio::test]
async fn websocket_stream_wrapper_happy_path() {
let (stream1, stream2) = duplex(1024);
let mut js = JoinSet::new();
js.spawn(async move {
let mut client = WebSocketStream::from_raw_socket(stream1, Role::Client, None).await;
client
.send(Message::Binary(b"hello world".to_vec()))
.await
.unwrap();
let message = client.next().await.unwrap().unwrap();
assert_eq!(message, Message::Binary(b"websockets are cool".to_vec()));
client.close(None).await.unwrap();
});
js.spawn(async move {
let mut rw = pin!(WebSocketRw::new(
WebSocketStream::from_raw_socket(stream2, Role::Server, None).await
));
let mut buf = vec![0; 1024];
let n = rw.read(&mut buf).await.unwrap();
assert_eq!(&buf[..n], b"hello world");
rw.write_all(b"websockets are cool").await.unwrap();
rw.flush().await.unwrap();
let n = rw.read_to_end(&mut buf).await.unwrap();
assert_eq!(n, 0);
});
js.join_next().await.unwrap().unwrap();
js.join_next().await.unwrap().unwrap();
}
}

View File

@@ -142,8 +142,6 @@ pub(crate) async fn branch_cleanup_and_check_errors(
.collect();
if !orphan_layers.is_empty() {
// An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report
// these as a hint that there is something worth cleaning up here.
result.warnings.push(format!(
"index_part.json does not contain layers from S3: {:?}",
orphan_layers

View File

@@ -87,7 +87,7 @@ impl S3Target {
new_self.prefix_in_bucket = format!("/{}/", new_segment);
} else {
if new_self.prefix_in_bucket.ends_with('/') {
new_self.prefix_in_bucket.pop();
let _ = new_self.prefix_in_bucket.pop();
}
new_self.prefix_in_bucket =
[&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);

View File

@@ -57,7 +57,7 @@ async fn main() -> anyhow::Result<()> {
));
match cli.command {
Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await {
Command::ScanMetadata { json } => match scan_metadata(bucket_config).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
@@ -70,17 +70,6 @@ async fn main() -> anyhow::Result<()> {
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
// Strictly speaking an empty bucket is a valid bucket, but if someone ran the
// scrubber they were likely expecting to scan something, and if we see no timelines
// at all then it's likely due to some configuration issues like a bad prefix
Err(anyhow::anyhow!(
"No timelines found in bucket {} prefix {}",
bucket_config.bucket,
bucket_config
.prefix_in_bucket
.unwrap_or("<none>".to_string())
))
} else {
Ok(())
}

View File

@@ -20,6 +20,14 @@ pub struct MetadataSummary {
with_warnings: HashSet<TenantTimelineId>,
with_garbage: HashSet<TenantTimelineId>,
indices_by_version: HashMap<usize, usize>,
indices_with_generation: usize,
indices_without_generation: usize,
/// Timelines that couldn't even parse metadata and/or object keys: extremely damaged
invalid_count: usize,
/// Timelines with just an initdb archive, left behind after deletion.
relic_count: usize,
layer_count: MinMaxHisto,
timeline_size_bytes: MinMaxHisto,
@@ -39,6 +47,8 @@ impl MinMaxHisto {
fn new() -> Self {
Self {
histo: histogram::Histogram::builder()
// Accomodate tenant sizes up to 32TiB
.maximum_value(32 * 1024 * 1024 * 1024 * 1024)
.build()
.expect("Bad histogram params"),
min: u64::MAX,
@@ -90,6 +100,10 @@ impl MetadataSummary {
with_warnings: HashSet::new(),
with_garbage: HashSet::new(),
indices_by_version: HashMap::new(),
indices_with_generation: 0,
indices_without_generation: 0,
invalid_count: 0,
relic_count: 0,
layer_count: MinMaxHisto::new(),
timeline_size_bytes: MinMaxHisto::new(),
layer_size_bytes: MinMaxHisto::new(),
@@ -111,24 +125,35 @@ impl MetadataSummary {
fn update_data(&mut self, data: &S3TimelineBlobData) {
self.count += 1;
if let BlobDataParseResult::Parsed {
index_part,
index_part_generation: _,
s3_layers: _,
} = &data.blob_data
{
*self
.indices_by_version
.entry(index_part.get_version())
.or_insert(0) += 1;
match &data.blob_data {
BlobDataParseResult::Parsed {
index_part,
index_part_generation,
s3_layers: _,
} => {
*self
.indices_by_version
.entry(index_part.get_version())
.or_insert(0) += 1;
if let Err(e) = self.update_histograms(index_part) {
// Value out of range? Warn that the results are untrustworthy
tracing::warn!(
"Error updating histograms, summary stats may be wrong: {}",
e
);
// These statistics exist to track the transition to generations. By early 2024 there should be zero
// generation-less timelines in the field and this check can be removed.
if index_part_generation.is_none() {
self.indices_without_generation += 1;
} else {
self.indices_with_generation += 1;
}
if let Err(e) = self.update_histograms(index_part) {
// Value out of range? Warn that the results are untrustworthy
tracing::warn!(
"Error updating histograms, summary stats may be wrong: {}",
e
);
}
}
BlobDataParseResult::Incorrect(_) => self.invalid_count += 1,
BlobDataParseResult::Relic => self.relic_count += 1,
}
}
@@ -156,7 +181,10 @@ impl MetadataSummary {
With errors: {1}
With warnings: {2}
With garbage: {3}
Invalid: {9}
Relics: {10}
Index versions: {version_summary}
Indices with/without generations: {7}/{8}
Timeline size bytes: {4}
Layer size bytes: {5}
Timeline layer count: {6}
@@ -168,16 +196,16 @@ Timeline layer count: {6}
self.timeline_size_bytes.oneline(),
self.layer_size_bytes.oneline(),
self.layer_count.oneline(),
self.indices_with_generation,
self.indices_without_generation,
self.invalid_count,
self.relic_count
)
}
pub fn is_fatal(&self) -> bool {
!self.with_errors.is_empty()
}
pub fn is_empty(&self) -> bool {
self.count == 0
}
}
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.

View File

@@ -11,7 +11,7 @@ use tracing::{debug, info, info_span, Instrument};
use crate::auth::check_permission;
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
use crate::metrics::{TrafficMetrics, PG_QUERIES_GAUGE};
use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
use crate::safekeeper::Term;
use crate::timeline::TimelineError;
use crate::wal_service::ConnectionId;
@@ -210,7 +210,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
let cmd = parse_cmd(query_string)?;
let cmd_str = cmd_to_string(&cmd);
let _guard = PG_QUERIES_GAUGE.with_label_values(&[cmd_str]).guard();
PG_QUERIES_RECEIVED.with_label_values(&[cmd_str]).inc();
scopeguard::defer! {
PG_QUERIES_FINISHED.with_label_values(&[cmd_str]).inc();
}
info!("got query {:?}", query_string);

View File

@@ -11,8 +11,7 @@ use futures::Future;
use metrics::{
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
proto::MetricFamily,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec, Gauge,
IntCounter, IntCounterPairVec, IntCounterVec, IntGaugeVec,
register_int_counter, register_int_counter_vec, Gauge, IntCounter, IntCounterVec, IntGaugeVec,
};
use once_cell::sync::Lazy;
@@ -90,10 +89,16 @@ pub static BROKER_PULLED_UPDATES: Lazy<IntCounterVec> = Lazy::new(|| {
)
.expect("Failed to register safekeeper_broker_pulled_updates_total counter")
});
pub static PG_QUERIES_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
pub static PG_QUERIES_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"safekeeper_pg_queries_received_total",
"Number of queries received through pg protocol",
&["query"]
)
.expect("Failed to register safekeeper_pg_queries_received_total counter")
});
pub static PG_QUERIES_FINISHED: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"safekeeper_pg_queries_finished_total",
"Number of queries finished through pg protocol",
&["query"]

View File

@@ -28,7 +28,6 @@ import jwt
import psycopg2
import pytest
import requests
import toml
from _pytest.config import Config
from _pytest.config.argparsing import Parser
from _pytest.fixtures import FixtureRequest
@@ -57,7 +56,6 @@ from fixtures.remote_storage import (
RemoteStorageKind,
RemoteStorageUser,
S3Storage,
default_remote_storage,
remote_storage_to_toml_inline_table,
)
from fixtures.types import Lsn, TenantId, TimelineId
@@ -437,7 +435,7 @@ class NeonEnvBuilder:
# Pageserver remote storage
self.pageserver_remote_storage = pageserver_remote_storage
# Safekeepers remote storage
self.safekeepers_remote_storage: Optional[RemoteStorage] = None
self.sk_remote_storage: Optional[RemoteStorage] = None
self.broker = broker
self.run_id = run_id
@@ -470,7 +468,7 @@ class NeonEnvBuilder:
# Cannot create more than one environment from one builder
assert self.env is None, "environment already initialized"
if default_remote_storage_if_missing and self.pageserver_remote_storage is None:
self.enable_pageserver_remote_storage(default_remote_storage())
self.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
self.env = NeonEnv(self)
return self.env
@@ -507,66 +505,6 @@ class NeonEnvBuilder:
return env
def from_repo_dir(
self,
repo_dir: Path,
neon_binpath: Optional[Path] = None,
pg_distrib_dir: Optional[Path] = None,
) -> NeonEnv:
"""
A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir.
"""
# Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests
self.neon_binpath = neon_binpath or self.neon_binpath
self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir
# Get the initial tenant and timeline from the snapshot config
snapshot_config_toml = repo_dir / "config"
with snapshot_config_toml.open("r") as f:
snapshot_config = toml.load(f)
self.initial_tenant = TenantId(snapshot_config["default_tenant_id"])
self.initial_timeline = TimelineId(
dict(snapshot_config["branch_name_mappings"][DEFAULT_BRANCH_NAME])[
str(self.initial_tenant)
]
)
self.env = self.init_configs()
for ps_dir in repo_dir.glob("pageserver_*"):
tenants_from_dir = ps_dir / "tenants"
tenants_to_dir = self.repo_dir / ps_dir.name / "tenants"
log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}")
shutil.copytree(tenants_from_dir, tenants_to_dir)
for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"):
sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name
log.info(f"Copying safekeeper directory {sk_from_dir} to {sk_to_dir}")
sk_to_dir.rmdir()
shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid"))
shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True)
shutil.copytree(
repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage"
)
if (attachments_json := Path(repo_dir / "attachments.json")).exists():
shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name)
# Update the config with info about tenants and timelines
with (self.repo_dir / "config").open("r") as f:
config = toml.load(f)
config["default_tenant_id"] = snapshot_config["default_tenant_id"]
config["branch_name_mappings"] = snapshot_config["branch_name_mappings"]
with (self.repo_dir / "config").open("w") as f:
toml.dump(config, f)
return self.env
def enable_scrub_on_exit(self):
"""
Call this if you would like the fixture to automatically run
@@ -595,11 +533,9 @@ class NeonEnvBuilder:
self.pageserver_remote_storage = ret
def enable_safekeeper_remote_storage(self, kind: RemoteStorageKind):
assert (
self.safekeepers_remote_storage is None
), "safekeepers_remote_storage already configured"
assert self.sk_remote_storage is None, "sk_remote_storage already configured"
self.safekeepers_remote_storage = self._configure_and_create_remote_storage(
self.sk_remote_storage = self._configure_and_create_remote_storage(
kind, RemoteStorageUser.SAFEKEEPER
)
@@ -652,7 +588,7 @@ class NeonEnvBuilder:
directory_to_clean.rmdir()
def cleanup_remote_storage(self):
for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]:
for x in [self.pageserver_remote_storage, self.sk_remote_storage]:
if isinstance(x, S3Storage):
x.do_cleanup()
@@ -756,7 +692,7 @@ class NeonEnv:
self.pageservers: List[NeonPageserver] = []
self.broker = config.broker
self.pageserver_remote_storage = config.pageserver_remote_storage
self.safekeepers_remote_storage = config.safekeepers_remote_storage
self.safekeepers_remote_storage = config.sk_remote_storage
self.pg_version = config.pg_version
# Binary path for pageserver, safekeeper, etc
self.neon_binpath = config.neon_binpath
@@ -781,17 +717,25 @@ class NeonEnv:
self.attachment_service = None
# Create a config file corresponding to the options
cfg: Dict[str, Any] = {
"default_tenant_id": str(self.initial_tenant),
"broker": {
"listen_addr": self.broker.listen_addr(),
},
"pageservers": [],
"safekeepers": [],
}
toml = textwrap.dedent(
f"""
default_tenant_id = '{config.initial_tenant}'
"""
)
if self.control_plane_api is not None:
cfg["control_plane_api"] = self.control_plane_api
toml += textwrap.dedent(
f"""
control_plane_api = '{self.control_plane_api}'
"""
)
toml += textwrap.dedent(
f"""
[broker]
listen_addr = '{self.broker.listen_addr()}'
"""
)
# Create config for pageserver
http_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
@@ -804,24 +748,26 @@ class NeonEnv:
http=self.port_distributor.get_port(),
)
ps_cfg: Dict[str, Any] = {
"id": ps_id,
"listen_pg_addr": f"localhost:{pageserver_port.pg}",
"listen_http_addr": f"localhost:{pageserver_port.http}",
"pg_auth_type": pg_auth_type,
"http_auth_type": http_auth_type,
}
toml += textwrap.dedent(
f"""
[[pageservers]]
id={ps_id}
listen_pg_addr = 'localhost:{pageserver_port.pg}'
listen_http_addr = 'localhost:{pageserver_port.http}'
pg_auth_type = '{pg_auth_type}'
http_auth_type = '{http_auth_type}'
"""
)
# Create a corresponding NeonPageserver object
self.pageservers.append(
NeonPageserver(
self,
ps_id,
port=pageserver_port,
config_override=self.pageserver_config_override,
config_override=config.pageserver_config_override,
)
)
cfg["pageservers"].append(ps_cfg)
# Create config and a Safekeeper object for each safekeeper
for i in range(1, config.num_safekeepers + 1):
port = SafekeeperPort(
@@ -830,22 +776,32 @@ class NeonEnv:
http=self.port_distributor.get_port(),
)
id = config.safekeepers_id_start + i # assign ids sequentially
sk_cfg: Dict[str, Any] = {
"id": id,
"pg_port": port.pg,
"pg_tenant_only_port": port.pg_tenant_only,
"http_port": port.http,
"sync": config.safekeepers_enable_fsync,
}
toml += textwrap.dedent(
f"""
[[safekeepers]]
id = {id}
pg_port = {port.pg}
pg_tenant_only_port = {port.pg_tenant_only}
http_port = {port.http}
sync = {'true' if config.safekeepers_enable_fsync else 'false'}"""
)
if config.auth_enabled:
sk_cfg["auth_enabled"] = True
if self.safekeepers_remote_storage is not None:
sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table()
self.safekeepers.append(Safekeeper(env=self, id=id, port=port))
cfg["safekeepers"].append(sk_cfg)
toml += textwrap.dedent(
"""
auth_enabled = true
"""
)
if config.sk_remote_storage is not None:
toml += textwrap.dedent(
f"""
remote_storage = "{remote_storage_to_toml_inline_table(config.sk_remote_storage)}"
"""
)
safekeeper = Safekeeper(env=self, id=id, port=port)
self.safekeepers.append(safekeeper)
log.info(f"Config: {cfg}")
self.neon_cli.init(cfg)
log.info(f"Config: {toml}")
self.neon_cli.init(toml)
def start(self):
# Start up broker, pageserver and all safekeepers
@@ -1331,10 +1287,10 @@ class NeonCli(AbstractNeonCli):
def init(
self,
config: Dict[str, Any],
config_toml: str,
) -> "subprocess.CompletedProcess[str]":
with tempfile.NamedTemporaryFile(mode="w+") as tmp:
tmp.write(toml.dumps(config))
tmp.write(config_toml)
tmp.flush()
cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version]
@@ -1772,16 +1728,11 @@ class NeonPageserver(PgProtocol):
@property
def workdir(self) -> Path:
return self.env.repo_dir / f"pageserver_{self.id}"
return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}"))
def assert_no_errors(self):
logfile = self.workdir / "pageserver.log"
if not logfile.exists():
log.warning(f"Skipping log check: {logfile} does not exist")
return
with logfile.open("r") as f:
errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
errors = scan_pageserver_log_for_errors(logfile, self.allowed_errors)
for _lineno, error in errors:
log.info(f"not allowed error: {error.strip()}")
@@ -1805,10 +1756,7 @@ class NeonPageserver(PgProtocol):
def log_contains(self, pattern: str) -> Optional[str]:
"""Check that the pageserver log contains a line that matches the given regex"""
logfile = self.workdir / "pageserver.log"
if not logfile.exists():
log.warning(f"Skipping log check: {logfile} does not exist")
return None
logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
contains_re = re.compile(pattern)
@@ -1817,11 +1765,14 @@ class NeonPageserver(PgProtocol):
# no guarantee it is already present in the log file. This hasn't
# been a problem in practice, our python tests are not fast enough
# to hit that race condition.
with logfile.open("r") as f:
for line in f:
if contains_re.search(line):
# found it!
return line
while True:
line = logfile.readline()
if not line:
break
if contains_re.search(line):
# found it!
return line
return None
@@ -1844,38 +1795,16 @@ class NeonPageserver(PgProtocol):
client = self.http_client()
return client.tenant_detach(tenant_id)
def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
# This API is only for use when generations are enabled
assert self.env.attachment_service is not None
if config["mode"].startswith("Attached") and "generation" not in config:
config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id)
client = self.http_client()
return client.tenant_location_conf(tenant_id, config, **kwargs)
def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]:
path = self.tenant_dir(tenant_id) / "config-v1"
log.info(f"Reading location conf from {path}")
bytes = open(path, "r").read()
try:
decoded: dict[str, Any] = toml.loads(bytes)
return decoded
except:
log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
raise
def tenant_create(
self,
tenant_id: TenantId,
conf: Optional[Dict[str, Any]] = None,
auth_token: Optional[str] = None,
generation: Optional[int] = None,
) -> TenantId:
if generation is None:
generation = self.maybe_get_generation(tenant_id)
client = self.http_client(auth_token=auth_token)
return client.tenant_create(tenant_id, conf, generation=generation)
return client.tenant_create(
tenant_id, conf, generation=self.maybe_get_generation(tenant_id)
)
def tenant_load(self, tenant_id: TenantId):
client = self.http_client()
@@ -2799,7 +2728,6 @@ class EndpointFactory:
lsn: Optional[Lsn] = None,
hot_standby: bool = False,
config_lines: Optional[List[str]] = None,
pageserver_id: Optional[int] = None,
) -> Endpoint:
ep = Endpoint(
self.env,
@@ -2819,7 +2747,6 @@ class EndpointFactory:
lsn=lsn,
hot_standby=hot_standby,
config_lines=config_lines,
pageserver_id=pageserver_id,
)
def stop_all(self) -> "EndpointFactory":
@@ -2945,7 +2872,7 @@ class Safekeeper:
tli_dir = self.timeline_dir(tenant_id, timeline_id)
segments = []
for _, _, filenames in os.walk(tli_dir):
segments.extend([f for f in filenames if not f.startswith("safekeeper.control")])
segments.extend([f for f in filenames if f != "safekeeper.control"])
segments.sort()
return segments
@@ -3166,7 +3093,7 @@ def pytest_addoption(parser: Parser):
SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)"
r"config|metadata|.+\.(?:toml|pid|json|sql)"
)
@@ -3427,6 +3354,8 @@ def parse_project_git_version_output(s: str) -> str:
The information is generated by utils::project_git_version!
"""
import re
res = re.search(r"git(-env)?:([0-9a-fA-F]{8,40})(-\S+)?", s)
if res and (commit := res.group(2)):
return commit

View File

@@ -150,7 +150,7 @@ class PageserverHttpClient(requests.Session):
# (this may change in future if we do fault injection of a kind that causes
# requests TCP flows to stick)
read=False,
backoff_factor=0.2,
backoff_factor=0,
status_forcelist=[503],
allowed_methods=None,
remove_headers_on_redirect=[],
@@ -277,23 +277,6 @@ class PageserverHttpClient(requests.Session):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
self.verbose_error(res)
def tenant_location_conf(
self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None
):
body = location_conf.copy()
body["tenant_id"] = str(tenant_id)
params = {}
if flush_ms is not None:
params["flush_ms"] = str(flush_ms)
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
json=body,
params=params,
)
self.verbose_error(res)
def tenant_delete(self, tenant_id: TenantId):
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
self.verbose_error(res)
@@ -322,10 +305,6 @@ class PageserverHttpClient(requests.Session):
self.verbose_error(res)
return TenantConfig.from_json(res.json())
def tenant_heatmap_upload(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
self.verbose_error(res)
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
assert "tenant_id" not in config.keys()
res = self.put(

View File

@@ -9,14 +9,12 @@ from pathlib import Path
from typing import Any, Dict, List, Optional, Union
import boto3
import toml
from mypy_boto3_s3 import S3Client
from fixtures.log_helper import log
from fixtures.types import TenantId, TimelineId
TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
TENANT_HEATMAP_FILE_NAME = "heatmap-v1.json"
@enum.unique
@@ -134,18 +132,8 @@ class LocalFsStorage:
with self.index_path(tenant_id, timeline_id).open("r") as f:
return json.load(f)
def heatmap_path(self, tenant_id: TenantId) -> Path:
return self.tenant_path(tenant_id) / TENANT_HEATMAP_FILE_NAME
def heatmap_content(self, tenant_id):
with self.heatmap_path(tenant_id).open("r") as f:
return json.load(f)
def to_toml_inline_table(self) -> str:
rv = {
"local_path": str(self.root),
}
return toml.TomlEncoder().dump_inline_table(rv)
return f"local_path='{self.root}'"
def cleanup(self):
# no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files
@@ -186,18 +174,18 @@ class S3Storage:
)
def to_toml_inline_table(self) -> str:
rv = {
"bucket_name": self.bucket_name,
"bucket_region": self.bucket_region,
}
s = [
f"bucket_name='{self.bucket_name}'",
f"bucket_region='{self.bucket_region}'",
]
if self.prefix_in_bucket is not None:
rv["prefix_in_bucket"] = self.prefix_in_bucket
s.append(f"prefix_in_bucket='{self.prefix_in_bucket}'")
if self.endpoint is not None:
rv["endpoint"] = self.endpoint
s.append(f"endpoint='{self.endpoint}'")
return toml.TomlEncoder().dump_inline_table(rv)
return ",".join(s)
def do_cleanup(self):
if not self.cleanup:
@@ -384,16 +372,9 @@ def s3_storage() -> RemoteStorageKind:
return RemoteStorageKind.MOCK_S3
def default_remote_storage() -> RemoteStorageKind:
"""
The remote storage kind used in tests that do not specify a preference
"""
return RemoteStorageKind.LOCAL_FS
# serialize as toml inline table
def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str:
if not isinstance(remote_storage, (LocalFsStorage, S3Storage)):
raise Exception("invalid remote storage type")
return remote_storage.to_toml_inline_table()
return f"{{{remote_storage.to_toml_inline_table()}}}"

View File

@@ -1,148 +0,0 @@
from typing import Optional
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
from fixtures.types import TenantId, TimelineId
class Workload:
"""
This is not a general purpose load generator: it exists for storage tests that need to inject some
high level types of storage work via the postgres interface:
- layer writes (`write_rows`)
- work for compaction (`churn_rows`)
- reads, checking we get the right data (`validate`)
"""
def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
self.env = env
self.tenant_id = tenant_id
self.timeline_id = timeline_id
self.table = "foo"
self.expect_rows = 0
self.churn_cursor = 0
self._endpoint: Optional[Endpoint] = None
def endpoint(self, pageserver_id: int) -> Endpoint:
if self._endpoint is None:
self._endpoint = self.env.endpoints.create(
"main",
tenant_id=self.tenant_id,
pageserver_id=pageserver_id,
endpoint_id="ep-workload",
)
self._endpoint.start(pageserver_id=pageserver_id)
else:
self._endpoint.reconfigure(pageserver_id=pageserver_id)
connstring = self._endpoint.safe_psql(
"SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
)
log.info(f"Workload.endpoint: connstr={connstring}")
return self._endpoint
def __del__(self):
if self._endpoint is not None:
self._endpoint.stop()
def init(self, pageserver_id: int):
endpoint = self.endpoint(pageserver_id)
endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
last_flush_lsn_upload(
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
def write_rows(self, n, pageserver_id):
endpoint = self.endpoint(pageserver_id)
start = self.expect_rows
end = start + n - 1
self.expect_rows += n
dummy_value = "blah"
endpoint.safe_psql(
f"""
INSERT INTO {self.table} (id, val)
SELECT g, '{dummy_value}'
FROM generate_series({start}, {end}) g
"""
)
return last_flush_lsn_upload(
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
def churn_rows(self, n, pageserver_id, upload=True):
assert self.expect_rows >= n
max_iters = 10
endpoint = self.endpoint(pageserver_id)
todo = n
i = 0
while todo > 0:
i += 1
if i > max_iters:
raise RuntimeError("oops")
start = self.churn_cursor % self.expect_rows
n_iter = min((self.expect_rows - start), todo)
todo -= n_iter
end = start + n_iter - 1
log.info(
f"start,end = {start},{end}, cursor={self.churn_cursor}, expect_rows={self.expect_rows}"
)
assert end < self.expect_rows
self.churn_cursor += n_iter
dummy_value = "blah"
endpoint.safe_psql_many(
[
f"""
INSERT INTO {self.table} (id, val)
SELECT g, '{dummy_value}'
FROM generate_series({start}, {end}) g
ON CONFLICT (id) DO UPDATE
SET val = EXCLUDED.val
""",
f"VACUUM {self.table}",
]
)
last_flush_lsn = wait_for_last_flush_lsn(
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
ps_http = self.env.get_pageserver(pageserver_id).http_client()
wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
if upload:
# force a checkpoint to trigger upload
ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id)
wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
else:
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
def validate(self, pageserver_id):
endpoint = self.endpoint(pageserver_id)
result = endpoint.safe_psql_many(
[
"select clear_buffer_cache()",
f"""
SELECT COUNT(*) FROM {self.table}
""",
]
)
log.info(f"validate({self.expect_rows}): {result}")
assert result == [[("",)], [(self.expect_rows,)]]

View File

@@ -55,20 +55,9 @@ def measure_recovery_time(env: NeonCompare):
# Delete the Tenant in the pageserver: this will drop local and remote layers, such that
# when we "create" the Tenant again, we will replay the WAL from the beginning.
#
# This is a "weird" thing to do, and can confuse the attachment service as we're re-using
# the same tenant ID for a tenant that is logically different from the pageserver's point
# of view, but the same as far as the safekeeper/WAL is concerned. To work around that,
# we will explicitly create the tenant in the same generation that it was previously
# attached in.
assert env.env.attachment_service is not None
attach_status = env.env.attachment_service.inspect(tenant_id=env.tenant)
assert attach_status is not None
(attach_gen, _) = attach_status
client.tenant_delete(env.tenant)
wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen)
env.env.pageserver.tenant_create(tenant_id=env.tenant)
# Measure recovery time
with env.record_duration("wal_recovery"):

View File

@@ -163,7 +163,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
"gc_feedback": True,
"gc_horizon": 23 * (1024 * 1024),
"gc_period": "2h 13m",
"heatmap_period": "10m",
"image_creation_threshold": 7,
"pitr_interval": "1m",
"lagging_wal_timeout": "23m",

View File

@@ -92,9 +92,8 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"]
)
env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
pageserver_token_old = env.auth_keys.generate_pageserver_token()
pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
@@ -146,9 +145,9 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
neon_env_builder.auth_enabled = True
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"]
)
env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
pageserver_token_old = env.auth_keys.generate_pageserver_token()
pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)

View File

@@ -14,9 +14,8 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
)
env.pageserver.allowed_errors.append(".*invalid branch start lsn.*")
env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
# Branch at the point where only 100 rows were inserted
branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")

View File

@@ -1,7 +1,8 @@
import random
import threading
import time
from typing import List
from queue import SimpleQueue
from typing import Any, Dict, List, Union
import pytest
from fixtures.log_helper import log
@@ -147,11 +148,11 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
env = neon_env_builder.init_configs()
env.start()
env.pageserver.allowed_errors.extend(
[
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading",
]
env.pageserver.allowed_errors.append(
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
)
env.pageserver.allowed_errors.append(
".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading"
)
ps_http = env.pageserver.http_client()
@@ -238,6 +239,92 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder
t.join()
def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: NeonEnvBuilder):
"""
If the activate only after upload is used, then retries could become competing.
"""
env = neon_env_builder.init_configs()
env.start()
env.pageserver.allowed_errors.append(
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
)
env.pageserver.allowed_errors.append(
".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory"
)
ps_http = env.pageserver.http_client()
# pause all uploads
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
env.pageserver.tenant_create(env.initial_tenant)
def start_creating_timeline():
ps_http.timeline_create(
env.pg_version, env.initial_tenant, env.initial_timeline, timeout=60
)
create_root = threading.Thread(target=start_creating_timeline)
branch_id = TimelineId.generate()
queue: SimpleQueue[Union[Dict[Any, Any], Exception]] = SimpleQueue()
barrier = threading.Barrier(3)
def try_branch():
barrier.wait()
barrier.wait()
try:
ret = ps_http.timeline_create(
env.pg_version,
env.initial_tenant,
branch_id,
ancestor_timeline_id=env.initial_timeline,
timeout=5,
)
queue.put(ret)
except Exception as e:
queue.put(e)
threads = [threading.Thread(target=try_branch) for _ in range(2)]
try:
create_root.start()
for t in threads:
t.start()
wait_until_paused(env, "before-upload-index-pausable")
barrier.wait()
ps_http.configure_failpoints(("before-upload-index-pausable", "off"))
barrier.wait()
# now both requests race to branch, only one can win because they take gc_cs, Tenant::timelines or marker files
first = queue.get()
second = queue.get()
log.info(first)
log.info(second)
(succeeded, failed) = (first, second) if isinstance(second, Exception) else (second, first)
assert isinstance(failed, Exception)
assert isinstance(succeeded, Dict)
# there's multiple valid status codes:
# - Timeline x/y already exists
# - whatever 409 response says, but that is a subclass of PageserverApiException
assert isinstance(failed, PageserverApiException)
assert succeeded["state"] == "Active"
finally:
# we might still have the failpoint active
env.pageserver.stop(immediate=True)
for t in threads:
t.join()
create_root.join()
def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
"""
Check that a timeline is deleted locally on subsequent restart if it never successfully uploaded during creation.

View File

@@ -1,25 +1,30 @@
import copy
import os
import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import List, Optional
from typing import Any, List, Optional
import pytest
import toml
import toml # TODO: replace with tomllib for Python >= 3.11
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonCli,
NeonEnvBuilder,
PgBin,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import (
timeline_delete_wait_completed,
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.pg_version import PgVersion
from fixtures.remote_storage import RemoteStorageKind
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser
from fixtures.types import Lsn
from pytest import FixtureRequest
#
# A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases.
@@ -32,8 +37,8 @@ from fixtures.types import Lsn
# If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true.
#
# The file contains a couple of helper functions:
# - prepare_snapshot copies the snapshot, cleans it up and makes it ready for the current version of Neon (replaces paths and ports in config files).
# - check_neon_works performs the test itself, feel free to add more checks there.
# - dump_differs compares two SQL dumps and writes the diff to a file.
#
#
# How to run `test_backward_compatibility` locally:
@@ -41,7 +46,6 @@ from fixtures.types import Lsn
# export DEFAULT_PG_VERSION=15
# export BUILD_TYPE=release
# export CHECK_ONDISK_DATA_COMPATIBILITY=true
# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
#
# # Build previous version of binaries and create a data snapshot:
# rm -rf pg_install target
@@ -55,7 +59,8 @@ from fixtures.types import Lsn
# CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc`
#
# # Run backward compatibility test
# ./scripts/pytest -k test_backward_compatibility
# COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} \
# ./scripts/pytest -k test_backward_compatibility
#
#
# How to run `test_forward_compatibility` locally:
@@ -63,8 +68,6 @@ from fixtures.types import Lsn
# export DEFAULT_PG_VERSION=15
# export BUILD_TYPE=release
# export CHECK_ONDISK_DATA_COMPATIBILITY=true
# export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
# export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install
#
# # Build previous version of binaries and store them somewhere:
# rm -rf pg_install target
@@ -81,7 +84,9 @@ from fixtures.types import Lsn
# ./scripts/pytest -k test_create_snapshot
#
# # Run forward compatibility test
# ./scripts/pytest -k test_forward_compatibility
# COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} \
# COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install \
# ./scripts/pytest -k test_forward_compatibility
#
check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
@@ -150,9 +155,13 @@ def test_create_snapshot(
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
def test_backward_compatibility(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
port_distributor: PortDistributor,
test_output_dir: Path,
neon_binpath: Path,
pg_distrib_dir: Path,
pg_version: PgVersion,
request: FixtureRequest,
):
"""
Test that the new binaries can read old data
@@ -168,15 +177,23 @@ def test_backward_compatibility(
)
try:
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
neon_env_builder.start()
# Copy the snapshot to current directory, and prepare for the test
prepare_snapshot(
from_dir=compatibility_snapshot_dir,
to_dir=test_output_dir / "compatibility_snapshot",
port_distributor=port_distributor,
)
check_neon_works(
env,
test_output_dir=test_output_dir,
sql_dump_path=compatibility_snapshot_dir / "dump.sql",
repo_dir=env.repo_dir,
test_output_dir / "compatibility_snapshot" / "repo",
neon_binpath,
neon_binpath,
pg_distrib_dir,
pg_version,
port_distributor,
test_output_dir,
pg_bin,
request,
)
except Exception:
if breaking_changes_allowed:
@@ -195,10 +212,12 @@ def test_backward_compatibility(
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
def test_forward_compatibility(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
top_output_dir: Path,
port_distributor: PortDistributor,
pg_version: PgVersion,
request: FixtureRequest,
neon_binpath: Path,
):
"""
Test that the old binaries can read new data
@@ -225,19 +244,24 @@ def test_forward_compatibility(
)
try:
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.from_repo_dir(
compatibility_snapshot_dir / "repo",
neon_binpath=compatibility_neon_bin,
# Copy the snapshot to current directory, and prepare for the test
prepare_snapshot(
from_dir=compatibility_snapshot_dir,
to_dir=test_output_dir / "compatibility_snapshot",
port_distributor=port_distributor,
pg_distrib_dir=compatibility_postgres_distrib_dir,
)
neon_env_builder.start()
check_neon_works(
env,
test_output_dir=test_output_dir,
sql_dump_path=compatibility_snapshot_dir / "dump.sql",
repo_dir=env.repo_dir,
test_output_dir / "compatibility_snapshot" / "repo",
compatibility_neon_bin,
neon_binpath,
compatibility_postgres_distrib_dir,
pg_version,
port_distributor,
test_output_dir,
PgBin(test_output_dir, compatibility_postgres_distrib_dir, pg_version),
request,
)
except Exception:
if breaking_changes_allowed:
@@ -252,45 +276,193 @@ def test_forward_compatibility(
), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
ep = env.endpoints.create_start("main")
pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
def prepare_snapshot(
from_dir: Path,
to_dir: Path,
port_distributor: PortDistributor,
pg_distrib_dir: Optional[Path] = None,
):
assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist"
assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory"
assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql"
connstr = ep.connstr()
log.info(f"Copying snapshot from {from_dir} to {to_dir}")
shutil.copytree(from_dir, to_dir)
repo_dir = to_dir / "repo"
snapshot_config_toml = repo_dir / "config"
snapshot_config = toml.load(snapshot_config_toml)
# Remove old logs to avoid confusion in test artifacts
for logfile in repo_dir.glob("**/*.log"):
logfile.unlink()
# Remove old computes in 'endpoints'. Old versions of the control plane used a directory
# called "pgdatadirs". Delete it, too.
if (repo_dir / "endpoints").exists():
shutil.rmtree(repo_dir / "endpoints")
if (repo_dir / "pgdatadirs").exists():
shutil.rmtree(repo_dir / "pgdatadirs")
os.mkdir(repo_dir / "endpoints")
# Update paths and ports in config files
legacy_pageserver_toml = repo_dir / "pageserver.toml"
legacy_bundle = os.path.exists(legacy_pageserver_toml)
path_to_config: dict[Path, dict[Any, Any]] = {}
if legacy_bundle:
os.mkdir(repo_dir / "pageserver_1")
path_to_config[repo_dir / "pageserver_1" / "pageserver.toml"] = toml.load(
legacy_pageserver_toml
)
os.remove(legacy_pageserver_toml)
os.rename(repo_dir / "tenants", repo_dir / "pageserver_1" / "tenants")
else:
for ps_conf in snapshot_config["pageservers"]:
config_path = repo_dir / f"pageserver_{ps_conf['id']}" / "pageserver.toml"
path_to_config[config_path] = toml.load(config_path)
# For each pageserver config, edit it and rewrite
for config_path, pageserver_config in path_to_config.items():
pageserver_config["remote_storage"]["local_path"] = str(
LocalFsStorage.component_path(repo_dir, RemoteStorageUser.PAGESERVER)
)
for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
pageserver_config[param] = port_distributor.replace_with_new_port(
pageserver_config[param]
)
# We don't use authentication in compatibility tests
# so just remove authentication related settings.
pageserver_config.pop("pg_auth_type", None)
pageserver_config.pop("http_auth_type", None)
if pg_distrib_dir:
pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
with config_path.open("w") as f:
toml.dump(pageserver_config, f)
# neon_local config doesn't have to be backward compatible. If we're using a dump from before
# it supported multiple pageservers, fix it up.
if "pageservers" not in snapshot_config:
snapshot_config["pageservers"] = [snapshot_config["pageserver"]]
del snapshot_config["pageserver"]
for param in ("listen_http_addr", "listen_pg_addr"):
for pageserver in snapshot_config["pageservers"]:
pageserver[param] = port_distributor.replace_with_new_port(pageserver[param])
snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
snapshot_config["broker"]["listen_addr"]
)
for sk in snapshot_config["safekeepers"]:
for param in ("http_port", "pg_port", "pg_tenant_only_port"):
sk[param] = port_distributor.replace_with_new_port(sk[param])
if pg_distrib_dir:
snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
with snapshot_config_toml.open("w") as f:
toml.dump(snapshot_config, f)
# Ensure that snapshot doesn't contain references to the original path
rv = subprocess.run(
[
"grep",
"--recursive",
"--binary-file=without-match",
"--files-with-matches",
"test_create_snapshot/repo",
str(repo_dir),
],
capture_output=True,
text=True,
)
assert (
rv.returncode != 0
), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
def check_neon_works(
repo_dir: Path,
neon_target_binpath: Path,
neon_current_binpath: Path,
pg_distrib_dir: Path,
pg_version: PgVersion,
port_distributor: PortDistributor,
test_output_dir: Path,
pg_bin: PgBin,
request: FixtureRequest,
):
snapshot_config_toml = repo_dir / "config"
snapshot_config = toml.load(snapshot_config_toml)
snapshot_config["neon_distrib_dir"] = str(neon_target_binpath)
snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir)
with (snapshot_config_toml).open("w") as f:
toml.dump(snapshot_config, f)
# TODO: replace with NeonEnvBuilder / NeonEnv
config: Any = type("NeonEnvStub", (object,), {})
config.rust_log_override = None
config.repo_dir = repo_dir
config.pg_version = pg_version
config.initial_tenant = snapshot_config["default_tenant_id"]
config.pg_distrib_dir = pg_distrib_dir
config.remote_storage = None
config.sk_remote_storage = None
# Use the "target" binaries to launch the storage nodes
config_target = config
config_target.neon_binpath = neon_target_binpath
# We are using maybe-old binaries for neon services, but want to use current
# binaries for test utilities like neon_local
config_target.neon_local_binpath = neon_current_binpath
cli_target = NeonCli(config_target)
# And the current binaries to launch computes
snapshot_config["neon_distrib_dir"] = str(neon_current_binpath)
with (snapshot_config_toml).open("w") as f:
toml.dump(snapshot_config, f)
config_current = copy.copy(config)
config_current.neon_binpath = neon_current_binpath
cli_current = NeonCli(config_current)
cli_target.raw_cli(["start"])
request.addfinalizer(lambda: cli_target.raw_cli(["stop"]))
pg_port = port_distributor.get_port()
http_port = port_distributor.get_port()
cli_current.endpoint_create(
branch_name="main", pg_port=pg_port, http_port=http_port, endpoint_id="ep-main"
)
cli_current.endpoint_start("ep-main")
request.addfinalizer(lambda: cli_current.endpoint_stop("ep-main"))
connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres"
pg_bin.run_capture(
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"]
)
initial_dump_differs = dump_differs(
sql_dump_path,
repo_dir.parent / "dump.sql",
test_output_dir / "dump.sql",
test_output_dir / "dump.filediff",
)
# Check that project can be recovered from WAL
# loosely based on https://www.notion.so/neondatabase/Storage-Recovery-from-WAL-d92c0aac0ebf40df892b938045d7d720
pageserver_http = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
pg_version = env.pg_version
# Delete all files from local_fs_remote_storage except initdb.tar.zst,
# the file is required for `timeline_create` with `existing_initdb_timeline_id`.
#
# TODO: switch to Path.walk() in Python 3.12
# for dirpath, _dirnames, filenames in (repo_dir / "local_fs_remote_storage").walk():
for dirpath, _dirnames, filenames in os.walk(repo_dir / "local_fs_remote_storage"):
for filename in filenames:
if filename != "initdb.tar.zst":
(Path(dirpath) / filename).unlink()
timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
pageserver_http.timeline_create(
pg_version=pg_version,
tenant_id=tenant_id,
new_timeline_id=timeline_id,
existing_initdb_timeline_id=timeline_id,
tenant_id = snapshot_config["default_tenant_id"]
timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
pageserver_port = snapshot_config["pageservers"][0]["listen_http_addr"].split(":")[-1]
pageserver_http = PageserverHttpClient(
port=pageserver_port,
is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled
)
shutil.rmtree(repo_dir / "local_fs_remote_storage")
timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id)
pageserver_http.timeline_create(pg_version, tenant_id, timeline_id)
pg_bin.run_capture(
["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
)
@@ -322,11 +494,6 @@ def dump_differs(
Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False).
"""
if not first.exists():
raise FileNotFoundError(f"{first} doesn't exist")
if not second.exists():
raise FileNotFoundError(f"{second} doesn't exist")
with output.open("w") as stdout:
res = subprocess.run(
[

0
test_runner/regress/test_config.py Normal file → Executable file
View File

View File

@@ -35,11 +35,6 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
# Because this test does a rapid series of restarts of the same node, it's possible that
# we are restarted again before we can clean up deletion lists form the previous generation,
# resulting in a subsequent startup logging a warning.
env.pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*")
for _ in range(5):
with pytest.raises(subprocess.SubprocessError):
pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])

View File

@@ -99,13 +99,12 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
]
)
env.pageserver.allowed_errors.extend(
[
# FIXME: we should clean up pageserver to not print this
".*exited with error: unexpected message type: CopyData.*",
# FIXME: Is this expected?
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*",
]
# FIXME: we should clean up pageserver to not print this
env.pageserver.allowed_errors.append(".*exited with error: unexpected message type: CopyData.*")
# FIXME: Is this expected?
env.pageserver.allowed_errors.append(
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
)
def import_tar(base, wal):

View File

@@ -236,30 +236,3 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
assert vanilla_pg.safe_psql(
"select sum(somedata) from replication_example"
) == endpoint.safe_psql("select sum(somedata) from replication_example")
#
# Check that slots are not inherited in brnach
#
def test_slots_and_branching(neon_simple_env: NeonEnv):
env = neon_simple_env
tenant, timeline = env.neon_cli.create_tenant()
env.pageserver.http_client()
main_branch = env.endpoints.create_start("main", tenant_id=tenant)
main_cur = main_branch.connect().cursor()
# Create table and insert some data
main_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
wait_for_last_flush_lsn(env, main_branch, tenant, timeline)
# Create branch ws.
env.neon_cli.create_branch("ws", "main", tenant_id=tenant)
ws_branch = env.endpoints.create_start("ws", tenant_id=tenant)
log.info("postgres is running on 'ws' branch")
# Check that we can create slot with the same name
ws_cur = ws_branch.connect().cursor()
ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")

View File

@@ -5,6 +5,7 @@ import time
from collections import defaultdict
from typing import Any, DefaultDict, Dict, Tuple
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
@@ -18,7 +19,7 @@ from fixtures.pageserver.utils import (
wait_for_upload,
wait_for_upload_queue_empty,
)
from fixtures.remote_storage import RemoteStorageKind
from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
from fixtures.types import Lsn
from fixtures.utils import query_scalar, wait_until
@@ -44,7 +45,13 @@ def get_num_downloaded_layers(client: PageserverHttpClient):
# If you have a large relation, check that the pageserver downloads parts of it as
# require by queries.
#
def test_ondemand_download_large_rel(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_ondemand_download_large_rel(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
):
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
# thinking about using a shared environment? the test assumes that global
# metrics are for single tenant.
env = neon_env_builder.init_start(
@@ -138,7 +145,13 @@ def test_ondemand_download_large_rel(neon_env_builder: NeonEnvBuilder):
# If you have a relation with a long history of updates, the pageserver downloads the layer
# files containing the history as needed by timetravel queries.
#
def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_ondemand_download_timetravel(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
):
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
# thinking about using a shared environment? the test assumes that global
# metrics are for single tenant.
@@ -216,7 +229,8 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
assert filled_current_physical == filled_size, "we don't yet do layer eviction"
# Wait until generated image layers are uploaded to S3
wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
if remote_storage_kind is not None:
wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, timeline_id)
env.pageserver.stop()

View File

@@ -23,20 +23,14 @@ from fixtures.neon_fixtures import (
PgBin,
S3Scrubber,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
assert_tenant_state,
list_prefix,
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.pageserver.utils import list_prefix
from fixtures.remote_storage import (
RemoteStorageKind,
)
from fixtures.types import TenantId, TimelineId
from fixtures.utils import print_gc_result, wait_until
from fixtures.workload import Workload
# A tenant configuration that is convenient for generating uploads and deletions
# without a large amount of postgres traffic.
@@ -99,10 +93,7 @@ def generate_uploads_and_deletions(
)
assert tenant_id is not None
assert timeline_id is not None
# We are waiting for uploads as well as local flush, in order to avoid leaving the system
# in a state where there are "future layers" in remote storage that will generate deletions
# after a restart.
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
ps_http.timeline_checkpoint(tenant_id, timeline_id)
# Compaction should generate some GC-elegible layers
@@ -569,91 +560,3 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
read_all(env, tenant_id, timeline_id)
evict_all_layers(env, tenant_id, timeline_id)
read_all(env, tenant_id, timeline_id)
def test_multi_attach(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 3
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
pageservers = env.pageservers
http_clients = list([p.http_client() for p in pageservers])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# We will intentionally create situations where stale deletions happen from non-latest-generation
# nodes when the tenant is multiply-attached
for ps in env.pageservers:
ps.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)
# Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
with pytest.raises(PageserverApiException):
http_clients[1].timeline_detail(tenant_id, timeline_id)
with pytest.raises(PageserverApiException):
http_clients[2].timeline_detail(tenant_id, timeline_id)
workload = Workload(env, tenant_id, timeline_id)
workload.init(pageservers[0].id)
workload.write_rows(1000, pageservers[0].id)
# Attach the tenant to the other two pageservers
pageservers[1].tenant_attach(env.initial_tenant)
pageservers[2].tenant_attach(env.initial_tenant)
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
# Now they all have it attached
_details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients])
_detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
_detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
# The endpoint can use any pageserver to service its reads
for pageserver in pageservers:
workload.validate(pageserver.id)
# If we write some more data, all the nodes can see it, including stale ones
wrote_lsn = workload.write_rows(1000, pageservers[0].id)
for ps_http in http_clients:
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, wrote_lsn)
# ...and indeed endpoints can see it via any of the pageservers
for pageserver in pageservers:
workload.validate(pageserver.id)
# Prompt all the pageservers, including stale ones, to upload ingested layers to remote storage
for ps_http in http_clients:
ps_http.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(ps_http, tenant_id, timeline_id, wrote_lsn)
# Now, the contents of remote storage will be a set of layers from each pageserver, but with unique
# generation numbers
# TODO: validate remote storage contents
# Stop all pageservers
for ps in pageservers:
ps.stop()
# Returning to a normal healthy state: all pageservers will start, but only the one most
# recently attached via the control plane will re-attach on startup
for ps in pageservers:
ps.start()
with pytest.raises(PageserverApiException):
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
with pytest.raises(PageserverApiException):
_detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
_detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
# All data we wrote while multi-attached remains readable
workload.validate(pageservers[2].id)

View File

@@ -64,13 +64,13 @@ def test_metric_collection(
# spin up neon, after http server is ready
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.extend(
[
".*metrics endpoint refused the sent metrics*",
# we have a fast rate of calculation, these can happen at shutdown
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
]
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
# we have a fast rate of calculation, these can happen at shutdown
env.pageserver.allowed_errors.append(
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
)
env.pageserver.allowed_errors.append(
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
)
tenant_id = env.initial_tenant
@@ -212,13 +212,13 @@ def test_metric_collection_cleans_up_tempfile(
pageserver_http = env.pageserver.http_client()
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.extend(
[
".*metrics endpoint refused the sent metrics*",
# we have a fast rate of calculation, these can happen at shutdown
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*",
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes",
]
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
# we have a fast rate of calculation, these can happen at shutdown
env.pageserver.allowed_errors.append(
".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*"
)
env.pageserver.allowed_errors.append(
".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes"
)
tenant_id = env.initial_tenant

View File

@@ -1,375 +0,0 @@
import random
from typing import Any, Dict, Optional
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import TenantId, TimelineId
from fixtures.utils import wait_until
from fixtures.workload import Workload
# A tenant configuration that is convenient for generating uploads and deletions
# without a large amount of postgres traffic.
TENANT_CONF = {
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": f"{128 * 1024}",
"compaction_target_size": f"{128 * 1024}",
"compaction_threshold": "1",
# no PITR horizon, we specify the horizon when we request on-demand GC
"pitr_interval": "0s",
# disable background compaction and GC. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# create image layers eagerly, so that GC can remove some layers
"image_creation_threshold": "1",
}
def evict_random_layers(
rng: random.Random, pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId
):
"""
Evict 50% of the layers on a pageserver
"""
timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
initial_local_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
)
client = pageserver.http_client()
for layer in initial_local_layers:
if "ephemeral" in layer.name or "temp_download" in layer.name:
continue
if rng.choice([True, False]):
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
@pytest.mark.parametrize("seed", [1, 2, 3])
def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
"""
Issue many location configuration changes, ensure that tenants
remain readable & we don't get any unexpected errors. We should
have no ERROR in the log, and no 500s in the API.
The location_config API is intentionally designed so that all destination
states are valid, so that we may test it in this way: the API should always
work as long as the tenant exists.
"""
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 3
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
pageservers = env.pageservers
list([p.http_client() for p in pageservers])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# We will make no effort to avoid stale attachments
for ps in env.pageservers:
ps.allowed_errors.extend(
[
".*Dropped remote consistent LSN updates.*",
".*Dropping stale deletions.*",
# page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
".*query handler.*Tenant.*not found.*",
# page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
".*query handler.*Tenant.*not active.*",
]
)
# these can happen, if we shutdown at a good time. to be fixed as part of #5172.
message = ".*duplicated L1 layer layer=.*"
ps.allowed_errors.append(message)
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageservers[0].id)
workload.write_rows(256, env.pageservers[0].id)
# We use a fixed seed to make the test reproducible: we want a randomly
# chosen order, but not to change the order every time we run the test.
rng = random.Random(seed)
initial_generation = 1
last_state = {
env.pageservers[0].id: ("AttachedSingle", initial_generation),
env.pageservers[1].id: ("Detached", None),
env.pageservers[2].id: ("Detached", None),
}
latest_attached = env.pageservers[0].id
for _i in range(0, 64):
# Pick a pageserver
pageserver = rng.choice(env.pageservers)
# Pick a pseudorandom state
modes = [
"AttachedSingle",
"AttachedMulti",
"AttachedStale",
"Secondary",
"Detached",
"_Evictions",
"_Restart",
]
mode = rng.choice(modes)
last_state_ps = last_state[pageserver.id]
if mode == "_Evictions":
if last_state_ps[0].startswith("Attached"):
log.info(f"Action: evictions on pageserver {pageserver.id}")
evict_random_layers(rng, pageserver, tenant_id, timeline_id)
else:
log.info(
f"Action: skipping evictions on pageserver {pageserver.id}, is not attached"
)
elif mode == "_Restart":
log.info(f"Action: restarting pageserver {pageserver.id}")
pageserver.stop()
pageserver.start()
if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id:
log.info("Entering postgres...")
workload.churn_rows(rng.randint(128, 256), pageserver.id)
workload.validate(pageserver.id)
elif last_state_ps[0].startswith("Attached"):
# The `attachment_service` will only re-attach on startup when a pageserver was the
# holder of the latest generation: otherwise the pageserver will revert to detached
# state if it was running attached with a stale generation
last_state[pageserver.id] = ("Detached", None)
else:
secondary_conf: Optional[Dict[str, Any]] = None
if mode == "Secondary":
secondary_conf = {"warm": rng.choice([True, False])}
location_conf: Dict[str, Any] = {
"mode": mode,
"secondary_conf": secondary_conf,
"tenant_conf": {},
}
log.info(f"Action: Configuring pageserver {pageserver.id} to {location_conf}")
# Select a generation number
if mode.startswith("Attached"):
if last_state_ps[1] is not None:
if rng.choice([True, False]):
# Move between attached states, staying in the same generation
generation = last_state_ps[1]
else:
# Switch generations, while also jumping between attached states
generation = env.attachment_service.attach_hook_issue(
tenant_id, pageserver.id
)
latest_attached = pageserver.id
else:
generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id)
latest_attached = pageserver.id
else:
generation = None
location_conf["generation"] = generation
pageserver.tenant_location_configure(tenant_id, location_conf)
last_state[pageserver.id] = (mode, generation)
if mode.startswith("Attached"):
# This is a basic test: we are validating that he endpoint works properly _between_
# configuration changes. A stronger test would be to validate that clients see
# no errors while we are making the changes.
workload.churn_rows(
rng.randint(128, 256), pageserver.id, upload=mode != "AttachedStale"
)
workload.validate(pageserver.id)
# Attach all pageservers
for ps in env.pageservers:
location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}}
ps.tenant_location_configure(tenant_id, location_conf)
# Confirm that all are readable
for ps in env.pageservers:
workload.validate(ps.id)
# Detach all pageservers
for ps in env.pageservers:
location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}}
ps.tenant_location_configure(tenant_id, location_conf)
# Confirm that all local disk state was removed on detach
# TODO
def test_live_migration(neon_env_builder: NeonEnvBuilder):
"""
Test the sequence of location states that are used in a live migration.
"""
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 2
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
pageserver_a = env.pageservers[0]
pageserver_b = env.pageservers[1]
initial_generation = 1
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageservers[0].id)
workload.write_rows(256, env.pageservers[0].id)
# Make the destination a secondary location
pageserver_b.tenant_location_configure(
tenant_id,
{
"mode": "Secondary",
"secondary_conf": {"warm": True},
"tenant_conf": {},
},
)
workload.churn_rows(64, pageserver_a.id, upload=False)
# Set origin attachment to stale
log.info("Setting origin to AttachedStale")
pageserver_a.tenant_location_configure(
tenant_id,
{
"mode": "AttachedStale",
"secondary_conf": None,
"tenant_conf": {},
"generation": initial_generation,
},
flush_ms=5000,
)
migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
log.info(f"Acquired generation {migrated_generation} for destination pageserver")
assert migrated_generation == initial_generation + 1
# Writes and reads still work in AttachedStale.
workload.validate(pageserver_a.id)
# TODO: call into secondary mode API hooks to do an upload/download sync
# Generate some more dirty writes: we expect the origin to ingest WAL in
# in AttachedStale
workload.churn_rows(64, pageserver_a.id, upload=False)
workload.validate(pageserver_a.id)
# Attach the destination
log.info("Setting destination to AttachedMulti")
pageserver_b.tenant_location_configure(
tenant_id,
{
"mode": "AttachedMulti",
"secondary_conf": None,
"tenant_conf": {},
"generation": migrated_generation,
},
)
# Wait for destination LSN to catch up with origin
origin_lsn = pageserver_a.http_client().timeline_detail(tenant_id, timeline_id)[
"last_record_lsn"
]
def caught_up():
destination_lsn = pageserver_b.http_client().timeline_detail(tenant_id, timeline_id)[
"last_record_lsn"
]
log.info(
f"Waiting for LSN to catch up: origin {origin_lsn} vs destination {destination_lsn}"
)
assert destination_lsn >= origin_lsn
wait_until(100, 0.1, caught_up)
# The destination should accept writes
workload.churn_rows(64, pageserver_b.id)
# Dual attached: both are readable.
workload.validate(pageserver_a.id)
workload.validate(pageserver_b.id)
# Revert the origin to secondary
log.info("Setting origin to Secondary")
pageserver_a.tenant_location_configure(
tenant_id,
{
"mode": "Secondary",
"secondary_conf": {"warm": True},
"tenant_conf": {},
},
)
workload.churn_rows(64, pageserver_b.id)
# Put the destination into final state
pageserver_b.tenant_location_configure(
tenant_id,
{
"mode": "AttachedSingle",
"secondary_conf": None,
"tenant_conf": {},
"generation": migrated_generation,
},
)
workload.churn_rows(64, pageserver_b.id)
workload.validate(pageserver_b.id)
def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
"""
Test the sequence of location states that are used in a live migration.
"""
env = neon_env_builder.init_start() # initial_tenant_conf=TENANT_CONF)
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Write some data so that we have some layers
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageservers[0].id)
# Write some layers and upload a heatmap
workload.write_rows(256, env.pageservers[0].id)
env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
def validate_heatmap(heatmap):
assert len(heatmap["timelines"]) == 1
assert heatmap["timelines"][0]["timeline_id"] == str(timeline_id)
assert len(heatmap["timelines"][0]["layers"]) > 0
layers = heatmap["timelines"][0]["layers"]
# Each layer appears at most once
assert len(set(layer["name"] for layer in layers)) == len(layers)
# Download and inspect the heatmap that the pageserver uploaded
heatmap_first = env.pageserver_remote_storage.heatmap_content(tenant_id)
log.info(f"Read back heatmap: {heatmap_first}")
validate_heatmap(heatmap_first)
# Do some more I/O to generate more layers
workload.churn_rows(64, env.pageservers[0].id)
env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
# Ensure that another heatmap upload includes the new layers
heatmap_second = env.pageserver_remote_storage.heatmap_content(tenant_id)
log.info(f"Read back heatmap: {heatmap_second}")
assert heatmap_second != heatmap_first
validate_heatmap(heatmap_second)

View File

@@ -73,20 +73,19 @@ def test_remote_storage_backup_and_restore(
##### First start, insert data and upload it to the remote storage
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[
# FIXME: Is this expected?
".*marking .* as locally complete, while it doesnt exist in remote index.*",
".*No timelines to attach received.*",
".*Failed to get local tenant state.*",
# FIXME retry downloads without throwing errors
".*failed to load remote timeline.*",
# we have a bunch of pytest.raises for these below
".*tenant .*? already exists, state:.*",
".*tenant directory already exists.*",
".*simulated failure of remote operation.*",
]
# FIXME: Is this expected?
env.pageserver.allowed_errors.append(
".*marking .* as locally complete, while it doesnt exist in remote index.*"
)
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
# FIXME retry downloads without throwing errors
env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
# we have a bunch of pytest.raises for these below
env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
env.pageserver.allowed_errors.append(".*tenant directory already exists.*")
env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*")
pageserver_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")

View File

@@ -314,10 +314,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
assert not config_path.exists(), "detach did not remove config file"
# The re-attach's increment of the generation number may invalidate deletion queue
# updates in flight from the previous attachment.
env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
env.pageserver.tenant_attach(tenant_id)
wait_until(
number_of_iterations=5,

View File

@@ -23,18 +23,23 @@ from fixtures.pageserver.utils import (
wait_until_tenant_active,
wait_until_tenant_state,
)
from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
from fixtures.remote_storage import (
RemoteStorageKind,
available_remote_storages,
available_s3_storages,
)
from fixtures.types import TenantId
from fixtures.utils import run_pg_bench_small, wait_until
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_tenant_delete_smoke(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
pg_bin: PgBin,
):
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
@@ -73,15 +78,16 @@ def test_tenant_delete_smoke(
run_pg_bench_small(pg_bin, endpoint.connstr())
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
parent = timeline
@@ -94,15 +100,16 @@ def test_tenant_delete_smoke(
tenant_path = env.pageserver.tenant_dir(tenant_id)
assert not tenant_path.exists()
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
if remote_storage_kind in available_s3_storages():
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
# Deletion updates the tenant count: the one default tenant remains
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
@@ -142,7 +149,9 @@ FAILPOINTS_BEFORE_BACKGROUND = [
def combinations():
result = []
remotes = available_s3_storages()
remotes = [RemoteStorageKind.MOCK_S3]
if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
remotes.append(RemoteStorageKind.REAL_S3)
for remote_storage_kind in remotes:
for delete_failpoint in FAILPOINTS:
@@ -156,8 +165,8 @@ def combinations():
return result
@pytest.mark.parametrize("check", list(Check))
@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
@pytest.mark.parametrize("check", list(Check))
def test_delete_tenant_exercise_crash_safety_failpoints(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
@@ -205,15 +214,16 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
run_pg_bench_small(pg_bin, endpoint.connstr())
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
ps_http.configure_failpoints((failpoint, "return"))
@@ -266,23 +276,25 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
assert not tenant_dir.exists()
# Check remote is empty
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
allowed_postfix="initdb.tar.zst",
)
if remote_storage_kind in available_s3_storages():
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
allowed_postfix="initdb.tar.zst",
)
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_tenant_delete_is_resumed_on_attach(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
pg_bin: PgBin,
):
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
@@ -302,15 +314,16 @@ def test_tenant_delete_is_resumed_on_attach(
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
# sanity check, data should be there
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
# failpoint before we remove index_part from s3
failpoint = "timeline-delete-before-index-delete"
@@ -341,15 +354,16 @@ def test_tenant_delete_is_resumed_on_attach(
iterations=iterations,
)
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
if remote_storage_kind in available_s3_storages():
assert_prefix_not_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
reason = tenant_info["state"]["data"]["reason"]
# failpoint may not be the only error in the stack
@@ -375,16 +389,17 @@ def test_tenant_delete_is_resumed_on_attach(
tenant_path = env.pageserver.tenant_dir(tenant_id)
assert not tenant_path.exists()
ps_http.deletion_queue_flush(execute=True)
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
if remote_storage_kind in available_s3_storages():
ps_http.deletion_queue_flush(execute=True)
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
@@ -395,13 +410,13 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
env.start()
pageserver_http = env.pageserver.http_client()
env.pageserver.allowed_errors.extend(
[
# happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero
".*Timeline got dropped without initializing, cleaning its files",
# the response hit_pausable_failpoint_and_later_fail
f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn",
]
# happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero
env.pageserver.allowed_errors.append(
".*Timeline got dropped without initializing, cleaning its files"
)
# the response hit_pausable_failpoint_and_later_fail
env.pageserver.allowed_errors.append(
f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn"
)
env.pageserver.tenant_create(env.initial_tenant)

View File

@@ -21,6 +21,7 @@ from fixtures.pageserver.utils import (
)
from fixtures.remote_storage import (
RemoteStorageKind,
available_remote_storages,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import query_scalar, wait_until
@@ -58,11 +59,16 @@ class ReattachMode(str, enum.Enum):
# Basic detach and re-attach test
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@pytest.mark.parametrize(
"mode",
[ReattachMode.REATTACH_EXPLICIT, ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP],
)
def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
def test_tenant_reattach(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, mode: str
):
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
# Exercise retry code path by making all uploads and downloads fail for the
# first time. The retries print INFO-messages to the log; we will check
# that they are present after the test.
@@ -181,13 +187,16 @@ num_rows = 100000
#
# I don't know what's causing that...
@pytest.mark.skip(reason="fixme")
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_tenant_reattach_while_busy(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
):
updates_started = 0
updates_finished = 0
updates_to_perform = 0
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
# Run random UPDATEs on test table. On failure, try again.
@@ -307,14 +316,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
bogus_timeline_id = TimelineId.generate()
pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
env.pageserver.allowed_errors.extend(
[
# the error will be printed to the log too
".*gc target timeline does not exist.*",
# Timelines get stopped during detach, ignore the gc calls that error, witnessing that
".*InternalServerError\\(timeline is Stopping.*",
]
)
# the error will be printed to the log too
env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
# Timelines get stopped during detach, ignore the gc calls that error, witnessing that
env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
# Detach while running manual GC.
# It should wait for manual GC to finish because it runs in a task associated with the tenant.
@@ -434,9 +439,13 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
should not be present in pageserver's memory"
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
def test_detach_while_attaching(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
):
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
##### First start, insert secret data and upload it to the remote storage
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()

Some files were not shown because too many files have changed in this diff Show More