Compare commits

..

7 Commits

Author SHA1 Message Date
Alex Chi Z
37e25ab51e fix tests
Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-04-24 13:58:24 -04:00
Alex Chi Z
f1deefc077 fix tests
Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-04-23 12:10:33 -04:00
Alex Chi Z
d4fc271766 fix unit test
Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-04-22 16:48:56 -04:00
Alex Chi Z
21db1bc2f0 metadata v3
Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-04-22 15:52:16 -04:00
Alex Chi Z
21addb827b fix unit test
Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-04-22 15:42:09 -04:00
Alex Chi Z
4b23692615 adapt to metadataupdate interface
Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-04-22 14:19:25 -04:00
Alex Chi Z
667376a9c4 feat(pageserver): add aux_file_v2 feature flag
Signed-off-by: Alex Chi Z <chi@neon.tech>

add tests

Signed-off-by: Alex Chi Z <chi@neon.tech>
2024-04-22 14:14:02 -04:00
59 changed files with 1599 additions and 1479 deletions

171
Cargo.lock generated
View File

@@ -595,7 +595,7 @@ dependencies = [
"http 0.2.9",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls 0.24.0",
"hyper-rustls",
"once_cell",
"pin-project-lite",
"pin-utils",
@@ -1780,18 +1780,6 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "enum-as-inner"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ffccbb6966c05b32ef8fbac435df276c4ae4d3dc55a8cd0eb9745e6c12f546a"
dependencies = [
"heck 0.4.1",
"proc-macro2",
"quote",
"syn 2.0.52",
]
[[package]]
name = "enum-map"
version = "2.5.0"
@@ -1983,9 +1971,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
[[package]]
name = "form_urlencoded"
version = "1.2.1"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8"
dependencies = [
"percent-encoding",
]
@@ -2344,51 +2332,6 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
[[package]]
name = "hickory-proto"
version = "0.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07698b8420e2f0d6447a436ba999ec85d8fbf2a398bbd737b82cac4a2e96e512"
dependencies = [
"async-trait",
"cfg-if",
"data-encoding",
"enum-as-inner",
"futures-channel",
"futures-io",
"futures-util",
"idna 0.4.0",
"ipnet",
"once_cell",
"rand 0.8.5",
"thiserror",
"tinyvec",
"tokio",
"tracing",
"url",
]
[[package]]
name = "hickory-resolver"
version = "0.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28757f23aa75c98f254cf0405e6d8c25b831b32921b050a66692427679b1f243"
dependencies = [
"cfg-if",
"futures-util",
"hickory-proto",
"ipconfig",
"lru-cache",
"once_cell",
"parking_lot 0.12.1",
"rand 0.8.5",
"resolv-conf",
"smallvec",
"thiserror",
"tokio",
"tracing",
]
[[package]]
name = "histogram"
version = "0.7.4"
@@ -2582,23 +2525,6 @@ dependencies = [
"tokio-rustls 0.24.0",
]
[[package]]
name = "hyper-rustls"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "399c78f9338483cb7e630c8474b07268983c6bd5acee012e4211f9f7bb21b070"
dependencies = [
"futures-util",
"http 0.2.9",
"hyper 0.14.26",
"log",
"rustls 0.22.4",
"rustls-native-certs 0.7.0",
"rustls-pki-types",
"tokio",
"tokio-rustls 0.25.0",
]
[[package]]
name = "hyper-timeout"
version = "0.4.1"
@@ -2686,19 +2612,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
[[package]]
name = "idna"
version = "0.4.0"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c"
dependencies = [
"unicode-bidi",
"unicode-normalization",
]
[[package]]
name = "idna"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6"
dependencies = [
"unicode-bidi",
"unicode-normalization",
@@ -2803,18 +2719,6 @@ dependencies = [
"libc",
]
[[package]]
name = "ipconfig"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b58db92f96b720de98181bbbe63c831e87005ab460c1bf306eb2622b4707997f"
dependencies = [
"socket2 0.5.5",
"widestring",
"windows-sys 0.48.0",
"winreg",
]
[[package]]
name = "ipnet"
version = "2.9.0"
@@ -2956,12 +2860,6 @@ version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "linked-hash-map"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linux-raw-sys"
version = "0.1.4"
@@ -2996,15 +2894,6 @@ version = "0.4.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "lru-cache"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31e24f1ad8321ca0e8a1e0ac13f23cb668e6f5466c2c57319f6a5cf1cc8e3b1c"
dependencies = [
"linked-hash-map",
]
[[package]]
name = "match_cfg"
version = "0.1.0"
@@ -3769,7 +3658,6 @@ dependencies = [
"tokio-util",
"toml_edit",
"tracing",
"twox-hash",
"url",
"utils",
"walkdir",
@@ -4000,9 +3888,9 @@ dependencies = [
[[package]]
name = "percent-encoding"
version = "2.3.1"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
[[package]]
name = "petgraph"
@@ -4111,7 +3999,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4124,7 +4012,7 @@ dependencies = [
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"native-tls",
"tokio",
@@ -4135,7 +4023,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -4154,7 +4042,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4404,7 +4292,6 @@ dependencies = [
"aws-config",
"aws-sdk-iam",
"aws-sigv4",
"aws-smithy-runtime",
"aws-types",
"base64 0.13.1",
"bstr",
@@ -4422,7 +4309,6 @@ dependencies = [
"hashbrown 0.13.2",
"hashlink",
"hex",
"hickory-resolver",
"hmac",
"hostname",
"http 1.1.0",
@@ -4430,7 +4316,6 @@ dependencies = [
"humantime",
"hyper 0.14.26",
"hyper 1.2.0",
"hyper-rustls 0.25.0",
"hyper-tungstenite",
"hyper-util",
"ipnet",
@@ -4499,12 +4384,6 @@ dependencies = [
"x509-parser",
]
[[package]]
name = "quick-error"
version = "1.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
[[package]]
name = "quick-xml"
version = "0.31.0"
@@ -4806,7 +4685,7 @@ dependencies = [
"http 0.2.9",
"http-body 0.4.5",
"hyper 0.14.26",
"hyper-rustls 0.24.0",
"hyper-rustls",
"hyper-tls",
"ipnet",
"js-sys",
@@ -4892,16 +4771,6 @@ dependencies = [
"tracing-opentelemetry",
]
[[package]]
name = "resolv-conf"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52e44394d2086d010551b14b53b1f24e31647570cd1deb0379e2c21b329aba00"
dependencies = [
"hostname",
"quick-error",
]
[[package]]
name = "retry-policies"
version = "0.1.2"
@@ -6295,7 +6164,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
dependencies = [
"async-trait",
"byteorder",
@@ -6825,12 +6694,12 @@ dependencies = [
[[package]]
name = "url"
version = "2.5.0"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643"
dependencies = [
"form_urlencoded",
"idna 0.5.0",
"idna",
"percent-encoding",
"serde",
]
@@ -7162,12 +7031,6 @@ dependencies = [
"once_cell",
]
[[package]]
name = "widestring"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "653f141f39ec16bba3c5abe400a0c60da7468261cc2cbf36805022876bc721a8"
[[package]]
name = "winapi"
version = "0.3.9"
@@ -7510,8 +7373,6 @@ dependencies = [
"tower",
"tracing",
"tracing-core",
"unicode-bidi",
"unicode-normalization",
"url",
"uuid",
"zeroize",

View File

@@ -57,7 +57,6 @@ aws-sdk-s3 = "1.14"
aws-sdk-iam = "1.15.0"
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.1.4"
aws-smithy-runtime = "1.1.8"
aws-credential-types = "1.1.4"
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
aws-types = "1.1.7"
@@ -195,11 +194,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -239,7 +238,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }

View File

@@ -434,6 +434,11 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
try_enable_aux_file_v2: settings
.remove("try_enable_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'try_enable_aux_file_v2' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -552,6 +557,11 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
try_enable_aux_file_v2: settings
.remove("try_enable_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'try_enable_aux_file_v2' as bool")?,
}
};

View File

@@ -1,10 +1,8 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use bytes::BufMut;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
use std::ops::RangeInclusive;
use std::{fmt, ops::Range};
use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -23,81 +21,9 @@ pub struct Key {
pub field6: u32,
}
/// The storage key size.
pub const KEY_SIZE: usize = 18;
/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
/// See [`Key::to_i128`] for more information on the encoding.
pub const METADATA_KEY_SIZE: usize = 16;
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
/// The (reserved) key prefix of relation sizes.
pub const RELATION_SIZE_PREFIX: u8 = 0x81;
/// The key prefix of AUX file keys.
pub const AUX_KEY_PREFIX: u8 = 0x82;
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
key[0] >= METADATA_KEY_BEGIN_PREFIX
}
impl Key {
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key(&self) -> bool {
self.field1 >= METADATA_KEY_BEGIN_PREFIX
}
/// Encode a metadata key to a storage key.
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
assert!(is_metadata_key_slice(key), "key not in metadata key range");
Key {
field1: key[0],
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
field5: key[11],
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
}
}
/// Encode a metadata key to a storage key.
pub fn from_metadata_key(key: &[u8]) -> Self {
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
}
/// Extract a metadata key to a writer. The result should always be 16 bytes.
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
writer.put_u8(self.field1);
assert!(self.field2 <= 0xFFFF);
writer.put_u16(self.field2 as u16);
writer.put_u32(self.field3);
writer.put_u32(self.field4);
writer.put_u8(self.field5);
writer.put_u32(self.field6);
}
/// Get the range of metadata keys.
pub fn metadata_key_range() -> RangeInclusive<Self> {
Key {
field1: METADATA_KEY_BEGIN_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..=Key {
field1: u8::MAX,
field2: u16::MAX as u32,
field3: u32::MAX,
field4: u32::MAX,
field5: u8::MAX,
field6: u32::MAX,
}
}
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
/// As long as Neon does not support tablespace (because of lack of access to local file system),
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
@@ -122,11 +48,11 @@ impl Key {
}
}
pub const fn next(&self) -> Key {
pub fn next(&self) -> Key {
self.add(1)
}
pub const fn add(&self, x: u32) -> Key {
pub fn add(&self, x: u32) -> Key {
let mut key = *self;
let r = key.field6.overflowing_add(x);
@@ -155,8 +81,6 @@ impl Key {
key
}
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::from_metadata_key`] instead.
pub fn from_slice(b: &[u8]) -> Self {
Key {
field1: b[0],
@@ -168,8 +92,6 @@ impl Key {
}
}
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::extract_metadata_key_to_writer`] instead.
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
buf[0] = self.field1;
BE::write_u32(&mut buf[1..5], self.field2);
@@ -553,14 +475,12 @@ pub const AUX_FILES_KEY: Key = Key {
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
#[inline(always)]
pub fn is_inherited_key(key: Key) -> bool {
!NON_INHERITED_RANGE.contains(&key)
key != AUX_FILES_KEY
}
#[inline(always)]
@@ -636,14 +556,11 @@ impl std::str::FromStr for Key {
mod tests {
use std::str::FromStr;
use crate::key::is_metadata_key_slice;
use crate::key::Key;
use rand::Rng;
use rand::SeedableRng;
use super::AUX_KEY_PREFIX;
#[test]
fn display_fromstr_bijection() {
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -659,16 +576,4 @@ mod tests {
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
}
#[test]
fn test_metadata_keys() {
let mut metadata_key = vec![AUX_KEY_PREFIX];
metadata_key.extend_from_slice(&[0xFF; 15]);
let encoded_key = Key::from_metadata_key(&metadata_key);
let mut output_key = Vec::new();
encoded_key.extract_metadata_key_to_writer(&mut output_key);
assert_eq!(metadata_key, output_key);
assert!(encoded_key.is_metadata_key());
assert!(is_metadata_key_slice(&metadata_key));
}
}

View File

@@ -94,13 +94,12 @@ impl KeySpace {
/// Remove all keys in `other` from `self`.
/// This can involve splitting or removing of existing ranges.
/// Returns the removed keyspace
pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
let (self_start, self_end) = match (self.start(), self.end()) {
(Some(start), Some(end)) => (start, end),
_ => {
// self is empty
return KeySpace::default();
return;
}
};
@@ -113,37 +112,30 @@ impl KeySpace {
.skip_while(|range| self_start >= range.end)
.take_while(|range| self_end > range.start);
let mut removed_accum = KeySpaceRandomAccum::new();
for range in other_ranges {
while let Some(overlap_at) = self.overlaps_at(range) {
let overlapped = self.ranges[overlap_at].clone();
if overlapped.start < range.start && overlapped.end <= range.end {
// Higher part of the range is completely overlapped.
removed_accum.add_range(range.start..self.ranges[overlap_at].end);
self.ranges[overlap_at].end = range.start;
}
if overlapped.start >= range.start && overlapped.end > range.end {
// Lower part of the range is completely overlapped.
removed_accum.add_range(self.ranges[overlap_at].start..range.end);
self.ranges[overlap_at].start = range.end;
}
if overlapped.start < range.start && overlapped.end > range.end {
// Middle part of the range is overlapped.
removed_accum.add_range(range.clone());
self.ranges[overlap_at].end = range.start;
self.ranges
.insert(overlap_at + 1, range.end..overlapped.end);
}
if overlapped.start >= range.start && overlapped.end <= range.end {
// Whole range is overlapped
removed_accum.add_range(self.ranges[overlap_at].clone());
self.ranges.remove(overlap_at);
}
}
}
removed_accum.to_keyspace()
}
pub fn start(&self) -> Option<Key> {
@@ -561,16 +553,7 @@ mod tests {
Key::from_i128(11)..Key::from_i128(13),
],
};
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace {
ranges: vec![
Key::from_i128(2)..Key::from_i128(3),
Key::from_i128(6)..Key::from_i128(7),
Key::from_i128(11)..Key::from_i128(12),
],
};
assert_eq!(removed, removed_expected);
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
@@ -600,17 +583,7 @@ mod tests {
Key::from_i128(14)..Key::from_i128(17),
],
};
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace {
ranges: vec![
Key::from_i128(3)..Key::from_i128(5),
Key::from_i128(8)..Key::from_i128(10),
Key::from_i128(14)..Key::from_i128(15),
],
};
assert_eq!(removed, removed_expected);
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
@@ -637,11 +610,7 @@ mod tests {
Key::from_i128(15)..Key::from_i128(17),
],
};
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace::default();
assert_eq!(removed, removed_expected);
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
@@ -668,17 +637,7 @@ mod tests {
let key_space2 = KeySpace {
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
};
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace {
ranges: vec![
Key::from_i128(9)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
Key::from_i128(17)..Key::from_i128(19),
],
};
assert_eq!(removed, removed_expected);
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![

View File

@@ -303,6 +303,7 @@ pub struct TenantConfig {
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub try_enable_aux_file_v2: Option<bool>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -429,7 +430,6 @@ pub struct StatusResponse {
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLocationConfigRequest {
#[serde(skip_serializing_if = "Option::is_none")]
pub tenant_id: Option<TenantShardId>,
#[serde(flatten)]
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -579,6 +579,9 @@ pub struct TimelineInfo {
pub state: TimelineState,
pub walreceiver_status: String,
/// Whether aux file v2 is enabled
pub aux_file_v2: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View File

@@ -134,11 +134,6 @@ impl RemotePath {
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
self.0.strip_prefix(&p.0)
}
pub fn add_trailing_slash(&self) -> Self {
// Unwrap safety inputs are guararnteed to be valid UTF-8
Self(format!("{}/", self.0).try_into().unwrap())
}
}
/// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -162,21 +157,47 @@ pub struct Listing {
/// providing basic CRUD operations for storage files.
#[allow(async_fn_in_trait)]
pub trait RemoteStorage: Send + Sync + 'static {
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
///
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
/// from the absolute root of the bucket.
///
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
/// returned in `keys` ().
///
/// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
/// Lists all top level subdirectories for a given prefix
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
/// so this method doesnt need to.
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
let result = self
.list(prefix, ListingMode::WithDelimiter, None, cancel)
.await?
.prefixes;
Ok(result)
}
/// Lists all files in directory "recursively"
/// (not really recursively, because AWS has a flat namespace)
/// Note: This is subtely different than list_prefixes,
/// because it is for listing files instead of listing
/// names sharing common prefixes.
/// For example,
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
/// whereas,
/// list_prefixes("foo/bar/") = ["cat", "dog"]
/// See `test_real_s3.rs` for more details.
///
/// max_keys limits max number of keys returned; None means unlimited.
async fn list_files(
&self,
prefix: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
let result = self
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
.await?
.keys;
Ok(result)
}
async fn list(
&self,
prefix: Option<&RemotePath>,
@@ -315,6 +336,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
// A function for listing all the files in a "directory"
// Example:
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
//
// max_keys limits max number of keys returned; None means unlimited.
pub async fn list_files(
&self,
folder: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
match self {
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
}
}
// lists common *prefixes*, if any of files
// Example:
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
pub async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
match self {
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
}
}
/// See [`RemoteStorage::upload`]
pub async fn upload(
&self,

View File

@@ -5,9 +5,11 @@
//! volume is mounted to the local FS.
use std::{
collections::HashSet,
borrow::Cow,
future::Future,
io::ErrorKind,
num::NonZeroU32,
pin::Pin,
time::{Duration, SystemTime, UNIX_EPOCH},
};
@@ -20,11 +22,11 @@ use tokio::{
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
};
use tokio_util::{io::ReaderStream, sync::CancellationToken};
use utils::crashsafe::path_with_suffix_extension;
use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
use crate::{
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
REMOTE_STORAGE_PREFIX_SEPARATOR,
};
use super::{RemoteStorage, StorageMetadata};
@@ -91,47 +93,7 @@ impl LocalFs {
#[cfg(test)]
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
use std::{future::Future, pin::Pin};
fn get_all_files<'a, P>(
directory_path: P,
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
where
P: AsRef<Utf8Path> + Send + Sync + 'a,
{
Box::pin(async move {
let directory_path = directory_path.as_ref();
if directory_path.exists() {
if directory_path.is_dir() {
let mut paths = Vec::new();
let mut dir_contents = fs::read_dir(directory_path).await?;
while let Some(dir_entry) = dir_contents.next_entry().await? {
let file_type = dir_entry.file_type().await?;
let entry_path =
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
anyhow::Error::msg(format!(
"non-Unicode path: {}",
pb.to_string_lossy()
))
})?;
if file_type.is_symlink() {
tracing::debug!("{entry_path:?} is a symlink, skipping")
} else if file_type.is_dir() {
paths.extend(get_all_files(&entry_path).await?.into_iter())
} else {
paths.push(entry_path);
}
}
Ok(paths)
} else {
bail!("Path {directory_path:?} is not a directory")
}
} else {
Ok(Vec::new())
}
})
}
Ok(get_all_files(&self.storage_root)
Ok(get_all_files(&self.storage_root, true)
.await?
.into_iter()
.map(|path| {
@@ -158,14 +120,6 @@ impl LocalFs {
// S3 object list prefixes can be arbitrary strings, but when reading
// the local filesystem we need a directory to start calling read_dir on.
let mut initial_dir = full_path.clone();
// If there's no trailing slash, we have to start looking from one above: even if
// `initial_dir` is a directory, we should still list any prefixes in the parent
// that start with the same string.
if !full_path.to_string().ends_with('/') {
initial_dir.pop();
}
loop {
// Did we make it to the root?
if initial_dir.parent().is_none() {
@@ -341,66 +295,61 @@ impl RemoteStorage for LocalFs {
let op = async {
let mut result = Listing::default();
// Filter out directories: in S3 directories don't exist, only the keys within them do.
let keys = self
.list_recursive(prefix)
if let ListingMode::NoDelimiter = mode {
let keys = self
.list_recursive(prefix)
.await
.map_err(DownloadError::Other)?;
result.keys = keys
.into_iter()
.filter(|k| {
let path = k.with_base(&self.storage_root);
!path.is_dir()
})
.collect();
if let Some(max_keys) = max_keys {
result.keys.truncate(max_keys.get() as usize);
}
return Ok(result);
}
let path = match prefix {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
};
let prefixes_to_filter = get_all_files(path.as_ref(), false)
.await
.map_err(DownloadError::Other)?;
let keys = keys
.into_iter()
.filter(|k| {
let path = k.with_base(&self.storage_root);
!path.is_dir()
})
.collect();
if let ListingMode::NoDelimiter = mode {
result.keys = keys;
} else {
let mut prefixes = HashSet::new();
for key in keys {
// If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
let relative_key = if let Some(prefix) = prefix {
let mut prefix = prefix.clone();
// We only strip the dirname of the prefix, so that when we strip it from the start of keys we
// end up with full file/dir names.
let prefix_full_local_path = prefix.with_base(&self.storage_root);
let has_slash = prefix.0.to_string().ends_with('/');
let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
prefix
} else {
prefix.0.pop();
prefix
};
RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
} else {
key
};
let relative_key = format!("{}", relative_key);
if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
let first_part = relative_key
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
.next()
.unwrap()
.to_owned();
prefixes.insert(first_part);
} else {
result
.keys
.push(RemotePath::from_string(&relative_key).unwrap());
}
// filter out empty directories to mirror s3 behavior.
for prefix in prefixes_to_filter {
if prefix.is_dir()
&& is_directory_empty(&prefix)
.await
.map_err(DownloadError::Other)?
{
continue;
}
let stripped = prefix
.strip_prefix(&self.storage_root)
.context("Failed to strip prefix")
.and_then(RemotePath::new)
.expect(
"We list files for storage root, hence should be able to remote the prefix",
);
if prefix.is_dir() {
result.prefixes.push(stripped);
} else {
result.keys.push(stripped);
}
result.prefixes = prefixes
.into_iter()
.map(|s| RemotePath::from_string(&s).unwrap())
.collect();
}
if let Some(max_keys) = max_keys {
result.keys.truncate(max_keys.get() as usize);
}
Ok(result)
};
@@ -611,6 +560,50 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
path_with_suffix_extension(original_path, "metadata")
}
fn get_all_files<'a, P>(
directory_path: P,
recursive: bool,
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
where
P: AsRef<Utf8Path> + Send + Sync + 'a,
{
Box::pin(async move {
let directory_path = directory_path.as_ref();
if directory_path.exists() {
if directory_path.is_dir() {
let mut paths = Vec::new();
let mut dir_contents = fs::read_dir(directory_path).await?;
while let Some(dir_entry) = dir_contents.next_entry().await? {
let file_type = dir_entry.file_type().await?;
let entry_path =
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
anyhow::Error::msg(format!(
"non-Unicode path: {}",
pb.to_string_lossy()
))
})?;
if file_type.is_symlink() {
debug!("{entry_path:?} is a symlink, skipping")
} else if file_type.is_dir() {
if recursive {
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
} else {
paths.push(entry_path)
}
} else {
paths.push(entry_path);
}
}
Ok(paths)
} else {
bail!("Path {directory_path:?} is not a directory")
}
} else {
Ok(Vec::new())
}
})
}
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
let target_dir = match target_file_path.parent() {
Some(parent_dir) => parent_dir,
@@ -930,18 +923,13 @@ mod fs_tests {
// No delimiter: should recursively list everything
let (storage, cancel) = create_storage()?;
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
let child_sibling =
upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
let listing = storage
.list(None, ListingMode::NoDelimiter, None, &cancel)
.await?;
assert!(listing.prefixes.is_empty());
assert_eq!(
listing.keys.into_iter().collect::<HashSet<_>>(),
HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
);
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
// Delimiter: should only go one deep
let listing = storage
@@ -954,25 +942,7 @@ mod fs_tests {
);
assert!(listing.keys.is_empty());
// Delimiter & prefix with a trailing slash
let listing = storage
.list(
Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(
listing.keys,
[RemotePath::from_string("uncle").unwrap()].to_vec()
);
assert_eq!(
listing.prefixes,
[RemotePath::from_string("parent").unwrap()].to_vec()
);
// Delimiter and prefix without a trailing slash
// Delimiter & prefix
let listing = storage
.list(
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -981,66 +951,12 @@ mod fs_tests {
&cancel,
)
.await?;
assert_eq!(listing.keys, [].to_vec());
assert_eq!(
listing.prefixes,
[RemotePath::from_string("grandparent").unwrap()].to_vec()
);
// Delimiter and prefix that's partway through a path component
let listing = storage
.list(
Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(listing.keys, [].to_vec());
assert_eq!(
listing.prefixes,
[RemotePath::from_string("grandparent").unwrap()].to_vec()
);
Ok(())
}
#[tokio::test]
async fn list_part_component() -> anyhow::Result<()> {
// No delimiter: should recursively list everything
let (storage, cancel) = create_storage()?;
// Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
// of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
// a freeform prefix.
let _child_a =
upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
let _child_b =
upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
// Delimiter and prefix that's partway through a path component
let listing = storage
.list(
Some(
&RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(listing.keys, [].to_vec());
let mut found_prefixes = listing.prefixes.clone();
found_prefixes.sort();
assert_eq!(
found_prefixes,
[
RemotePath::from_string("tenant").unwrap(),
RemotePath::from_string("tenant-01").unwrap(),
]
.to_vec()
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
.to_vec()
);
assert_eq!(listing.keys, [uncle.clone()].to_vec());
Ok(())
}

View File

@@ -178,7 +178,10 @@ impl S3Bucket {
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
let path_string = path.get_path().as_str();
let path_string = path
.get_path()
.as_str()
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
match &self.prefix_in_bucket {
Some(prefix) => prefix.clone() + "/" + path_string,
None => path_string.to_string(),
@@ -468,11 +471,16 @@ impl RemoteStorage for S3Bucket {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| {
self.prefix_in_bucket.clone().map(|mut s| {
s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
s
})
.or_else(|| self.prefix_in_bucket.clone())
.map(|mut p| {
// required to end with a separator
// otherwise request will return only the entry of a prefix
if matches!(mode, ListingMode::WithDelimiter)
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
{
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
}
p
});
let _permit = self.permit(kind, cancel).await?;
@@ -541,15 +549,11 @@ impl RemoteStorage for S3Bucket {
}
}
// S3 gives us prefixes like "foo/", we return them like "foo"
result.prefixes.extend(prefixes.iter().filter_map(|o| {
Some(
self.s3_object_to_relative_path(
o.prefix()?
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
),
)
}));
result.prefixes.extend(
prefixes
.iter()
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
);
continuation_token = match response.next_continuation_token {
Some(new_token) => Some(new_token),
@@ -1046,22 +1050,22 @@ mod tests {
Some("/test/prefix/"),
];
let expected_outputs = [
vec!["", "some/path", "some/path/"],
vec!["/", "/some/path", "/some/path/"],
vec!["", "some/path", "some/path"],
vec!["/", "/some/path", "/some/path"],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path/",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path/",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path/",
"test/prefix/some/path",
],
];

View File

@@ -107,6 +107,27 @@ impl UnreliableWrapper {
type VoidStorage = crate::LocalFs;
impl RemoteStorage for UnreliableWrapper {
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;
self.inner.list_prefixes(prefix, cancel).await
}
async fn list_files(
&self,
folder: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
.map_err(DownloadError::Other)?;
self.inner.list_files(folder, max_keys, cancel).await
}
async fn list(
&self,
prefix: Option<&RemotePath>,

View File

@@ -1,6 +1,5 @@
use anyhow::Context;
use camino::Utf8Path;
use remote_storage::ListingMode;
use remote_storage::RemotePath;
use std::sync::Arc;
use std::{collections::HashSet, num::NonZeroU32};
@@ -55,9 +54,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
.context("common_prefix construction")?;
let root_remote_prefixes = test_client
.list(None, ListingMode::WithDelimiter, None, &cancel)
.await?
.prefixes
.list_prefixes(None, &cancel)
.await
.context("client list root prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
assert_eq!(
@@ -66,14 +65,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
);
let nested_remote_prefixes = test_client
.list(
Some(&base_prefix.add_trailing_slash()),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?
.prefixes
.list_prefixes(Some(&base_prefix), &cancel)
.await
.context("client list nested prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
let remote_only_prefixes = nested_remote_prefixes
@@ -96,13 +90,11 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
///
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
/// Then performs the following queries:
/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
#[tokio::test]
async fn list_no_delimiter_works(
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
) -> anyhow::Result<()> {
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -115,36 +107,29 @@ async fn list_no_delimiter_works(
let base_prefix =
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
let root_files = test_client
.list(None, ListingMode::NoDelimiter, None, &cancel)
.list_files(None, None, &cancel)
.await
.context("client list root files failure")?
.keys
.into_iter()
.collect::<HashSet<_>>();
assert_eq!(
root_files,
ctx.remote_blobs.clone(),
"remote storage list on root mismatches with the uploads."
"remote storage list_files on root mismatches with the uploads."
);
// Test that max_keys limit works. In total there are about 21 files (see
// upload_simple_remote_data call in test_real_s3.rs).
let limited_root_files = test_client
.list(
None,
ListingMode::NoDelimiter,
Some(NonZeroU32::new(2).unwrap()),
&cancel,
)
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
.await
.context("client list root files failure")?;
assert_eq!(limited_root_files.keys.len(), 2);
assert_eq!(limited_root_files.len(), 2);
let nested_remote_files = test_client
.list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
.list_files(Some(&base_prefix), None, &cancel)
.await
.context("client list nested files failure")?
.keys
.into_iter()
.collect::<HashSet<_>>();
let trim_remote_blobs: HashSet<_> = ctx
@@ -156,7 +141,7 @@ async fn list_no_delimiter_works(
.collect();
assert_eq!(
nested_remote_files, trim_remote_blobs,
"remote storage list on subdirrectory mismatches with the uploads."
"remote storage list_files on subdirrectory mismatches with the uploads."
);
Ok(())
}
@@ -214,11 +199,7 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
let prefixes = ctx
.client
.list(None, ListingMode::WithDelimiter, None, &cancel)
.await?
.prefixes;
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
assert_eq!(prefixes.len(), 1);

View File

@@ -132,6 +132,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
}
}
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
// whereas the list_files function is concerned with listing files.
// See `RemoteStorage::list_files` documentation for more details
enum MaybeEnabledStorageWithSimpleTestBlobs {
Enabled(AzureWithSimpleTestBlobs),
Disabled,

View File

@@ -12,8 +12,8 @@ use anyhow::Context;
use camino::Utf8Path;
use futures_util::StreamExt;
use remote_storage::{
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
RemoteStorageKind, S3Config,
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
S3Config,
};
use test_context::test_context;
use test_context::AsyncTestContext;
@@ -75,14 +75,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
client: &Arc<GenericRemoteStorage>,
cancel: &CancellationToken,
) -> anyhow::Result<HashSet<RemotePath>> {
Ok(
retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
.await
.context("list root files failure")?
.keys
.into_iter()
.collect::<HashSet<_>>(),
)
Ok(retry(|| client.list_files(None, None, cancel))
.await
.context("list root files failure")?
.into_iter()
.collect::<HashSet<_>>())
}
let cancel = CancellationToken::new();
@@ -297,6 +294,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
}
}
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
// whereas the list_files function is concerned with listing files.
// See `RemoteStorage::list_files` documentation for more details
enum MaybeEnabledStorageWithSimpleTestBlobs {
Enabled(S3WithSimpleTestBlobs),
Disabled,

View File

@@ -70,7 +70,6 @@ tokio-stream.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = [ "serde" ] }
tracing.workspace = true
twox-hash.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true

View File

@@ -279,7 +279,7 @@ impl Client {
lazy: bool,
) -> Result<()> {
let req_body = TenantLocationConfigRequest {
tenant_id: None,
tenant_id: Some(tenant_shard_id),
config,
};

View File

@@ -74,6 +74,8 @@ struct MetadataCmd {
prev_record_lsn: Option<Lsn>,
/// Replace latest gc cuttoff
latest_gc_cuttoff: Option<Lsn>,
/// Enable aux file v2 storage
aux_file_v2: Option<bool>,
}
#[derive(Parser)]
@@ -213,12 +215,14 @@ fn handle_metadata(
disk_consistent_lsn,
prev_record_lsn,
latest_gc_cuttoff,
aux_file_v2,
}: &MetadataCmd,
) -> Result<(), anyhow::Error> {
let metadata_bytes = std::fs::read(path)?;
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
println!("Current metadata:\n{meta:?}");
let mut update_meta = false;
// TODO: simplify this part
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
meta = TimelineMetadata::new(
*disk_consistent_lsn,
@@ -228,6 +232,7 @@ fn handle_metadata(
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
meta.aux_file_v2(),
);
update_meta = true;
}
@@ -240,6 +245,7 @@ fn handle_metadata(
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
meta.aux_file_v2(),
);
update_meta = true;
}
@@ -252,6 +258,20 @@ fn handle_metadata(
*latest_gc_cuttoff,
meta.initdb_lsn(),
meta.pg_version(),
meta.aux_file_v2(),
);
update_meta = true;
}
if let Some(aux_file_v2) = aux_file_v2 {
meta = TimelineMetadata::new(
meta.disk_consistent_lsn(),
meta.prev_record_lsn(),
meta.ancestor_timeline(),
meta.ancestor_lsn(),
meta.latest_gc_cutoff_lsn(),
meta.initdb_lsn(),
meta.pg_version(),
*aux_file_v2,
);
update_meta = true;
}

View File

@@ -1,112 +0,0 @@
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
use tracing::warn;
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
let mut key = [0; METADATA_KEY_SIZE];
let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
key[0] = AUX_KEY_PREFIX;
key[1] = dir_level1;
key[2] = dir_level2;
key[3..16].copy_from_slice(&hash[0..13]);
Key::from_metadata_key_fixed_size(&key)
}
const AUX_DIR_PG_LOGICAL: u8 = 0x01;
const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
/// Encode the aux file into a fixed-size key.
///
/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
/// is roughly based on the first two components of the path, one unique number for one component.
///
/// * pg_logical/mappings -> 0x0101
/// * pg_logical/snapshots -> 0x0102
/// * pg_logical/replorigin_checkpoint -> 0x0103
/// * pg_logical/others -> 0x01FF
/// * pg_replslot/ -> 0x0201
/// * others -> 0xFFFF
///
/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
/// The new file type must have never been written to the storage before. Otherwise, there could be data
/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
pub fn encode_aux_file_key(path: &str) -> Key {
if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
} else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
} else if path == "pg_logical/replorigin_checkpoint" {
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
} else if let Some(fname) = path.strip_prefix("pg_logical/") {
if cfg!(debug_assertions) {
warn!(
"unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
path
);
}
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
} else if let Some(fname) = path.strip_prefix("pg_replslot/") {
aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
} else {
if cfg!(debug_assertions) {
warn!(
"unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
path
);
}
aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hash_portable() {
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
// if the algorithm produces the same hash across different environments.
assert_eq!(
305317690835051308206966631765527126151,
twox_hash::xxh3::hash128("test1".as_bytes())
);
assert_eq!(
85104974691013376326742244813280798847,
twox_hash::xxh3::hash128("test/test2".as_bytes())
);
assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
}
#[test]
fn test_encoding_portable() {
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
// of the page server.
assert_eq!(
"8200000101E5B20C5F8DD5AA3289D6D9EAFA",
encode_aux_file_key("pg_logical/mappings/test1").to_string()
);
assert_eq!(
"820000010239AAC544893139B26F501B97E6",
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
);
assert_eq!(
"820000010300000000000000000000000000",
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
);
assert_eq!(
"82000001FF8635AF2134B7266EC5B4189FD6",
encode_aux_file_key("pg_logical/unsupported").to_string()
);
assert_eq!(
"8200000201772D0E5D71DE14DA86142A1619",
encode_aux_file_key("pg_replslot/test3").to_string()
);
assert_eq!(
"820000FFFF1866EBEB53B807B26A2416F317",
encode_aux_file_key("other_file_not_supported").to_string()
);
}
}

View File

@@ -426,6 +426,10 @@ async fn build_timeline_info_common(
state,
walreceiver_status,
aux_file_v2: timeline
.aux_file_v2
.load(std::sync::atomic::Ordering::SeqCst),
};
Ok(info)
}

View File

@@ -12,7 +12,6 @@ pub mod disk_usage_eviction_task;
pub mod http;
pub mod import_datadir;
pub use pageserver_api::keyspace;
pub mod aux_file;
pub mod metrics;
pub mod page_cache;
pub mod page_service;

View File

@@ -86,20 +86,11 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_visited_per_read_global",
"Number of layers visited to reconstruct one key",
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
)
.expect("failed to define a metric")
});
pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_layers_visited_per_vectored_read_global",
"Average number of layers visited to reconstruct one key",
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
"pageserver_read_num_fs_layers",
"Number of persistent layers accessed for processing a read request, including those in the cache",
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
)
.expect("failed to define a metric")
});
@@ -2780,8 +2771,7 @@ pub fn preinitialize_metrics() {
// histograms
[
&READ_NUM_LAYERS_VISITED,
&VEC_READ_NUM_LAYERS_VISITED,
&READ_NUM_FS_LAYERS,
&WAIT_LSN_TIME,
&WAL_REDO_TIME,
&WAL_REDO_RECORDS_HISTOGRAM,

View File

@@ -1206,10 +1206,6 @@ impl PageServerHandler {
))
}
/// Note on "fullbackup":
/// Full basebackups should only be used for debugging purposes.
/// Originally, it was introduced to enable breaking storage format changes,
/// but that is not applicable anymore.
#[allow(clippy::too_many_arguments)]
#[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
async fn handle_basebackup_request<IO>(

View File

@@ -32,7 +32,7 @@ use std::ops::ControlFlow;
use std::ops::Range;
use strum::IntoEnumIterator;
use tokio_util::sync::CancellationToken;
use tracing::{debug, trace, warn};
use tracing::{debug, info, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::vec_map::{VecMap, VecMapOrdering};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -448,11 +448,6 @@ impl Timeline {
// include physical changes from later commits that will be marked
// as aborted, and will need to be vacuumed away.
let commit_lsn = Lsn((low - 1) * 8);
// This maxing operation is for the edge case that the search above did
// set found_smaller to true but it never increased the lsn. Then, low
// is still the old min_lsn the subtraction above could possibly give a value
// below the anchestor_lsn.
let commit_lsn = commit_lsn.max(min_lsn);
match (found_smaller, found_larger) {
(false, false) => {
// This can happen if no commit records have been processed yet, e.g.
@@ -1404,10 +1399,35 @@ impl<'a> DatadirModification<'a> {
Some(Bytes::copy_from_slice(content))
};
// TODO: either ensure we don't flip the flag for users with existing AUX files, or do a check there.
let aux_file_v2 = {
let tline_aux_file_v2 = self
.tline
.aux_file_v2
.load(std::sync::atomic::Ordering::SeqCst);
if tline_aux_file_v2 {
true
} else if self.tline.get_try_enable_aux_file_v2() {
info!(
"enabling aux file v2 support for timeline {}",
self.tline.timeline_id
);
// The next index part upload will have `aux_file_v2` to `true`.
self.tline
.aux_file_v2
.store(true, std::sync::atomic::Ordering::SeqCst);
true
} else {
false
}
};
let _ = aux_file_v2; // keep this unused until the write path is implemented
let n_files;
let mut aux_files = self.tline.aux_files.lock().await;
if let Some(mut dir) = aux_files.dir.take() {
// We already updated aux files in `self`: emit a delta and update our latest value.
// We already updated aux files in `self`: emit a delta and update our latest value
dir.upsert(file_path.clone(), content.clone());
n_files = dir.files.len();
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {

View File

@@ -1346,6 +1346,7 @@ impl Tenant {
initdb_lsn,
initdb_lsn,
pg_version,
false,
);
self.prepare_new_timeline(
new_timeline_id,
@@ -2870,23 +2871,20 @@ impl Tenant {
}
}
let cutoff = timeline
.get_last_record_lsn()
.checked_sub(horizon)
.unwrap_or(Lsn(0));
if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
let branchpoints: Vec<Lsn> = all_branchpoints
.range((
Included((timeline_id, Lsn(0))),
Included((timeline_id, Lsn(u64::MAX))),
))
.map(|&x| x.1)
.collect();
timeline
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
.await?;
let branchpoints: Vec<Lsn> = all_branchpoints
.range((
Included((timeline_id, Lsn(0))),
Included((timeline_id, Lsn(u64::MAX))),
))
.map(|&x| x.1)
.collect();
timeline
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
.await?;
gc_timelines.push(timeline);
gc_timelines.push(timeline);
}
}
drop(gc_cs);
Ok(gc_timelines)
@@ -3010,6 +3008,7 @@ impl Tenant {
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
src_timeline.initdb_lsn,
src_timeline.pg_version,
src_timeline.aux_file_v2.load(Ordering::SeqCst),
);
let uninitialized_timeline = self
@@ -3213,6 +3212,7 @@ impl Tenant {
pgdata_lsn,
pgdata_lsn,
pg_version,
false,
);
let raw_timeline = self
.prepare_new_timeline(
@@ -3664,6 +3664,7 @@ pub(crate) mod harness {
image_layer_creation_check_threshold: Some(
tenant_conf.image_layer_creation_check_threshold,
),
try_enable_aux_file_v2: Some(tenant_conf.try_enable_aux_file_v2),
}
}
}
@@ -3862,7 +3863,6 @@ mod tests {
use crate::DEFAULT_PG_VERSION;
use bytes::BytesMut;
use hex_literal::hex;
use pageserver_api::key::NON_INHERITED_RANGE;
use pageserver_api::keyspace::KeySpace;
use rand::{thread_rng, Rng};
use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -4662,62 +4662,6 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_get_vectored_aux_files")?;
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
.await?;
let tline = tline.raw_timeline().unwrap();
let mut modification = tline.begin_modification(Lsn(0x1000));
modification.put_file("foo/bar1", b"content1", &ctx).await?;
modification.set_lsn(Lsn(0x1008))?;
modification.put_file("foo/bar2", b"content2", &ctx).await?;
modification.commit(&ctx).await?;
let child_timeline_id = TimelineId::generate();
tenant
.branch_timeline_test(
tline,
child_timeline_id,
Some(tline.get_last_record_lsn()),
&ctx,
)
.await?;
let child_timeline = tenant
.get_timeline(child_timeline_id, true)
.expect("Should have the branched timeline");
let aux_keyspace = KeySpace {
ranges: vec![NON_INHERITED_RANGE],
};
let read_lsn = child_timeline.get_last_record_lsn();
let vectored_res = child_timeline
.get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
.await;
child_timeline
.validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
.await;
let images = vectored_res?;
let mut key = NON_INHERITED_RANGE.start;
while key < NON_INHERITED_RANGE.end {
assert!(matches!(
images[&key],
Err(PageReconstructError::MissingKey(_))
));
key = key.next();
}
Ok(())
}
// Test that vectored get handles layer gaps correctly
// by advancing into the next ancestor timeline if required.
//

View File

@@ -369,6 +369,10 @@ pub struct TenantConf {
// How much WAL must be ingested before checking again whether a new image layer is required.
// Expresed in multiples of checkpoint distance.
pub image_layer_creation_check_threshold: u8,
/// Try enable the aux file v2 storage. Once this is set to true and the tenant writes an AUX file, the
/// pageserver will always use v2 for AUX files and setting this flag to false will be a no-op.
pub try_enable_aux_file_v2: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -464,6 +468,10 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
pub image_layer_creation_check_threshold: Option<u8>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub try_enable_aux_file_v2: Option<bool>,
}
impl TenantConfOpt {
@@ -521,6 +529,9 @@ impl TenantConfOpt {
image_layer_creation_check_threshold: self
.image_layer_creation_check_threshold
.unwrap_or(global_conf.image_layer_creation_check_threshold),
try_enable_aux_file_v2: self
.try_enable_aux_file_v2
.unwrap_or(global_conf.try_enable_aux_file_v2),
}
}
}
@@ -562,6 +573,7 @@ impl Default for TenantConf {
lazy_slru_download: false,
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
try_enable_aux_file_v2: false,
}
}
}
@@ -636,6 +648,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
try_enable_aux_file_v2: value.try_enable_aux_file_v2,
}
}
}

View File

@@ -14,10 +14,11 @@ use utils::bin_ser::SerializeError;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
/// Use special format number to enable backward compatibility.
const METADATA_FORMAT_VERSION: u16 = 4;
const METADATA_FORMAT_VERSION: u16 = 5;
/// Previous supported format versions.
const METADATA_OLD_FORMAT_VERSION: u16 = 3;
const METADATA_OLD_FORMAT_VERSION_V2: u16 = 4;
const METADATA_OLD_FORMAT_VERSION_V1: u16 = 3;
/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
///
@@ -31,7 +32,7 @@ const METADATA_MAX_SIZE: usize = 512;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TimelineMetadata {
hdr: TimelineMetadataHeader,
body: TimelineMetadataBodyV2,
body: TimelineMetadataBodyV3,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -42,6 +43,28 @@ struct TimelineMetadataHeader {
}
const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
struct TimelineMetadataBodyV3 {
disk_consistent_lsn: Lsn,
// This is only set if we know it. We track it in memory when the page
// server is running, but we only track the value corresponding to
// 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
// lot. We only store it in the metadata file when we flush *all* the
// in-memory data so that 'last_record_lsn' is the same as
// 'disk_consistent_lsn'. That's OK, because after page server restart, as
// soon as we reprocess at least one record, we will have a valid
// 'prev_record_lsn' value in memory again. This is only really needed when
// doing a clean shutdown, so that there is no more WAL beyond
// 'disk_consistent_lsn'
prev_record_lsn: Option<Lsn>,
ancestor_timeline: Option<TimelineId>,
ancestor_lsn: Lsn,
latest_gc_cutoff_lsn: Lsn,
initdb_lsn: Lsn,
pg_version: u32,
aux_file_v2: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
struct TimelineMetadataBodyV2 {
disk_consistent_lsn: Lsn,
@@ -84,6 +107,7 @@ struct TimelineMetadataBodyV1 {
}
impl TimelineMetadata {
#[allow(clippy::too_many_arguments)]
pub fn new(
disk_consistent_lsn: Lsn,
prev_record_lsn: Option<Lsn>,
@@ -92,6 +116,7 @@ impl TimelineMetadata {
latest_gc_cutoff_lsn: Lsn,
initdb_lsn: Lsn,
pg_version: u32,
aux_file_v2: bool,
) -> Self {
Self {
hdr: TimelineMetadataHeader {
@@ -99,7 +124,7 @@ impl TimelineMetadata {
size: 0,
format_version: METADATA_FORMAT_VERSION,
},
body: TimelineMetadataBodyV2 {
body: TimelineMetadataBodyV3 {
disk_consistent_lsn,
prev_record_lsn,
ancestor_timeline,
@@ -107,6 +132,7 @@ impl TimelineMetadata {
latest_gc_cutoff_lsn,
initdb_lsn,
pg_version,
aux_file_v2,
},
}
}
@@ -115,29 +141,51 @@ impl TimelineMetadata {
let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
// backward compatible only up to this version
ensure!(
hdr.format_version == METADATA_OLD_FORMAT_VERSION,
"unsupported metadata format version {}",
hdr.format_version
);
let body = match hdr.format_version {
METADATA_OLD_FORMAT_VERSION_V2 => {
let metadata_size = hdr.size as usize;
let metadata_size = hdr.size as usize;
let body: TimelineMetadataBodyV2 =
TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
let body: TimelineMetadataBodyV1 =
TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
let body = TimelineMetadataBodyV3 {
disk_consistent_lsn: body.disk_consistent_lsn,
prev_record_lsn: body.prev_record_lsn,
ancestor_timeline: body.ancestor_timeline,
ancestor_lsn: body.ancestor_lsn,
latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
initdb_lsn: body.initdb_lsn,
pg_version: body.pg_version,
aux_file_v2: false,
};
let body = TimelineMetadataBodyV2 {
disk_consistent_lsn: body.disk_consistent_lsn,
prev_record_lsn: body.prev_record_lsn,
ancestor_timeline: body.ancestor_timeline,
ancestor_lsn: body.ancestor_lsn,
latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
initdb_lsn: body.initdb_lsn,
pg_version: 14, // All timelines created before this version had pg_version 14
hdr.format_version = METADATA_FORMAT_VERSION;
body
}
METADATA_OLD_FORMAT_VERSION_V1 => {
let metadata_size = hdr.size as usize;
let body: TimelineMetadataBodyV1 =
TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
let body = TimelineMetadataBodyV3 {
disk_consistent_lsn: body.disk_consistent_lsn,
prev_record_lsn: body.prev_record_lsn,
ancestor_timeline: body.ancestor_timeline,
ancestor_lsn: body.ancestor_lsn,
latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
initdb_lsn: body.initdb_lsn,
pg_version: 14, // All timelines created before this version had pg_version 14
aux_file_v2: false,
};
hdr.format_version = METADATA_FORMAT_VERSION;
body
}
_ => {
anyhow::bail!("unsupported metadata format version {}", hdr.format_version);
}
};
hdr.format_version = METADATA_FORMAT_VERSION;
Ok(Self { hdr, body })
}
@@ -165,7 +213,7 @@ impl TimelineMetadata {
TimelineMetadata::upgrade_timeline_metadata(metadata_bytes)
} else {
let body =
TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
TimelineMetadataBodyV3::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
ensure!(
body.disk_consistent_lsn.is_aligned(),
"disk_consistent_lsn is not aligned"
@@ -219,6 +267,10 @@ impl TimelineMetadata {
self.body.pg_version
}
pub fn aux_file_v2(&self) -> bool {
self.body.aux_file_v2
}
// Checksums make it awkward to build a valid instance by hand. This helper
// provides a TimelineMetadata with a valid checksum in its header.
#[cfg(test)]
@@ -231,6 +283,7 @@ impl TimelineMetadata {
Lsn::from_hex("00000000").unwrap(),
Lsn::from_hex("00000000").unwrap(),
0,
false,
);
let bytes = instance.to_bytes().unwrap();
Self::from_bytes(&bytes).unwrap()
@@ -240,6 +293,7 @@ impl TimelineMetadata {
self.body.disk_consistent_lsn = update.disk_consistent_lsn;
self.body.prev_record_lsn = update.prev_record_lsn;
self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
self.body.aux_file_v2 = update.aux_file_v2;
}
}
@@ -270,6 +324,7 @@ pub(crate) struct MetadataUpdate {
disk_consistent_lsn: Lsn,
prev_record_lsn: Option<Lsn>,
latest_gc_cutoff_lsn: Lsn,
aux_file_v2: bool,
}
impl MetadataUpdate {
@@ -277,11 +332,13 @@ impl MetadataUpdate {
disk_consistent_lsn: Lsn,
prev_record_lsn: Option<Lsn>,
latest_gc_cutoff_lsn: Lsn,
aux_file_v2: bool,
) -> Self {
Self {
disk_consistent_lsn,
prev_record_lsn,
latest_gc_cutoff_lsn,
aux_file_v2,
}
}
}
@@ -302,6 +359,7 @@ mod tests {
Lsn(0),
// Any version will do here, so use the default
crate::DEFAULT_PG_VERSION,
true,
);
let metadata_bytes = original_metadata
@@ -331,7 +389,7 @@ mod tests {
hdr: TimelineMetadataHeader {
checksum: 0,
size: 0,
format_version: METADATA_OLD_FORMAT_VERSION,
format_version: METADATA_OLD_FORMAT_VERSION_V1,
},
body: TimelineMetadataBodyV1 {
disk_consistent_lsn: Lsn(0x200),
@@ -349,7 +407,7 @@ mod tests {
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
let hdr = TimelineMetadataHeader {
size: metadata_size as u16,
format_version: METADATA_OLD_FORMAT_VERSION,
format_version: METADATA_OLD_FORMAT_VERSION_V1,
checksum: crc32c::crc32c(&body_bytes),
};
let hdr_bytes = hdr.ser()?;
@@ -376,12 +434,83 @@ mod tests {
Lsn(0),
Lsn(0),
14, // All timelines created before this version had pg_version 14
false,
);
assert_eq!(
deserialized_metadata.body, expected_metadata.body,
"Metadata of the old version {} should be upgraded to the latest version {}",
METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
METADATA_OLD_FORMAT_VERSION_V1, METADATA_FORMAT_VERSION
);
}
// Generate old version metadata and read it with current code.
// Ensure that it is upgraded correctly
#[test]
fn test_metadata_upgrade_v2() {
#[derive(Debug, Clone, PartialEq, Eq)]
struct TimelineMetadataV2 {
hdr: TimelineMetadataHeader,
body: TimelineMetadataBodyV2,
}
let metadata_v2 = TimelineMetadataV2 {
hdr: TimelineMetadataHeader {
checksum: 0,
size: 0,
format_version: METADATA_OLD_FORMAT_VERSION_V2,
},
body: TimelineMetadataBodyV2 {
disk_consistent_lsn: Lsn(0x200),
prev_record_lsn: Some(Lsn(0x100)),
ancestor_timeline: Some(TIMELINE_ID),
ancestor_lsn: Lsn(0),
latest_gc_cutoff_lsn: Lsn(0),
initdb_lsn: Lsn(0),
pg_version: 16,
},
};
impl TimelineMetadataV2 {
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
let body_bytes = self.body.ser()?;
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
let hdr = TimelineMetadataHeader {
size: metadata_size as u16,
format_version: METADATA_OLD_FORMAT_VERSION_V2,
checksum: crc32c::crc32c(&body_bytes),
};
let hdr_bytes = hdr.ser()?;
let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE];
metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes);
metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes);
Ok(metadata_bytes)
}
}
let metadata_bytes = metadata_v2
.to_bytes()
.expect("Should serialize correct metadata to bytes");
// This should deserialize to the latest version format
let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
.expect("Should deserialize its own bytes");
let expected_metadata = TimelineMetadata::new(
Lsn(0x200),
Some(Lsn(0x100)),
Some(TIMELINE_ID),
Lsn(0),
Lsn(0),
Lsn(0),
16,
false,
);
assert_eq!(
deserialized_metadata.body, expected_metadata.body,
"Metadata of the old version {} should be upgraded to the latest version {}",
METADATA_OLD_FORMAT_VERSION_V2, METADATA_FORMAT_VERSION
);
}
@@ -396,6 +525,7 @@ mod tests {
Lsn(0),
// Any version will do here, so use the default
crate::DEFAULT_PG_VERSION,
true,
);
let metadata_bytes = original_metadata
.to_bytes()
@@ -449,12 +579,13 @@ mod tests {
Lsn(0),
// Any version will do here, so use the default
crate::DEFAULT_PG_VERSION,
true,
);
let expected_bytes = vec![
/* bincode length encoding bytes */
0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
/* TimelineMetadataHeader */
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
97, 148, 11, 30, 0, 71, 0, 5, // checksum, size, format_version (4 + 2 + 2)
/* TimelineMetadataBodyV2 */
0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
@@ -464,6 +595,7 @@ mod tests {
0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
0, 0, 0, 15, // pg_version (4 bytes)
1, // aux_file_v2 (1 byte)
/* padding bytes */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -480,7 +612,7 @@ mod tests {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
];
let metadata_ser_bytes = original_metadata.ser().unwrap();
assert_eq!(metadata_ser_bytes, expected_bytes);

View File

@@ -202,9 +202,7 @@ use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use std::time::Duration;
use remote_storage::{
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
};
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
use std::ops::DerefMut;
use tracing::{debug, error, info, instrument, warn};
use tracing::{info_span, Instrument};
@@ -1147,7 +1145,7 @@ impl RemoteTimelineClient {
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
// Execute all pending deletions, so that when we proceed to do a listing below, we aren't
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
self.flush_deletion_queue().await?;
@@ -1156,20 +1154,14 @@ impl RemoteTimelineClient {
let remaining = download_retry(
|| async {
self.storage_impl
.list(
Some(&timeline_storage_path),
ListingMode::NoDelimiter,
None,
&cancel,
)
.list_files(Some(&timeline_storage_path), None, &cancel)
.await
},
"list remaining files",
&cancel,
)
.await
.context("list files remaining files")?
.keys;
.context("list files remaining files")?;
// We will delete the current index_part object last, since it acts as a deletion
// marker via its deleted_at attribute
@@ -1860,6 +1852,7 @@ mod tests {
// Any version will do
// but it should be consistent with the one in the tests
crate::DEFAULT_PG_VERSION,
false,
);
// go through serialize + deserialize to fix the header, including checksum

View File

@@ -258,7 +258,7 @@ pub async fn list_remote_timelines(
tenant_shard_id: TenantShardId,
cancel: CancellationToken,
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
let remote_path = remote_timelines_path(&tenant_shard_id);
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
anyhow::bail!("storage-sync-list-remote-timelines");
@@ -417,16 +417,11 @@ pub(super) async fn download_index_part(
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
let indices = download_retry(
|| async {
storage
.list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
.await
},
|| async { storage.list_files(Some(&index_prefix), None, cancel).await },
"list index_part files",
cancel,
)
.await?
.keys;
.await?;
// General case logic for which index to use: the latest index whose generation
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md

View File

@@ -118,7 +118,6 @@ pub(crate) struct ValuesReconstructState {
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
keys_done: KeySpaceRandomAccum,
layers_visited: u32,
}
impl ValuesReconstructState {
@@ -126,7 +125,6 @@ impl ValuesReconstructState {
Self {
keys: HashMap::new(),
keys_done: KeySpaceRandomAccum::new(),
layers_visited: 0,
}
}
@@ -140,14 +138,6 @@ impl ValuesReconstructState {
}
}
pub(crate) fn on_layer_visited(&mut self) {
self.layers_visited += 1;
}
pub(crate) fn get_layers_visited(&self) -> u32 {
self.layers_visited
}
/// Update the state collected for a given key.
/// Returns true if this was the last value needed for the key and false otherwise.
///

View File

@@ -62,7 +62,7 @@ impl BackgroundLoopKind {
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
loop_kind: BackgroundLoopKind,
_ctx: &RequestContext,
) -> tokio::sync::SemaphorePermit<'static> {
) -> impl Drop {
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
.with_label_values(&[loop_kind.as_static_str()])
.guard();

View File

@@ -16,7 +16,7 @@ use enumset::EnumSet;
use fail::fail_point;
use once_cell::sync::Lazy;
use pageserver_api::{
key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
key::AUX_FILES_KEY,
keyspace::KeySpaceAccum,
models::{
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
@@ -40,7 +40,6 @@ use utils::{
vec_map::VecMap,
};
use std::ops::{Deref, Range};
use std::pin::pin;
use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -54,6 +53,10 @@ use std::{
cmp::{max, min, Ordering},
ops::ControlFlow,
};
use std::{
ops::{Deref, Range},
sync::atomic::AtomicBool,
};
use crate::deletion_queue::DeletionQueueClient;
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
@@ -382,6 +385,9 @@ pub struct Timeline {
/// Keep aux directory cache to avoid it's reconstruction on each update
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
/// Indicate whether aux file v2 storage is enabled.
pub(crate) aux_file_v2: AtomicBool,
}
pub struct WalReceiverInfo {
@@ -943,13 +949,7 @@ impl Timeline {
Err(MissingKey(MissingKeyError {
stuck_at_lsn: false,
..
})) if !NON_INHERITED_RANGE.contains(&key) => {
// The vectored read path handles non inherited keys specially.
// If such a a key cannot be reconstructed from the current timeline,
// the vectored read path returns a key level error as opposed to a top
// level error.
return Err(GetVectoredError::MissingKey(key));
}
})) => return Err(GetVectoredError::MissingKey(key)),
_ => {
values.insert(key, block);
key = key.next();
@@ -973,7 +973,6 @@ impl Timeline {
.await?;
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
let layers_visited = reconstruct_state.get_layers_visited();
for (key, res) in reconstruct_state.keys {
match res {
Err(err) => {
@@ -988,12 +987,6 @@ impl Timeline {
}
}
// Note that this is an approximation. Tracking the exact number of layers visited
// per key requires virtually unbounded memory usage and is inefficient
// (i.e. segment tree tracking each range queried from a layer)
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
.observe(layers_visited as f64 / results.len() as f64);
Ok(results)
}
@@ -1750,6 +1743,14 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
// Private functions
impl Timeline {
pub(crate) fn get_try_enable_aux_file_v2(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
.tenant_conf
.try_enable_aux_file_v2
.unwrap_or(self.conf.default_tenant_conf.try_enable_aux_file_v2)
}
pub(crate) fn get_lazy_slru_download(&self) -> bool {
let tenant_conf = self.tenant_conf.load();
tenant_conf
@@ -2000,6 +2001,8 @@ impl Timeline {
dir: None,
n_deltas: 0,
}),
aux_file_v2: AtomicBool::new(false),
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2149,6 +2152,11 @@ impl Timeline {
let shard = self.get_shard_index();
let this = self.myself.upgrade().expect("&self method holds the arc");
if let Some(ref index_part) = index_part {
self.aux_file_v2
.store(index_part.metadata.aux_file_v2(), AtomicOrdering::SeqCst);
}
let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
move || {
let _g = span.entered();
@@ -2820,7 +2828,7 @@ impl Timeline {
let mut timeline = self;
let mut read_count = scopeguard::guard(0, |cnt| {
crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
});
// For debugging purposes, collect the path of layers that we traversed
@@ -2935,7 +2943,7 @@ impl Timeline {
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
*read_count += 1;
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
continue 'outer;
}
@@ -2962,7 +2970,7 @@ impl Timeline {
Err(e) => return Err(PageReconstructError::from(e)),
};
cont_lsn = lsn_floor;
*read_count += 1;
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
continue 'outer;
}
@@ -3037,41 +3045,6 @@ impl Timeline {
.await?;
keyspace.remove_overlapping_with(&completed);
// Do not descend into the ancestor timeline for aux files.
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
// stalling compaction.
// TODO(chi): this will need to be updated for aux files v2 storage
if keyspace.overlaps(&NON_INHERITED_RANGE) {
let removed = keyspace.remove_overlapping_with(&KeySpace {
ranges: vec![NON_INHERITED_RANGE],
});
for range in removed.ranges {
let mut key = range.start;
while key < range.end {
reconstruct_state.on_key_error(
key,
PageReconstructError::MissingKey(MissingKeyError {
stuck_at_lsn: false,
key,
shard: self.shard_identity.get_shard_number(&key),
cont_lsn,
request_lsn,
ancestor_lsn: None,
traversal_path: Vec::default(),
backtrace: if cfg!(test) {
Some(std::backtrace::Backtrace::force_capture())
} else {
None
},
}),
);
key = key.next();
}
}
}
if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
break;
}
@@ -3190,8 +3163,6 @@ impl Timeline {
unmapped_keyspace = keyspace_to_read;
cont_lsn = next_cont_lsn;
reconstruct_state.on_layer_visited();
} else {
break;
}
@@ -3656,6 +3627,7 @@ impl Timeline {
disk_consistent_lsn,
ondisk_prev_record_lsn,
*self.latest_gc_cutoff_lsn.read(),
self.aux_file_v2.load(AtomicOrdering::SeqCst),
);
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -4244,8 +4216,9 @@ impl Timeline {
*self.get_latest_gc_cutoff_lsn()
}
} else {
// No time-based retention was configured. Interpret this as "keep no history".
self.get_last_record_lsn()
// No time-based retention was configured. Set time-based cutoff to
// same as LSN based.
cutoff_horizon
};
// Grab the lock and update the values

View File

@@ -188,10 +188,24 @@ impl Timeline {
) -> ControlFlow<()> {
let now = SystemTime::now();
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Eviction,
ctx,
);
self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
.await?;
let _permit = tokio::select! {
permit = acquire_permit => permit,
_ = cancel.cancelled() => return ControlFlow::Break(()),
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
};
match self
.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
.await
{
ControlFlow::Break(()) => return ControlFlow::Break(()),
ControlFlow::Continue(()) => (),
}
#[derive(Debug, Default)]
struct EvictionStats {
@@ -316,27 +330,19 @@ impl Timeline {
gate: &GateGuard,
ctx: &RequestContext,
) -> ControlFlow<()> {
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
.await
}
async fn acquire_imitation_permit(
&self,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
BackgroundLoopKind::Eviction,
ctx,
);
tokio::select! {
permit = acquire_permit => ControlFlow::Continue(permit),
_ = cancel.cancelled() => ControlFlow::Break(()),
_ = self.cancel.cancelled() => ControlFlow::Break(()),
}
let _permit = tokio::select! {
permit = acquire_permit => permit,
_ = cancel.cancelled() => return ControlFlow::Break(()),
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
};
self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
.await
}
/// If we evict layers but keep cached values derived from those layers, then
@@ -370,7 +376,6 @@ impl Timeline {
p: &EvictionPolicyLayerAccessThreshold,
cancel: &CancellationToken,
gate: &GateGuard,
permit: tokio::sync::SemaphorePermit<'static>,
ctx: &RequestContext,
) -> ControlFlow<()> {
if !self.tenant_shard_id.is_shard_zero() {
@@ -403,28 +408,7 @@ impl Timeline {
// Make one of the tenant's timelines draw the short straw and run the calculation.
// The others wait until the calculation is done so that they take into account the
// imitated accesses that the winner made.
let (mut state, _permit) = {
if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
(locked, permit)
} else {
// we might need to wait for a long time here in case of pathological synthetic
// size calculation performance
drop(permit);
let locked = tokio::select! {
locked = tenant.eviction_task_tenant_state.lock() => locked,
_ = self.cancel.cancelled() => {
return ControlFlow::Break(())
},
_ = cancel.cancelled() => {
return ControlFlow::Break(())
}
};
// then reacquire -- this will be bad if there is a lot of traffic, but because we
// released the permit, the overall latency will be much better.
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
(locked, permit)
}
};
let mut state = tenant.eviction_task_tenant_state.lock().await;
match state.last_layer_access_imitation {
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
_ => {

View File

@@ -16,7 +16,6 @@ atomic-take.workspace = true
aws-config.workspace = true
aws-sdk-iam.workspace = true
aws-sigv4.workspace = true
aws-smithy-runtime.workspace = true
aws-types.workspace = true
base64.workspace = true
bstr.workspace = true
@@ -32,21 +31,14 @@ git-version.workspace = true
hashbrown.workspace = true
hashlink.workspace = true
hex.workspace = true
hickory-resolver = "0.24.1"
hmac.workspace = true
hostname.workspace = true
http.workspace = true
humantime.workspace = true
hyper-tungstenite.workspace = true
hyper.workspace = true
hyper-rustls = { version = "0.25.0", features = ["rustls-native-certs", "http1", "http2"] }
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
hyper-util = { version = "0.1", features = [
"server",
"http1",
"http2",
"tokio",
] }
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
http-body-util = { version = "0.1" }
ipnet.workspace = true
itertools.workspace = true

View File

@@ -5,10 +5,7 @@ use aws_config::meta::region::RegionProviderChain;
use aws_config::profile::ProfileFileCredentialsProvider;
use aws_config::provider_config::ProviderConfig;
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
use aws_smithy_runtime::client::http::hyper_014::HyperClientBuilder;
use futures::future::Either;
use hyper::client::HttpConnector;
use hyper_rustls::ConfigBuilderExt;
use proxy::auth;
use proxy::auth::backend::AuthRateLimiter;
use proxy::auth::backend::MaybeOwned;
@@ -21,7 +18,6 @@ use proxy::config::HttpConfig;
use proxy::config::ProjectInfoCacheOptions;
use proxy::console;
use proxy::context::parquet::ParquetUploadArgs;
use proxy::dns::Dns;
use proxy::http;
use proxy::http::health_server::AppMetrics;
use proxy::metrics::Metrics;
@@ -37,7 +33,6 @@ use proxy::usage_metrics;
use anyhow::bail;
use proxy::config::{self, ProxyConfig};
use proxy::serverless;
use rustls::crypto::CryptoProvider;
use std::net::SocketAddr;
use std::pin::pin;
use std::sync::Arc;
@@ -275,40 +270,8 @@ async fn main() -> anyhow::Result<()> {
info!("Using region: {}", config.aws_region);
let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
let aws_tls_client_config =
rustls::ClientConfig::builder_with_provider(Arc::new(CryptoProvider {
cipher_suites: vec![
// TLS1.3 suites
rustls::crypto::ring::cipher_suite::TLS13_AES_256_GCM_SHA384,
rustls::crypto::ring::cipher_suite::TLS13_AES_128_GCM_SHA256,
// TLS1.2 suites
rustls::crypto::ring::cipher_suite::TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
rustls::crypto::ring::cipher_suite::TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
rustls::crypto::ring::cipher_suite::TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
rustls::crypto::ring::cipher_suite::TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
rustls::crypto::ring::cipher_suite::TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256,
],
..rustls::crypto::ring::default_provider()
}))
.with_safe_default_protocol_versions()
.unwrap()
.with_native_roots()?
.with_no_client_auth();
let provider_conf = ProviderConfig::without_region()
.with_region(region_provider.region().await)
.with_http_client(
HyperClientBuilder::new().build(
hyper_rustls::HttpsConnectorBuilder::new()
.with_tls_config(aws_tls_client_config)
.https_or_http()
.enable_http1()
.enable_http2()
.wrap_connector(HttpConnector::new_with_resolver(config.dns.clone())),
),
);
let provider_conf =
ProviderConfig::without_region().with_region(region_provider.region().await);
let aws_credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
@@ -437,10 +400,10 @@ async fn main() -> anyhow::Result<()> {
if let Some(metrics_config) = &config.metric_collection {
// TODO: Add gc regardles of the metric collection being enabled.
maintenance_tasks.spawn(usage_metrics::task_main(config.dns.clone(), metrics_config));
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
client_tasks.spawn(usage_metrics::task_backup(
&metrics_config.backup_metric_collection_config,
cancellation_token.clone(),
cancellation_token,
));
}
@@ -460,10 +423,7 @@ async fn main() -> anyhow::Result<()> {
let cache = api.caches.endpoints_cache.clone();
let con = regional_redis_client;
let span = tracing::info_span!("endpoints_cache");
maintenance_tasks.spawn(
async move { cache.do_read(con, cancellation_token.clone()).await }
.instrument(span),
);
maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
}
}
}
@@ -534,8 +494,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
bail!("dynamic rate limiter should be disabled");
}
let dns = Dns::new();
let auth_backend = match &args.auth_backend {
AuthBackend::Console => {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
@@ -576,7 +534,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
tokio::spawn(locks.garbage_collect_worker());
let url = args.auth_endpoint.parse()?;
let endpoint = http::Endpoint::new(url, http::new_client(dns.clone()));
let endpoint = http::Endpoint::new(url, http::new_client());
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
@@ -620,7 +578,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
RateBucketInfo::validate(&mut redis_rps_limit)?;
let config = Box::leak(Box::new(ProxyConfig {
dns,
tls_config,
auth_backend,
metric_collection,

View File

@@ -4,7 +4,6 @@ use std::{
atomic::{AtomicBool, Ordering},
Arc,
},
time::Duration,
};
use dashmap::DashSet;
@@ -14,7 +13,6 @@ use redis::{
};
use serde::Deserialize;
use tokio::sync::Mutex;
use tokio_util::sync::CancellationToken;
use tracing::info;
use crate::{
@@ -113,22 +111,16 @@ impl EndpointsCache {
pub async fn do_read(
&self,
mut con: ConnectionWithCredentialsProvider,
cancellation_token: CancellationToken,
) -> anyhow::Result<Infallible> {
let mut last_id = "0-0".to_string();
loop {
self.ready.store(false, Ordering::Release);
if let Err(e) = con.connect().await {
tracing::error!("error connecting to redis: {:?}", e);
self.ready.store(false, Ordering::Release);
continue;
}
if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
tracing::error!("error reading from redis: {:?}", e);
self.ready.store(false, Ordering::Release);
}
if cancellation_token.is_cancelled() {
info!("cancellation token is cancelled, exiting");
tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
// 1 week.
}
tokio::time::sleep(self.config.retry_interval).await;
}

View File

@@ -3,21 +3,17 @@ use crate::{
cancellation::CancelClosure,
console::{errors::WakeComputeError, messages::MetricsAuxInfo},
context::RequestMonitoring,
dns::Dns,
error::{ReportableError, UserFacingError},
metrics::{Metrics, NumDbConnectionsGuard},
proxy::neon_option,
};
use futures::TryFutureExt;
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
use pq_proto::StartupMessageParams;
use std::{io, net::SocketAddr, time::Duration};
use thiserror::Error;
use tokio::net::TcpStream;
use tokio_postgres::{
tls::{MakeTlsConnect, NoTlsError},
Connection, SocketConfig,
};
use tokio_postgres::tls::MakeTlsConnect;
use tracing::{error, info, warn};
const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
@@ -37,9 +33,6 @@ pub enum ConnectionError {
#[error("{COULD_NOT_CONNECT}: {0}")]
WakeComputeError(#[from] WakeComputeError),
#[error("{COULD_NOT_CONNECT}: {0}")]
TlsNotSupported(#[from] NoTlsError),
}
impl UserFacingError for ConnectionError {
@@ -77,7 +70,6 @@ impl ReportableError for ConnectionError {
ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
ConnectionError::TlsNotSupported(_) => crate::error::ErrorKind::Compute,
ConnectionError::WakeComputeError(e) => e.get_error_kind(),
}
}
@@ -173,42 +165,20 @@ impl std::ops::DerefMut for ConnCfg {
impl ConnCfg {
/// Establish a raw TCP connection to the compute node.
async fn connect_raw(
&self,
dns: &Dns,
timeout: Duration,
) -> io::Result<(SocketAddr, TcpStream, &str)> {
async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
use tokio_postgres::config::Host;
// wrap TcpStream::connect with timeout
let connect_with_timeout = |host, port| async move {
let addrs = dns
.resolve(host)
.await
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
let timeout = timeout / addrs.len() as u32;
let mut last_err = None;
for addr in addrs {
match tokio::time::timeout(timeout, TcpStream::connect((addr, port))).await {
Ok(Ok(stream)) => return Ok(stream),
Ok(Err(e)) => last_err = Some(e),
Err(_) => {
last_err = Some(io::Error::new(
io::ErrorKind::TimedOut,
format!("exceeded connection timeout {timeout:?}"),
))
}
};
}
Err(last_err.unwrap_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidInput,
"could not resolve to any address",
)
}))
let connect_with_timeout = |host, port| {
tokio::time::timeout(timeout, TcpStream::connect((host, port))).map(
move |res| match res {
Ok(tcpstream_connect_res) => tcpstream_connect_res,
Err(_) => Err(io::Error::new(
io::ErrorKind::TimedOut,
format!("exceeded connection timeout {timeout:?}"),
)),
},
)
};
let connect_once = |host, port| {
@@ -265,11 +235,12 @@ impl ConnCfg {
}
}
type TlsStream = postgres_native_tls::TlsStream<TcpStream>;
pub struct PostgresConnection {
/// Socket connected to a compute node.
pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<TcpStream, TlsStream>,
pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
tokio::net::TcpStream,
postgres_native_tls::TlsStream<tokio::net::TcpStream>,
>,
/// PostgreSQL connection parameters.
pub params: std::collections::HashMap<String, String>,
/// Query cancellation token.
@@ -282,38 +253,26 @@ pub struct PostgresConnection {
impl ConnCfg {
/// Connect to a corresponding compute node.
pub async fn connect_managed<Tls: MakeTlsConnect<TcpStream>>(
pub async fn connect(
&self,
ctx: &mut RequestMonitoring,
dns: &Dns,
allow_self_signed_compute: bool,
aux: MetricsAuxInfo,
timeout: Duration,
mut tls: Tls,
) -> Result<
(
SocketAddr,
tokio_postgres::Client,
Connection<TcpStream, Tls::Stream>,
),
ConnectionError,
>
where
ConnectionError: From<Tls::Error>,
{
let (socket_addr, stream, host) = self.connect_raw(dns, timeout).await?;
) -> Result<PostgresConnection, ConnectionError> {
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
let tls = MakeTlsConnect::<TcpStream>::make_tls_connect(&mut tls, host)?;
let tls_connector = native_tls::TlsConnector::builder()
.danger_accept_invalid_certs(allow_self_signed_compute)
.build()
.unwrap();
let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
// connect_raw() will not use TLS if sslmode is "disable"
let (mut client, connection) = self.0.connect_raw(stream, tls).await?;
let (client, connection) = self.0.connect_raw(stream, tls).await?;
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
client.set_socket_config(SocketConfig {
host: tokio_postgres::config::Host::Tcp(host.to_owned()),
port: socket_addr.port(),
socket_addr: tokio_postgres::SocketAddr::Tcp(socket_addr),
connect_timeout: None,
keepalive: None,
});
let stream = connection.stream.into_inner();
info!(
cold_start_info = ctx.cold_start_info.as_str(),
@@ -321,28 +280,6 @@ impl ConnCfg {
self.0.get_ssl_mode()
);
Ok((socket_addr, client, connection))
}
/// Connect to a corresponding compute node.
pub async fn connect(
&self,
ctx: &mut RequestMonitoring,
dns: &Dns,
allow_self_signed_compute: bool,
aux: MetricsAuxInfo,
timeout: Duration,
) -> Result<PostgresConnection, ConnectionError> {
let tls_connector = native_tls::TlsConnector::builder()
.danger_accept_invalid_certs(allow_self_signed_compute)
.build()
.unwrap();
let mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
let (socket_addr, client, connection) =
self.connect_managed(ctx, dns, timeout, mk_tls).await?;
let stream = connection.stream.into_inner();
// This is very ugly but as of now there's no better way to
// extract the connection parameters from tokio-postgres' connection.
// TODO: solve this problem in a more elegant manner (e.g. the new library).

View File

@@ -1,6 +1,5 @@
use crate::{
auth::{self, backend::AuthRateLimiter},
dns::Dns,
rate_limiter::RateBucketInfo,
serverless::GlobalConnPoolOptions,
};
@@ -22,7 +21,6 @@ use tracing::{error, info};
use x509_parser::oid_registry;
pub struct ProxyConfig {
pub dns: Dns,
pub tls_config: Option<TlsConfig>,
pub auth_backend: auth::BackendType<'static, (), ()>,
pub metric_collection: Option<MetricCollectionConfig>,

View File

@@ -12,7 +12,6 @@ use crate::{
compute,
config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
context::RequestMonitoring,
dns::Dns,
intern::ProjectIdInt,
metrics::ApiLockMetrics,
scram, EndpointCacheKey,
@@ -303,13 +302,11 @@ impl NodeInfo {
pub async fn connect(
&self,
ctx: &mut RequestMonitoring,
dns: &Dns,
timeout: Duration,
) -> Result<compute::PostgresConnection, compute::ConnectionError> {
self.config
.connect(
ctx,
dns,
self.allow_self_signed_compute,
self.aux.clone(),
timeout,

View File

@@ -1,96 +0,0 @@
//! Async dns resolvers
use std::{
net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr},
sync::Arc,
};
use aws_sdk_iam::error::BoxError;
use hickory_resolver::{error::ResolveError, proto::rr::RData};
use hyper::client::connect::dns::Name;
use reqwest::dns::Addrs;
use tokio::time::Instant;
use tracing::trace;
#[derive(Clone)]
pub struct Dns {
resolver: Arc<hickory_resolver::TokioAsyncResolver>,
}
impl Default for Dns {
fn default() -> Self {
Self::new()
}
}
impl Dns {
pub fn new() -> Self {
let (config, options) =
hickory_resolver::system_conf::read_system_conf().expect("could not read resolv.conf");
let resolver = Arc::new(hickory_resolver::TokioAsyncResolver::tokio(config, options));
Self { resolver }
}
pub async fn resolve(&self, name: &str) -> Result<Vec<IpAddr>, ResolveError> {
let start = Instant::now();
// try to parse the host as a regular IP address first
if let Ok(addr) = name.parse::<Ipv4Addr>() {
return Ok(vec![IpAddr::V4(addr)]);
}
if let Ok(addr) = name.parse::<Ipv6Addr>() {
return Ok(vec![IpAddr::V6(addr)]);
}
let res = self.resolver.lookup_ip(name).await;
let resolve_duration = start.elapsed();
trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
Ok(res?
.as_lookup()
.records()
.iter()
.filter_map(|r| r.data())
.filter_map(|rdata| match rdata {
RData::A(ip) => Some(IpAddr::from(ip.0)),
RData::AAAA(ip) => Some(IpAddr::from(ip.0)),
_ => None,
})
.collect())
}
}
impl hyper::service::Service<Name> for Dns {
type Response = Addrs;
type Error = BoxError;
type Future = reqwest::dns::Resolving;
fn poll_ready(
&mut self,
_cx: &mut std::task::Context<'_>,
) -> std::task::Poll<Result<(), Self::Error>> {
std::task::Poll::Ready(Ok(()))
}
fn call(&mut self, req: Name) -> Self::Future {
reqwest::dns::Resolve::resolve(self, req)
}
}
impl reqwest::dns::Resolve for Dns {
fn resolve(&self, name: Name) -> reqwest::dns::Resolving {
let this = self.clone();
Box::pin(async move {
match this.resolve(name.as_str()).await {
Ok(iter) => {
Ok(Box::new(iter.into_iter().map(|ip| SocketAddr::new(ip, 0))) as Box<_>)
}
Err(e) => Err(e.into()),
}
})
}
}

View File

@@ -14,7 +14,6 @@ use tokio::time::Instant;
use tracing::trace;
use crate::{
dns::Dns,
metrics::{ConsoleRequest, Metrics},
url::ApiUrl,
};
@@ -23,9 +22,9 @@ use reqwest_middleware::RequestBuilder;
/// This is the preferred way to create new http clients,
/// because it takes care of observability (OpenTelemetry).
/// We deliberately don't want to replace this with a public static.
pub fn new_client(dns: Dns) -> ClientWithMiddleware {
pub fn new_client() -> ClientWithMiddleware {
let client = reqwest::ClientBuilder::new()
.dns_resolver(Arc::new(dns))
.dns_resolver(Arc::new(GaiResolver::default()))
.connection_verbose(true)
.build()
.expect("Failed to create http client");
@@ -35,9 +34,9 @@ pub fn new_client(dns: Dns) -> ClientWithMiddleware {
.build()
}
pub fn new_client_with_timeout(dns: Dns, default_timout: Duration) -> ClientWithMiddleware {
pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
let timeout_client = reqwest::ClientBuilder::new()
.dns_resolver(Arc::new(dns))
.dns_resolver(Arc::new(GaiResolver::default()))
.connection_verbose(true)
.timeout(default_timout)
.build()

View File

@@ -14,7 +14,6 @@ pub mod compute;
pub mod config;
pub mod console;
pub mod context;
pub mod dns;
pub mod error;
pub mod http;
pub mod intern;

View File

@@ -307,7 +307,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
ctx,
&TcpMechanism { params: &params },
&user_info,
&config.dns,
mode.allow_self_signed_compute(config),
config.wake_compute_retry_config,
config.connect_to_compute_retry_config,

View File

@@ -4,7 +4,6 @@ use crate::{
config::RetryConfig,
console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
context::RequestMonitoring,
dns::Dns,
error::ReportableError,
metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
proxy::{
@@ -45,7 +44,6 @@ pub trait ConnectMechanism {
async fn connect_once(
&self,
ctx: &mut RequestMonitoring,
dns: &Dns,
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
) -> Result<Self::Connection, Self::ConnectError>;
@@ -78,11 +76,10 @@ impl ConnectMechanism for TcpMechanism<'_> {
async fn connect_once(
&self,
ctx: &mut RequestMonitoring,
dns: &Dns,
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
) -> Result<PostgresConnection, Self::Error> {
node_info.connect(ctx, dns, timeout).await
node_info.connect(ctx, timeout).await
}
fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -96,7 +93,6 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
ctx: &mut RequestMonitoring,
mechanism: &M,
user_info: &B,
dns: &Dns,
allow_self_signed_compute: bool,
wake_compute_retry_config: RetryConfig,
connect_to_compute_retry_config: RetryConfig,
@@ -118,7 +114,7 @@ where
// try once
let err = match mechanism
.connect_once(ctx, dns, &node_info, CONNECT_TIMEOUT)
.connect_once(ctx, &node_info, CONNECT_TIMEOUT)
.await
{
Ok(res) => {
@@ -163,7 +159,7 @@ where
num_retries = 1;
loop {
match mechanism
.connect_once(ctx, dns, &node_info, CONNECT_TIMEOUT)
.connect_once(ctx, &node_info, CONNECT_TIMEOUT)
.await
{
Ok(res) => {

View File

@@ -15,7 +15,6 @@ use crate::console::caches::NodeInfoCache;
use crate::console::messages::MetricsAuxInfo;
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
use crate::console::{self, CachedNodeInfo, NodeInfo};
use crate::dns::Dns;
use crate::error::ErrorKind;
use crate::proxy::retry::retry_after;
use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
@@ -454,7 +453,6 @@ impl ConnectMechanism for TestConnectMechanism {
async fn connect_once(
&self,
_ctx: &mut RequestMonitoring,
_dns: &Dns,
_node_info: &console::CachedNodeInfo,
_timeout: std::time::Duration,
) -> Result<Self::Connection, Self::ConnectError> {
@@ -560,17 +558,9 @@ async fn connect_to_compute_success() {
max_retries: 5,
backoff_factor: 2.0,
};
connect_to_compute(
&mut ctx,
&mechanism,
&user_info,
&Dns::new(),
false,
config,
config,
)
.await
.unwrap();
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
.await
.unwrap();
mechanism.verify();
}
@@ -586,17 +576,9 @@ async fn connect_to_compute_retry() {
max_retries: 5,
backoff_factor: 2.0,
};
connect_to_compute(
&mut ctx,
&mechanism,
&user_info,
&Dns::new(),
false,
config,
config,
)
.await
.unwrap();
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
.await
.unwrap();
mechanism.verify();
}
@@ -613,17 +595,9 @@ async fn connect_to_compute_non_retry_1() {
max_retries: 5,
backoff_factor: 2.0,
};
connect_to_compute(
&mut ctx,
&mechanism,
&user_info,
&Dns::new(),
false,
config,
config,
)
.await
.unwrap_err();
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
.await
.unwrap_err();
mechanism.verify();
}
@@ -640,17 +614,9 @@ async fn connect_to_compute_non_retry_2() {
max_retries: 5,
backoff_factor: 2.0,
};
connect_to_compute(
&mut ctx,
&mechanism,
&user_info,
&Dns::new(),
false,
config,
config,
)
.await
.unwrap();
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
.await
.unwrap();
mechanism.verify();
}
@@ -678,7 +644,6 @@ async fn connect_to_compute_non_retry_3() {
&mut ctx,
&mechanism,
&user_info,
&Dns::new(),
false,
wake_compute_retry_config,
connect_to_compute_retry_config,
@@ -701,17 +666,9 @@ async fn wake_retry() {
max_retries: 5,
backoff_factor: 2.0,
};
connect_to_compute(
&mut ctx,
&mechanism,
&user_info,
&Dns::new(),
false,
config,
config,
)
.await
.unwrap();
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
.await
.unwrap();
mechanism.verify();
}
@@ -728,16 +685,8 @@ async fn wake_non_retry() {
max_retries: 5,
backoff_factor: 2.0,
};
connect_to_compute(
&mut ctx,
&mechanism,
&user_info,
&Dns::new(),
false,
config,
config,
)
.await
.unwrap_err();
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
.await
.unwrap_err();
mechanism.verify();
}

View File

@@ -1,19 +1,17 @@
use std::{sync::Arc, time::Duration};
use async_trait::async_trait;
use tokio_postgres::NoTls;
use tracing::{field::display, info};
use crate::{
auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
compute::{self, ConnectionError},
compute,
config::{AuthenticationConfig, ProxyConfig},
console::{
errors::{GetAuthInfoError, WakeComputeError},
CachedNodeInfo,
},
context::RequestMonitoring,
dns::Dns,
error::{ErrorKind, ReportableError, UserFacingError},
proxy::connect_compute::ConnectMechanism,
};
@@ -109,7 +107,6 @@ impl PoolingBackend {
pool: self.pool.clone(),
},
&backend,
&self.config.dns,
false, // do not allow self signed compute for http flow
self.config.wake_compute_retry_config,
self.config.connect_to_compute_retry_config,
@@ -123,7 +120,7 @@ pub enum HttpConnError {
#[error("pooled connection closed at inconsistent state")]
ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
#[error("could not connection to compute")]
ConnectionError(#[from] ConnectionError),
ConnectionError(#[from] tokio_postgres::Error),
#[error("could not get auth info")]
GetAuthInfo(#[from] GetAuthInfoError),
@@ -166,24 +163,23 @@ struct TokioMechanism {
#[async_trait]
impl ConnectMechanism for TokioMechanism {
type Connection = Client<tokio_postgres::Client>;
type ConnectError = ConnectionError;
type ConnectError = tokio_postgres::Error;
type Error = HttpConnError;
async fn connect_once(
&self,
ctx: &mut RequestMonitoring,
dns: &Dns,
node_info: &CachedNodeInfo,
timeout: Duration,
) -> Result<Self::Connection, ConnectionError> {
let mut config = node_info.config.clone();
config
) -> Result<Self::Connection, Self::ConnectError> {
let mut config = (*node_info.config).clone();
let config = config
.user(&self.conn_info.user_info.user)
.password(&*self.conn_info.password)
.dbname(&self.conn_info.dbname)
.connect_timeout(timeout);
let (_, client, connection) = config.connect_managed(ctx, dns, timeout, NoTls).await?;
let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
Ok(poll_client(

View File

@@ -12,10 +12,9 @@ use std::{
ops::Deref,
sync::atomic::{self, AtomicUsize},
};
use tokio::net::TcpStream;
use tokio::time::Instant;
use tokio_postgres::tls::NoTlsStream;
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
use tokio_util::sync::CancellationToken;
use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
@@ -469,7 +468,7 @@ pub fn poll_client<C: ClientInnerExt>(
ctx: &mut RequestMonitoring,
conn_info: ConnInfo,
client: C,
mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
conn_id: uuid::Uuid,
aux: MetricsAuxInfo,
) -> Client<C> {

View File

@@ -37,7 +37,6 @@ use utils::http::error::ApiError;
use crate::auth::backend::ComputeUserInfo;
use crate::auth::endpoint_sni;
use crate::auth::ComputeUserInfoParseError;
use crate::compute::ConnectionError;
use crate::config::ProxyConfig;
use crate::config::TlsConfig;
use crate::context::RequestMonitoring;
@@ -258,9 +257,7 @@ pub async fn handle(
let mut message = e.to_string_client();
let db_error = match &e {
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
ConnectionError::Postgres(e),
))
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
_ => None,
};
@@ -664,9 +661,7 @@ impl QueryData {
// query failed or was cancelled.
Ok(Err(error)) => {
let db_error = match &error {
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
ConnectionError::Postgres(e),
))
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
_ => None,
};

View File

@@ -3,7 +3,6 @@
use crate::{
config::{MetricBackupCollectionConfig, MetricCollectionConfig},
context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
dns::Dns,
http,
intern::{BranchIdInt, EndpointIdInt},
};
@@ -218,13 +217,13 @@ impl Metrics {
pub static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
pub async fn task_main(dns: Dns, config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
info!("metrics collector config: {config:?}");
scopeguard::defer! {
info!("metrics collector has shut down");
}
let http_client = http::new_client_with_timeout(dns, DEFAULT_HTTP_REPORTING_TIMEOUT);
let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
let mut prev = Utc::now();
@@ -496,7 +495,7 @@ mod tests {
use url::Url;
use super::*;
use crate::{dns::Dns, http, BranchId, EndpointId};
use crate::{http, BranchId, EndpointId};
#[tokio::test]
async fn metrics() {
@@ -526,7 +525,7 @@ mod tests {
tokio::spawn(server);
let metrics = Metrics::default();
let client = http::new_client(Dns::new());
let client = http::new_client();
let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
let now = Utc::now();

View File

@@ -18,7 +18,7 @@ use std::time::Duration;
use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
use postgres_ffi::XLogFileName;
use postgres_ffi::{XLogSegNo, PG_TLI};
use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata};
use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
use tokio::fs::File;
use tokio::select;
@@ -601,18 +601,12 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
backoff::retry(
|| async {
// Do list-delete in batch_size batches to make progress even if there a lot of files.
// Alternatively we could make remote storage list return iterator, but it is more complicated and
// Alternatively we could make list_files return iterator, but it is more complicated and
// I'm not sure deleting while iterating is expected in s3.
loop {
let files = storage
.list(
Some(&remote_path),
ListingMode::NoDelimiter,
Some(batch_size),
&cancel,
)
.await?
.keys;
.list_files(Some(&remote_path), Some(batch_size), &cancel)
.await?;
if files.is_empty() {
return Ok(()); // done
}
@@ -672,9 +666,8 @@ pub async fn copy_s3_segments(
let cancel = CancellationToken::new();
let files = storage
.list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
.await?
.keys;
.list_files(Some(&remote_path), None, &cancel)
.await?;
let uploaded_segments = &files
.iter()

View File

@@ -0,0 +1,730 @@
#
# Script to export tenants from one pageserver and import them into another page server.
#
# Outline of steps:
# 1. Get `(last_lsn, prev_lsn)` from old pageserver
# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
# 3. This tar file might be missing relation files for empty relations, if the pageserver
# is old enough (we didn't always store those). So to recreate them, we start a local
# vanilla postgres on this basebackup and ask it what relations should exist, then touch
# any missing files and re-pack the tar.
# TODO This functionality is no longer needed, so we can delete it later if we don't
# end up using the same utils for the pg 15 upgrade. Not sure.
# 4. We import the patched basebackup into a new pageserver
# 5. We export again via fullbackup, now from the new pageserver and compare the returned
# tar file with the one we imported. This confirms that we imported everything that was
# exported, but doesn't guarantee correctness (what if we didn't **export** everything
# initially?)
# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
#
# For more context on how to use this, see:
# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf
import argparse
import os
import shutil
import subprocess
import tempfile
import time
import uuid
from contextlib import closing
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, cast
import psycopg2
import requests
from psycopg2.extensions import connection as PgConnection
from psycopg2.extensions import parse_dsn
###############################################
### client-side utils copied from test fixtures
###############################################
Env = Dict[str, str]
_global_counter = 0
def global_counter() -> int:
"""A really dumb global counter.
This is useful for giving output files a unique number, so if we run the
same command multiple times we can keep their output separate.
"""
global _global_counter
_global_counter += 1
return _global_counter
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
"""Run a process and capture its output
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
where "cmd" is the name of the program and NNN is an incrementing
counter.
If those files already exist, we will overwrite them.
Returns basepath for files with captured output.
"""
assert isinstance(cmd, list)
base = f"{os.path.basename(cmd[0])}_{global_counter()}"
basepath = os.path.join(capture_dir, base)
stdout_filename = basepath + ".stdout"
stderr_filename = basepath + ".stderr"
with open(stdout_filename, "w") as stdout_f:
with open(stderr_filename, "w") as stderr_f:
print(f'(capturing output to "{base}.stdout")')
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
return basepath
class PgBin:
"""A helper class for executing postgres binaries"""
def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
self.log_dir = log_dir
self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
self.env = os.environ.copy()
self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
def _fixpath(self, command: List[str]):
if "/" not in command[0]:
command[0] = os.path.join(self.pg_bin_path, command[0])
def _build_env(self, env_add: Optional[Env]) -> Env:
if env_add is None:
return self.env
env = self.env.copy()
env.update(env_add)
return env
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
"""
Run one of the postgres binaries.
The command should be in list form, e.g. ['pgbench', '-p', '55432']
All the necessary environment variables will be set.
If the first argument (the command name) doesn't include a path (no '/'
characters present), then it will be edited to include the correct path.
If you want stdout/stderr captured to files, use `run_capture` instead.
"""
self._fixpath(command)
print(f'Running command "{" ".join(command)}"')
env = self._build_env(env)
subprocess.run(command, env=env, cwd=cwd, check=True)
def run_capture(
self,
command: List[str],
env: Optional[Env] = None,
cwd: Optional[str] = None,
**kwargs: Any,
) -> str:
"""
Run one of the postgres binaries, with stderr and stdout redirected to a file.
This is just like `run`, but for chatty programs. Returns basepath for files
with captured output.
"""
self._fixpath(command)
print(f'Running command "{" ".join(command)}"')
env = self._build_env(env)
return subprocess_capture(
str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
)
class PgProtocol:
"""Reusable connection logic"""
def __init__(self, **kwargs):
self.default_options = kwargs
def conn_options(self, **kwargs):
conn_options = self.default_options.copy()
if "dsn" in kwargs:
conn_options.update(parse_dsn(kwargs["dsn"]))
conn_options.update(kwargs)
# Individual statement timeout in seconds. 2 minutes should be
# enough for our tests, but if you need a longer, you can
# change it by calling "SET statement_timeout" after
# connecting.
conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}"
return conn_options
# autocommit=True here by default because that's what we need most of the time
def connect(self, autocommit=True, **kwargs) -> PgConnection:
"""
Connect to the node.
Returns psycopg2's connection object.
This method passes all extra params to connstr.
"""
conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs))
# WARNING: this setting affects *all* tests!
conn.autocommit = autocommit
return conn
def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
"""
Execute query against the node and return all rows.
This method passes all extra params to connstr.
"""
return self.safe_psql_many([query], **kwargs)[0]
def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
"""
Execute queries against the node and return all rows.
This method passes all extra params to connstr.
"""
result: List[List[Any]] = []
with closing(self.connect(**kwargs)) as conn:
with conn.cursor() as cur:
for query in queries:
print(f"Executing query: {query}")
cur.execute(query)
if cur.description is None:
result.append([]) # query didn't return data
else:
result.append(cast(List[Any], cur.fetchall()))
return result
class VanillaPostgres(PgProtocol):
def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
super().__init__(host="localhost", port=port, dbname="postgres")
self.pgdatadir = pgdatadir
self.pg_bin = pg_bin
self.running = False
if init:
self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
self.configure([f"port = {port}\n"])
def configure(self, options: List[str]):
"""Append lines into postgresql.conf file."""
assert not self.running
with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
conf_file.write("\n".join(options))
def start(self, log_path: Optional[str] = None):
assert not self.running
self.running = True
log_path = log_path or os.path.join(self.pgdatadir, "pg.log")
self.pg_bin.run_capture(
["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"]
)
def stop(self):
assert self.running
self.running = False
self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
if self.running:
self.stop()
class NeonPageserverApiException(Exception):
pass
class NeonPageserverHttpClient(requests.Session):
def __init__(self, host, port):
super().__init__()
self.host = host
self.port = port
def verbose_error(self, res: requests.Response):
try:
res.raise_for_status()
except requests.RequestException as e:
try:
msg = res.json()["msg"]
except: # noqa: E722
msg = ""
raise NeonPageserverApiException(msg) from e
def check_status(self):
self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
def tenant_list(self):
res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
return res_json
def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
res = self.post(
f"http://{self.host}:{self.port}/v1/tenant",
json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
)
if res.status_code == 409:
if ok_if_exists:
print(f"could not create tenant: already exists for id {new_tenant_id}")
else:
res.raise_for_status()
elif res.status_code == 201:
print(f"created tenant {new_tenant_id}")
else:
self.verbose_error(res)
return new_tenant_id
def timeline_list(self, tenant_id: uuid.UUID):
res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, list)
return res_json
def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true"
)
self.verbose_error(res)
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def lsn_to_hex(num: int) -> str:
"""Convert lsn from int to standard hex notation."""
return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
def lsn_from_hex(lsn_hex: str) -> int:
"""Convert lsn from hex notation to int."""
left, right = lsn_hex.split("/")
return (int(left, 16) << 32) + int(right, 16)
def remote_consistent_lsn(
pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID
) -> int:
detail = pageserver_http_client.timeline_detail(tenant, timeline)
lsn_str = detail["remote_consistent_lsn"]
assert isinstance(lsn_str, str)
return lsn_from_hex(lsn_str)
def wait_for_upload(
pageserver_http_client: NeonPageserverHttpClient,
tenant: uuid.UUID,
timeline: uuid.UUID,
lsn: int,
):
"""waits for local timeline upload up to specified lsn"""
for i in range(10):
current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
if current_lsn >= lsn:
return
print(
f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
)
time.sleep(1)
raise Exception(
f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
)
##############
# End of utils
##############
def pack_base(log_dir, restored_dir, output_tar):
"""Create tar file from basebackup, being careful to produce relative filenames."""
tmp_tar_name = "tmp.tar"
tmp_tar_path = os.path.join(restored_dir, tmp_tar_name)
cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir)
# We actually cd into the dir and call tar from there. If we call tar from
# outside we won't encode filenames as relative, and they won't parse well
# on import.
subprocess_capture(log_dir, cmd, cwd=restored_dir)
shutil.move(tmp_tar_path, output_tar)
def reconstruct_paths(log_dir, pg_bin, base_tar, port: int):
"""Reconstruct what relation files should exist in the datadir by querying postgres."""
with tempfile.TemporaryDirectory() as restored_dir:
# Unpack the base tar
subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
# Start a vanilla postgres from the given datadir and query it to find
# what relfiles should exist, but possibly don't.
with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg:
vanilla_pg.configure([f"port={port}"])
vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
# Create database based on template0 because we can't connect to template0
query = "create database template0copy template template0"
vanilla_pg.safe_psql(query, user="cloud_admin")
vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
# Get all databases
query = "select oid, datname from pg_database"
oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
template0_oid = [
oid for (oid, database) in oid_dbname_pairs if database == "template0"
][0]
# Get rel paths for each database
for oid, database in oid_dbname_pairs:
if database == "template0":
# We can't connect to template0
continue
query = "select relname, pg_relation_filepath(oid) from pg_class"
result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
for _relname, filepath in result:
if filepath is not None:
if database == "template0copy":
# Add all template0copy paths to template0
prefix = f"base/{oid}/"
if filepath.startswith(prefix):
suffix = filepath[len(prefix) :]
yield f"base/{template0_oid}/{suffix}"
elif filepath.startswith("global"):
print(f"skipping {database} global file {filepath}")
else:
raise AssertionError
else:
yield filepath
def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
"""Add the appropriate empty files to a basebadkup tar."""
with tempfile.TemporaryDirectory() as restored_dir:
# Unpack the base tar
subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
# Touch files that don't exist
for path in paths:
absolute_path = os.path.join(restored_dir, path)
exists = os.path.exists(absolute_path)
if not exists:
print(f"File {absolute_path} didn't exist. Creating..")
Path(absolute_path).touch()
# Repackage
pack_base(log_dir, restored_dir, output_tar)
# HACK This is a workaround for exporting from old pageservers that
# can't export empty relations. In this case we need to start
# a vanilla postgres from the exported datadir, and query it
# to see what empty relations are missing, and then create
# those empty files before importing.
def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int):
reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port))
touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
with closing(psycopg2.connect(pageserver_connstr)) as conn:
conn.autocommit = True
with conn.cursor() as cur:
cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
cur.execute(cmd)
res = cur.fetchone()
assert res is not None
prev_lsn = res[0]
last_lsn = res[1]
return last_lsn, prev_lsn
def import_timeline(
args,
psql_path,
pageserver_connstr,
pageserver_http,
tenant_id,
timeline_id,
last_lsn,
prev_lsn,
tar_filename,
pg_version,
):
# Import timelines to new pageserver
import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}"
full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """
stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr")
stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout")
print(f"Running: {full_cmd}")
with open(stdout_filename, "w") as stdout_f:
with open(stderr_filename2, "w") as stderr_f:
print(f"(capturing output to {stdout_filename})")
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
subprocess.run(
full_cmd,
stdout=stdout_f,
stderr=stderr_f,
env=pg_bin._build_env(None),
shell=True,
check=True,
)
print("Done import")
# Wait until pageserver persists the files
wait_for_upload(
pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn)
)
def export_timeline(
args,
psql_path,
pageserver_connstr,
tenant_id,
timeline_id,
last_lsn,
prev_lsn,
tar_filename,
pg_version,
):
# Choose filenames
incomplete_filename = tar_filename + ".incomplete"
stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
# Construct export command
query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}"
cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query]
# Run export command
print(f"Running: {cmd}")
with open(incomplete_filename, "w") as stdout_f:
with open(stderr_filename, "w") as stderr_f:
print(f"(capturing output to {incomplete_filename})")
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
subprocess.run(
cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True
)
# Add missing rels
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port)
# Log more info
file_size = os.path.getsize(tar_filename)
print(f"Done export: {tar_filename}, size {file_size}")
def main(args: argparse.Namespace):
# any psql version will do here. use current DEFAULT_PG_VERSION = 15
psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")
old_pageserver_host = args.old_pageserver_host
new_pageserver_host = args.new_pageserver_host
old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
old_http_client.check_status()
old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
new_http_client.check_status()
new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
for tenant_id in args.tenants:
print(f"Tenant: {tenant_id}")
timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
print(f"Timelines: {timelines}")
# Create tenant in new pageserver
if args.only_import is False and not args.timelines:
new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
for timeline in timelines:
# Skip timelines we don't need to export
if args.timelines and timeline["timeline_id"] not in args.timelines:
print(f"Skipping timeline {timeline['timeline_id']}")
continue
# Choose filenames
tar_filename = os.path.join(
args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar"
)
pg_version = timeline["pg_version"]
# Export timeline from old pageserver
if args.only_import is False:
last_lsn, prev_lsn = get_rlsn(
old_pageserver_connstr,
timeline["tenant_id"],
timeline["timeline_id"],
)
export_timeline(
args,
psql_path,
old_pageserver_connstr,
timeline["tenant_id"],
timeline["timeline_id"],
last_lsn,
prev_lsn,
tar_filename,
pg_version,
)
# Import into new pageserver
import_timeline(
args,
psql_path,
new_pageserver_connstr,
new_http_client,
timeline["tenant_id"],
timeline["timeline_id"],
last_lsn,
prev_lsn,
tar_filename,
pg_version,
)
# Re-export and compare
re_export_filename = tar_filename + ".reexport"
export_timeline(
args,
psql_path,
new_pageserver_connstr,
timeline["tenant_id"],
timeline["timeline_id"],
last_lsn,
prev_lsn,
re_export_filename,
pg_version,
)
# Check the size is the same
old_size = (os.path.getsize(tar_filename),)
new_size = (os.path.getsize(re_export_filename),)
if old_size != new_size:
raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
def non_zero_tcp_port(arg: Any):
port = int(arg)
if port < 1 or port > 65535:
raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}")
return port
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--tenant-id",
dest="tenants",
required=True,
nargs="+",
help="Id of the tenant to migrate. You can pass multiple arguments",
)
parser.add_argument(
"--timeline-id",
dest="timelines",
required=False,
nargs="+",
help="Id of the timeline to migrate. You can pass multiple arguments",
)
parser.add_argument(
"--from-host",
dest="old_pageserver_host",
required=True,
help="Host of the pageserver to migrate data from",
)
parser.add_argument(
"--from-http-port",
dest="old_pageserver_http_port",
required=False,
type=int,
default=9898,
help="HTTP port of the pageserver to migrate data from. Default: 9898",
)
parser.add_argument(
"--from-pg-port",
dest="old_pageserver_pg_port",
required=False,
type=int,
default=6400,
help="pg port of the pageserver to migrate data from. Default: 6400",
)
parser.add_argument(
"--to-host",
dest="new_pageserver_host",
required=True,
help="Host of the pageserver to migrate data to",
)
parser.add_argument(
"--to-http-port",
dest="new_pageserver_http_port",
required=False,
default=9898,
type=int,
help="HTTP port of the pageserver to migrate data to. Default: 9898",
)
parser.add_argument(
"--to-pg-port",
dest="new_pageserver_pg_port",
required=False,
default=6400,
type=int,
help="pg port of the pageserver to migrate data to. Default: 6400",
)
parser.add_argument(
"--ignore-tenant-exists",
dest="ok_if_exists",
required=False,
help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.",
)
parser.add_argument(
"--pg-distrib-dir",
dest="pg_distrib_dir",
required=False,
default="/usr/local/",
help="Path where postgres binaries are installed. Default: /usr/local/",
)
parser.add_argument(
"--psql-path",
dest="psql_path",
required=False,
default="/usr/local/v14/bin/psql",
help="Path to the psql binary. Default: /usr/local/v14/bin/psql",
)
parser.add_argument(
"--only-import",
dest="only_import",
required=False,
default=False,
action="store_true",
help="Skip export and tenant creation part",
)
parser.add_argument(
"--work-dir",
dest="work_dir",
required=True,
default=False,
help="directory where temporary tar files are stored",
)
parser.add_argument(
"--tmp-pg-port",
dest="tmp_pg_port",
required=False,
default=55439,
type=non_zero_tcp_port,
help="localhost port to use for temporary postgres instance",
)
args = parser.parse_args()
main(args)

View File

@@ -5,7 +5,6 @@ use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use metrics::BuildInfo;
use std::sync::Arc;
use std::time::Duration;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
@@ -246,8 +245,6 @@ async fn async_main() -> anyhow::Result<()> {
};
// After loading secrets & config, but before starting anything else, apply database migrations
Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?;
migration_run(&secrets.database_url)
.await
.context("Running database migrations")?;

View File

@@ -2,7 +2,6 @@ pub(crate) mod split_state;
use std::collections::HashMap;
use std::str::FromStr;
use std::time::Duration;
use std::time::Instant;
use self::split_state::SplitState;
use camino::Utf8Path;
@@ -145,31 +144,6 @@ impl Persistence {
}
}
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
/// database and the storage controller, therefore the database might not be available right away
pub async fn await_connection(
database_url: &str,
timeout: Duration,
) -> Result<(), diesel::ConnectionError> {
let started_at = Instant::now();
loop {
match PgConnection::establish(database_url) {
Ok(_) => {
tracing::info!("Connected to database.");
return Ok(());
}
Err(e) => {
if started_at.elapsed() > timeout {
return Err(e);
} else {
tracing::info!("Database not yet available, waiting... ({e})");
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
}
}
}
/// Wraps `with_conn` in order to collect latency and error metrics
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
where

View File

@@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
"pageserver_getpage_reconstruct_seconds_sum",
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
*histogram("pageserver_smgr_query_seconds_global"),
*histogram("pageserver_layers_visited_per_read_global"),
*histogram("pageserver_read_num_fs_layers"),
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
*histogram("pageserver_wait_lsn_seconds"),
*histogram("pageserver_remote_operation_seconds"),

View File

@@ -190,6 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
"trace_read_requests": True,
"walreceiver_connect_timeout": "13m",
"image_layer_creation_check_threshold": 1,
"try_enable_aux_file_v2": True,
}
ps_http = env.pageserver.http_client()

View File

@@ -0,0 +1,63 @@
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
logical_replication_sync,
)
def test_aux_v2_config_switch(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_aux_v2_config_switch", "empty")
endpoint = env.endpoints.create_start(
"test_aux_v2_config_switch", config_lines=["log_statement=all"]
)
with env.pageserver.http_client() as client:
tenant_config = client.tenant_config(tenant_id).effective_config
tenant_config["try_enable_aux_file_v2"] = True
client.set_tenant_config(tenant_id, tenant_config)
# aux file v2 is enabled on the write path
assert not client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
"aux_file_v2"
]
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
cur.execute("create table t(pk integer primary key, payload integer)")
cur.execute(
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
)
cur.execute("create publication pub1 for table t, replication_example")
# now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
# instead of going through the full logical replication process.
vanilla_pg.start()
vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
vanilla_pg.safe_psql(
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
)
connstr = endpoint.connstr().replace("'", "''")
log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
# Wait logical replication channel to be established
logical_replication_sync(vanilla_pg, endpoint)
vanilla_pg.stop()
endpoint.stop()
env.pageserver.assert_log_contains("enabling aux file v2 support")
with env.pageserver.http_client() as client:
# aux file v2 flag should be enabled at this point
assert client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["aux_file_v2"]
with env.pageserver.http_client() as client:
tenant_config = client.tenant_config(tenant_id).effective_config
tenant_config["try_enable_aux_file_v2"] = False
client.set_tenant_config(tenant_id, tenant_config)
# the flag should still be enabled
assert client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["aux_file_v2"]
env.pageserver.restart()
with env.pageserver.http_client() as client:
# aux file v2 flag should be persisted
assert client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["aux_file_v2"]

View File

@@ -1,93 +0,0 @@
import os
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.workload import Workload
AGGRESIVE_COMPACTION_TENANT_CONF = {
# Disable gc and compaction. The test runs compaction manually.
"gc_period": "0s",
"compaction_period": "0s",
# Small checkpoint distance to create many layers
"checkpoint_distance": 1024**2,
# Compact small layers
"compaction_target_size": 1024**2,
"image_creation_threshold": 2,
# INC-186: remove when merging the fix
"image_layer_creation_check_threshold": 0,
}
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
"""
This is a smoke test that compaction kicks in. The workload repeatedly churns
a small number of rows and manually instructs the pageserver to run compaction
between iterations. At the end of the test validate that the average number of
layers visited to gather reconstruct data for a given key is within the empirically
observed bounds.
"""
# Effectively disable the page cache to rely only on image layers
# to shorten reads.
neon_env_builder.pageserver_config_override = """
page_cache_size=10
"""
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
row_count = 10000
churn_rounds = 100
ps_http = env.pageserver.http_client()
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageserver.id)
log.info("Writing initial data ...")
workload.write_rows(row_count, env.pageserver.id)
for i in range(1, churn_rounds + 1):
if i % 10 == 0:
log.info(f"Running churn round {i}/{churn_rounds} ...")
workload.churn_rows(row_count, env.pageserver.id)
ps_http.timeline_compact(tenant_id, timeline_id)
log.info("Validating at workload end ...")
workload.validate(env.pageserver.id)
log.info("Checking layer access metrics ...")
layer_access_metric_names = [
"pageserver_layers_visited_per_read_global_sum",
"pageserver_layers_visited_per_read_global_count",
"pageserver_layers_visited_per_read_global_bucket",
"pageserver_layers_visited_per_vectored_read_global_sum",
"pageserver_layers_visited_per_vectored_read_global_count",
"pageserver_layers_visited_per_vectored_read_global_bucket",
]
metrics = env.pageserver.http_client().get_metrics()
for name in layer_access_metric_names:
layer_access_metrics = metrics.query_all(name)
log.info(f"Got metrics: {layer_access_metrics}")
non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
non_vectored_average = non_vectored_sum.value / non_vectored_count.value
vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
vectored_average = vectored_sum.value / vectored_count.value
log.info(f"{non_vectored_average=} {vectored_average=}")
# The upper bound for average number of layer visits below (8)
# was chosen empirically for this workload.
assert non_vectored_average < 8
assert vectored_average < 8

View File

@@ -192,6 +192,7 @@ def test_backward_compatibility(
assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
@pytest.xfail
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")

View File

@@ -16,6 +16,7 @@ from fixtures.pageserver.utils import (
wait_for_upload,
wait_tenant_status_404,
)
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import (
LocalFsStorage,
RemoteStorageKind,
@@ -23,6 +24,7 @@ from fixtures.remote_storage import (
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import (
query_scalar,
subprocess_capture,
wait_until,
)
@@ -182,14 +184,20 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca
# A minor migration involves no storage breaking changes.
# It is done by attaching the tenant to a new pageserver.
"minor",
# In the unlikely and unfortunate event that we have to break
# the storage format, extend this test with the param below.
# "major",
# A major migration involves exporting a postgres datadir
# basebackup and importing it into the new pageserver.
# This kind of migration can tolerate breaking changes
# to storage format
"major",
],
)
@pytest.mark.parametrize("with_load", ["with_load", "without_load"])
def test_tenant_relocation(
neon_env_builder: NeonEnvBuilder,
port_distributor: PortDistributor,
test_output_dir: Path,
neon_binpath: Path,
base_dir: Path,
method: str,
with_load: str,
):
@@ -291,7 +299,40 @@ def test_tenant_relocation(
current_lsn=current_lsn_second,
)
if method == "minor":
# Migrate either by attaching from s3 or import/export basebackup
if method == "major":
cmd = [
"poetry",
"run",
"python",
str(base_dir / "scripts/export_import_between_pageservers.py"),
"--tenant-id",
str(tenant_id),
"--from-host",
"localhost",
"--from-http-port",
str(origin_http.port),
"--from-pg-port",
str(origin_ps.service_port.pg),
"--to-host",
"localhost",
"--to-http-port",
str(destination_http.port),
"--to-pg-port",
str(destination_ps.service_port.pg),
"--pg-distrib-dir",
str(neon_env_builder.pg_distrib_dir),
"--work-dir",
str(test_output_dir),
"--tmp-pg-port",
str(port_distributor.get_port()),
]
subprocess_capture(test_output_dir, cmd, check=True)
destination_ps.allowed_errors.append(
".*ignored .* unexpected bytes after the tar archive.*"
)
elif method == "minor":
# call to attach timeline to new pageserver
destination_ps.tenant_attach(tenant_id)

View File

@@ -292,12 +292,33 @@ def test_single_branch_get_tenant_size_grows(
Operate on single branch reading the tenants size after each transaction.
"""
# Disable automatic compaction and GC, and set a long PITR interval: we will expect
# size to always increase with writes as all writes remain within the PITR
# Disable automatic gc and compaction.
# The pitr_interval here is quite problematic, so we cannot really use it.
# it'd have to be calibrated per test executing env.
# there was a bug which was hidden if the create table and first batch of
# inserts is larger than gc_horizon. for example 0x20000 here hid the fact
# that there next_gc_cutoff could be smaller than initdb_lsn, which will
# obviously lead to issues when calculating the size.
gc_horizon = 0x3BA00
# it's a bit of a hack, but different versions of postgres have different
# amount of WAL generated for the same amount of data. so we need to
# adjust the gc_horizon accordingly.
if pg_version == PgVersion.V14:
gc_horizon = 0x4A000
elif pg_version == PgVersion.V15:
gc_horizon = 0x3BA00
elif pg_version == PgVersion.V16:
gc_horizon = 210000
else:
raise NotImplementedError(pg_version)
tenant_config = {
"compaction_period": "0s",
"gc_period": "0s",
"pitr_interval": "3600s",
"pitr_interval": "0s",
"gc_horizon": gc_horizon,
}
env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
@@ -311,6 +332,18 @@ def test_single_branch_get_tenant_size_grows(
size_debug_file = open(test_output_dir / "size_debug.html", "w")
def check_size_change(
current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
):
if current_lsn - initdb_lsn >= gc_horizon:
assert (
size >= prev_size
), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
else:
assert (
size > prev_size
), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
def get_current_consistent_size(
env: NeonEnv,
endpoint: Endpoint,
@@ -379,6 +412,14 @@ def test_single_branch_get_tenant_size_grows(
)
prev_size = collected_responses[-1][2]
# branch start shouldn't be past gc_horizon yet
# thus the size should grow as we insert more data
# "gc_horizon" is tuned so that it kicks in _after_ the
# insert phase, but before the update phase ends.
assert (
current_lsn - initdb_lsn <= gc_horizon
), "Tuning of GC window is likely out-of-date"
assert size > prev_size
collected_responses.append(("INSERT", current_lsn, size))
@@ -398,7 +439,8 @@ def test_single_branch_get_tenant_size_grows(
)
prev_size = collected_responses[-1][2]
assert size > prev_size
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
collected_responses.append(("UPDATE", current_lsn, size))
@@ -415,7 +457,8 @@ def test_single_branch_get_tenant_size_grows(
)
prev_size = collected_responses[-1][2]
assert size > prev_size
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
collected_responses.append(("DELETE", current_lsn, size))
@@ -426,20 +469,20 @@ def test_single_branch_get_tenant_size_grows(
with endpoint.cursor() as cur:
cur.execute("DROP TABLE t0")
# Dropping the table doesn't reclaim any space
# from the user's point of view, because the DROP transaction is still
# within pitr_interval.
# Without setting a PITR interval, dropping the table doesn't reclaim any space
# from the user's point of view, because the DROP transaction is too small
# to fall out of gc_horizon.
(current_lsn, size) = get_current_consistent_size(
env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
)
assert size >= prev_size
prev_size = size
prev_size = collected_responses[-1][2]
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
# Set a zero PITR interval to allow the DROP to impact the synthetic size
# Set a tiny PITR interval to allow the DROP to impact the synthetic size
# Because synthetic size calculation uses pitr interval when available,
# when our tenant is configured with a tiny pitr interval, dropping a table should
# cause synthetic size to go down immediately
tenant_config["pitr_interval"] = "0s"
tenant_config["pitr_interval"] = "1ms"
env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
(current_lsn, size) = get_current_consistent_size(
env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
@@ -451,6 +494,10 @@ def test_single_branch_get_tenant_size_grows(
# defined by gc_horizon.
collected_responses.append(("DROP", current_lsn, size))
# Should have gone past gc_horizon, otherwise gc_horizon is too large
bytes_written = current_lsn - initdb_lsn
assert bytes_written > gc_horizon
# this isn't too many lines to forget for a while. observed while
# developing these tests that locally the value is a bit more than what we
# get in the ci.

View File

@@ -75,8 +75,6 @@ tonic = { version = "0.9", features = ["tls-roots"] }
tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
tracing = { version = "0.1", features = ["log"] }
tracing-core = { version = "0.1" }
unicode-bidi = { version = "0.3" }
unicode-normalization = { version = "0.1" }
url = { version = "2", features = ["serde"] }
uuid = { version = "1", features = ["serde", "v4", "v7"] }
zeroize = { version = "1", features = ["derive"] }