mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-31 12:00:42 +00:00
Compare commits
7 Commits
proxy-asyn
...
skyzh/aux-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
37e25ab51e | ||
|
|
f1deefc077 | ||
|
|
d4fc271766 | ||
|
|
21db1bc2f0 | ||
|
|
21addb827b | ||
|
|
4b23692615 | ||
|
|
667376a9c4 |
171
Cargo.lock
generated
171
Cargo.lock
generated
@@ -595,7 +595,7 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"hyper-rustls 0.24.0",
|
||||
"hyper-rustls",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
@@ -1780,18 +1780,6 @@ dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "enum-as-inner"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5ffccbb6966c05b32ef8fbac435df276c4ae4d3dc55a8cd0eb9745e6c12f546a"
|
||||
dependencies = [
|
||||
"heck 0.4.1",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "enum-map"
|
||||
version = "2.5.0"
|
||||
@@ -1983,9 +1971,9 @@ checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.2.1"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
|
||||
checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8"
|
||||
dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
@@ -2344,51 +2332,6 @@ version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
|
||||
|
||||
[[package]]
|
||||
name = "hickory-proto"
|
||||
version = "0.24.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07698b8420e2f0d6447a436ba999ec85d8fbf2a398bbd737b82cac4a2e96e512"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"cfg-if",
|
||||
"data-encoding",
|
||||
"enum-as-inner",
|
||||
"futures-channel",
|
||||
"futures-io",
|
||||
"futures-util",
|
||||
"idna 0.4.0",
|
||||
"ipnet",
|
||||
"once_cell",
|
||||
"rand 0.8.5",
|
||||
"thiserror",
|
||||
"tinyvec",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hickory-resolver"
|
||||
version = "0.24.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28757f23aa75c98f254cf0405e6d8c25b831b32921b050a66692427679b1f243"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"futures-util",
|
||||
"hickory-proto",
|
||||
"ipconfig",
|
||||
"lru-cache",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"rand 0.8.5",
|
||||
"resolv-conf",
|
||||
"smallvec",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "histogram"
|
||||
version = "0.7.4"
|
||||
@@ -2582,23 +2525,6 @@ dependencies = [
|
||||
"tokio-rustls 0.24.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-rustls"
|
||||
version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "399c78f9338483cb7e630c8474b07268983c6bd5acee012e4211f9f7bb21b070"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"log",
|
||||
"rustls 0.22.4",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
"tokio-rustls 0.25.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-timeout"
|
||||
version = "0.4.1"
|
||||
@@ -2686,19 +2612,9 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.4.0"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c"
|
||||
dependencies = [
|
||||
"unicode-bidi",
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
|
||||
checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6"
|
||||
dependencies = [
|
||||
"unicode-bidi",
|
||||
"unicode-normalization",
|
||||
@@ -2803,18 +2719,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ipconfig"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b58db92f96b720de98181bbbe63c831e87005ab460c1bf306eb2622b4707997f"
|
||||
dependencies = [
|
||||
"socket2 0.5.5",
|
||||
"widestring",
|
||||
"windows-sys 0.48.0",
|
||||
"winreg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ipnet"
|
||||
version = "2.9.0"
|
||||
@@ -2956,12 +2860,6 @@ version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||
|
||||
[[package]]
|
||||
name = "linked-hash-map"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.1.4"
|
||||
@@ -2996,15 +2894,6 @@ version = "0.4.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
||||
|
||||
[[package]]
|
||||
name = "lru-cache"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "31e24f1ad8321ca0e8a1e0ac13f23cb668e6f5466c2c57319f6a5cf1cc8e3b1c"
|
||||
dependencies = [
|
||||
"linked-hash-map",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "match_cfg"
|
||||
version = "0.1.0"
|
||||
@@ -3769,7 +3658,6 @@ dependencies = [
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"twox-hash",
|
||||
"url",
|
||||
"utils",
|
||||
"walkdir",
|
||||
@@ -4000,9 +3888,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.1"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
|
||||
|
||||
[[package]]
|
||||
name = "petgraph"
|
||||
@@ -4111,7 +3999,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4124,7 +4012,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -4135,7 +4023,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -4154,7 +4042,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4404,7 +4292,6 @@ dependencies = [
|
||||
"aws-config",
|
||||
"aws-sdk-iam",
|
||||
"aws-sigv4",
|
||||
"aws-smithy-runtime",
|
||||
"aws-types",
|
||||
"base64 0.13.1",
|
||||
"bstr",
|
||||
@@ -4422,7 +4309,6 @@ dependencies = [
|
||||
"hashbrown 0.13.2",
|
||||
"hashlink",
|
||||
"hex",
|
||||
"hickory-resolver",
|
||||
"hmac",
|
||||
"hostname",
|
||||
"http 1.1.0",
|
||||
@@ -4430,7 +4316,6 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"hyper 1.2.0",
|
||||
"hyper-rustls 0.25.0",
|
||||
"hyper-tungstenite",
|
||||
"hyper-util",
|
||||
"ipnet",
|
||||
@@ -4499,12 +4384,6 @@ dependencies = [
|
||||
"x509-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-error"
|
||||
version = "1.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.31.0"
|
||||
@@ -4806,7 +4685,7 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"http-body 0.4.5",
|
||||
"hyper 0.14.26",
|
||||
"hyper-rustls 0.24.0",
|
||||
"hyper-rustls",
|
||||
"hyper-tls",
|
||||
"ipnet",
|
||||
"js-sys",
|
||||
@@ -4892,16 +4771,6 @@ dependencies = [
|
||||
"tracing-opentelemetry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "resolv-conf"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52e44394d2086d010551b14b53b1f24e31647570cd1deb0379e2c21b329aba00"
|
||||
dependencies = [
|
||||
"hostname",
|
||||
"quick-error",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "retry-policies"
|
||||
version = "0.1.2"
|
||||
@@ -6295,7 +6164,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=socket-config#539ce321bbe1d2cb1c64c2c405c9afa1bb9f6366"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6825,12 +6694,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.0"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
|
||||
checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"idna 0.5.0",
|
||||
"idna",
|
||||
"percent-encoding",
|
||||
"serde",
|
||||
]
|
||||
@@ -7162,12 +7031,6 @@ dependencies = [
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "widestring"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "653f141f39ec16bba3c5abe400a0c60da7468261cc2cbf36805022876bc721a8"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
@@ -7510,8 +7373,6 @@ dependencies = [
|
||||
"tower",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"unicode-bidi",
|
||||
"unicode-normalization",
|
||||
"url",
|
||||
"uuid",
|
||||
"zeroize",
|
||||
|
||||
13
Cargo.toml
13
Cargo.toml
@@ -57,7 +57,6 @@ aws-sdk-s3 = "1.14"
|
||||
aws-sdk-iam = "1.15.0"
|
||||
aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
|
||||
aws-smithy-types = "1.1.4"
|
||||
aws-smithy-runtime = "1.1.8"
|
||||
aws-credential-types = "1.1.4"
|
||||
aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
|
||||
aws-types = "1.1.7"
|
||||
@@ -195,11 +194,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -239,7 +238,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="socket-config" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
|
||||
|
||||
# bug fixes for UUID
|
||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
|
||||
|
||||
@@ -434,6 +434,11 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
try_enable_aux_file_v2: settings
|
||||
.remove("try_enable_aux_file_v2")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'try_enable_aux_file_v2' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -552,6 +557,11 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
try_enable_aux_file_v2: settings
|
||||
.remove("try_enable_aux_file_v2")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'try_enable_aux_file_v2' as bool")?,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use bytes::BufMut;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::RangeInclusive;
|
||||
use std::{fmt, ops::Range};
|
||||
|
||||
use crate::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
@@ -23,81 +21,9 @@ pub struct Key {
|
||||
pub field6: u32,
|
||||
}
|
||||
|
||||
/// The storage key size.
|
||||
pub const KEY_SIZE: usize = 18;
|
||||
|
||||
/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
|
||||
/// See [`Key::to_i128`] for more information on the encoding.
|
||||
pub const METADATA_KEY_SIZE: usize = 16;
|
||||
|
||||
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
|
||||
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
|
||||
|
||||
/// The (reserved) key prefix of relation sizes.
|
||||
pub const RELATION_SIZE_PREFIX: u8 = 0x81;
|
||||
|
||||
/// The key prefix of AUX file keys.
|
||||
pub const AUX_KEY_PREFIX: u8 = 0x82;
|
||||
|
||||
/// Check if the key falls in the range of metadata keys.
|
||||
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
|
||||
key[0] >= METADATA_KEY_BEGIN_PREFIX
|
||||
}
|
||||
|
||||
impl Key {
|
||||
/// Check if the key falls in the range of metadata keys.
|
||||
pub const fn is_metadata_key(&self) -> bool {
|
||||
self.field1 >= METADATA_KEY_BEGIN_PREFIX
|
||||
}
|
||||
|
||||
/// Encode a metadata key to a storage key.
|
||||
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
|
||||
assert!(is_metadata_key_slice(key), "key not in metadata key range");
|
||||
Key {
|
||||
field1: key[0],
|
||||
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
|
||||
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
|
||||
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
|
||||
field5: key[11],
|
||||
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode a metadata key to a storage key.
|
||||
pub fn from_metadata_key(key: &[u8]) -> Self {
|
||||
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
|
||||
}
|
||||
|
||||
/// Extract a metadata key to a writer. The result should always be 16 bytes.
|
||||
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
|
||||
writer.put_u8(self.field1);
|
||||
assert!(self.field2 <= 0xFFFF);
|
||||
writer.put_u16(self.field2 as u16);
|
||||
writer.put_u32(self.field3);
|
||||
writer.put_u32(self.field4);
|
||||
writer.put_u8(self.field5);
|
||||
writer.put_u32(self.field6);
|
||||
}
|
||||
|
||||
/// Get the range of metadata keys.
|
||||
pub fn metadata_key_range() -> RangeInclusive<Self> {
|
||||
Key {
|
||||
field1: METADATA_KEY_BEGIN_PREFIX,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..=Key {
|
||||
field1: u8::MAX,
|
||||
field2: u16::MAX as u32,
|
||||
field3: u32::MAX,
|
||||
field4: u32::MAX,
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX,
|
||||
}
|
||||
}
|
||||
|
||||
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||
@@ -122,11 +48,11 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn next(&self) -> Key {
|
||||
pub fn next(&self) -> Key {
|
||||
self.add(1)
|
||||
}
|
||||
|
||||
pub const fn add(&self, x: u32) -> Key {
|
||||
pub fn add(&self, x: u32) -> Key {
|
||||
let mut key = *self;
|
||||
|
||||
let r = key.field6.overflowing_add(x);
|
||||
@@ -155,8 +81,6 @@ impl Key {
|
||||
key
|
||||
}
|
||||
|
||||
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::from_metadata_key`] instead.
|
||||
pub fn from_slice(b: &[u8]) -> Self {
|
||||
Key {
|
||||
field1: b[0],
|
||||
@@ -168,8 +92,6 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::extract_metadata_key_to_writer`] instead.
|
||||
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
||||
buf[0] = self.field1;
|
||||
BE::write_u32(&mut buf[1..5], self.field2);
|
||||
@@ -553,14 +475,12 @@ pub const AUX_FILES_KEY: Key = Key {
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
!NON_INHERITED_RANGE.contains(&key)
|
||||
key != AUX_FILES_KEY
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -636,14 +556,11 @@ impl std::str::FromStr for Key {
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use crate::key::is_metadata_key_slice;
|
||||
use crate::key::Key;
|
||||
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
|
||||
use super::AUX_KEY_PREFIX;
|
||||
|
||||
#[test]
|
||||
fn display_fromstr_bijection() {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||
@@ -659,16 +576,4 @@ mod tests {
|
||||
|
||||
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metadata_keys() {
|
||||
let mut metadata_key = vec![AUX_KEY_PREFIX];
|
||||
metadata_key.extend_from_slice(&[0xFF; 15]);
|
||||
let encoded_key = Key::from_metadata_key(&metadata_key);
|
||||
let mut output_key = Vec::new();
|
||||
encoded_key.extract_metadata_key_to_writer(&mut output_key);
|
||||
assert_eq!(metadata_key, output_key);
|
||||
assert!(encoded_key.is_metadata_key());
|
||||
assert!(is_metadata_key_slice(&metadata_key));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,13 +94,12 @@ impl KeySpace {
|
||||
|
||||
/// Remove all keys in `other` from `self`.
|
||||
/// This can involve splitting or removing of existing ranges.
|
||||
/// Returns the removed keyspace
|
||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
|
||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
|
||||
let (self_start, self_end) = match (self.start(), self.end()) {
|
||||
(Some(start), Some(end)) => (start, end),
|
||||
_ => {
|
||||
// self is empty
|
||||
return KeySpace::default();
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -113,37 +112,30 @@ impl KeySpace {
|
||||
.skip_while(|range| self_start >= range.end)
|
||||
.take_while(|range| self_end > range.start);
|
||||
|
||||
let mut removed_accum = KeySpaceRandomAccum::new();
|
||||
for range in other_ranges {
|
||||
while let Some(overlap_at) = self.overlaps_at(range) {
|
||||
let overlapped = self.ranges[overlap_at].clone();
|
||||
|
||||
if overlapped.start < range.start && overlapped.end <= range.end {
|
||||
// Higher part of the range is completely overlapped.
|
||||
removed_accum.add_range(range.start..self.ranges[overlap_at].end);
|
||||
self.ranges[overlap_at].end = range.start;
|
||||
}
|
||||
if overlapped.start >= range.start && overlapped.end > range.end {
|
||||
// Lower part of the range is completely overlapped.
|
||||
removed_accum.add_range(self.ranges[overlap_at].start..range.end);
|
||||
self.ranges[overlap_at].start = range.end;
|
||||
}
|
||||
if overlapped.start < range.start && overlapped.end > range.end {
|
||||
// Middle part of the range is overlapped.
|
||||
removed_accum.add_range(range.clone());
|
||||
self.ranges[overlap_at].end = range.start;
|
||||
self.ranges
|
||||
.insert(overlap_at + 1, range.end..overlapped.end);
|
||||
}
|
||||
if overlapped.start >= range.start && overlapped.end <= range.end {
|
||||
// Whole range is overlapped
|
||||
removed_accum.add_range(self.ranges[overlap_at].clone());
|
||||
self.ranges.remove(overlap_at);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
removed_accum.to_keyspace()
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Option<Key> {
|
||||
@@ -561,16 +553,7 @@ mod tests {
|
||||
Key::from_i128(11)..Key::from_i128(13),
|
||||
],
|
||||
};
|
||||
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||
let removed_expected = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(2)..Key::from_i128(3),
|
||||
Key::from_i128(6)..Key::from_i128(7),
|
||||
Key::from_i128(11)..Key::from_i128(12),
|
||||
],
|
||||
};
|
||||
assert_eq!(removed, removed_expected);
|
||||
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
@@ -600,17 +583,7 @@ mod tests {
|
||||
Key::from_i128(14)..Key::from_i128(17),
|
||||
],
|
||||
};
|
||||
|
||||
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||
let removed_expected = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(3)..Key::from_i128(5),
|
||||
Key::from_i128(8)..Key::from_i128(10),
|
||||
Key::from_i128(14)..Key::from_i128(15),
|
||||
],
|
||||
};
|
||||
assert_eq!(removed, removed_expected);
|
||||
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
@@ -637,11 +610,7 @@ mod tests {
|
||||
Key::from_i128(15)..Key::from_i128(17),
|
||||
],
|
||||
};
|
||||
|
||||
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||
let removed_expected = KeySpace::default();
|
||||
assert_eq!(removed, removed_expected);
|
||||
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
@@ -668,17 +637,7 @@ mod tests {
|
||||
let key_space2 = KeySpace {
|
||||
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
|
||||
};
|
||||
|
||||
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||
let removed_expected = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(9)..Key::from_i128(10),
|
||||
Key::from_i128(12)..Key::from_i128(15),
|
||||
Key::from_i128(17)..Key::from_i128(19),
|
||||
],
|
||||
};
|
||||
assert_eq!(removed, removed_expected);
|
||||
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
|
||||
@@ -303,6 +303,7 @@ pub struct TenantConfig {
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
pub try_enable_aux_file_v2: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -429,7 +430,6 @@ pub struct StatusResponse {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub tenant_id: Option<TenantShardId>,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
@@ -579,6 +579,9 @@ pub struct TimelineInfo {
|
||||
pub state: TimelineState,
|
||||
|
||||
pub walreceiver_status: String,
|
||||
|
||||
/// Whether aux file v2 is enabled
|
||||
pub aux_file_v2: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
|
||||
@@ -134,11 +134,6 @@ impl RemotePath {
|
||||
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
||||
self.0.strip_prefix(&p.0)
|
||||
}
|
||||
|
||||
pub fn add_trailing_slash(&self) -> Self {
|
||||
// Unwrap safety inputs are guararnteed to be valid UTF-8
|
||||
Self(format!("{}/", self.0).try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
||||
@@ -162,21 +157,47 @@ pub struct Listing {
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
|
||||
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
|
||||
///
|
||||
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
|
||||
/// from the absolute root of the bucket.
|
||||
///
|
||||
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
|
||||
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
|
||||
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
|
||||
/// returned in `keys` ().
|
||||
///
|
||||
/// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function
|
||||
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
|
||||
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
|
||||
/// so this method doesnt need to.
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::WithDelimiter, None, cancel)
|
||||
.await?
|
||||
.prefixes;
|
||||
Ok(result)
|
||||
}
|
||||
/// Lists all files in directory "recursively"
|
||||
/// (not really recursively, because AWS has a flat namespace)
|
||||
/// Note: This is subtely different than list_prefixes,
|
||||
/// because it is for listing files instead of listing
|
||||
/// names sharing common prefixes.
|
||||
/// For example,
|
||||
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
|
||||
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
|
||||
/// whereas,
|
||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||
/// See `test_real_s3.rs` for more details.
|
||||
///
|
||||
/// max_keys limits max number of keys returned; None means unlimited.
|
||||
async fn list_files(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
|
||||
.await?
|
||||
.keys;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -315,6 +336,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
//
|
||||
// max_keys limits max number of keys returned; None means unlimited.
|
||||
pub async fn list_files(
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
// lists common *prefixes*, if any of files
|
||||
// Example:
|
||||
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
|
||||
@@ -5,9 +5,11 @@
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
io::ErrorKind,
|
||||
num::NonZeroU32,
|
||||
pin::Pin,
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
|
||||
@@ -20,11 +22,11 @@ use tokio::{
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
@@ -91,47 +93,7 @@ impl LocalFs {
|
||||
|
||||
#[cfg(test)]
|
||||
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
use std::{future::Future, pin::Pin};
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
||||
{
|
||||
Box::pin(async move {
|
||||
let directory_path = directory_path.as_ref();
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path =
|
||||
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
||||
anyhow::Error::msg(format!(
|
||||
"non-Unicode path: {}",
|
||||
pb.to_string_lossy()
|
||||
))
|
||||
})?;
|
||||
if file_type.is_symlink() {
|
||||
tracing::debug!("{entry_path:?} is a symlink, skipping")
|
||||
} else if file_type.is_dir() {
|
||||
paths.extend(get_all_files(&entry_path).await?.into_iter())
|
||||
} else {
|
||||
paths.push(entry_path);
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path {directory_path:?} is not a directory")
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Ok(get_all_files(&self.storage_root)
|
||||
Ok(get_all_files(&self.storage_root, true)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|path| {
|
||||
@@ -158,14 +120,6 @@ impl LocalFs {
|
||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||
// the local filesystem we need a directory to start calling read_dir on.
|
||||
let mut initial_dir = full_path.clone();
|
||||
|
||||
// If there's no trailing slash, we have to start looking from one above: even if
|
||||
// `initial_dir` is a directory, we should still list any prefixes in the parent
|
||||
// that start with the same string.
|
||||
if !full_path.to_string().ends_with('/') {
|
||||
initial_dir.pop();
|
||||
}
|
||||
|
||||
loop {
|
||||
// Did we make it to the root?
|
||||
if initial_dir.parent().is_none() {
|
||||
@@ -341,66 +295,61 @@ impl RemoteStorage for LocalFs {
|
||||
let op = async {
|
||||
let mut result = Listing::default();
|
||||
|
||||
// Filter out directories: in S3 directories don't exist, only the keys within them do.
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
result.keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
let keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
result.keys = keys;
|
||||
} else {
|
||||
let mut prefixes = HashSet::new();
|
||||
for key in keys {
|
||||
// If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
|
||||
let relative_key = if let Some(prefix) = prefix {
|
||||
let mut prefix = prefix.clone();
|
||||
// We only strip the dirname of the prefix, so that when we strip it from the start of keys we
|
||||
// end up with full file/dir names.
|
||||
let prefix_full_local_path = prefix.with_base(&self.storage_root);
|
||||
let has_slash = prefix.0.to_string().ends_with('/');
|
||||
let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
|
||||
prefix
|
||||
} else {
|
||||
prefix.0.pop();
|
||||
prefix
|
||||
};
|
||||
|
||||
RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
|
||||
} else {
|
||||
key
|
||||
};
|
||||
|
||||
let relative_key = format!("{}", relative_key);
|
||||
if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
let first_part = relative_key
|
||||
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
prefixes.insert(first_part);
|
||||
} else {
|
||||
result
|
||||
.keys
|
||||
.push(RemotePath::from_string(&relative_key).unwrap());
|
||||
}
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped);
|
||||
}
|
||||
result.prefixes = prefixes
|
||||
.into_iter()
|
||||
.map(|s| RemotePath::from_string(&s).unwrap())
|
||||
.collect();
|
||||
}
|
||||
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
Ok(result)
|
||||
};
|
||||
|
||||
@@ -611,6 +560,50 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
path_with_suffix_extension(original_path, "metadata")
|
||||
}
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
recursive: bool,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
||||
{
|
||||
Box::pin(async move {
|
||||
let directory_path = directory_path.as_ref();
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path =
|
||||
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
||||
anyhow::Error::msg(format!(
|
||||
"non-Unicode path: {}",
|
||||
pb.to_string_lossy()
|
||||
))
|
||||
})?;
|
||||
if file_type.is_symlink() {
|
||||
debug!("{entry_path:?} is a symlink, skipping")
|
||||
} else if file_type.is_dir() {
|
||||
if recursive {
|
||||
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
|
||||
} else {
|
||||
paths.push(entry_path)
|
||||
}
|
||||
} else {
|
||||
paths.push(entry_path);
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path {directory_path:?} is not a directory")
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
let target_dir = match target_file_path.parent() {
|
||||
Some(parent_dir) => parent_dir,
|
||||
@@ -930,18 +923,13 @@ mod fs_tests {
|
||||
// No delimiter: should recursively list everything
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
||||
let child_sibling =
|
||||
upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
|
||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
||||
|
||||
let listing = storage
|
||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||
.await?;
|
||||
assert!(listing.prefixes.is_empty());
|
||||
assert_eq!(
|
||||
listing.keys.into_iter().collect::<HashSet<_>>(),
|
||||
HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
|
||||
);
|
||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
||||
|
||||
// Delimiter: should only go one deep
|
||||
let listing = storage
|
||||
@@ -954,25 +942,7 @@ mod fs_tests {
|
||||
);
|
||||
assert!(listing.keys.is_empty());
|
||||
|
||||
// Delimiter & prefix with a trailing slash
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
listing.keys,
|
||||
[RemotePath::from_string("uncle").unwrap()].to_vec()
|
||||
);
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("parent").unwrap()].to_vec()
|
||||
);
|
||||
|
||||
// Delimiter and prefix without a trailing slash
|
||||
// Delimiter & prefix
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||
@@ -981,66 +951,12 @@ mod fs_tests {
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(listing.keys, [].to_vec());
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||
);
|
||||
|
||||
// Delimiter and prefix that's partway through a path component
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(listing.keys, [].to_vec());
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list_part_component() -> anyhow::Result<()> {
|
||||
// No delimiter: should recursively list everything
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
// Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
|
||||
// of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
|
||||
// a freeform prefix.
|
||||
let _child_a =
|
||||
upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
|
||||
let _child_b =
|
||||
upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
|
||||
|
||||
// Delimiter and prefix that's partway through a path component
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(
|
||||
&RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
|
||||
),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(listing.keys, [].to_vec());
|
||||
|
||||
let mut found_prefixes = listing.prefixes.clone();
|
||||
found_prefixes.sort();
|
||||
assert_eq!(
|
||||
found_prefixes,
|
||||
[
|
||||
RemotePath::from_string("tenant").unwrap(),
|
||||
RemotePath::from_string("tenant-01").unwrap(),
|
||||
]
|
||||
.to_vec()
|
||||
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
|
||||
.to_vec()
|
||||
);
|
||||
assert_eq!(listing.keys, [uncle.clone()].to_vec());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -178,7 +178,10 @@ impl S3Bucket {
|
||||
|
||||
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
let path_string = path.get_path().as_str();
|
||||
let path_string = path
|
||||
.get_path()
|
||||
.as_str()
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
match &self.prefix_in_bucket {
|
||||
Some(prefix) => prefix.clone() + "/" + path_string,
|
||||
None => path_string.to_string(),
|
||||
@@ -468,11 +471,16 @@ impl RemoteStorage for S3Bucket {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| {
|
||||
self.prefix_in_bucket.clone().map(|mut s| {
|
||||
s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
s
|
||||
})
|
||||
.or_else(|| self.prefix_in_bucket.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
@@ -541,15 +549,11 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
// S3 gives us prefixes like "foo/", we return them like "foo"
|
||||
result.prefixes.extend(prefixes.iter().filter_map(|o| {
|
||||
Some(
|
||||
self.s3_object_to_relative_path(
|
||||
o.prefix()?
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
|
||||
),
|
||||
)
|
||||
}));
|
||||
result.prefixes.extend(
|
||||
prefixes
|
||||
.iter()
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
|
||||
continuation_token = match response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
@@ -1046,22 +1050,22 @@ mod tests {
|
||||
Some("/test/prefix/"),
|
||||
];
|
||||
let expected_outputs = [
|
||||
vec!["", "some/path", "some/path/"],
|
||||
vec!["/", "/some/path", "/some/path/"],
|
||||
vec!["", "some/path", "some/path"],
|
||||
vec!["/", "/some/path", "/some/path"],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path/",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path/",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path/",
|
||||
"test/prefix/some/path",
|
||||
],
|
||||
];
|
||||
|
||||
|
||||
@@ -107,6 +107,27 @@ impl UnreliableWrapper {
|
||||
type VoidStorage = crate::LocalFs;
|
||||
|
||||
impl RemoteStorage for UnreliableWrapper {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_prefixes(prefix, cancel).await
|
||||
}
|
||||
|
||||
async fn list_files(
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_files(folder, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use remote_storage::ListingMode;
|
||||
use remote_storage::RemotePath;
|
||||
use std::sync::Arc;
|
||||
use std::{collections::HashSet, num::NonZeroU32};
|
||||
@@ -55,9 +54,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||
.await?
|
||||
.prefixes
|
||||
.list_prefixes(None, &cancel)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
@@ -66,14 +65,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list(
|
||||
Some(&base_prefix.add_trailing_slash()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?
|
||||
.prefixes
|
||||
.list_prefixes(Some(&base_prefix), &cancel)
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let remote_only_prefixes = nested_remote_prefixes
|
||||
@@ -96,13 +90,11 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
///
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn list_no_delimiter_works(
|
||||
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||
@@ -115,36 +107,29 @@ async fn list_no_delimiter_works(
|
||||
let base_prefix =
|
||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||
.list_files(None, None, &cancel)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.keys
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_files,
|
||||
ctx.remote_blobs.clone(),
|
||||
"remote storage list on root mismatches with the uploads."
|
||||
"remote storage list_files on root mismatches with the uploads."
|
||||
);
|
||||
|
||||
// Test that max_keys limit works. In total there are about 21 files (see
|
||||
// upload_simple_remote_data call in test_real_s3.rs).
|
||||
let limited_root_files = test_client
|
||||
.list(
|
||||
None,
|
||||
ListingMode::NoDelimiter,
|
||||
Some(NonZeroU32::new(2).unwrap()),
|
||||
&cancel,
|
||||
)
|
||||
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
|
||||
.await
|
||||
.context("client list root files failure")?;
|
||||
assert_eq!(limited_root_files.keys.len(), 2);
|
||||
assert_eq!(limited_root_files.len(), 2);
|
||||
|
||||
let nested_remote_files = test_client
|
||||
.list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
|
||||
.list_files(Some(&base_prefix), None, &cancel)
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.keys
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let trim_remote_blobs: HashSet<_> = ctx
|
||||
@@ -156,7 +141,7 @@ async fn list_no_delimiter_works(
|
||||
.collect();
|
||||
assert_eq!(
|
||||
nested_remote_files, trim_remote_blobs,
|
||||
"remote storage list on subdirrectory mismatches with the uploads."
|
||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -214,11 +199,7 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
||||
|
||||
let prefixes = ctx
|
||||
.client
|
||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||
.await?
|
||||
.prefixes;
|
||||
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
|
||||
@@ -132,6 +132,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
Enabled(AzureWithSimpleTestBlobs),
|
||||
Disabled,
|
||||
|
||||
@@ -12,8 +12,8 @@ use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use futures_util::StreamExt;
|
||||
use remote_storage::{
|
||||
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
|
||||
RemoteStorageKind, S3Config,
|
||||
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
S3Config,
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
@@ -75,14 +75,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(
|
||||
retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.keys
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>(),
|
||||
)
|
||||
Ok(retry(|| client.list_files(None, None, cancel))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>())
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
@@ -297,6 +294,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
Enabled(S3WithSimpleTestBlobs),
|
||||
Disabled,
|
||||
|
||||
@@ -70,7 +70,6 @@ tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tracing.workspace = true
|
||||
twox-hash.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
|
||||
@@ -279,7 +279,7 @@ impl Client {
|
||||
lazy: bool,
|
||||
) -> Result<()> {
|
||||
let req_body = TenantLocationConfigRequest {
|
||||
tenant_id: None,
|
||||
tenant_id: Some(tenant_shard_id),
|
||||
config,
|
||||
};
|
||||
|
||||
|
||||
@@ -74,6 +74,8 @@ struct MetadataCmd {
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
/// Replace latest gc cuttoff
|
||||
latest_gc_cuttoff: Option<Lsn>,
|
||||
/// Enable aux file v2 storage
|
||||
aux_file_v2: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -213,12 +215,14 @@ fn handle_metadata(
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
latest_gc_cuttoff,
|
||||
aux_file_v2,
|
||||
}: &MetadataCmd,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let metadata_bytes = std::fs::read(path)?;
|
||||
let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
|
||||
println!("Current metadata:\n{meta:?}");
|
||||
let mut update_meta = false;
|
||||
// TODO: simplify this part
|
||||
if let Some(disk_consistent_lsn) = disk_consistent_lsn {
|
||||
meta = TimelineMetadata::new(
|
||||
*disk_consistent_lsn,
|
||||
@@ -228,6 +232,7 @@ fn handle_metadata(
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
meta.aux_file_v2(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
@@ -240,6 +245,7 @@ fn handle_metadata(
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
meta.aux_file_v2(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
@@ -252,6 +258,20 @@ fn handle_metadata(
|
||||
*latest_gc_cuttoff,
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
meta.aux_file_v2(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
if let Some(aux_file_v2) = aux_file_v2 {
|
||||
meta = TimelineMetadata::new(
|
||||
meta.disk_consistent_lsn(),
|
||||
meta.prev_record_lsn(),
|
||||
meta.ancestor_timeline(),
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
*aux_file_v2,
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
|
||||
@@ -1,112 +0,0 @@
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
|
||||
use tracing::warn;
|
||||
|
||||
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
|
||||
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
|
||||
let mut key = [0; METADATA_KEY_SIZE];
|
||||
let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
|
||||
key[0] = AUX_KEY_PREFIX;
|
||||
key[1] = dir_level1;
|
||||
key[2] = dir_level2;
|
||||
key[3..16].copy_from_slice(&hash[0..13]);
|
||||
Key::from_metadata_key_fixed_size(&key)
|
||||
}
|
||||
|
||||
const AUX_DIR_PG_LOGICAL: u8 = 0x01;
|
||||
const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
|
||||
const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
|
||||
|
||||
/// Encode the aux file into a fixed-size key.
|
||||
///
|
||||
/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
|
||||
/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
|
||||
/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
|
||||
/// is roughly based on the first two components of the path, one unique number for one component.
|
||||
///
|
||||
/// * pg_logical/mappings -> 0x0101
|
||||
/// * pg_logical/snapshots -> 0x0102
|
||||
/// * pg_logical/replorigin_checkpoint -> 0x0103
|
||||
/// * pg_logical/others -> 0x01FF
|
||||
/// * pg_replslot/ -> 0x0201
|
||||
/// * others -> 0xFFFF
|
||||
///
|
||||
/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
|
||||
/// The new file type must have never been written to the storage before. Otherwise, there could be data
|
||||
/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
|
||||
pub fn encode_aux_file_key(path: &str) -> Key {
|
||||
if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
|
||||
} else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
|
||||
} else if path == "pg_logical/replorigin_checkpoint" {
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
|
||||
} else if let Some(fname) = path.strip_prefix("pg_logical/") {
|
||||
if cfg!(debug_assertions) {
|
||||
warn!(
|
||||
"unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
|
||||
path
|
||||
);
|
||||
}
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
|
||||
} else if let Some(fname) = path.strip_prefix("pg_replslot/") {
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
|
||||
} else {
|
||||
if cfg!(debug_assertions) {
|
||||
warn!(
|
||||
"unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
|
||||
path
|
||||
);
|
||||
}
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_hash_portable() {
|
||||
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
|
||||
// if the algorithm produces the same hash across different environments.
|
||||
assert_eq!(
|
||||
305317690835051308206966631765527126151,
|
||||
twox_hash::xxh3::hash128("test1".as_bytes())
|
||||
);
|
||||
assert_eq!(
|
||||
85104974691013376326742244813280798847,
|
||||
twox_hash::xxh3::hash128("test/test2".as_bytes())
|
||||
);
|
||||
assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encoding_portable() {
|
||||
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
|
||||
// of the page server.
|
||||
assert_eq!(
|
||||
"8200000101E5B20C5F8DD5AA3289D6D9EAFA",
|
||||
encode_aux_file_key("pg_logical/mappings/test1").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"820000010239AAC544893139B26F501B97E6",
|
||||
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"820000010300000000000000000000000000",
|
||||
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"82000001FF8635AF2134B7266EC5B4189FD6",
|
||||
encode_aux_file_key("pg_logical/unsupported").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"8200000201772D0E5D71DE14DA86142A1619",
|
||||
encode_aux_file_key("pg_replslot/test3").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"820000FFFF1866EBEB53B807B26A2416F317",
|
||||
encode_aux_file_key("other_file_not_supported").to_string()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -426,6 +426,10 @@ async fn build_timeline_info_common(
|
||||
state,
|
||||
|
||||
walreceiver_status,
|
||||
|
||||
aux_file_v2: timeline
|
||||
.aux_file_v2
|
||||
.load(std::sync::atomic::Ordering::SeqCst),
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub use pageserver_api::keyspace;
|
||||
pub mod aux_file;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
|
||||
@@ -86,20 +86,11 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_visited_per_read_global",
|
||||
"Number of layers visited to reconstruct one key",
|
||||
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_visited_per_vectored_read_global",
|
||||
"Average number of layers visited to reconstruct one key",
|
||||
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
|
||||
"pageserver_read_num_fs_layers",
|
||||
"Number of persistent layers accessed for processing a read request, including those in the cache",
|
||||
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -2780,8 +2771,7 @@ pub fn preinitialize_metrics() {
|
||||
|
||||
// histograms
|
||||
[
|
||||
&READ_NUM_LAYERS_VISITED,
|
||||
&VEC_READ_NUM_LAYERS_VISITED,
|
||||
&READ_NUM_FS_LAYERS,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||
|
||||
@@ -1206,10 +1206,6 @@ impl PageServerHandler {
|
||||
))
|
||||
}
|
||||
|
||||
/// Note on "fullbackup":
|
||||
/// Full basebackups should only be used for debugging purposes.
|
||||
/// Originally, it was introduced to enable breaking storage format changes,
|
||||
/// but that is not applicable anymore.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
|
||||
async fn handle_basebackup_request<IO>(
|
||||
|
||||
@@ -32,7 +32,7 @@ use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::vec_map::{VecMap, VecMapOrdering};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
@@ -448,11 +448,6 @@ impl Timeline {
|
||||
// include physical changes from later commits that will be marked
|
||||
// as aborted, and will need to be vacuumed away.
|
||||
let commit_lsn = Lsn((low - 1) * 8);
|
||||
// This maxing operation is for the edge case that the search above did
|
||||
// set found_smaller to true but it never increased the lsn. Then, low
|
||||
// is still the old min_lsn the subtraction above could possibly give a value
|
||||
// below the anchestor_lsn.
|
||||
let commit_lsn = commit_lsn.max(min_lsn);
|
||||
match (found_smaller, found_larger) {
|
||||
(false, false) => {
|
||||
// This can happen if no commit records have been processed yet, e.g.
|
||||
@@ -1404,10 +1399,35 @@ impl<'a> DatadirModification<'a> {
|
||||
Some(Bytes::copy_from_slice(content))
|
||||
};
|
||||
|
||||
// TODO: either ensure we don't flip the flag for users with existing AUX files, or do a check there.
|
||||
let aux_file_v2 = {
|
||||
let tline_aux_file_v2 = self
|
||||
.tline
|
||||
.aux_file_v2
|
||||
.load(std::sync::atomic::Ordering::SeqCst);
|
||||
if tline_aux_file_v2 {
|
||||
true
|
||||
} else if self.tline.get_try_enable_aux_file_v2() {
|
||||
info!(
|
||||
"enabling aux file v2 support for timeline {}",
|
||||
self.tline.timeline_id
|
||||
);
|
||||
// The next index part upload will have `aux_file_v2` to `true`.
|
||||
self.tline
|
||||
.aux_file_v2
|
||||
.store(true, std::sync::atomic::Ordering::SeqCst);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
};
|
||||
|
||||
let _ = aux_file_v2; // keep this unused until the write path is implemented
|
||||
|
||||
let n_files;
|
||||
let mut aux_files = self.tline.aux_files.lock().await;
|
||||
if let Some(mut dir) = aux_files.dir.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value.
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
n_files = dir.files.len();
|
||||
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
||||
|
||||
@@ -1346,6 +1346,7 @@ impl Tenant {
|
||||
initdb_lsn,
|
||||
initdb_lsn,
|
||||
pg_version,
|
||||
false,
|
||||
);
|
||||
self.prepare_new_timeline(
|
||||
new_timeline_id,
|
||||
@@ -2870,23 +2871,20 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let cutoff = timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(horizon)
|
||||
.unwrap_or(Lsn(0));
|
||||
if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
|
||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||
.range((
|
||||
Included((timeline_id, Lsn(0))),
|
||||
Included((timeline_id, Lsn(u64::MAX))),
|
||||
))
|
||||
.map(|&x| x.1)
|
||||
.collect();
|
||||
timeline
|
||||
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
|
||||
.await?;
|
||||
|
||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||
.range((
|
||||
Included((timeline_id, Lsn(0))),
|
||||
Included((timeline_id, Lsn(u64::MAX))),
|
||||
))
|
||||
.map(|&x| x.1)
|
||||
.collect();
|
||||
timeline
|
||||
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
|
||||
.await?;
|
||||
|
||||
gc_timelines.push(timeline);
|
||||
gc_timelines.push(timeline);
|
||||
}
|
||||
}
|
||||
drop(gc_cs);
|
||||
Ok(gc_timelines)
|
||||
@@ -3010,6 +3008,7 @@ impl Tenant {
|
||||
*src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
|
||||
src_timeline.initdb_lsn,
|
||||
src_timeline.pg_version,
|
||||
src_timeline.aux_file_v2.load(Ordering::SeqCst),
|
||||
);
|
||||
|
||||
let uninitialized_timeline = self
|
||||
@@ -3213,6 +3212,7 @@ impl Tenant {
|
||||
pgdata_lsn,
|
||||
pgdata_lsn,
|
||||
pg_version,
|
||||
false,
|
||||
);
|
||||
let raw_timeline = self
|
||||
.prepare_new_timeline(
|
||||
@@ -3664,6 +3664,7 @@ pub(crate) mod harness {
|
||||
image_layer_creation_check_threshold: Some(
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
try_enable_aux_file_v2: Some(tenant_conf.try_enable_aux_file_v2),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3862,7 +3863,6 @@ mod tests {
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::key::NON_INHERITED_RANGE;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use rand::{thread_rng, Rng};
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
@@ -4662,62 +4662,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored_aux_files")?;
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
|
||||
let mut modification = tline.begin_modification(Lsn(0x1000));
|
||||
modification.put_file("foo/bar1", b"content1", &ctx).await?;
|
||||
modification.set_lsn(Lsn(0x1008))?;
|
||||
modification.put_file("foo/bar2", b"content2", &ctx).await?;
|
||||
modification.commit(&ctx).await?;
|
||||
|
||||
let child_timeline_id = TimelineId::generate();
|
||||
tenant
|
||||
.branch_timeline_test(
|
||||
tline,
|
||||
child_timeline_id,
|
||||
Some(tline.get_last_record_lsn()),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let child_timeline = tenant
|
||||
.get_timeline(child_timeline_id, true)
|
||||
.expect("Should have the branched timeline");
|
||||
|
||||
let aux_keyspace = KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE],
|
||||
};
|
||||
let read_lsn = child_timeline.get_last_record_lsn();
|
||||
|
||||
let vectored_res = child_timeline
|
||||
.get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
|
||||
.await;
|
||||
|
||||
child_timeline
|
||||
.validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
|
||||
.await;
|
||||
|
||||
let images = vectored_res?;
|
||||
let mut key = NON_INHERITED_RANGE.start;
|
||||
while key < NON_INHERITED_RANGE.end {
|
||||
assert!(matches!(
|
||||
images[&key],
|
||||
Err(PageReconstructError::MissingKey(_))
|
||||
));
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Test that vectored get handles layer gaps correctly
|
||||
// by advancing into the next ancestor timeline if required.
|
||||
//
|
||||
|
||||
@@ -369,6 +369,10 @@ pub struct TenantConf {
|
||||
// How much WAL must be ingested before checking again whether a new image layer is required.
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
|
||||
/// Try enable the aux file v2 storage. Once this is set to true and the tenant writes an AUX file, the
|
||||
/// pageserver will always use v2 for AUX files and setting this flag to false will be a no-op.
|
||||
pub try_enable_aux_file_v2: bool,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
@@ -464,6 +468,10 @@ pub struct TenantConfOpt {
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub try_enable_aux_file_v2: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -521,6 +529,9 @@ impl TenantConfOpt {
|
||||
image_layer_creation_check_threshold: self
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||
try_enable_aux_file_v2: self
|
||||
.try_enable_aux_file_v2
|
||||
.unwrap_or(global_conf.try_enable_aux_file_v2),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -562,6 +573,7 @@ impl Default for TenantConf {
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
try_enable_aux_file_v2: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -636,6 +648,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
try_enable_aux_file_v2: value.try_enable_aux_file_v2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,10 +14,11 @@ use utils::bin_ser::SerializeError;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
|
||||
|
||||
/// Use special format number to enable backward compatibility.
|
||||
const METADATA_FORMAT_VERSION: u16 = 4;
|
||||
const METADATA_FORMAT_VERSION: u16 = 5;
|
||||
|
||||
/// Previous supported format versions.
|
||||
const METADATA_OLD_FORMAT_VERSION: u16 = 3;
|
||||
const METADATA_OLD_FORMAT_VERSION_V2: u16 = 4;
|
||||
const METADATA_OLD_FORMAT_VERSION_V1: u16 = 3;
|
||||
|
||||
/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
|
||||
///
|
||||
@@ -31,7 +32,7 @@ const METADATA_MAX_SIZE: usize = 512;
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TimelineMetadata {
|
||||
hdr: TimelineMetadataHeader,
|
||||
body: TimelineMetadataBodyV2,
|
||||
body: TimelineMetadataBodyV3,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -42,6 +43,28 @@ struct TimelineMetadataHeader {
|
||||
}
|
||||
const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct TimelineMetadataBodyV3 {
|
||||
disk_consistent_lsn: Lsn,
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
// 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
|
||||
// lot. We only store it in the metadata file when we flush *all* the
|
||||
// in-memory data so that 'last_record_lsn' is the same as
|
||||
// 'disk_consistent_lsn'. That's OK, because after page server restart, as
|
||||
// soon as we reprocess at least one record, we will have a valid
|
||||
// 'prev_record_lsn' value in memory again. This is only really needed when
|
||||
// doing a clean shutdown, so that there is no more WAL beyond
|
||||
// 'disk_consistent_lsn'
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<TimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
aux_file_v2: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct TimelineMetadataBodyV2 {
|
||||
disk_consistent_lsn: Lsn,
|
||||
@@ -84,6 +107,7 @@ struct TimelineMetadataBodyV1 {
|
||||
}
|
||||
|
||||
impl TimelineMetadata {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
@@ -92,6 +116,7 @@ impl TimelineMetadata {
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
aux_file_v2: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
hdr: TimelineMetadataHeader {
|
||||
@@ -99,7 +124,7 @@ impl TimelineMetadata {
|
||||
size: 0,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
},
|
||||
body: TimelineMetadataBodyV2 {
|
||||
body: TimelineMetadataBodyV3 {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
ancestor_timeline,
|
||||
@@ -107,6 +132,7 @@ impl TimelineMetadata {
|
||||
latest_gc_cutoff_lsn,
|
||||
initdb_lsn,
|
||||
pg_version,
|
||||
aux_file_v2,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -115,29 +141,51 @@ impl TimelineMetadata {
|
||||
let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
|
||||
|
||||
// backward compatible only up to this version
|
||||
ensure!(
|
||||
hdr.format_version == METADATA_OLD_FORMAT_VERSION,
|
||||
"unsupported metadata format version {}",
|
||||
hdr.format_version
|
||||
);
|
||||
let body = match hdr.format_version {
|
||||
METADATA_OLD_FORMAT_VERSION_V2 => {
|
||||
let metadata_size = hdr.size as usize;
|
||||
|
||||
let metadata_size = hdr.size as usize;
|
||||
let body: TimelineMetadataBodyV2 =
|
||||
TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
|
||||
|
||||
let body: TimelineMetadataBodyV1 =
|
||||
TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
|
||||
let body = TimelineMetadataBodyV3 {
|
||||
disk_consistent_lsn: body.disk_consistent_lsn,
|
||||
prev_record_lsn: body.prev_record_lsn,
|
||||
ancestor_timeline: body.ancestor_timeline,
|
||||
ancestor_lsn: body.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: body.initdb_lsn,
|
||||
pg_version: body.pg_version,
|
||||
aux_file_v2: false,
|
||||
};
|
||||
|
||||
let body = TimelineMetadataBodyV2 {
|
||||
disk_consistent_lsn: body.disk_consistent_lsn,
|
||||
prev_record_lsn: body.prev_record_lsn,
|
||||
ancestor_timeline: body.ancestor_timeline,
|
||||
ancestor_lsn: body.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: body.initdb_lsn,
|
||||
pg_version: 14, // All timelines created before this version had pg_version 14
|
||||
hdr.format_version = METADATA_FORMAT_VERSION;
|
||||
body
|
||||
}
|
||||
METADATA_OLD_FORMAT_VERSION_V1 => {
|
||||
let metadata_size = hdr.size as usize;
|
||||
|
||||
let body: TimelineMetadataBodyV1 =
|
||||
TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
|
||||
|
||||
let body = TimelineMetadataBodyV3 {
|
||||
disk_consistent_lsn: body.disk_consistent_lsn,
|
||||
prev_record_lsn: body.prev_record_lsn,
|
||||
ancestor_timeline: body.ancestor_timeline,
|
||||
ancestor_lsn: body.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: body.initdb_lsn,
|
||||
pg_version: 14, // All timelines created before this version had pg_version 14
|
||||
aux_file_v2: false,
|
||||
};
|
||||
|
||||
hdr.format_version = METADATA_FORMAT_VERSION;
|
||||
body
|
||||
}
|
||||
_ => {
|
||||
anyhow::bail!("unsupported metadata format version {}", hdr.format_version);
|
||||
}
|
||||
};
|
||||
|
||||
hdr.format_version = METADATA_FORMAT_VERSION;
|
||||
|
||||
Ok(Self { hdr, body })
|
||||
}
|
||||
|
||||
@@ -165,7 +213,7 @@ impl TimelineMetadata {
|
||||
TimelineMetadata::upgrade_timeline_metadata(metadata_bytes)
|
||||
} else {
|
||||
let body =
|
||||
TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
|
||||
TimelineMetadataBodyV3::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
|
||||
ensure!(
|
||||
body.disk_consistent_lsn.is_aligned(),
|
||||
"disk_consistent_lsn is not aligned"
|
||||
@@ -219,6 +267,10 @@ impl TimelineMetadata {
|
||||
self.body.pg_version
|
||||
}
|
||||
|
||||
pub fn aux_file_v2(&self) -> bool {
|
||||
self.body.aux_file_v2
|
||||
}
|
||||
|
||||
// Checksums make it awkward to build a valid instance by hand. This helper
|
||||
// provides a TimelineMetadata with a valid checksum in its header.
|
||||
#[cfg(test)]
|
||||
@@ -231,6 +283,7 @@ impl TimelineMetadata {
|
||||
Lsn::from_hex("00000000").unwrap(),
|
||||
Lsn::from_hex("00000000").unwrap(),
|
||||
0,
|
||||
false,
|
||||
);
|
||||
let bytes = instance.to_bytes().unwrap();
|
||||
Self::from_bytes(&bytes).unwrap()
|
||||
@@ -240,6 +293,7 @@ impl TimelineMetadata {
|
||||
self.body.disk_consistent_lsn = update.disk_consistent_lsn;
|
||||
self.body.prev_record_lsn = update.prev_record_lsn;
|
||||
self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
|
||||
self.body.aux_file_v2 = update.aux_file_v2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -270,6 +324,7 @@ pub(crate) struct MetadataUpdate {
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
aux_file_v2: bool,
|
||||
}
|
||||
|
||||
impl MetadataUpdate {
|
||||
@@ -277,11 +332,13 @@ impl MetadataUpdate {
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
aux_file_v2: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
latest_gc_cutoff_lsn,
|
||||
aux_file_v2,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -302,6 +359,7 @@ mod tests {
|
||||
Lsn(0),
|
||||
// Any version will do here, so use the default
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
true,
|
||||
);
|
||||
|
||||
let metadata_bytes = original_metadata
|
||||
@@ -331,7 +389,7 @@ mod tests {
|
||||
hdr: TimelineMetadataHeader {
|
||||
checksum: 0,
|
||||
size: 0,
|
||||
format_version: METADATA_OLD_FORMAT_VERSION,
|
||||
format_version: METADATA_OLD_FORMAT_VERSION_V1,
|
||||
},
|
||||
body: TimelineMetadataBodyV1 {
|
||||
disk_consistent_lsn: Lsn(0x200),
|
||||
@@ -349,7 +407,7 @@ mod tests {
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
size: metadata_size as u16,
|
||||
format_version: METADATA_OLD_FORMAT_VERSION,
|
||||
format_version: METADATA_OLD_FORMAT_VERSION_V1,
|
||||
checksum: crc32c::crc32c(&body_bytes),
|
||||
};
|
||||
let hdr_bytes = hdr.ser()?;
|
||||
@@ -376,12 +434,83 @@ mod tests {
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
14, // All timelines created before this version had pg_version 14
|
||||
false,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
deserialized_metadata.body, expected_metadata.body,
|
||||
"Metadata of the old version {} should be upgraded to the latest version {}",
|
||||
METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
|
||||
METADATA_OLD_FORMAT_VERSION_V1, METADATA_FORMAT_VERSION
|
||||
);
|
||||
}
|
||||
|
||||
// Generate old version metadata and read it with current code.
|
||||
// Ensure that it is upgraded correctly
|
||||
#[test]
|
||||
fn test_metadata_upgrade_v2() {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct TimelineMetadataV2 {
|
||||
hdr: TimelineMetadataHeader,
|
||||
body: TimelineMetadataBodyV2,
|
||||
}
|
||||
|
||||
let metadata_v2 = TimelineMetadataV2 {
|
||||
hdr: TimelineMetadataHeader {
|
||||
checksum: 0,
|
||||
size: 0,
|
||||
format_version: METADATA_OLD_FORMAT_VERSION_V2,
|
||||
},
|
||||
body: TimelineMetadataBodyV2 {
|
||||
disk_consistent_lsn: Lsn(0x200),
|
||||
prev_record_lsn: Some(Lsn(0x100)),
|
||||
ancestor_timeline: Some(TIMELINE_ID),
|
||||
ancestor_lsn: Lsn(0),
|
||||
latest_gc_cutoff_lsn: Lsn(0),
|
||||
initdb_lsn: Lsn(0),
|
||||
pg_version: 16,
|
||||
},
|
||||
};
|
||||
|
||||
impl TimelineMetadataV2 {
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let body_bytes = self.body.ser()?;
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
size: metadata_size as u16,
|
||||
format_version: METADATA_OLD_FORMAT_VERSION_V2,
|
||||
checksum: crc32c::crc32c(&body_bytes),
|
||||
};
|
||||
let hdr_bytes = hdr.ser()?;
|
||||
let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE];
|
||||
metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes);
|
||||
metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes);
|
||||
Ok(metadata_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
let metadata_bytes = metadata_v2
|
||||
.to_bytes()
|
||||
.expect("Should serialize correct metadata to bytes");
|
||||
|
||||
// This should deserialize to the latest version format
|
||||
let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
.expect("Should deserialize its own bytes");
|
||||
|
||||
let expected_metadata = TimelineMetadata::new(
|
||||
Lsn(0x200),
|
||||
Some(Lsn(0x100)),
|
||||
Some(TIMELINE_ID),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
16,
|
||||
false,
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
deserialized_metadata.body, expected_metadata.body,
|
||||
"Metadata of the old version {} should be upgraded to the latest version {}",
|
||||
METADATA_OLD_FORMAT_VERSION_V2, METADATA_FORMAT_VERSION
|
||||
);
|
||||
}
|
||||
|
||||
@@ -396,6 +525,7 @@ mod tests {
|
||||
Lsn(0),
|
||||
// Any version will do here, so use the default
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
true,
|
||||
);
|
||||
let metadata_bytes = original_metadata
|
||||
.to_bytes()
|
||||
@@ -449,12 +579,13 @@ mod tests {
|
||||
Lsn(0),
|
||||
// Any version will do here, so use the default
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
true,
|
||||
);
|
||||
let expected_bytes = vec![
|
||||
/* bincode length encoding bytes */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
|
||||
/* TimelineMetadataHeader */
|
||||
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
|
||||
97, 148, 11, 30, 0, 71, 0, 5, // checksum, size, format_version (4 + 2 + 2)
|
||||
/* TimelineMetadataBodyV2 */
|
||||
0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
|
||||
1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
|
||||
@@ -464,6 +595,7 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
|
||||
0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
|
||||
0, 0, 0, 15, // pg_version (4 bytes)
|
||||
1, // aux_file_v2 (1 byte)
|
||||
/* padding bytes */
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
@@ -480,7 +612,7 @@ mod tests {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
];
|
||||
let metadata_ser_bytes = original_metadata.ser().unwrap();
|
||||
assert_eq!(metadata_ser_bytes, expected_bytes);
|
||||
|
||||
@@ -202,9 +202,7 @@ use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
use remote_storage::{
|
||||
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
|
||||
};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use std::ops::DerefMut;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
@@ -1147,7 +1145,7 @@ impl RemoteTimelineClient {
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
||||
|
||||
// Execute all pending deletions, so that when we proceed to do a listing below, we aren't
|
||||
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
||||
// taking the burden of listing all the layers that we already know we should delete.
|
||||
self.flush_deletion_queue().await?;
|
||||
|
||||
@@ -1156,20 +1154,14 @@ impl RemoteTimelineClient {
|
||||
let remaining = download_retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
.list(
|
||||
Some(&timeline_storage_path),
|
||||
ListingMode::NoDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.list_files(Some(&timeline_storage_path), None, &cancel)
|
||||
.await
|
||||
},
|
||||
"list remaining files",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.context("list files remaining files")?
|
||||
.keys;
|
||||
.context("list files remaining files")?;
|
||||
|
||||
// We will delete the current index_part object last, since it acts as a deletion
|
||||
// marker via its deleted_at attribute
|
||||
@@ -1860,6 +1852,7 @@ mod tests {
|
||||
// Any version will do
|
||||
// but it should be consistent with the one in the tests
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
false,
|
||||
);
|
||||
|
||||
// go through serialize + deserialize to fix the header, including checksum
|
||||
|
||||
@@ -258,7 +258,7 @@ pub async fn list_remote_timelines(
|
||||
tenant_shard_id: TenantShardId,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
||||
let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
|
||||
let remote_path = remote_timelines_path(&tenant_shard_id);
|
||||
|
||||
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||
@@ -417,16 +417,11 @@ pub(super) async fn download_index_part(
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
|
||||
let indices = download_retry(
|
||||
|| async {
|
||||
storage
|
||||
.list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
|
||||
.await
|
||||
},
|
||||
|| async { storage.list_files(Some(&index_prefix), None, cancel).await },
|
||||
"list index_part files",
|
||||
cancel,
|
||||
)
|
||||
.await?
|
||||
.keys;
|
||||
.await?;
|
||||
|
||||
// General case logic for which index to use: the latest index whose generation
|
||||
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
|
||||
@@ -118,7 +118,6 @@ pub(crate) struct ValuesReconstructState {
|
||||
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
||||
|
||||
keys_done: KeySpaceRandomAccum,
|
||||
layers_visited: u32,
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
@@ -126,7 +125,6 @@ impl ValuesReconstructState {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
layers_visited: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,14 +138,6 @@ impl ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn on_layer_visited(&mut self) {
|
||||
self.layers_visited += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn get_layers_visited(&self) -> u32 {
|
||||
self.layers_visited
|
||||
}
|
||||
|
||||
/// Update the state collected for a given key.
|
||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||
///
|
||||
|
||||
@@ -62,7 +62,7 @@ impl BackgroundLoopKind {
|
||||
pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
|
||||
loop_kind: BackgroundLoopKind,
|
||||
_ctx: &RequestContext,
|
||||
) -> tokio::sync::SemaphorePermit<'static> {
|
||||
) -> impl Drop {
|
||||
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
|
||||
.with_label_values(&[loop_kind.as_static_str()])
|
||||
.guard();
|
||||
|
||||
@@ -16,7 +16,7 @@ use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
|
||||
key::AUX_FILES_KEY,
|
||||
keyspace::KeySpaceAccum,
|
||||
models::{
|
||||
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
@@ -40,7 +40,6 @@ use utils::{
|
||||
vec_map::VecMap,
|
||||
};
|
||||
|
||||
use std::ops::{Deref, Range};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
@@ -54,6 +53,10 @@ use std::{
|
||||
cmp::{max, min, Ordering},
|
||||
ops::ControlFlow,
|
||||
};
|
||||
use std::{
|
||||
ops::{Deref, Range},
|
||||
sync::atomic::AtomicBool,
|
||||
};
|
||||
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||
@@ -382,6 +385,9 @@ pub struct Timeline {
|
||||
|
||||
/// Keep aux directory cache to avoid it's reconstruction on each update
|
||||
pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
|
||||
|
||||
/// Indicate whether aux file v2 storage is enabled.
|
||||
pub(crate) aux_file_v2: AtomicBool,
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -943,13 +949,7 @@ impl Timeline {
|
||||
Err(MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: false,
|
||||
..
|
||||
})) if !NON_INHERITED_RANGE.contains(&key) => {
|
||||
// The vectored read path handles non inherited keys specially.
|
||||
// If such a a key cannot be reconstructed from the current timeline,
|
||||
// the vectored read path returns a key level error as opposed to a top
|
||||
// level error.
|
||||
return Err(GetVectoredError::MissingKey(key));
|
||||
}
|
||||
})) => return Err(GetVectoredError::MissingKey(key)),
|
||||
_ => {
|
||||
values.insert(key, block);
|
||||
key = key.next();
|
||||
@@ -973,7 +973,6 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
|
||||
let layers_visited = reconstruct_state.get_layers_visited();
|
||||
for (key, res) in reconstruct_state.keys {
|
||||
match res {
|
||||
Err(err) => {
|
||||
@@ -988,12 +987,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// Note that this is an approximation. Tracking the exact number of layers visited
|
||||
// per key requires virtually unbounded memory usage and is inefficient
|
||||
// (i.e. segment tree tracking each range queried from a layer)
|
||||
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
|
||||
.observe(layers_visited as f64 / results.len() as f64);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
@@ -1750,6 +1743,14 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
|
||||
|
||||
// Private functions
|
||||
impl Timeline {
|
||||
pub(crate) fn get_try_enable_aux_file_v2(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
.tenant_conf
|
||||
.try_enable_aux_file_v2
|
||||
.unwrap_or(self.conf.default_tenant_conf.try_enable_aux_file_v2)
|
||||
}
|
||||
|
||||
pub(crate) fn get_lazy_slru_download(&self) -> bool {
|
||||
let tenant_conf = self.tenant_conf.load();
|
||||
tenant_conf
|
||||
@@ -2000,6 +2001,8 @@ impl Timeline {
|
||||
dir: None,
|
||||
n_deltas: 0,
|
||||
}),
|
||||
|
||||
aux_file_v2: AtomicBool::new(false),
|
||||
};
|
||||
result.repartition_threshold =
|
||||
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
|
||||
@@ -2149,6 +2152,11 @@ impl Timeline {
|
||||
let shard = self.get_shard_index();
|
||||
let this = self.myself.upgrade().expect("&self method holds the arc");
|
||||
|
||||
if let Some(ref index_part) = index_part {
|
||||
self.aux_file_v2
|
||||
.store(index_part.metadata.aux_file_v2(), AtomicOrdering::SeqCst);
|
||||
}
|
||||
|
||||
let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
|
||||
move || {
|
||||
let _g = span.entered();
|
||||
@@ -2820,7 +2828,7 @@ impl Timeline {
|
||||
let mut timeline = self;
|
||||
|
||||
let mut read_count = scopeguard::guard(0, |cnt| {
|
||||
crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
|
||||
crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
|
||||
});
|
||||
|
||||
// For debugging purposes, collect the path of layers that we traversed
|
||||
@@ -2935,7 +2943,7 @@ impl Timeline {
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
*read_count += 1;
|
||||
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
|
||||
traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -2962,7 +2970,7 @@ impl Timeline {
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
*read_count += 1;
|
||||
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
|
||||
traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
|
||||
continue 'outer;
|
||||
}
|
||||
@@ -3037,41 +3045,6 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
keyspace.remove_overlapping_with(&completed);
|
||||
|
||||
// Do not descend into the ancestor timeline for aux files.
|
||||
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
|
||||
// stalling compaction.
|
||||
// TODO(chi): this will need to be updated for aux files v2 storage
|
||||
if keyspace.overlaps(&NON_INHERITED_RANGE) {
|
||||
let removed = keyspace.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE],
|
||||
});
|
||||
|
||||
for range in removed.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: false,
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
ancestor_lsn: None,
|
||||
traversal_path: Vec::default(),
|
||||
backtrace: if cfg!(test) {
|
||||
Some(std::backtrace::Backtrace::force_capture())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}),
|
||||
);
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
|
||||
break;
|
||||
}
|
||||
@@ -3190,8 +3163,6 @@ impl Timeline {
|
||||
|
||||
unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = next_cont_lsn;
|
||||
|
||||
reconstruct_state.on_layer_visited();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@@ -3656,6 +3627,7 @@ impl Timeline {
|
||||
disk_consistent_lsn,
|
||||
ondisk_prev_record_lsn,
|
||||
*self.latest_gc_cutoff_lsn.read(),
|
||||
self.aux_file_v2.load(AtomicOrdering::SeqCst),
|
||||
);
|
||||
|
||||
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
|
||||
@@ -4244,8 +4216,9 @@ impl Timeline {
|
||||
*self.get_latest_gc_cutoff_lsn()
|
||||
}
|
||||
} else {
|
||||
// No time-based retention was configured. Interpret this as "keep no history".
|
||||
self.get_last_record_lsn()
|
||||
// No time-based retention was configured. Set time-based cutoff to
|
||||
// same as LSN based.
|
||||
cutoff_horizon
|
||||
};
|
||||
|
||||
// Grab the lock and update the values
|
||||
|
||||
@@ -188,10 +188,24 @@ impl Timeline {
|
||||
) -> ControlFlow<()> {
|
||||
let now = SystemTime::now();
|
||||
|
||||
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
|
||||
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||
BackgroundLoopKind::Eviction,
|
||||
ctx,
|
||||
);
|
||||
|
||||
self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
|
||||
.await?;
|
||||
let _permit = tokio::select! {
|
||||
permit = acquire_permit => permit,
|
||||
_ = cancel.cancelled() => return ControlFlow::Break(()),
|
||||
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
match self
|
||||
.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
{
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct EvictionStats {
|
||||
@@ -316,27 +330,19 @@ impl Timeline {
|
||||
gate: &GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
|
||||
|
||||
self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn acquire_imitation_permit(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
|
||||
let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
|
||||
BackgroundLoopKind::Eviction,
|
||||
ctx,
|
||||
);
|
||||
|
||||
tokio::select! {
|
||||
permit = acquire_permit => ControlFlow::Continue(permit),
|
||||
_ = cancel.cancelled() => ControlFlow::Break(()),
|
||||
_ = self.cancel.cancelled() => ControlFlow::Break(()),
|
||||
}
|
||||
let _permit = tokio::select! {
|
||||
permit = acquire_permit => permit,
|
||||
_ = cancel.cancelled() => return ControlFlow::Break(()),
|
||||
_ = self.cancel.cancelled() => return ControlFlow::Break(()),
|
||||
};
|
||||
|
||||
self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
/// If we evict layers but keep cached values derived from those layers, then
|
||||
@@ -370,7 +376,6 @@ impl Timeline {
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
gate: &GateGuard,
|
||||
permit: tokio::sync::SemaphorePermit<'static>,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
if !self.tenant_shard_id.is_shard_zero() {
|
||||
@@ -403,28 +408,7 @@ impl Timeline {
|
||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||
// The others wait until the calculation is done so that they take into account the
|
||||
// imitated accesses that the winner made.
|
||||
let (mut state, _permit) = {
|
||||
if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
|
||||
(locked, permit)
|
||||
} else {
|
||||
// we might need to wait for a long time here in case of pathological synthetic
|
||||
// size calculation performance
|
||||
drop(permit);
|
||||
let locked = tokio::select! {
|
||||
locked = tenant.eviction_task_tenant_state.lock() => locked,
|
||||
_ = self.cancel.cancelled() => {
|
||||
return ControlFlow::Break(())
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
return ControlFlow::Break(())
|
||||
}
|
||||
};
|
||||
// then reacquire -- this will be bad if there is a lot of traffic, but because we
|
||||
// released the permit, the overall latency will be much better.
|
||||
let permit = self.acquire_imitation_permit(cancel, ctx).await?;
|
||||
(locked, permit)
|
||||
}
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
|
||||
_ => {
|
||||
|
||||
@@ -16,7 +16,6 @@ atomic-take.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-iam.workspace = true
|
||||
aws-sigv4.workspace = true
|
||||
aws-smithy-runtime.workspace = true
|
||||
aws-types.workspace = true
|
||||
base64.workspace = true
|
||||
bstr.workspace = true
|
||||
@@ -32,21 +31,14 @@ git-version.workspace = true
|
||||
hashbrown.workspace = true
|
||||
hashlink.workspace = true
|
||||
hex.workspace = true
|
||||
hickory-resolver = "0.24.1"
|
||||
hmac.workspace = true
|
||||
hostname.workspace = true
|
||||
http.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper-tungstenite.workspace = true
|
||||
hyper.workspace = true
|
||||
hyper-rustls = { version = "0.25.0", features = ["rustls-native-certs", "http1", "http2"] }
|
||||
hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
|
||||
hyper-util = { version = "0.1", features = [
|
||||
"server",
|
||||
"http1",
|
||||
"http2",
|
||||
"tokio",
|
||||
] }
|
||||
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
|
||||
http-body-util = { version = "0.1" }
|
||||
ipnet.workspace = true
|
||||
itertools.workspace = true
|
||||
|
||||
@@ -5,10 +5,7 @@ use aws_config::meta::region::RegionProviderChain;
|
||||
use aws_config::profile::ProfileFileCredentialsProvider;
|
||||
use aws_config::provider_config::ProviderConfig;
|
||||
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
|
||||
use aws_smithy_runtime::client::http::hyper_014::HyperClientBuilder;
|
||||
use futures::future::Either;
|
||||
use hyper::client::HttpConnector;
|
||||
use hyper_rustls::ConfigBuilderExt;
|
||||
use proxy::auth;
|
||||
use proxy::auth::backend::AuthRateLimiter;
|
||||
use proxy::auth::backend::MaybeOwned;
|
||||
@@ -21,7 +18,6 @@ use proxy::config::HttpConfig;
|
||||
use proxy::config::ProjectInfoCacheOptions;
|
||||
use proxy::console;
|
||||
use proxy::context::parquet::ParquetUploadArgs;
|
||||
use proxy::dns::Dns;
|
||||
use proxy::http;
|
||||
use proxy::http::health_server::AppMetrics;
|
||||
use proxy::metrics::Metrics;
|
||||
@@ -37,7 +33,6 @@ use proxy::usage_metrics;
|
||||
use anyhow::bail;
|
||||
use proxy::config::{self, ProxyConfig};
|
||||
use proxy::serverless;
|
||||
use rustls::crypto::CryptoProvider;
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::pin;
|
||||
use std::sync::Arc;
|
||||
@@ -275,40 +270,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
info!("Using region: {}", config.aws_region);
|
||||
|
||||
let region_provider = RegionProviderChain::default_provider().or_else(&*config.aws_region); // Replace with your Redis region if needed
|
||||
|
||||
let aws_tls_client_config =
|
||||
rustls::ClientConfig::builder_with_provider(Arc::new(CryptoProvider {
|
||||
cipher_suites: vec![
|
||||
// TLS1.3 suites
|
||||
rustls::crypto::ring::cipher_suite::TLS13_AES_256_GCM_SHA384,
|
||||
rustls::crypto::ring::cipher_suite::TLS13_AES_128_GCM_SHA256,
|
||||
// TLS1.2 suites
|
||||
rustls::crypto::ring::cipher_suite::TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
|
||||
rustls::crypto::ring::cipher_suite::TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
|
||||
rustls::crypto::ring::cipher_suite::TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
|
||||
rustls::crypto::ring::cipher_suite::TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
|
||||
rustls::crypto::ring::cipher_suite::TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256,
|
||||
],
|
||||
..rustls::crypto::ring::default_provider()
|
||||
}))
|
||||
.with_safe_default_protocol_versions()
|
||||
.unwrap()
|
||||
.with_native_roots()?
|
||||
.with_no_client_auth();
|
||||
|
||||
let provider_conf = ProviderConfig::without_region()
|
||||
.with_region(region_provider.region().await)
|
||||
.with_http_client(
|
||||
HyperClientBuilder::new().build(
|
||||
hyper_rustls::HttpsConnectorBuilder::new()
|
||||
.with_tls_config(aws_tls_client_config)
|
||||
.https_or_http()
|
||||
.enable_http1()
|
||||
.enable_http2()
|
||||
.wrap_connector(HttpConnector::new_with_resolver(config.dns.clone())),
|
||||
),
|
||||
);
|
||||
|
||||
let provider_conf =
|
||||
ProviderConfig::without_region().with_region(region_provider.region().await);
|
||||
let aws_credentials_provider = {
|
||||
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
|
||||
CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
|
||||
@@ -437,10 +400,10 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
if let Some(metrics_config) = &config.metric_collection {
|
||||
// TODO: Add gc regardles of the metric collection being enabled.
|
||||
maintenance_tasks.spawn(usage_metrics::task_main(config.dns.clone(), metrics_config));
|
||||
maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
|
||||
client_tasks.spawn(usage_metrics::task_backup(
|
||||
&metrics_config.backup_metric_collection_config,
|
||||
cancellation_token.clone(),
|
||||
cancellation_token,
|
||||
));
|
||||
}
|
||||
|
||||
@@ -460,10 +423,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let cache = api.caches.endpoints_cache.clone();
|
||||
let con = regional_redis_client;
|
||||
let span = tracing::info_span!("endpoints_cache");
|
||||
maintenance_tasks.spawn(
|
||||
async move { cache.do_read(con, cancellation_token.clone()).await }
|
||||
.instrument(span),
|
||||
);
|
||||
maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -534,8 +494,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
bail!("dynamic rate limiter should be disabled");
|
||||
}
|
||||
|
||||
let dns = Dns::new();
|
||||
|
||||
let auth_backend = match &args.auth_backend {
|
||||
AuthBackend::Console => {
|
||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||
@@ -576,7 +534,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
tokio::spawn(locks.garbage_collect_worker());
|
||||
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let endpoint = http::Endpoint::new(url, http::new_client(dns.clone()));
|
||||
let endpoint = http::Endpoint::new(url, http::new_client());
|
||||
|
||||
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
|
||||
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
|
||||
@@ -620,7 +578,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
RateBucketInfo::validate(&mut redis_rps_limit)?;
|
||||
|
||||
let config = Box::leak(Box::new(ProxyConfig {
|
||||
dns,
|
||||
tls_config,
|
||||
auth_backend,
|
||||
metric_collection,
|
||||
|
||||
12
proxy/src/cache/endpoints.rs
vendored
12
proxy/src/cache/endpoints.rs
vendored
@@ -4,7 +4,6 @@ use std::{
|
||||
atomic::{AtomicBool, Ordering},
|
||||
Arc,
|
||||
},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use dashmap::DashSet;
|
||||
@@ -14,7 +13,6 @@ use redis::{
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
@@ -113,22 +111,16 @@ impl EndpointsCache {
|
||||
pub async fn do_read(
|
||||
&self,
|
||||
mut con: ConnectionWithCredentialsProvider,
|
||||
cancellation_token: CancellationToken,
|
||||
) -> anyhow::Result<Infallible> {
|
||||
let mut last_id = "0-0".to_string();
|
||||
loop {
|
||||
self.ready.store(false, Ordering::Release);
|
||||
if let Err(e) = con.connect().await {
|
||||
tracing::error!("error connecting to redis: {:?}", e);
|
||||
self.ready.store(false, Ordering::Release);
|
||||
continue;
|
||||
}
|
||||
if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
|
||||
tracing::error!("error reading from redis: {:?}", e);
|
||||
self.ready.store(false, Ordering::Release);
|
||||
}
|
||||
if cancellation_token.is_cancelled() {
|
||||
info!("cancellation token is cancelled, exiting");
|
||||
tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
|
||||
// 1 week.
|
||||
}
|
||||
tokio::time::sleep(self.config.retry_interval).await;
|
||||
}
|
||||
|
||||
@@ -3,21 +3,17 @@ use crate::{
|
||||
cancellation::CancelClosure,
|
||||
console::{errors::WakeComputeError, messages::MetricsAuxInfo},
|
||||
context::RequestMonitoring,
|
||||
dns::Dns,
|
||||
error::{ReportableError, UserFacingError},
|
||||
metrics::{Metrics, NumDbConnectionsGuard},
|
||||
proxy::neon_option,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use futures::{FutureExt, TryFutureExt};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::{io, net::SocketAddr, time::Duration};
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_postgres::{
|
||||
tls::{MakeTlsConnect, NoTlsError},
|
||||
Connection, SocketConfig,
|
||||
};
|
||||
use tokio_postgres::tls::MakeTlsConnect;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
|
||||
@@ -37,9 +33,6 @@ pub enum ConnectionError {
|
||||
|
||||
#[error("{COULD_NOT_CONNECT}: {0}")]
|
||||
WakeComputeError(#[from] WakeComputeError),
|
||||
|
||||
#[error("{COULD_NOT_CONNECT}: {0}")]
|
||||
TlsNotSupported(#[from] NoTlsError),
|
||||
}
|
||||
|
||||
impl UserFacingError for ConnectionError {
|
||||
@@ -77,7 +70,6 @@ impl ReportableError for ConnectionError {
|
||||
ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
|
||||
ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
|
||||
ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
|
||||
ConnectionError::TlsNotSupported(_) => crate::error::ErrorKind::Compute,
|
||||
ConnectionError::WakeComputeError(e) => e.get_error_kind(),
|
||||
}
|
||||
}
|
||||
@@ -173,42 +165,20 @@ impl std::ops::DerefMut for ConnCfg {
|
||||
|
||||
impl ConnCfg {
|
||||
/// Establish a raw TCP connection to the compute node.
|
||||
async fn connect_raw(
|
||||
&self,
|
||||
dns: &Dns,
|
||||
timeout: Duration,
|
||||
) -> io::Result<(SocketAddr, TcpStream, &str)> {
|
||||
async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
|
||||
use tokio_postgres::config::Host;
|
||||
|
||||
// wrap TcpStream::connect with timeout
|
||||
let connect_with_timeout = |host, port| async move {
|
||||
let addrs = dns
|
||||
.resolve(host)
|
||||
.await
|
||||
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?;
|
||||
|
||||
let timeout = timeout / addrs.len() as u32;
|
||||
|
||||
let mut last_err = None;
|
||||
for addr in addrs {
|
||||
match tokio::time::timeout(timeout, TcpStream::connect((addr, port))).await {
|
||||
Ok(Ok(stream)) => return Ok(stream),
|
||||
Ok(Err(e)) => last_err = Some(e),
|
||||
Err(_) => {
|
||||
last_err = Some(io::Error::new(
|
||||
io::ErrorKind::TimedOut,
|
||||
format!("exceeded connection timeout {timeout:?}"),
|
||||
))
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
Err(last_err.unwrap_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"could not resolve to any address",
|
||||
)
|
||||
}))
|
||||
let connect_with_timeout = |host, port| {
|
||||
tokio::time::timeout(timeout, TcpStream::connect((host, port))).map(
|
||||
move |res| match res {
|
||||
Ok(tcpstream_connect_res) => tcpstream_connect_res,
|
||||
Err(_) => Err(io::Error::new(
|
||||
io::ErrorKind::TimedOut,
|
||||
format!("exceeded connection timeout {timeout:?}"),
|
||||
)),
|
||||
},
|
||||
)
|
||||
};
|
||||
|
||||
let connect_once = |host, port| {
|
||||
@@ -265,11 +235,12 @@ impl ConnCfg {
|
||||
}
|
||||
}
|
||||
|
||||
type TlsStream = postgres_native_tls::TlsStream<TcpStream>;
|
||||
|
||||
pub struct PostgresConnection {
|
||||
/// Socket connected to a compute node.
|
||||
pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<TcpStream, TlsStream>,
|
||||
pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
|
||||
tokio::net::TcpStream,
|
||||
postgres_native_tls::TlsStream<tokio::net::TcpStream>,
|
||||
>,
|
||||
/// PostgreSQL connection parameters.
|
||||
pub params: std::collections::HashMap<String, String>,
|
||||
/// Query cancellation token.
|
||||
@@ -282,38 +253,26 @@ pub struct PostgresConnection {
|
||||
|
||||
impl ConnCfg {
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect_managed<Tls: MakeTlsConnect<TcpStream>>(
|
||||
pub async fn connect(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
dns: &Dns,
|
||||
allow_self_signed_compute: bool,
|
||||
aux: MetricsAuxInfo,
|
||||
timeout: Duration,
|
||||
mut tls: Tls,
|
||||
) -> Result<
|
||||
(
|
||||
SocketAddr,
|
||||
tokio_postgres::Client,
|
||||
Connection<TcpStream, Tls::Stream>,
|
||||
),
|
||||
ConnectionError,
|
||||
>
|
||||
where
|
||||
ConnectionError: From<Tls::Error>,
|
||||
{
|
||||
let (socket_addr, stream, host) = self.connect_raw(dns, timeout).await?;
|
||||
) -> Result<PostgresConnection, ConnectionError> {
|
||||
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
|
||||
|
||||
let tls = MakeTlsConnect::<TcpStream>::make_tls_connect(&mut tls, host)?;
|
||||
let tls_connector = native_tls::TlsConnector::builder()
|
||||
.danger_accept_invalid_certs(allow_self_signed_compute)
|
||||
.build()
|
||||
.unwrap();
|
||||
let mut mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
|
||||
let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
|
||||
|
||||
// connect_raw() will not use TLS if sslmode is "disable"
|
||||
let (mut client, connection) = self.0.connect_raw(stream, tls).await?;
|
||||
let (client, connection) = self.0.connect_raw(stream, tls).await?;
|
||||
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
|
||||
|
||||
client.set_socket_config(SocketConfig {
|
||||
host: tokio_postgres::config::Host::Tcp(host.to_owned()),
|
||||
port: socket_addr.port(),
|
||||
socket_addr: tokio_postgres::SocketAddr::Tcp(socket_addr),
|
||||
connect_timeout: None,
|
||||
keepalive: None,
|
||||
});
|
||||
let stream = connection.stream.into_inner();
|
||||
|
||||
info!(
|
||||
cold_start_info = ctx.cold_start_info.as_str(),
|
||||
@@ -321,28 +280,6 @@ impl ConnCfg {
|
||||
self.0.get_ssl_mode()
|
||||
);
|
||||
|
||||
Ok((socket_addr, client, connection))
|
||||
}
|
||||
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
dns: &Dns,
|
||||
allow_self_signed_compute: bool,
|
||||
aux: MetricsAuxInfo,
|
||||
timeout: Duration,
|
||||
) -> Result<PostgresConnection, ConnectionError> {
|
||||
let tls_connector = native_tls::TlsConnector::builder()
|
||||
.danger_accept_invalid_certs(allow_self_signed_compute)
|
||||
.build()
|
||||
.unwrap();
|
||||
let mk_tls = postgres_native_tls::MakeTlsConnector::new(tls_connector);
|
||||
|
||||
let (socket_addr, client, connection) =
|
||||
self.connect_managed(ctx, dns, timeout, mk_tls).await?;
|
||||
let stream = connection.stream.into_inner();
|
||||
|
||||
// This is very ugly but as of now there's no better way to
|
||||
// extract the connection parameters from tokio-postgres' connection.
|
||||
// TODO: solve this problem in a more elegant manner (e.g. the new library).
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use crate::{
|
||||
auth::{self, backend::AuthRateLimiter},
|
||||
dns::Dns,
|
||||
rate_limiter::RateBucketInfo,
|
||||
serverless::GlobalConnPoolOptions,
|
||||
};
|
||||
@@ -22,7 +21,6 @@ use tracing::{error, info};
|
||||
use x509_parser::oid_registry;
|
||||
|
||||
pub struct ProxyConfig {
|
||||
pub dns: Dns,
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
pub auth_backend: auth::BackendType<'static, (), ()>,
|
||||
pub metric_collection: Option<MetricCollectionConfig>,
|
||||
|
||||
@@ -12,7 +12,6 @@ use crate::{
|
||||
compute,
|
||||
config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
|
||||
context::RequestMonitoring,
|
||||
dns::Dns,
|
||||
intern::ProjectIdInt,
|
||||
metrics::ApiLockMetrics,
|
||||
scram, EndpointCacheKey,
|
||||
@@ -303,13 +302,11 @@ impl NodeInfo {
|
||||
pub async fn connect(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
dns: &Dns,
|
||||
timeout: Duration,
|
||||
) -> Result<compute::PostgresConnection, compute::ConnectionError> {
|
||||
self.config
|
||||
.connect(
|
||||
ctx,
|
||||
dns,
|
||||
self.allow_self_signed_compute,
|
||||
self.aux.clone(),
|
||||
timeout,
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
//! Async dns resolvers
|
||||
|
||||
use std::{
|
||||
net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use aws_sdk_iam::error::BoxError;
|
||||
use hickory_resolver::{error::ResolveError, proto::rr::RData};
|
||||
use hyper::client::connect::dns::Name;
|
||||
use reqwest::dns::Addrs;
|
||||
use tokio::time::Instant;
|
||||
use tracing::trace;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Dns {
|
||||
resolver: Arc<hickory_resolver::TokioAsyncResolver>,
|
||||
}
|
||||
|
||||
impl Default for Dns {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl Dns {
|
||||
pub fn new() -> Self {
|
||||
let (config, options) =
|
||||
hickory_resolver::system_conf::read_system_conf().expect("could not read resolv.conf");
|
||||
|
||||
let resolver = Arc::new(hickory_resolver::TokioAsyncResolver::tokio(config, options));
|
||||
|
||||
Self { resolver }
|
||||
}
|
||||
|
||||
pub async fn resolve(&self, name: &str) -> Result<Vec<IpAddr>, ResolveError> {
|
||||
let start = Instant::now();
|
||||
|
||||
// try to parse the host as a regular IP address first
|
||||
if let Ok(addr) = name.parse::<Ipv4Addr>() {
|
||||
return Ok(vec![IpAddr::V4(addr)]);
|
||||
}
|
||||
|
||||
if let Ok(addr) = name.parse::<Ipv6Addr>() {
|
||||
return Ok(vec![IpAddr::V6(addr)]);
|
||||
}
|
||||
|
||||
let res = self.resolver.lookup_ip(name).await;
|
||||
|
||||
let resolve_duration = start.elapsed();
|
||||
trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
|
||||
|
||||
Ok(res?
|
||||
.as_lookup()
|
||||
.records()
|
||||
.iter()
|
||||
.filter_map(|r| r.data())
|
||||
.filter_map(|rdata| match rdata {
|
||||
RData::A(ip) => Some(IpAddr::from(ip.0)),
|
||||
RData::AAAA(ip) => Some(IpAddr::from(ip.0)),
|
||||
_ => None,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
impl hyper::service::Service<Name> for Dns {
|
||||
type Response = Addrs;
|
||||
type Error = BoxError;
|
||||
type Future = reqwest::dns::Resolving;
|
||||
|
||||
fn poll_ready(
|
||||
&mut self,
|
||||
_cx: &mut std::task::Context<'_>,
|
||||
) -> std::task::Poll<Result<(), Self::Error>> {
|
||||
std::task::Poll::Ready(Ok(()))
|
||||
}
|
||||
|
||||
fn call(&mut self, req: Name) -> Self::Future {
|
||||
reqwest::dns::Resolve::resolve(self, req)
|
||||
}
|
||||
}
|
||||
|
||||
impl reqwest::dns::Resolve for Dns {
|
||||
fn resolve(&self, name: Name) -> reqwest::dns::Resolving {
|
||||
let this = self.clone();
|
||||
Box::pin(async move {
|
||||
match this.resolve(name.as_str()).await {
|
||||
Ok(iter) => {
|
||||
Ok(Box::new(iter.into_iter().map(|ip| SocketAddr::new(ip, 0))) as Box<_>)
|
||||
}
|
||||
Err(e) => Err(e.into()),
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -14,7 +14,6 @@ use tokio::time::Instant;
|
||||
use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
dns::Dns,
|
||||
metrics::{ConsoleRequest, Metrics},
|
||||
url::ApiUrl,
|
||||
};
|
||||
@@ -23,9 +22,9 @@ use reqwest_middleware::RequestBuilder;
|
||||
/// This is the preferred way to create new http clients,
|
||||
/// because it takes care of observability (OpenTelemetry).
|
||||
/// We deliberately don't want to replace this with a public static.
|
||||
pub fn new_client(dns: Dns) -> ClientWithMiddleware {
|
||||
pub fn new_client() -> ClientWithMiddleware {
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.dns_resolver(Arc::new(dns))
|
||||
.dns_resolver(Arc::new(GaiResolver::default()))
|
||||
.connection_verbose(true)
|
||||
.build()
|
||||
.expect("Failed to create http client");
|
||||
@@ -35,9 +34,9 @@ pub fn new_client(dns: Dns) -> ClientWithMiddleware {
|
||||
.build()
|
||||
}
|
||||
|
||||
pub fn new_client_with_timeout(dns: Dns, default_timout: Duration) -> ClientWithMiddleware {
|
||||
pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
|
||||
let timeout_client = reqwest::ClientBuilder::new()
|
||||
.dns_resolver(Arc::new(dns))
|
||||
.dns_resolver(Arc::new(GaiResolver::default()))
|
||||
.connection_verbose(true)
|
||||
.timeout(default_timout)
|
||||
.build()
|
||||
|
||||
@@ -14,7 +14,6 @@ pub mod compute;
|
||||
pub mod config;
|
||||
pub mod console;
|
||||
pub mod context;
|
||||
pub mod dns;
|
||||
pub mod error;
|
||||
pub mod http;
|
||||
pub mod intern;
|
||||
|
||||
@@ -307,7 +307,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
ctx,
|
||||
&TcpMechanism { params: ¶ms },
|
||||
&user_info,
|
||||
&config.dns,
|
||||
mode.allow_self_signed_compute(config),
|
||||
config.wake_compute_retry_config,
|
||||
config.connect_to_compute_retry_config,
|
||||
|
||||
@@ -4,7 +4,6 @@ use crate::{
|
||||
config::RetryConfig,
|
||||
console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
|
||||
context::RequestMonitoring,
|
||||
dns::Dns,
|
||||
error::ReportableError,
|
||||
metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
|
||||
proxy::{
|
||||
@@ -45,7 +44,6 @@ pub trait ConnectMechanism {
|
||||
async fn connect_once(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
dns: &Dns,
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<Self::Connection, Self::ConnectError>;
|
||||
@@ -78,11 +76,10 @@ impl ConnectMechanism for TcpMechanism<'_> {
|
||||
async fn connect_once(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
dns: &Dns,
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<PostgresConnection, Self::Error> {
|
||||
node_info.connect(ctx, dns, timeout).await
|
||||
node_info.connect(ctx, timeout).await
|
||||
}
|
||||
|
||||
fn update_connect_config(&self, config: &mut compute::ConnCfg) {
|
||||
@@ -96,7 +93,6 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
|
||||
ctx: &mut RequestMonitoring,
|
||||
mechanism: &M,
|
||||
user_info: &B,
|
||||
dns: &Dns,
|
||||
allow_self_signed_compute: bool,
|
||||
wake_compute_retry_config: RetryConfig,
|
||||
connect_to_compute_retry_config: RetryConfig,
|
||||
@@ -118,7 +114,7 @@ where
|
||||
|
||||
// try once
|
||||
let err = match mechanism
|
||||
.connect_once(ctx, dns, &node_info, CONNECT_TIMEOUT)
|
||||
.connect_once(ctx, &node_info, CONNECT_TIMEOUT)
|
||||
.await
|
||||
{
|
||||
Ok(res) => {
|
||||
@@ -163,7 +159,7 @@ where
|
||||
num_retries = 1;
|
||||
loop {
|
||||
match mechanism
|
||||
.connect_once(ctx, dns, &node_info, CONNECT_TIMEOUT)
|
||||
.connect_once(ctx, &node_info, CONNECT_TIMEOUT)
|
||||
.await
|
||||
{
|
||||
Ok(res) => {
|
||||
|
||||
@@ -15,7 +15,6 @@ use crate::console::caches::NodeInfoCache;
|
||||
use crate::console::messages::MetricsAuxInfo;
|
||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
|
||||
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
||||
use crate::dns::Dns;
|
||||
use crate::error::ErrorKind;
|
||||
use crate::proxy::retry::retry_after;
|
||||
use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
|
||||
@@ -454,7 +453,6 @@ impl ConnectMechanism for TestConnectMechanism {
|
||||
async fn connect_once(
|
||||
&self,
|
||||
_ctx: &mut RequestMonitoring,
|
||||
_dns: &Dns,
|
||||
_node_info: &console::CachedNodeInfo,
|
||||
_timeout: std::time::Duration,
|
||||
) -> Result<Self::Connection, Self::ConnectError> {
|
||||
@@ -560,17 +558,9 @@ async fn connect_to_compute_success() {
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(
|
||||
&mut ctx,
|
||||
&mechanism,
|
||||
&user_info,
|
||||
&Dns::new(),
|
||||
false,
|
||||
config,
|
||||
config,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
@@ -586,17 +576,9 @@ async fn connect_to_compute_retry() {
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(
|
||||
&mut ctx,
|
||||
&mechanism,
|
||||
&user_info,
|
||||
&Dns::new(),
|
||||
false,
|
||||
config,
|
||||
config,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
@@ -613,17 +595,9 @@ async fn connect_to_compute_non_retry_1() {
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(
|
||||
&mut ctx,
|
||||
&mechanism,
|
||||
&user_info,
|
||||
&Dns::new(),
|
||||
false,
|
||||
config,
|
||||
config,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap_err();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
@@ -640,17 +614,9 @@ async fn connect_to_compute_non_retry_2() {
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(
|
||||
&mut ctx,
|
||||
&mechanism,
|
||||
&user_info,
|
||||
&Dns::new(),
|
||||
false,
|
||||
config,
|
||||
config,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
@@ -678,7 +644,6 @@ async fn connect_to_compute_non_retry_3() {
|
||||
&mut ctx,
|
||||
&mechanism,
|
||||
&user_info,
|
||||
&Dns::new(),
|
||||
false,
|
||||
wake_compute_retry_config,
|
||||
connect_to_compute_retry_config,
|
||||
@@ -701,17 +666,9 @@ async fn wake_retry() {
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(
|
||||
&mut ctx,
|
||||
&mechanism,
|
||||
&user_info,
|
||||
&Dns::new(),
|
||||
false,
|
||||
config,
|
||||
config,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
@@ -728,16 +685,8 @@ async fn wake_non_retry() {
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(
|
||||
&mut ctx,
|
||||
&mechanism,
|
||||
&user_info,
|
||||
&Dns::new(),
|
||||
false,
|
||||
config,
|
||||
config,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap_err();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
@@ -1,19 +1,17 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use tokio_postgres::NoTls;
|
||||
use tracing::{field::display, info};
|
||||
|
||||
use crate::{
|
||||
auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
|
||||
compute::{self, ConnectionError},
|
||||
compute,
|
||||
config::{AuthenticationConfig, ProxyConfig},
|
||||
console::{
|
||||
errors::{GetAuthInfoError, WakeComputeError},
|
||||
CachedNodeInfo,
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
dns::Dns,
|
||||
error::{ErrorKind, ReportableError, UserFacingError},
|
||||
proxy::connect_compute::ConnectMechanism,
|
||||
};
|
||||
@@ -109,7 +107,6 @@ impl PoolingBackend {
|
||||
pool: self.pool.clone(),
|
||||
},
|
||||
&backend,
|
||||
&self.config.dns,
|
||||
false, // do not allow self signed compute for http flow
|
||||
self.config.wake_compute_retry_config,
|
||||
self.config.connect_to_compute_retry_config,
|
||||
@@ -123,7 +120,7 @@ pub enum HttpConnError {
|
||||
#[error("pooled connection closed at inconsistent state")]
|
||||
ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
|
||||
#[error("could not connection to compute")]
|
||||
ConnectionError(#[from] ConnectionError),
|
||||
ConnectionError(#[from] tokio_postgres::Error),
|
||||
|
||||
#[error("could not get auth info")]
|
||||
GetAuthInfo(#[from] GetAuthInfoError),
|
||||
@@ -166,24 +163,23 @@ struct TokioMechanism {
|
||||
#[async_trait]
|
||||
impl ConnectMechanism for TokioMechanism {
|
||||
type Connection = Client<tokio_postgres::Client>;
|
||||
type ConnectError = ConnectionError;
|
||||
type ConnectError = tokio_postgres::Error;
|
||||
type Error = HttpConnError;
|
||||
|
||||
async fn connect_once(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
dns: &Dns,
|
||||
node_info: &CachedNodeInfo,
|
||||
timeout: Duration,
|
||||
) -> Result<Self::Connection, ConnectionError> {
|
||||
let mut config = node_info.config.clone();
|
||||
config
|
||||
) -> Result<Self::Connection, Self::ConnectError> {
|
||||
let mut config = (*node_info.config).clone();
|
||||
let config = config
|
||||
.user(&self.conn_info.user_info.user)
|
||||
.password(&*self.conn_info.password)
|
||||
.dbname(&self.conn_info.dbname)
|
||||
.connect_timeout(timeout);
|
||||
|
||||
let (_, client, connection) = config.connect_managed(ctx, dns, timeout, NoTls).await?;
|
||||
let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
|
||||
|
||||
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
|
||||
Ok(poll_client(
|
||||
|
||||
@@ -12,10 +12,9 @@ use std::{
|
||||
ops::Deref,
|
||||
sync::atomic::{self, AtomicUsize},
|
||||
};
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::time::Instant;
|
||||
use tokio_postgres::tls::NoTlsStream;
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||
@@ -469,7 +468,7 @@ pub fn poll_client<C: ClientInnerExt>(
|
||||
ctx: &mut RequestMonitoring,
|
||||
conn_info: ConnInfo,
|
||||
client: C,
|
||||
mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
|
||||
mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
|
||||
conn_id: uuid::Uuid,
|
||||
aux: MetricsAuxInfo,
|
||||
) -> Client<C> {
|
||||
|
||||
@@ -37,7 +37,6 @@ use utils::http::error::ApiError;
|
||||
use crate::auth::backend::ComputeUserInfo;
|
||||
use crate::auth::endpoint_sni;
|
||||
use crate::auth::ComputeUserInfoParseError;
|
||||
use crate::compute::ConnectionError;
|
||||
use crate::config::ProxyConfig;
|
||||
use crate::config::TlsConfig;
|
||||
use crate::context::RequestMonitoring;
|
||||
@@ -258,9 +257,7 @@ pub async fn handle(
|
||||
|
||||
let mut message = e.to_string_client();
|
||||
let db_error = match &e {
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
|
||||
ConnectionError::Postgres(e),
|
||||
))
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
|
||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||
_ => None,
|
||||
};
|
||||
@@ -664,9 +661,7 @@ impl QueryData {
|
||||
// query failed or was cancelled.
|
||||
Ok(Err(error)) => {
|
||||
let db_error = match &error {
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(
|
||||
ConnectionError::Postgres(e),
|
||||
))
|
||||
SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e))
|
||||
| SqlOverHttpError::Postgres(e) => e.as_db_error(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
use crate::{
|
||||
config::{MetricBackupCollectionConfig, MetricCollectionConfig},
|
||||
context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
dns::Dns,
|
||||
http,
|
||||
intern::{BranchIdInt, EndpointIdInt},
|
||||
};
|
||||
@@ -218,13 +217,13 @@ impl Metrics {
|
||||
|
||||
pub static USAGE_METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
|
||||
|
||||
pub async fn task_main(dns: Dns, config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
|
||||
pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infallible> {
|
||||
info!("metrics collector config: {config:?}");
|
||||
scopeguard::defer! {
|
||||
info!("metrics collector has shut down");
|
||||
}
|
||||
|
||||
let http_client = http::new_client_with_timeout(dns, DEFAULT_HTTP_REPORTING_TIMEOUT);
|
||||
let http_client = http::new_client_with_timeout(DEFAULT_HTTP_REPORTING_TIMEOUT);
|
||||
let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
|
||||
|
||||
let mut prev = Utc::now();
|
||||
@@ -496,7 +495,7 @@ mod tests {
|
||||
use url::Url;
|
||||
|
||||
use super::*;
|
||||
use crate::{dns::Dns, http, BranchId, EndpointId};
|
||||
use crate::{http, BranchId, EndpointId};
|
||||
|
||||
#[tokio::test]
|
||||
async fn metrics() {
|
||||
@@ -526,7 +525,7 @@ mod tests {
|
||||
tokio::spawn(server);
|
||||
|
||||
let metrics = Metrics::default();
|
||||
let client = http::new_client(Dns::new());
|
||||
let client = http::new_client();
|
||||
let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
|
||||
let now = Utc::now();
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ use std::time::Duration;
|
||||
use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
|
||||
use postgres_ffi::XLogFileName;
|
||||
use postgres_ffi::{XLogSegNo, PG_TLI};
|
||||
use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata};
|
||||
use tokio::fs::File;
|
||||
|
||||
use tokio::select;
|
||||
@@ -601,18 +601,12 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
backoff::retry(
|
||||
|| async {
|
||||
// Do list-delete in batch_size batches to make progress even if there a lot of files.
|
||||
// Alternatively we could make remote storage list return iterator, but it is more complicated and
|
||||
// Alternatively we could make list_files return iterator, but it is more complicated and
|
||||
// I'm not sure deleting while iterating is expected in s3.
|
||||
loop {
|
||||
let files = storage
|
||||
.list(
|
||||
Some(&remote_path),
|
||||
ListingMode::NoDelimiter,
|
||||
Some(batch_size),
|
||||
&cancel,
|
||||
)
|
||||
.await?
|
||||
.keys;
|
||||
.list_files(Some(&remote_path), Some(batch_size), &cancel)
|
||||
.await?;
|
||||
if files.is_empty() {
|
||||
return Ok(()); // done
|
||||
}
|
||||
@@ -672,9 +666,8 @@ pub async fn copy_s3_segments(
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let files = storage
|
||||
.list(Some(&remote_path), ListingMode::NoDelimiter, None, &cancel)
|
||||
.await?
|
||||
.keys;
|
||||
.list_files(Some(&remote_path), None, &cancel)
|
||||
.await?;
|
||||
|
||||
let uploaded_segments = &files
|
||||
.iter()
|
||||
|
||||
730
scripts/export_import_between_pageservers.py
Executable file
730
scripts/export_import_between_pageservers.py
Executable file
@@ -0,0 +1,730 @@
|
||||
#
|
||||
# Script to export tenants from one pageserver and import them into another page server.
|
||||
#
|
||||
# Outline of steps:
|
||||
# 1. Get `(last_lsn, prev_lsn)` from old pageserver
|
||||
# 2. Get `fullbackup` from old pageserver, which creates a basebackup tar file
|
||||
# 3. This tar file might be missing relation files for empty relations, if the pageserver
|
||||
# is old enough (we didn't always store those). So to recreate them, we start a local
|
||||
# vanilla postgres on this basebackup and ask it what relations should exist, then touch
|
||||
# any missing files and re-pack the tar.
|
||||
# TODO This functionality is no longer needed, so we can delete it later if we don't
|
||||
# end up using the same utils for the pg 15 upgrade. Not sure.
|
||||
# 4. We import the patched basebackup into a new pageserver
|
||||
# 5. We export again via fullbackup, now from the new pageserver and compare the returned
|
||||
# tar file with the one we imported. This confirms that we imported everything that was
|
||||
# exported, but doesn't guarantee correctness (what if we didn't **export** everything
|
||||
# initially?)
|
||||
# 6. We wait for the new pageserver's remote_consistent_lsn to catch up
|
||||
#
|
||||
# For more context on how to use this, see:
|
||||
# https://www.notion.so/neondatabase/Storage-format-migration-9a8eba33ccf8417ea8cf50e6a0c542cf
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, cast
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
from psycopg2.extensions import parse_dsn
|
||||
|
||||
###############################################
|
||||
### client-side utils copied from test fixtures
|
||||
###############################################
|
||||
|
||||
Env = Dict[str, str]
|
||||
|
||||
_global_counter = 0
|
||||
|
||||
|
||||
def global_counter() -> int:
|
||||
"""A really dumb global counter.
|
||||
This is useful for giving output files a unique number, so if we run the
|
||||
same command multiple times we can keep their output separate.
|
||||
"""
|
||||
global _global_counter
|
||||
_global_counter += 1
|
||||
return _global_counter
|
||||
|
||||
|
||||
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
|
||||
"""Run a process and capture its output
|
||||
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
where "cmd" is the name of the program and NNN is an incrementing
|
||||
counter.
|
||||
If those files already exist, we will overwrite them.
|
||||
Returns basepath for files with captured output.
|
||||
"""
|
||||
assert isinstance(cmd, list)
|
||||
base = f"{os.path.basename(cmd[0])}_{global_counter()}"
|
||||
basepath = os.path.join(capture_dir, base)
|
||||
stdout_filename = basepath + ".stdout"
|
||||
stderr_filename = basepath + ".stderr"
|
||||
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
with open(stderr_filename, "w") as stderr_f:
|
||||
print(f'(capturing output to "{base}.stdout")')
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
|
||||
return basepath
|
||||
|
||||
|
||||
class PgBin:
|
||||
"""A helper class for executing postgres binaries"""
|
||||
|
||||
def __init__(self, log_dir: Path, pg_distrib_dir, pg_version):
|
||||
self.log_dir = log_dir
|
||||
self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin")
|
||||
self.env = os.environ.copy()
|
||||
self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib")
|
||||
|
||||
def _fixpath(self, command: List[str]):
|
||||
if "/" not in command[0]:
|
||||
command[0] = os.path.join(self.pg_bin_path, command[0])
|
||||
|
||||
def _build_env(self, env_add: Optional[Env]) -> Env:
|
||||
if env_add is None:
|
||||
return self.env
|
||||
env = self.env.copy()
|
||||
env.update(env_add)
|
||||
return env
|
||||
|
||||
def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None):
|
||||
"""
|
||||
Run one of the postgres binaries.
|
||||
The command should be in list form, e.g. ['pgbench', '-p', '55432']
|
||||
All the necessary environment variables will be set.
|
||||
If the first argument (the command name) doesn't include a path (no '/'
|
||||
characters present), then it will be edited to include the correct path.
|
||||
If you want stdout/stderr captured to files, use `run_capture` instead.
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print(f'Running command "{" ".join(command)}"')
|
||||
env = self._build_env(env)
|
||||
subprocess.run(command, env=env, cwd=cwd, check=True)
|
||||
|
||||
def run_capture(
|
||||
self,
|
||||
command: List[str],
|
||||
env: Optional[Env] = None,
|
||||
cwd: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> str:
|
||||
"""
|
||||
Run one of the postgres binaries, with stderr and stdout redirected to a file.
|
||||
This is just like `run`, but for chatty programs. Returns basepath for files
|
||||
with captured output.
|
||||
"""
|
||||
|
||||
self._fixpath(command)
|
||||
print(f'Running command "{" ".join(command)}"')
|
||||
env = self._build_env(env)
|
||||
return subprocess_capture(
|
||||
str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs
|
||||
)
|
||||
|
||||
|
||||
class PgProtocol:
|
||||
"""Reusable connection logic"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.default_options = kwargs
|
||||
|
||||
def conn_options(self, **kwargs):
|
||||
conn_options = self.default_options.copy()
|
||||
if "dsn" in kwargs:
|
||||
conn_options.update(parse_dsn(kwargs["dsn"]))
|
||||
conn_options.update(kwargs)
|
||||
|
||||
# Individual statement timeout in seconds. 2 minutes should be
|
||||
# enough for our tests, but if you need a longer, you can
|
||||
# change it by calling "SET statement_timeout" after
|
||||
# connecting.
|
||||
conn_options["options"] = f"-cstatement_timeout=120s {conn_options.get('options', '')}"
|
||||
|
||||
return conn_options
|
||||
|
||||
# autocommit=True here by default because that's what we need most of the time
|
||||
def connect(self, autocommit=True, **kwargs) -> PgConnection:
|
||||
"""
|
||||
Connect to the node.
|
||||
Returns psycopg2's connection object.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
conn: PgConnection = psycopg2.connect(**self.conn_options(**kwargs))
|
||||
|
||||
# WARNING: this setting affects *all* tests!
|
||||
conn.autocommit = autocommit
|
||||
return conn
|
||||
|
||||
def safe_psql(self, query: str, **kwargs: Any) -> List[Tuple[Any, ...]]:
|
||||
"""
|
||||
Execute query against the node and return all rows.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
return self.safe_psql_many([query], **kwargs)[0]
|
||||
|
||||
def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
|
||||
"""
|
||||
Execute queries against the node and return all rows.
|
||||
This method passes all extra params to connstr.
|
||||
"""
|
||||
result: List[List[Any]] = []
|
||||
with closing(self.connect(**kwargs)) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for query in queries:
|
||||
print(f"Executing query: {query}")
|
||||
cur.execute(query)
|
||||
|
||||
if cur.description is None:
|
||||
result.append([]) # query didn't return data
|
||||
else:
|
||||
result.append(cast(List[Any], cur.fetchall()))
|
||||
return result
|
||||
|
||||
|
||||
class VanillaPostgres(PgProtocol):
|
||||
def __init__(self, pgdatadir: Path, pg_bin: PgBin, port: int, init=True):
|
||||
super().__init__(host="localhost", port=port, dbname="postgres")
|
||||
self.pgdatadir = pgdatadir
|
||||
self.pg_bin = pg_bin
|
||||
self.running = False
|
||||
if init:
|
||||
self.pg_bin.run_capture(["initdb", "-D", str(pgdatadir)])
|
||||
self.configure([f"port = {port}\n"])
|
||||
|
||||
def configure(self, options: List[str]):
|
||||
"""Append lines into postgresql.conf file."""
|
||||
assert not self.running
|
||||
with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
|
||||
conf_file.write("\n".join(options))
|
||||
|
||||
def start(self, log_path: Optional[str] = None):
|
||||
assert not self.running
|
||||
self.running = True
|
||||
|
||||
log_path = log_path or os.path.join(self.pgdatadir, "pg.log")
|
||||
|
||||
self.pg_bin.run_capture(
|
||||
["pg_ctl", "-w", "-D", str(self.pgdatadir), "-l", log_path, "start"]
|
||||
)
|
||||
|
||||
def stop(self):
|
||||
assert self.running
|
||||
self.running = False
|
||||
self.pg_bin.run_capture(["pg_ctl", "-w", "-D", str(self.pgdatadir), "stop"])
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
if self.running:
|
||||
self.stop()
|
||||
|
||||
|
||||
class NeonPageserverApiException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class NeonPageserverHttpClient(requests.Session):
|
||||
def __init__(self, host, port):
|
||||
super().__init__()
|
||||
self.host = host
|
||||
self.port = port
|
||||
|
||||
def verbose_error(self, res: requests.Response):
|
||||
try:
|
||||
res.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
try:
|
||||
msg = res.json()["msg"]
|
||||
except: # noqa: E722
|
||||
msg = ""
|
||||
raise NeonPageserverApiException(msg) from e
|
||||
|
||||
def check_status(self):
|
||||
self.get(f"http://{self.host}:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def tenant_list(self):
|
||||
res = self.get(f"http://{self.host}:{self.port}/v1/tenant")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, list)
|
||||
return res_json
|
||||
|
||||
def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
|
||||
res = self.post(
|
||||
f"http://{self.host}:{self.port}/v1/tenant",
|
||||
json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
|
||||
)
|
||||
|
||||
if res.status_code == 409:
|
||||
if ok_if_exists:
|
||||
print(f"could not create tenant: already exists for id {new_tenant_id}")
|
||||
else:
|
||||
res.raise_for_status()
|
||||
elif res.status_code == 201:
|
||||
print(f"created tenant {new_tenant_id}")
|
||||
else:
|
||||
self.verbose_error(res)
|
||||
|
||||
return new_tenant_id
|
||||
|
||||
def timeline_list(self, tenant_id: uuid.UUID):
|
||||
res = self.get(f"http://{self.host}:{self.port}/v1/tenant/{tenant_id.hex}/timeline")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, list)
|
||||
return res_json
|
||||
|
||||
def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]:
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=true"
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
|
||||
def lsn_to_hex(num: int) -> str:
|
||||
"""Convert lsn from int to standard hex notation."""
|
||||
return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}"
|
||||
|
||||
|
||||
def lsn_from_hex(lsn_hex: str) -> int:
|
||||
"""Convert lsn from hex notation to int."""
|
||||
left, right = lsn_hex.split("/")
|
||||
return (int(left, 16) << 32) + int(right, 16)
|
||||
|
||||
|
||||
def remote_consistent_lsn(
|
||||
pageserver_http_client: NeonPageserverHttpClient, tenant: uuid.UUID, timeline: uuid.UUID
|
||||
) -> int:
|
||||
detail = pageserver_http_client.timeline_detail(tenant, timeline)
|
||||
|
||||
lsn_str = detail["remote_consistent_lsn"]
|
||||
assert isinstance(lsn_str, str)
|
||||
return lsn_from_hex(lsn_str)
|
||||
|
||||
|
||||
def wait_for_upload(
|
||||
pageserver_http_client: NeonPageserverHttpClient,
|
||||
tenant: uuid.UUID,
|
||||
timeline: uuid.UUID,
|
||||
lsn: int,
|
||||
):
|
||||
"""waits for local timeline upload up to specified lsn"""
|
||||
for i in range(10):
|
||||
current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
return
|
||||
print(
|
||||
f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}"
|
||||
)
|
||||
time.sleep(1)
|
||||
|
||||
raise Exception(
|
||||
f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}"
|
||||
)
|
||||
|
||||
|
||||
##############
|
||||
# End of utils
|
||||
##############
|
||||
|
||||
|
||||
def pack_base(log_dir, restored_dir, output_tar):
|
||||
"""Create tar file from basebackup, being careful to produce relative filenames."""
|
||||
tmp_tar_name = "tmp.tar"
|
||||
tmp_tar_path = os.path.join(restored_dir, tmp_tar_name)
|
||||
cmd = ["tar", "-cf", tmp_tar_name] + os.listdir(restored_dir)
|
||||
# We actually cd into the dir and call tar from there. If we call tar from
|
||||
# outside we won't encode filenames as relative, and they won't parse well
|
||||
# on import.
|
||||
subprocess_capture(log_dir, cmd, cwd=restored_dir)
|
||||
shutil.move(tmp_tar_path, output_tar)
|
||||
|
||||
|
||||
def reconstruct_paths(log_dir, pg_bin, base_tar, port: int):
|
||||
"""Reconstruct what relation files should exist in the datadir by querying postgres."""
|
||||
with tempfile.TemporaryDirectory() as restored_dir:
|
||||
# Unpack the base tar
|
||||
subprocess_capture(log_dir, ["tar", "-xf", base_tar, "-C", restored_dir])
|
||||
|
||||
# Start a vanilla postgres from the given datadir and query it to find
|
||||
# what relfiles should exist, but possibly don't.
|
||||
with VanillaPostgres(Path(restored_dir), pg_bin, port, init=False) as vanilla_pg:
|
||||
vanilla_pg.configure([f"port={port}"])
|
||||
vanilla_pg.start(log_path=os.path.join(log_dir, "tmp_pg.log"))
|
||||
|
||||
# Create database based on template0 because we can't connect to template0
|
||||
query = "create database template0copy template template0"
|
||||
vanilla_pg.safe_psql(query, user="cloud_admin")
|
||||
vanilla_pg.safe_psql("CHECKPOINT", user="cloud_admin")
|
||||
|
||||
# Get all databases
|
||||
query = "select oid, datname from pg_database"
|
||||
oid_dbname_pairs = vanilla_pg.safe_psql(query, user="cloud_admin")
|
||||
template0_oid = [
|
||||
oid for (oid, database) in oid_dbname_pairs if database == "template0"
|
||||
][0]
|
||||
|
||||
# Get rel paths for each database
|
||||
for oid, database in oid_dbname_pairs:
|
||||
if database == "template0":
|
||||
# We can't connect to template0
|
||||
continue
|
||||
|
||||
query = "select relname, pg_relation_filepath(oid) from pg_class"
|
||||
result = vanilla_pg.safe_psql(query, user="cloud_admin", dbname=database)
|
||||
for _relname, filepath in result:
|
||||
if filepath is not None:
|
||||
if database == "template0copy":
|
||||
# Add all template0copy paths to template0
|
||||
prefix = f"base/{oid}/"
|
||||
if filepath.startswith(prefix):
|
||||
suffix = filepath[len(prefix) :]
|
||||
yield f"base/{template0_oid}/{suffix}"
|
||||
elif filepath.startswith("global"):
|
||||
print(f"skipping {database} global file {filepath}")
|
||||
else:
|
||||
raise AssertionError
|
||||
else:
|
||||
yield filepath
|
||||
|
||||
|
||||
def touch_missing_rels(log_dir, corrupt_tar, output_tar, paths):
|
||||
"""Add the appropriate empty files to a basebadkup tar."""
|
||||
with tempfile.TemporaryDirectory() as restored_dir:
|
||||
# Unpack the base tar
|
||||
subprocess_capture(log_dir, ["tar", "-xf", corrupt_tar, "-C", restored_dir])
|
||||
|
||||
# Touch files that don't exist
|
||||
for path in paths:
|
||||
absolute_path = os.path.join(restored_dir, path)
|
||||
exists = os.path.exists(absolute_path)
|
||||
if not exists:
|
||||
print(f"File {absolute_path} didn't exist. Creating..")
|
||||
Path(absolute_path).touch()
|
||||
|
||||
# Repackage
|
||||
pack_base(log_dir, restored_dir, output_tar)
|
||||
|
||||
|
||||
# HACK This is a workaround for exporting from old pageservers that
|
||||
# can't export empty relations. In this case we need to start
|
||||
# a vanilla postgres from the exported datadir, and query it
|
||||
# to see what empty relations are missing, and then create
|
||||
# those empty files before importing.
|
||||
def add_missing_rels(base_tar, output_tar, log_dir, pg_bin, tmp_pg_port: int):
|
||||
reconstructed_paths = set(reconstruct_paths(log_dir, pg_bin, base_tar, tmp_pg_port))
|
||||
touch_missing_rels(log_dir, base_tar, output_tar, reconstructed_paths)
|
||||
|
||||
|
||||
def get_rlsn(pageserver_connstr, tenant_id, timeline_id):
|
||||
with closing(psycopg2.connect(pageserver_connstr)) as conn:
|
||||
conn.autocommit = True
|
||||
with conn.cursor() as cur:
|
||||
cmd = f"get_last_record_rlsn {tenant_id} {timeline_id}"
|
||||
cur.execute(cmd)
|
||||
res = cur.fetchone()
|
||||
assert res is not None
|
||||
prev_lsn = res[0]
|
||||
last_lsn = res[1]
|
||||
|
||||
return last_lsn, prev_lsn
|
||||
|
||||
|
||||
def import_timeline(
|
||||
args,
|
||||
psql_path,
|
||||
pageserver_connstr,
|
||||
pageserver_http,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
tar_filename,
|
||||
pg_version,
|
||||
):
|
||||
# Import timelines to new pageserver
|
||||
import_cmd = f"import basebackup {tenant_id} {timeline_id} {last_lsn} {last_lsn} {pg_version}"
|
||||
full_cmd = rf"""cat {tar_filename} | {psql_path} {pageserver_connstr} -c '{import_cmd}' """
|
||||
|
||||
stderr_filename2 = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stderr")
|
||||
stdout_filename = os.path.join(args.work_dir, f"import_{tenant_id}_{timeline_id}.stdout")
|
||||
|
||||
print(f"Running: {full_cmd}")
|
||||
|
||||
with open(stdout_filename, "w") as stdout_f:
|
||||
with open(stderr_filename2, "w") as stderr_f:
|
||||
print(f"(capturing output to {stdout_filename})")
|
||||
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
|
||||
subprocess.run(
|
||||
full_cmd,
|
||||
stdout=stdout_f,
|
||||
stderr=stderr_f,
|
||||
env=pg_bin._build_env(None),
|
||||
shell=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
print("Done import")
|
||||
|
||||
# Wait until pageserver persists the files
|
||||
wait_for_upload(
|
||||
pageserver_http, uuid.UUID(tenant_id), uuid.UUID(timeline_id), lsn_from_hex(last_lsn)
|
||||
)
|
||||
|
||||
|
||||
def export_timeline(
|
||||
args,
|
||||
psql_path,
|
||||
pageserver_connstr,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
tar_filename,
|
||||
pg_version,
|
||||
):
|
||||
# Choose filenames
|
||||
incomplete_filename = tar_filename + ".incomplete"
|
||||
stderr_filename = os.path.join(args.work_dir, f"{tenant_id}_{timeline_id}.stderr")
|
||||
|
||||
# Construct export command
|
||||
query = f"fullbackup {tenant_id} {timeline_id} {last_lsn} {prev_lsn}"
|
||||
cmd = [psql_path, "--no-psqlrc", pageserver_connstr, "-c", query]
|
||||
|
||||
# Run export command
|
||||
print(f"Running: {cmd}")
|
||||
with open(incomplete_filename, "w") as stdout_f:
|
||||
with open(stderr_filename, "w") as stderr_f:
|
||||
print(f"(capturing output to {incomplete_filename})")
|
||||
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
|
||||
subprocess.run(
|
||||
cmd, stdout=stdout_f, stderr=stderr_f, env=pg_bin._build_env(None), check=True
|
||||
)
|
||||
|
||||
# Add missing rels
|
||||
pg_bin = PgBin(args.work_dir, args.pg_distrib_dir, pg_version)
|
||||
add_missing_rels(incomplete_filename, tar_filename, args.work_dir, pg_bin, args.tmp_pg_port)
|
||||
|
||||
# Log more info
|
||||
file_size = os.path.getsize(tar_filename)
|
||||
print(f"Done export: {tar_filename}, size {file_size}")
|
||||
|
||||
|
||||
def main(args: argparse.Namespace):
|
||||
# any psql version will do here. use current DEFAULT_PG_VERSION = 15
|
||||
psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")
|
||||
|
||||
old_pageserver_host = args.old_pageserver_host
|
||||
new_pageserver_host = args.new_pageserver_host
|
||||
|
||||
old_http_client = NeonPageserverHttpClient(old_pageserver_host, args.old_pageserver_http_port)
|
||||
old_http_client.check_status()
|
||||
old_pageserver_connstr = f"postgresql://{old_pageserver_host}:{args.old_pageserver_pg_port}"
|
||||
|
||||
new_http_client = NeonPageserverHttpClient(new_pageserver_host, args.new_pageserver_http_port)
|
||||
new_http_client.check_status()
|
||||
new_pageserver_connstr = f"postgresql://{new_pageserver_host}:{args.new_pageserver_pg_port}"
|
||||
|
||||
for tenant_id in args.tenants:
|
||||
print(f"Tenant: {tenant_id}")
|
||||
timelines = old_http_client.timeline_list(uuid.UUID(tenant_id))
|
||||
print(f"Timelines: {timelines}")
|
||||
|
||||
# Create tenant in new pageserver
|
||||
if args.only_import is False and not args.timelines:
|
||||
new_http_client.tenant_create(uuid.UUID(tenant_id), args.ok_if_exists)
|
||||
|
||||
for timeline in timelines:
|
||||
# Skip timelines we don't need to export
|
||||
if args.timelines and timeline["timeline_id"] not in args.timelines:
|
||||
print(f"Skipping timeline {timeline['timeline_id']}")
|
||||
continue
|
||||
|
||||
# Choose filenames
|
||||
tar_filename = os.path.join(
|
||||
args.work_dir, f"{timeline['tenant_id']}_{timeline['timeline_id']}.tar"
|
||||
)
|
||||
|
||||
pg_version = timeline["pg_version"]
|
||||
|
||||
# Export timeline from old pageserver
|
||||
if args.only_import is False:
|
||||
last_lsn, prev_lsn = get_rlsn(
|
||||
old_pageserver_connstr,
|
||||
timeline["tenant_id"],
|
||||
timeline["timeline_id"],
|
||||
)
|
||||
export_timeline(
|
||||
args,
|
||||
psql_path,
|
||||
old_pageserver_connstr,
|
||||
timeline["tenant_id"],
|
||||
timeline["timeline_id"],
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
tar_filename,
|
||||
pg_version,
|
||||
)
|
||||
|
||||
# Import into new pageserver
|
||||
import_timeline(
|
||||
args,
|
||||
psql_path,
|
||||
new_pageserver_connstr,
|
||||
new_http_client,
|
||||
timeline["tenant_id"],
|
||||
timeline["timeline_id"],
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
tar_filename,
|
||||
pg_version,
|
||||
)
|
||||
|
||||
# Re-export and compare
|
||||
re_export_filename = tar_filename + ".reexport"
|
||||
export_timeline(
|
||||
args,
|
||||
psql_path,
|
||||
new_pageserver_connstr,
|
||||
timeline["tenant_id"],
|
||||
timeline["timeline_id"],
|
||||
last_lsn,
|
||||
prev_lsn,
|
||||
re_export_filename,
|
||||
pg_version,
|
||||
)
|
||||
|
||||
# Check the size is the same
|
||||
old_size = (os.path.getsize(tar_filename),)
|
||||
new_size = (os.path.getsize(re_export_filename),)
|
||||
if old_size != new_size:
|
||||
raise AssertionError(f"Sizes don't match old: {old_size} new: {new_size}")
|
||||
|
||||
|
||||
def non_zero_tcp_port(arg: Any):
|
||||
port = int(arg)
|
||||
if port < 1 or port > 65535:
|
||||
raise argparse.ArgumentTypeError(f"invalid tcp port: {arg}")
|
||||
return port
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--tenant-id",
|
||||
dest="tenants",
|
||||
required=True,
|
||||
nargs="+",
|
||||
help="Id of the tenant to migrate. You can pass multiple arguments",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeline-id",
|
||||
dest="timelines",
|
||||
required=False,
|
||||
nargs="+",
|
||||
help="Id of the timeline to migrate. You can pass multiple arguments",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--from-host",
|
||||
dest="old_pageserver_host",
|
||||
required=True,
|
||||
help="Host of the pageserver to migrate data from",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--from-http-port",
|
||||
dest="old_pageserver_http_port",
|
||||
required=False,
|
||||
type=int,
|
||||
default=9898,
|
||||
help="HTTP port of the pageserver to migrate data from. Default: 9898",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--from-pg-port",
|
||||
dest="old_pageserver_pg_port",
|
||||
required=False,
|
||||
type=int,
|
||||
default=6400,
|
||||
help="pg port of the pageserver to migrate data from. Default: 6400",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--to-host",
|
||||
dest="new_pageserver_host",
|
||||
required=True,
|
||||
help="Host of the pageserver to migrate data to",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--to-http-port",
|
||||
dest="new_pageserver_http_port",
|
||||
required=False,
|
||||
default=9898,
|
||||
type=int,
|
||||
help="HTTP port of the pageserver to migrate data to. Default: 9898",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--to-pg-port",
|
||||
dest="new_pageserver_pg_port",
|
||||
required=False,
|
||||
default=6400,
|
||||
type=int,
|
||||
help="pg port of the pageserver to migrate data to. Default: 6400",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-tenant-exists",
|
||||
dest="ok_if_exists",
|
||||
required=False,
|
||||
help="Ignore error if we are trying to create the tenant that already exists. It can be dangerous if existing tenant already contains some data.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pg-distrib-dir",
|
||||
dest="pg_distrib_dir",
|
||||
required=False,
|
||||
default="/usr/local/",
|
||||
help="Path where postgres binaries are installed. Default: /usr/local/",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--psql-path",
|
||||
dest="psql_path",
|
||||
required=False,
|
||||
default="/usr/local/v14/bin/psql",
|
||||
help="Path to the psql binary. Default: /usr/local/v14/bin/psql",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only-import",
|
||||
dest="only_import",
|
||||
required=False,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="Skip export and tenant creation part",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--work-dir",
|
||||
dest="work_dir",
|
||||
required=True,
|
||||
default=False,
|
||||
help="directory where temporary tar files are stored",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tmp-pg-port",
|
||||
dest="tmp_pg_port",
|
||||
required=False,
|
||||
default=55439,
|
||||
type=non_zero_tcp_port,
|
||||
help="localhost port to use for temporary postgres instance",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -5,7 +5,6 @@ use diesel::Connection;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use metrics::BuildInfo;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage_controller::http::make_router;
|
||||
use storage_controller::metrics::preinitialize_metrics;
|
||||
use storage_controller::persistence::Persistence;
|
||||
@@ -246,8 +245,6 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
};
|
||||
|
||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||
Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?;
|
||||
|
||||
migration_run(&secrets.database_url)
|
||||
.await
|
||||
.context("Running database migrations")?;
|
||||
|
||||
@@ -2,7 +2,6 @@ pub(crate) mod split_state;
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use self::split_state::SplitState;
|
||||
use camino::Utf8Path;
|
||||
@@ -145,31 +144,6 @@ impl Persistence {
|
||||
}
|
||||
}
|
||||
|
||||
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
|
||||
/// database and the storage controller, therefore the database might not be available right away
|
||||
pub async fn await_connection(
|
||||
database_url: &str,
|
||||
timeout: Duration,
|
||||
) -> Result<(), diesel::ConnectionError> {
|
||||
let started_at = Instant::now();
|
||||
loop {
|
||||
match PgConnection::establish(database_url) {
|
||||
Ok(_) => {
|
||||
tracing::info!("Connected to database.");
|
||||
return Ok(());
|
||||
}
|
||||
Err(e) => {
|
||||
if started_at.elapsed() > timeout {
|
||||
return Err(e);
|
||||
} else {
|
||||
tracing::info!("Database not yet available, waiting... ({e})");
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wraps `with_conn` in order to collect latency and error metrics
|
||||
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
|
||||
where
|
||||
|
||||
@@ -129,7 +129,7 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_getpage_reconstruct_seconds_sum",
|
||||
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
||||
*histogram("pageserver_smgr_query_seconds_global"),
|
||||
*histogram("pageserver_layers_visited_per_read_global"),
|
||||
*histogram("pageserver_read_num_fs_layers"),
|
||||
*histogram("pageserver_getpage_get_reconstruct_data_seconds"),
|
||||
*histogram("pageserver_wait_lsn_seconds"),
|
||||
*histogram("pageserver_remote_operation_seconds"),
|
||||
|
||||
@@ -190,6 +190,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
"trace_read_requests": True,
|
||||
"walreceiver_connect_timeout": "13m",
|
||||
"image_layer_creation_check_threshold": 1,
|
||||
"try_enable_aux_file_v2": True,
|
||||
}
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
63
test_runner/regress/test_aux_files.py
Normal file
63
test_runner/regress/test_aux_files.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
logical_replication_sync,
|
||||
)
|
||||
|
||||
|
||||
def test_aux_v2_config_switch(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_aux_v2_config_switch", "empty")
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_aux_v2_config_switch", config_lines=["log_statement=all"]
|
||||
)
|
||||
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_config = client.tenant_config(tenant_id).effective_config
|
||||
tenant_config["try_enable_aux_file_v2"] = True
|
||||
client.set_tenant_config(tenant_id, tenant_config)
|
||||
# aux file v2 is enabled on the write path
|
||||
assert not client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
|
||||
"aux_file_v2"
|
||||
]
|
||||
pg_conn = endpoint.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
cur.execute("create table t(pk integer primary key, payload integer)")
|
||||
cur.execute(
|
||||
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
|
||||
)
|
||||
cur.execute("create publication pub1 for table t, replication_example")
|
||||
|
||||
# now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
|
||||
# instead of going through the full logical replication process.
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
|
||||
vanilla_pg.safe_psql(
|
||||
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
|
||||
)
|
||||
connstr = endpoint.connstr().replace("'", "''")
|
||||
log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
|
||||
|
||||
# Wait logical replication channel to be established
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
vanilla_pg.stop()
|
||||
endpoint.stop()
|
||||
|
||||
env.pageserver.assert_log_contains("enabling aux file v2 support")
|
||||
with env.pageserver.http_client() as client:
|
||||
# aux file v2 flag should be enabled at this point
|
||||
assert client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["aux_file_v2"]
|
||||
with env.pageserver.http_client() as client:
|
||||
tenant_config = client.tenant_config(tenant_id).effective_config
|
||||
tenant_config["try_enable_aux_file_v2"] = False
|
||||
client.set_tenant_config(tenant_id, tenant_config)
|
||||
# the flag should still be enabled
|
||||
assert client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["aux_file_v2"]
|
||||
env.pageserver.restart()
|
||||
with env.pageserver.http_client() as client:
|
||||
# aux file v2 flag should be persisted
|
||||
assert client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["aux_file_v2"]
|
||||
@@ -1,93 +0,0 @@
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.workload import Workload
|
||||
|
||||
AGGRESIVE_COMPACTION_TENANT_CONF = {
|
||||
# Disable gc and compaction. The test runs compaction manually.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# Small checkpoint distance to create many layers
|
||||
"checkpoint_distance": 1024**2,
|
||||
# Compact small layers
|
||||
"compaction_target_size": 1024**2,
|
||||
"image_creation_threshold": 2,
|
||||
# INC-186: remove when merging the fix
|
||||
"image_layer_creation_check_threshold": 0,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
|
||||
def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
This is a smoke test that compaction kicks in. The workload repeatedly churns
|
||||
a small number of rows and manually instructs the pageserver to run compaction
|
||||
between iterations. At the end of the test validate that the average number of
|
||||
layers visited to gather reconstruct data for a given key is within the empirically
|
||||
observed bounds.
|
||||
"""
|
||||
|
||||
# Effectively disable the page cache to rely only on image layers
|
||||
# to shorten reads.
|
||||
neon_env_builder.pageserver_config_override = """
|
||||
page_cache_size=10
|
||||
"""
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
row_count = 10000
|
||||
churn_rounds = 100
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
workload = Workload(env, tenant_id, timeline_id)
|
||||
workload.init(env.pageserver.id)
|
||||
|
||||
log.info("Writing initial data ...")
|
||||
workload.write_rows(row_count, env.pageserver.id)
|
||||
|
||||
for i in range(1, churn_rounds + 1):
|
||||
if i % 10 == 0:
|
||||
log.info(f"Running churn round {i}/{churn_rounds} ...")
|
||||
|
||||
workload.churn_rows(row_count, env.pageserver.id)
|
||||
ps_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
log.info("Validating at workload end ...")
|
||||
workload.validate(env.pageserver.id)
|
||||
|
||||
log.info("Checking layer access metrics ...")
|
||||
|
||||
layer_access_metric_names = [
|
||||
"pageserver_layers_visited_per_read_global_sum",
|
||||
"pageserver_layers_visited_per_read_global_count",
|
||||
"pageserver_layers_visited_per_read_global_bucket",
|
||||
"pageserver_layers_visited_per_vectored_read_global_sum",
|
||||
"pageserver_layers_visited_per_vectored_read_global_count",
|
||||
"pageserver_layers_visited_per_vectored_read_global_bucket",
|
||||
]
|
||||
|
||||
metrics = env.pageserver.http_client().get_metrics()
|
||||
for name in layer_access_metric_names:
|
||||
layer_access_metrics = metrics.query_all(name)
|
||||
log.info(f"Got metrics: {layer_access_metrics}")
|
||||
|
||||
non_vectored_sum = metrics.query_one("pageserver_layers_visited_per_read_global_sum")
|
||||
non_vectored_count = metrics.query_one("pageserver_layers_visited_per_read_global_count")
|
||||
non_vectored_average = non_vectored_sum.value / non_vectored_count.value
|
||||
|
||||
vectored_sum = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_sum")
|
||||
vectored_count = metrics.query_one("pageserver_layers_visited_per_vectored_read_global_count")
|
||||
vectored_average = vectored_sum.value / vectored_count.value
|
||||
|
||||
log.info(f"{non_vectored_average=} {vectored_average=}")
|
||||
|
||||
# The upper bound for average number of layer visits below (8)
|
||||
# was chosen empirically for this workload.
|
||||
assert non_vectored_average < 8
|
||||
assert vectored_average < 8
|
||||
@@ -192,6 +192,7 @@ def test_backward_compatibility(
|
||||
assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
|
||||
|
||||
|
||||
@pytest.xfail
|
||||
@check_ondisk_data_compatibility_if_enabled
|
||||
@pytest.mark.xdist_group("compatibility")
|
||||
@pytest.mark.order(after="test_create_snapshot")
|
||||
|
||||
@@ -16,6 +16,7 @@ from fixtures.pageserver.utils import (
|
||||
wait_for_upload,
|
||||
wait_tenant_status_404,
|
||||
)
|
||||
from fixtures.port_distributor import PortDistributor
|
||||
from fixtures.remote_storage import (
|
||||
LocalFsStorage,
|
||||
RemoteStorageKind,
|
||||
@@ -23,6 +24,7 @@ from fixtures.remote_storage import (
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import (
|
||||
query_scalar,
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
)
|
||||
|
||||
@@ -182,14 +184,20 @@ def post_migration_check(endpoint: Endpoint, sum_before_migration: int, old_loca
|
||||
# A minor migration involves no storage breaking changes.
|
||||
# It is done by attaching the tenant to a new pageserver.
|
||||
"minor",
|
||||
# In the unlikely and unfortunate event that we have to break
|
||||
# the storage format, extend this test with the param below.
|
||||
# "major",
|
||||
# A major migration involves exporting a postgres datadir
|
||||
# basebackup and importing it into the new pageserver.
|
||||
# This kind of migration can tolerate breaking changes
|
||||
# to storage format
|
||||
"major",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("with_load", ["with_load", "without_load"])
|
||||
def test_tenant_relocation(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
port_distributor: PortDistributor,
|
||||
test_output_dir: Path,
|
||||
neon_binpath: Path,
|
||||
base_dir: Path,
|
||||
method: str,
|
||||
with_load: str,
|
||||
):
|
||||
@@ -291,7 +299,40 @@ def test_tenant_relocation(
|
||||
current_lsn=current_lsn_second,
|
||||
)
|
||||
|
||||
if method == "minor":
|
||||
# Migrate either by attaching from s3 or import/export basebackup
|
||||
if method == "major":
|
||||
cmd = [
|
||||
"poetry",
|
||||
"run",
|
||||
"python",
|
||||
str(base_dir / "scripts/export_import_between_pageservers.py"),
|
||||
"--tenant-id",
|
||||
str(tenant_id),
|
||||
"--from-host",
|
||||
"localhost",
|
||||
"--from-http-port",
|
||||
str(origin_http.port),
|
||||
"--from-pg-port",
|
||||
str(origin_ps.service_port.pg),
|
||||
"--to-host",
|
||||
"localhost",
|
||||
"--to-http-port",
|
||||
str(destination_http.port),
|
||||
"--to-pg-port",
|
||||
str(destination_ps.service_port.pg),
|
||||
"--pg-distrib-dir",
|
||||
str(neon_env_builder.pg_distrib_dir),
|
||||
"--work-dir",
|
||||
str(test_output_dir),
|
||||
"--tmp-pg-port",
|
||||
str(port_distributor.get_port()),
|
||||
]
|
||||
subprocess_capture(test_output_dir, cmd, check=True)
|
||||
|
||||
destination_ps.allowed_errors.append(
|
||||
".*ignored .* unexpected bytes after the tar archive.*"
|
||||
)
|
||||
elif method == "minor":
|
||||
# call to attach timeline to new pageserver
|
||||
destination_ps.tenant_attach(tenant_id)
|
||||
|
||||
|
||||
@@ -292,12 +292,33 @@ def test_single_branch_get_tenant_size_grows(
|
||||
Operate on single branch reading the tenants size after each transaction.
|
||||
"""
|
||||
|
||||
# Disable automatic compaction and GC, and set a long PITR interval: we will expect
|
||||
# size to always increase with writes as all writes remain within the PITR
|
||||
# Disable automatic gc and compaction.
|
||||
# The pitr_interval here is quite problematic, so we cannot really use it.
|
||||
# it'd have to be calibrated per test executing env.
|
||||
|
||||
# there was a bug which was hidden if the create table and first batch of
|
||||
# inserts is larger than gc_horizon. for example 0x20000 here hid the fact
|
||||
# that there next_gc_cutoff could be smaller than initdb_lsn, which will
|
||||
# obviously lead to issues when calculating the size.
|
||||
gc_horizon = 0x3BA00
|
||||
|
||||
# it's a bit of a hack, but different versions of postgres have different
|
||||
# amount of WAL generated for the same amount of data. so we need to
|
||||
# adjust the gc_horizon accordingly.
|
||||
if pg_version == PgVersion.V14:
|
||||
gc_horizon = 0x4A000
|
||||
elif pg_version == PgVersion.V15:
|
||||
gc_horizon = 0x3BA00
|
||||
elif pg_version == PgVersion.V16:
|
||||
gc_horizon = 210000
|
||||
else:
|
||||
raise NotImplementedError(pg_version)
|
||||
|
||||
tenant_config = {
|
||||
"compaction_period": "0s",
|
||||
"gc_period": "0s",
|
||||
"pitr_interval": "3600s",
|
||||
"pitr_interval": "0s",
|
||||
"gc_horizon": gc_horizon,
|
||||
}
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=tenant_config)
|
||||
@@ -311,6 +332,18 @@ def test_single_branch_get_tenant_size_grows(
|
||||
|
||||
size_debug_file = open(test_output_dir / "size_debug.html", "w")
|
||||
|
||||
def check_size_change(
|
||||
current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
|
||||
):
|
||||
if current_lsn - initdb_lsn >= gc_horizon:
|
||||
assert (
|
||||
size >= prev_size
|
||||
), f"tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
|
||||
else:
|
||||
assert (
|
||||
size > prev_size
|
||||
), f"tenant_size should grow, because we continue to add WAL to initial snapshot size (Currently at: {current_lsn}, Init at: {initdb_lsn})"
|
||||
|
||||
def get_current_consistent_size(
|
||||
env: NeonEnv,
|
||||
endpoint: Endpoint,
|
||||
@@ -379,6 +412,14 @@ def test_single_branch_get_tenant_size_grows(
|
||||
)
|
||||
|
||||
prev_size = collected_responses[-1][2]
|
||||
|
||||
# branch start shouldn't be past gc_horizon yet
|
||||
# thus the size should grow as we insert more data
|
||||
# "gc_horizon" is tuned so that it kicks in _after_ the
|
||||
# insert phase, but before the update phase ends.
|
||||
assert (
|
||||
current_lsn - initdb_lsn <= gc_horizon
|
||||
), "Tuning of GC window is likely out-of-date"
|
||||
assert size > prev_size
|
||||
|
||||
collected_responses.append(("INSERT", current_lsn, size))
|
||||
@@ -398,7 +439,8 @@ def test_single_branch_get_tenant_size_grows(
|
||||
)
|
||||
|
||||
prev_size = collected_responses[-1][2]
|
||||
assert size > prev_size
|
||||
|
||||
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
|
||||
|
||||
collected_responses.append(("UPDATE", current_lsn, size))
|
||||
|
||||
@@ -415,7 +457,8 @@ def test_single_branch_get_tenant_size_grows(
|
||||
)
|
||||
|
||||
prev_size = collected_responses[-1][2]
|
||||
assert size > prev_size
|
||||
|
||||
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
|
||||
|
||||
collected_responses.append(("DELETE", current_lsn, size))
|
||||
|
||||
@@ -426,20 +469,20 @@ def test_single_branch_get_tenant_size_grows(
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("DROP TABLE t0")
|
||||
|
||||
# Dropping the table doesn't reclaim any space
|
||||
# from the user's point of view, because the DROP transaction is still
|
||||
# within pitr_interval.
|
||||
# Without setting a PITR interval, dropping the table doesn't reclaim any space
|
||||
# from the user's point of view, because the DROP transaction is too small
|
||||
# to fall out of gc_horizon.
|
||||
(current_lsn, size) = get_current_consistent_size(
|
||||
env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
|
||||
)
|
||||
assert size >= prev_size
|
||||
prev_size = size
|
||||
prev_size = collected_responses[-1][2]
|
||||
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
|
||||
|
||||
# Set a zero PITR interval to allow the DROP to impact the synthetic size
|
||||
# Set a tiny PITR interval to allow the DROP to impact the synthetic size
|
||||
# Because synthetic size calculation uses pitr interval when available,
|
||||
# when our tenant is configured with a tiny pitr interval, dropping a table should
|
||||
# cause synthetic size to go down immediately
|
||||
tenant_config["pitr_interval"] = "0s"
|
||||
tenant_config["pitr_interval"] = "1ms"
|
||||
env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config)
|
||||
(current_lsn, size) = get_current_consistent_size(
|
||||
env, endpoint, size_debug_file, http_client, tenant_id, timeline_id
|
||||
@@ -451,6 +494,10 @@ def test_single_branch_get_tenant_size_grows(
|
||||
# defined by gc_horizon.
|
||||
collected_responses.append(("DROP", current_lsn, size))
|
||||
|
||||
# Should have gone past gc_horizon, otherwise gc_horizon is too large
|
||||
bytes_written = current_lsn - initdb_lsn
|
||||
assert bytes_written > gc_horizon
|
||||
|
||||
# this isn't too many lines to forget for a while. observed while
|
||||
# developing these tests that locally the value is a bit more than what we
|
||||
# get in the ci.
|
||||
|
||||
@@ -75,8 +75,6 @@ tonic = { version = "0.9", features = ["tls-roots"] }
|
||||
tower = { version = "0.4", default-features = false, features = ["balance", "buffer", "limit", "log", "timeout", "util"] }
|
||||
tracing = { version = "0.1", features = ["log"] }
|
||||
tracing-core = { version = "0.1" }
|
||||
unicode-bidi = { version = "0.3" }
|
||||
unicode-normalization = { version = "0.1" }
|
||||
url = { version = "2", features = ["serde"] }
|
||||
uuid = { version = "1", features = ["serde", "v4", "v7"] }
|
||||
zeroize = { version = "1", features = ["derive"] }
|
||||
|
||||
Reference in New Issue
Block a user