mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-04 00:40:38 +00:00
Compare commits
23 Commits
density-ba
...
layer_map_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5edb0ccfa7 | ||
|
|
2bc1324bed | ||
|
|
aabca55d7e | ||
|
|
1c3636d848 | ||
|
|
0c16ad8591 | ||
|
|
0b673c12d7 | ||
|
|
7a333cfb12 | ||
|
|
f7ec33970a | ||
|
|
98d0a0d242 | ||
|
|
f74080cbad | ||
|
|
55c184fcd7 | ||
|
|
fd18692dfb | ||
|
|
a4be54d21f | ||
|
|
6b6570b580 | ||
|
|
7704caa3ac | ||
|
|
a44e5eda14 | ||
|
|
5c865f46ba | ||
|
|
a3d7ad2d52 | ||
|
|
36f048d6b0 | ||
|
|
58fb6fe861 | ||
|
|
20b1e26e74 | ||
|
|
8ba1699937 | ||
|
|
a9bd05760f |
6
.github/ansible/deploy.yaml
vendored
6
.github/ansible/deploy.yaml
vendored
@@ -117,7 +117,8 @@
|
||||
shell:
|
||||
cmd: |
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
|
||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
|
||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
@@ -186,6 +187,7 @@
|
||||
shell:
|
||||
cmd: |
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
|
||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
|
||||
curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
2
.github/ansible/staging.us-east-2.hosts.yaml
vendored
2
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -29,6 +29,8 @@ storage:
|
||||
ansible_host: i-0565a8b4008aa3f40
|
||||
pageserver-2.us-east-2.aws.neon.build:
|
||||
ansible_host: i-01e31cdf7e970586a
|
||||
pageserver-3.us-east-2.aws.neon.build:
|
||||
ansible_host: i-0602a0291365ef7cc
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,7 @@
|
||||
/pg_install
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_check_cli
|
||||
__pycache__/
|
||||
test_output/
|
||||
.vscode
|
||||
|
||||
205
Cargo.lock
generated
205
Cargo.lock
generated
@@ -37,11 +37,6 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "amplify_num"
|
||||
version = "0.4.1"
|
||||
source = "git+https://github.com/rust-amplify/rust-amplify.git?tag=v4.0.0-beta.1#3ad006cf2804e1862ec7725a7684a493f3023523"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
@@ -66,6 +61,15 @@ dependencies = [
|
||||
"backtrace",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "archery"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02"
|
||||
dependencies = [
|
||||
"static_assertions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "asn1-rs"
|
||||
version = "0.5.1"
|
||||
@@ -137,15 +141,6 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atomic-polyfill"
|
||||
version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28"
|
||||
dependencies = [
|
||||
"critical-section",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
@@ -606,6 +601,15 @@ version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bitmaps"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "031043d04099746d8db04daf1fa424b2bc8bd69d92b25962dcde24da39ab64a2"
|
||||
dependencies = [
|
||||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "block-buffer"
|
||||
version = "0.10.3"
|
||||
@@ -629,9 +633,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.11.1"
|
||||
version = "3.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
|
||||
checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
@@ -750,13 +754,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.0.32"
|
||||
version = "4.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39"
|
||||
checksum = "4ec7a4128863c188deefe750ac1d1dfe66c236909f845af04beed823638dc1b2"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"clap_derive",
|
||||
"clap_lex 0.3.0",
|
||||
"clap_lex 0.3.1",
|
||||
"is-terminal",
|
||||
"once_cell",
|
||||
"strsim",
|
||||
@@ -765,9 +769,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.0.21"
|
||||
version = "4.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014"
|
||||
checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro-error",
|
||||
@@ -787,9 +791,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.3.0"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8"
|
||||
checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
|
||||
dependencies = [
|
||||
"os_str_bytes",
|
||||
]
|
||||
@@ -832,7 +836,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"clap 4.0.32",
|
||||
"clap 4.1.1",
|
||||
"futures",
|
||||
"hyper",
|
||||
"notify",
|
||||
@@ -887,7 +891,7 @@ name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 4.0.32",
|
||||
"clap 4.1.1",
|
||||
"comfy-table",
|
||||
"git-version",
|
||||
"nix",
|
||||
@@ -988,12 +992,6 @@ dependencies = [
|
||||
"itertools",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "critical-section"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.6"
|
||||
@@ -1030,12 +1028,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.11"
|
||||
version = "0.8.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
|
||||
checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1506,15 +1503,6 @@ version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
|
||||
|
||||
[[package]]
|
||||
name = "hash32"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
@@ -1530,19 +1518,6 @@ dependencies = [
|
||||
"ahash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapless"
|
||||
version = "0.7.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743"
|
||||
dependencies = [
|
||||
"atomic-polyfill",
|
||||
"hash32",
|
||||
"rustc_version",
|
||||
"spin 0.9.4",
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.0"
|
||||
@@ -1762,6 +1737,20 @@ dependencies = [
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "im"
|
||||
version = "15.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0acd33ff0285af998aaf9b57342af478078f53492322fafc47450e09397e0e9"
|
||||
dependencies = [
|
||||
"bitmaps",
|
||||
"rand_core",
|
||||
"rand_xoshiro",
|
||||
"sized-chunks",
|
||||
"typenum",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "1.9.2"
|
||||
@@ -1804,9 +1793,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "1.0.3"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c"
|
||||
checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys",
|
||||
@@ -1916,12 +1905,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libm"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
|
||||
|
||||
[[package]]
|
||||
name = "link-cplusplus"
|
||||
version = "1.0.8"
|
||||
@@ -2067,9 +2050,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.26.1"
|
||||
version = "0.26.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "46a58d1d356c6597d08cde02c2f09d785b09e28711837b1ed667dc652c08a694"
|
||||
checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
@@ -2081,9 +2064,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "7.1.2"
|
||||
version = "7.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c"
|
||||
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"minimal-lexical",
|
||||
@@ -2154,7 +2137,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2230,14 +2212,13 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"amplify_num",
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.0.32",
|
||||
"clap 4.1.1",
|
||||
"close_fds",
|
||||
"const_format",
|
||||
"consumption_metrics",
|
||||
@@ -2252,6 +2233,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"hyper",
|
||||
"im",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"nix",
|
||||
@@ -2269,7 +2251,7 @@ dependencies = [
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"rstar",
|
||||
"rpds",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -2581,9 +2563,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.49"
|
||||
version = "1.0.50"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
|
||||
checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@@ -2683,7 +2665,7 @@ dependencies = [
|
||||
"bstr",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.0.32",
|
||||
"clap 4.1.1",
|
||||
"consumption_metrics",
|
||||
"futures",
|
||||
"git-version",
|
||||
@@ -2742,14 +2724,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.4"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
"rand_hc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2772,10 +2753,10 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.3.1"
|
||||
name = "rand_xoshiro"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
|
||||
checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
|
||||
dependencies = [
|
||||
"rand_core",
|
||||
]
|
||||
@@ -2930,7 +2911,7 @@ dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"spin 0.5.2",
|
||||
"spin",
|
||||
"untrusted",
|
||||
"web-sys",
|
||||
"winapi",
|
||||
@@ -2950,14 +2931,12 @@ dependencies = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rstar"
|
||||
version = "0.9.3"
|
||||
name = "rpds"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa"
|
||||
checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000"
|
||||
dependencies = [
|
||||
"heapless",
|
||||
"num-traits",
|
||||
"smallvec",
|
||||
"archery",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3018,9 +2997,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.36.6"
|
||||
version = "0.36.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549"
|
||||
checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
@@ -3093,7 +3072,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"clap 4.0.32",
|
||||
"clap 4.1.1",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"fs2",
|
||||
@@ -3448,6 +3427,16 @@ version = "0.3.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
|
||||
|
||||
[[package]]
|
||||
name = "sized-chunks"
|
||||
version = "0.6.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "16d69225bde7a69b235da73377861095455d298f2b970996eec25ddbb42b3d1e"
|
||||
dependencies = [
|
||||
"bitmaps",
|
||||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.7"
|
||||
@@ -3479,21 +3468,6 @@ version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
|
||||
|
||||
[[package]]
|
||||
name = "spin"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||
|
||||
[[package]]
|
||||
name = "static_assertions"
|
||||
version = "1.1.0"
|
||||
@@ -3507,7 +3481,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"bytes",
|
||||
"clap 4.0.32",
|
||||
"clap 4.1.1",
|
||||
"const_format",
|
||||
"futures",
|
||||
"futures-core",
|
||||
@@ -3639,9 +3613,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.1.3"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
|
||||
checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
@@ -3749,9 +3723,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.24.1"
|
||||
version = "1.24.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
|
||||
checksum = "597a12a59981d9e3c38d216785b0c37399f6e415e8d0712047620f189371b0bb"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"bytes",
|
||||
@@ -4183,9 +4157,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
|
||||
|
||||
[[package]]
|
||||
name = "ureq"
|
||||
version = "2.6.1"
|
||||
version = "2.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566"
|
||||
checksum = "338b31dd1314f68f3aabf3ed57ab922df95ffcd902476ca7ba3c4ce7b908c46d"
|
||||
dependencies = [
|
||||
"base64 0.13.1",
|
||||
"log",
|
||||
@@ -4226,6 +4200,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"atty",
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
@@ -4287,7 +4262,7 @@ name = "wal_craft"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 4.0.32",
|
||||
"clap 4.1.1",
|
||||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
@@ -4534,7 +4509,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.0.32",
|
||||
"clap 4.1.1",
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"fail",
|
||||
|
||||
@@ -69,7 +69,7 @@ rand = "0.8"
|
||||
regex = "1.4"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
routerify = "3"
|
||||
rstar = "0.9.3"
|
||||
rpds = "0.12.0"
|
||||
rustls = "0.20"
|
||||
rustls-pemfile = "1"
|
||||
rustls-split = "0.3"
|
||||
@@ -107,9 +107,6 @@ x509-parser = "0.14"
|
||||
env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## TODO switch when the new release is made
|
||||
amplify_num = { git = "https://github.com/rust-amplify/rust-amplify.git", tag = "v4.0.0-beta.1" }
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
|
||||
@@ -252,7 +252,7 @@ impl ComputeNode {
|
||||
// If connection fails,
|
||||
// it may be the old node with `zenith_admin` superuser.
|
||||
//
|
||||
// In this case we need to connect with old `zenith_admin`name
|
||||
// In this case we need to connect with old `zenith_admin` name
|
||||
// and create new user. We cannot simply rename connected user,
|
||||
// but we can create a new one and grant it all privileges.
|
||||
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
||||
@@ -278,6 +278,7 @@ impl ComputeNode {
|
||||
Ok(client) => client,
|
||||
};
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
handle_roles(&self.spec, &mut client)?;
|
||||
handle_databases(&self.spec, &mut client)?;
|
||||
handle_role_deletions(self, &mut client)?;
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
pub const DEFAULT_LOG_LEVEL: &str = "info";
|
||||
pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
|
||||
// From Postgres docs:
|
||||
// To ease transition from the md5 method to the newer SCRAM method, if md5 is specified
|
||||
// as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM
|
||||
// (see below), then SCRAM-based authentication will automatically be chosen instead.
|
||||
// https://www.postgresql.org/docs/15/auth-password.html
|
||||
//
|
||||
// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
|
||||
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
|
||||
|
||||
@@ -130,8 +130,8 @@ impl Role {
|
||||
/// Serialize a list of role parameters into a Postgres-acceptable
|
||||
/// string of arguments.
|
||||
pub fn to_pg_options(&self) -> String {
|
||||
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in Rails.
|
||||
// For now we do not use generic `options` for roles. Once used, add
|
||||
// XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
|
||||
// For now, we do not use generic `options` for roles. Once used, add
|
||||
// `self.options.as_pg_options()` somewhere here.
|
||||
let mut params: String = "LOGIN".to_string();
|
||||
|
||||
|
||||
@@ -152,8 +152,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
{
|
||||
RoleAction::Update
|
||||
} else if let Some(pg_pwd) = &r.encrypted_password {
|
||||
// Check whether password changed or not (trim 'md5:' prefix first)
|
||||
if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
|
||||
// Check whether password changed or not (trim 'md5' prefix first if any)
|
||||
//
|
||||
// This is a backward compatibility hack, which comes from the times when we were using
|
||||
// md5 for everyone and hashes were stored in the console db without md5 prefix. So when
|
||||
// role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
|
||||
// but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
|
||||
// Here is the only place so far where we compare hashes, so it seems to be the best candidate
|
||||
// to place this compatibility layer.
|
||||
let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
|
||||
stripped
|
||||
} else {
|
||||
pg_pwd
|
||||
};
|
||||
if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
|
||||
RoleAction::Update
|
||||
} else {
|
||||
RoleAction::None
|
||||
@@ -213,8 +225,20 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
|
||||
if let Some(ops) = &node.spec.delta_operations {
|
||||
// First, reassign all dependent objects to db owners.
|
||||
info!("reassigning dependent objects of to-be-deleted roles");
|
||||
|
||||
// Fetch existing roles. We could've exported and used `existing_roles` from
|
||||
// `handle_roles()`, but we only make this list there before creating new roles.
|
||||
// Which is probably fine as we never create to-be-deleted roles, but that'd
|
||||
// just look a bit untidy. Anyway, the entire `pg_roles` should be in shared
|
||||
// buffers already, so this shouldn't be a big deal.
|
||||
let mut xact = client.transaction()?;
|
||||
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
||||
xact.commit()?;
|
||||
|
||||
for op in ops {
|
||||
if op.action == "delete_role" {
|
||||
// Check that role is still present in Postgres, as this could be a
|
||||
// restart with the same spec after role deletion.
|
||||
if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
|
||||
reassign_owned_objects(node, &op.name)?;
|
||||
}
|
||||
}
|
||||
|
||||
1
control_plane/.gitignore
vendored
Normal file
1
control_plane/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
tmp_check/
|
||||
@@ -44,18 +44,17 @@ impl TenantState {
|
||||
/// A state of a timeline in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum TimelineState {
|
||||
/// Timeline is fully operational. If the containing Tenant is Active, the timeline's
|
||||
/// background jobs are running otherwise they will be launched when the tenant is activated.
|
||||
/// The timeline is recognized by the pageserver but is not yet operational.
|
||||
/// In particular, the walreceiver connection loop is not running for this timeline.
|
||||
/// It will eventually transition to state Active or Broken.
|
||||
Loading,
|
||||
/// The timeline is fully operational.
|
||||
/// It can be queried, and the walreceiver connection loop is running.
|
||||
Active,
|
||||
/// A timeline is recognized by pageserver, but not yet ready to operate.
|
||||
/// The status indicates, that the timeline could eventually go back to Active automatically:
|
||||
/// for example, if the owning tenant goes back to Active again.
|
||||
Suspended,
|
||||
/// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
|
||||
/// automatically become Active after certain events: only a management call can change this status.
|
||||
/// The timeline was previously Loading or Active but is shutting down.
|
||||
/// It cannot transition back into any other state.
|
||||
Stopping,
|
||||
/// A timeline is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations, because it failed to be activated.
|
||||
/// The timeline is broken and not operational (previous states: Loading or Active).
|
||||
Broken,
|
||||
}
|
||||
|
||||
|
||||
@@ -134,22 +134,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
op: Cow<'static, str>,
|
||||
lsn: u64,
|
||||
size: Option<u64>,
|
||||
) where
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
assert!(lsn > lastseg.end_lsn);
|
||||
|
||||
let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lsn,
|
||||
start_size: lastseg.end_size.unwrap(),
|
||||
start_size,
|
||||
end_size: size,
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
@@ -158,6 +161,8 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
|
||||
@@ -167,21 +172,24 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
op: Cow<'static, str>,
|
||||
lsn_bytes: u64,
|
||||
size_bytes: i64,
|
||||
) where
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||
let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
|
||||
let newseg_id = self.segments.len();
|
||||
let lastseg = &mut self.segments[lastseg_id];
|
||||
|
||||
let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
|
||||
|
||||
let newseg = Segment {
|
||||
op,
|
||||
parent: Some(lastseg_id),
|
||||
start_lsn: lastseg.end_lsn,
|
||||
end_lsn: lastseg.end_lsn + lsn_bytes,
|
||||
start_size: lastseg.end_size.unwrap(),
|
||||
end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
|
||||
start_size: last_end_size,
|
||||
end_size: Some((last_end_size as i64 + size_bytes) as u64),
|
||||
children_after: Vec::new(),
|
||||
needed: false,
|
||||
};
|
||||
@@ -189,33 +197,33 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
|
||||
self.segments.push(newseg);
|
||||
*self.branches.get_mut(branch).expect("read already") = newseg_id;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
|
||||
self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
|
||||
}
|
||||
|
||||
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "update".into(), bytes, 0i64);
|
||||
self.modify_branch(branch, "update".into(), bytes, 0i64)
|
||||
}
|
||||
|
||||
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
|
||||
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
|
||||
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
|
||||
}
|
||||
|
||||
/// Panics if the parent branch cannot be found.
|
||||
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q> + std::fmt::Debug,
|
||||
@@ -236,7 +244,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
|
||||
pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
|
||||
// Phase 1: Mark all the segments that need to be retained
|
||||
for (_branch, &last_seg_id) in self.branches.iter() {
|
||||
let last_seg = &self.segments[last_seg_id];
|
||||
@@ -261,7 +269,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
self.size_from_snapshot_later(0)
|
||||
}
|
||||
|
||||
fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
|
||||
fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||
let seg = &self.segments[seg_id];
|
||||
|
||||
let this_size = seg.end_lsn - seg.start_lsn;
|
||||
@@ -272,10 +280,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id);
|
||||
let p1 = self.size_from_wal(child_id)?;
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id);
|
||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
@@ -286,15 +294,15 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
SegmentSize {
|
||||
Ok(SegmentSize {
|
||||
seg_id,
|
||||
method: if seg.needed { WalNeeded } else { Wal },
|
||||
this_size,
|
||||
children,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
|
||||
fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
|
||||
// If this is needed, then it's time to do the snapshot and continue
|
||||
// with wal method.
|
||||
let seg = &self.segments[seg_id];
|
||||
@@ -305,10 +313,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
for &child_id in seg.children_after.iter() {
|
||||
// try each child both ways
|
||||
let child = &self.segments[child_id];
|
||||
let p1 = self.size_from_wal(child_id);
|
||||
let p1 = self.size_from_wal(child_id)?;
|
||||
|
||||
let p = if !child.needed {
|
||||
let p2 = self.size_from_snapshot_later(child_id);
|
||||
let p2 = self.size_from_snapshot_later(child_id)?;
|
||||
if p1.total() < p2.total() {
|
||||
p1
|
||||
} else {
|
||||
@@ -319,12 +327,12 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
};
|
||||
children.push(p);
|
||||
}
|
||||
SegmentSize {
|
||||
Ok(SegmentSize {
|
||||
seg_id,
|
||||
method: WalNeeded,
|
||||
this_size: seg.start_size,
|
||||
children,
|
||||
}
|
||||
})
|
||||
} else {
|
||||
// If any of the direct children are "needed", need to be able to reconstruct here
|
||||
let mut children_needed = false;
|
||||
@@ -339,7 +347,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
let method1 = if !children_needed {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_snapshot_later(*child));
|
||||
children.push(self.size_from_snapshot_later(*child)?);
|
||||
}
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
@@ -355,20 +363,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
let method2 = if children_needed || seg.children_after.len() >= 2 {
|
||||
let mut children = Vec::new();
|
||||
for child in seg.children_after.iter() {
|
||||
children.push(self.size_from_wal(*child));
|
||||
children.push(self.size_from_wal(*child)?);
|
||||
}
|
||||
let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
|
||||
Some(SegmentSize {
|
||||
seg_id,
|
||||
method: SnapshotAfter,
|
||||
this_size: seg.end_size.unwrap(),
|
||||
this_size,
|
||||
children,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
match (method1, method2) {
|
||||
(None, None) => panic!(),
|
||||
Ok(match (method1, method2) {
|
||||
(None, None) => anyhow::bail!(
|
||||
"neither method was applicable: children_after={}, children_needed={}",
|
||||
seg.children_after.len(),
|
||||
children_needed
|
||||
),
|
||||
(Some(method), None) => method,
|
||||
(None, Some(method)) => method,
|
||||
(Some(method1), Some(method2)) => {
|
||||
@@ -378,7 +391,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
method2
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,118 +7,118 @@
|
||||
use tenant_size_model::{Segment, SegmentSize, Storage};
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_1() -> (Vec<Segment>, SegmentSize) {
|
||||
fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
(storage.into_segments(), size)
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Main branch only. Some updates on it.
|
||||
fn scenario_2() -> (Vec<Segment>, SegmentSize) {
|
||||
fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child").unwrap();
|
||||
storage.update("child", 1_000);
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
storage.update("main", 1_000);
|
||||
storage.update("main", 1_000)?;
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
(storage.into_segments(), size)
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Like 2, but more updates on main
|
||||
fn scenario_3() -> (Vec<Segment>, SegmentSize) {
|
||||
fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child").unwrap();
|
||||
storage.update("child", 1_000);
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
(storage.into_segments(), size)
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
// Diverged branches
|
||||
fn scenario_4() -> (Vec<Segment>, SegmentSize) {
|
||||
fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
// Create main branch
|
||||
let mut storage = Storage::new("main");
|
||||
|
||||
// Bulk load 5 GB of data to it
|
||||
storage.insert("main", 5_000);
|
||||
storage.insert("main", 5_000)?;
|
||||
|
||||
// Stream of updates
|
||||
for _ in 0..5 {
|
||||
storage.update("main", 1_000);
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child").unwrap();
|
||||
storage.update("child", 1_000);
|
||||
storage.branch("main", "child")?;
|
||||
storage.update("child", 1_000)?;
|
||||
|
||||
// More updates on parent
|
||||
for _ in 0..8 {
|
||||
storage.update("main", 1_000);
|
||||
storage.update("main", 1_000)?;
|
||||
}
|
||||
|
||||
let size = storage.calculate(1000);
|
||||
let size = storage.calculate(1000)?;
|
||||
|
||||
(storage.into_segments(), size)
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn scenario_5() -> (Vec<Segment>, SegmentSize) {
|
||||
fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
let mut storage = Storage::new("a");
|
||||
storage.insert("a", 5000);
|
||||
storage.branch("a", "b").unwrap();
|
||||
storage.update("b", 4000);
|
||||
storage.update("a", 2000);
|
||||
storage.branch("a", "c").unwrap();
|
||||
storage.insert("c", 4000);
|
||||
storage.insert("a", 2000);
|
||||
storage.insert("a", 5000)?;
|
||||
storage.branch("a", "b")?;
|
||||
storage.update("b", 4000)?;
|
||||
storage.update("a", 2000)?;
|
||||
storage.branch("a", "c")?;
|
||||
storage.insert("c", 4000)?;
|
||||
storage.insert("a", 2000)?;
|
||||
|
||||
let size = storage.calculate(5000);
|
||||
let size = storage.calculate(5000)?;
|
||||
|
||||
(storage.into_segments(), size)
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn scenario_6() -> (Vec<Segment>, SegmentSize) {
|
||||
fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
|
||||
use std::borrow::Cow;
|
||||
|
||||
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
|
||||
@@ -133,18 +133,18 @@ fn scenario_6() -> (Vec<Segment>, SegmentSize) {
|
||||
|
||||
let mut storage = Storage::new(None);
|
||||
|
||||
storage.branch(&None, branches[0]).unwrap(); // at 0
|
||||
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
|
||||
storage.branch(&branches[0], branches[1]).unwrap(); // at 108951064
|
||||
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
|
||||
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
|
||||
storage.branch(&branches[0], branches[2]).unwrap(); // at 283415424
|
||||
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
|
||||
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
|
||||
storage.branch(&None, branches[0])?; // at 0
|
||||
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
|
||||
storage.branch(&branches[0], branches[1])?; // at 108951064
|
||||
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
|
||||
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
|
||||
storage.branch(&branches[0], branches[2])?; // at 283415424
|
||||
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
|
||||
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
|
||||
|
||||
let size = storage.calculate(100_000);
|
||||
let size = storage.calculate(100_000)?;
|
||||
|
||||
(storage.into_segments(), size)
|
||||
Ok((storage.into_segments(), size))
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -163,7 +163,8 @@ fn main() {
|
||||
eprintln!("invalid scenario {}", other);
|
||||
std::process::exit(1);
|
||||
}
|
||||
};
|
||||
}
|
||||
.unwrap();
|
||||
|
||||
graphviz_tree(&segments, &size);
|
||||
}
|
||||
@@ -251,7 +252,7 @@ fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
|
||||
|
||||
#[test]
|
||||
fn scenarios_return_same_size() {
|
||||
type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
|
||||
type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
|
||||
let truths: &[(u32, ScenarioFn, _)] = &[
|
||||
(line!(), scenario_1, 8000),
|
||||
(line!(), scenario_2, 9000),
|
||||
@@ -262,7 +263,7 @@ fn scenarios_return_same_size() {
|
||||
];
|
||||
|
||||
for (line, scenario, expected) in truths {
|
||||
let (_, size) = scenario();
|
||||
let (_, size) = scenario().unwrap();
|
||||
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
atty.workspace = true
|
||||
sentry.workspace = true
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
|
||||
@@ -34,7 +34,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
|
||||
let base_logger = tracing_subscriber::fmt()
|
||||
.with_env_filter(env_filter)
|
||||
.with_target(false)
|
||||
.with_ansi(false)
|
||||
.with_ansi(atty::is(atty::Stream::Stdout))
|
||||
.with_writer(std::io::stdout);
|
||||
|
||||
match log_format {
|
||||
|
||||
@@ -11,7 +11,6 @@ default = []
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
amplify_num.workspace = true
|
||||
anyhow.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
@@ -41,7 +40,6 @@ postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
rstar.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
@@ -68,6 +66,8 @@ tenant_size_model.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
reqwest.workspace = true
|
||||
rpds.workspace = true
|
||||
im = "15.1.0"
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
|
||||
@@ -1,13 +1,12 @@
|
||||
use anyhow::Result;
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
|
||||
use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
|
||||
use pageserver::tenant::storage_layer::Layer;
|
||||
use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, LayerDescriptor};
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use std::cmp::{max, min};
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
@@ -17,102 +16,35 @@ use utils::lsn::Lsn;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
|
||||
struct DummyDelta {
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl Layer for DummyDelta {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
panic!()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn dump(&self, _verbose: bool) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
struct DummyImage {
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
impl Layer for DummyImage {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
// End-bound is exclusive
|
||||
self.lsn..(self.lsn + 1)
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
panic!()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn dump(&self, _verbose: bool) -> Result<()> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
|
||||
let mut layer_map = LayerMap::<dyn Layer>::default();
|
||||
fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
|
||||
let mut layer_map = LayerMap::<LayerDescriptor>::default();
|
||||
|
||||
let mut min_lsn = Lsn(u64::MAX);
|
||||
let mut max_lsn = Lsn(0);
|
||||
|
||||
let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
|
||||
|
||||
let mut updates = layer_map.batch_update();
|
||||
for fname in filenames {
|
||||
let fname = &fname.unwrap();
|
||||
if let Some(imgfilename) = ImageFileName::parse_str(fname) {
|
||||
let layer = DummyImage {
|
||||
key_range: imgfilename.key_range,
|
||||
lsn: imgfilename.lsn,
|
||||
let layer = LayerDescriptor {
|
||||
key: imgfilename.key_range,
|
||||
lsn: imgfilename.lsn..(imgfilename.lsn + 1),
|
||||
is_incremental: false,
|
||||
short_id: fname.to_string(),
|
||||
};
|
||||
layer_map.insert_historic(Arc::new(layer));
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
min_lsn = min(min_lsn, imgfilename.lsn);
|
||||
max_lsn = max(max_lsn, imgfilename.lsn);
|
||||
} else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
|
||||
let layer = DummyDelta {
|
||||
key_range: deltafilename.key_range,
|
||||
lsn_range: deltafilename.lsn_range.clone(),
|
||||
let layer = LayerDescriptor {
|
||||
key: deltafilename.key_range.clone(),
|
||||
lsn: deltafilename.lsn_range.clone(),
|
||||
is_incremental: true,
|
||||
short_id: fname.to_string(),
|
||||
};
|
||||
layer_map.insert_historic(Arc::new(layer));
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
min_lsn = min(min_lsn, deltafilename.lsn_range.start);
|
||||
max_lsn = max(max_lsn, deltafilename.lsn_range.end);
|
||||
} else {
|
||||
@@ -122,11 +54,12 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
|
||||
|
||||
println!("min: {min_lsn}, max: {max_lsn}");
|
||||
|
||||
updates.flush();
|
||||
layer_map
|
||||
}
|
||||
|
||||
/// Construct a layer map query pattern for benchmarks
|
||||
fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
|
||||
fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
|
||||
// For each image layer we query one of the pages contained, at LSN right
|
||||
// before the image layer was created. This gives us a somewhat uniform
|
||||
// coverage of both the lsn and key space because image layers have
|
||||
@@ -150,6 +83,41 @@ fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
// Construct a partitioning for testing get_difficulty map when we
|
||||
// don't have an exact result of `collect_keyspace` to work with.
|
||||
fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
|
||||
let mut parts = Vec::new();
|
||||
|
||||
// We add a partition boundary at the start of each image layer,
|
||||
// no matter what lsn range it covers. This is just the easiest
|
||||
// thing to do. A better thing to do would be to get a real
|
||||
// partitioning from some database. Even better, remove the need
|
||||
// for key partitions by deciding where to create image layers
|
||||
// directly based on a coverage-based difficulty map.
|
||||
let mut keys: Vec<_> = layer_map
|
||||
.iter_historic_layers()
|
||||
.filter_map(|l| {
|
||||
if l.is_incremental() {
|
||||
None
|
||||
} else {
|
||||
let kr = l.get_key_range();
|
||||
Some(kr.start.next())
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
keys.sort();
|
||||
|
||||
let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||
for key in keys {
|
||||
parts.push(KeySpace {
|
||||
ranges: vec![current_key..key],
|
||||
});
|
||||
current_key = key;
|
||||
}
|
||||
|
||||
KeyPartitioning { parts }
|
||||
}
|
||||
|
||||
// Benchmark using metadata extracted from our performance test environment, from
|
||||
// a project where we have run pgbench many timmes. The pgbench database was initialized
|
||||
// between each test run.
|
||||
@@ -183,24 +151,68 @@ fn bench_from_captest_env(c: &mut Criterion) {
|
||||
// Benchmark using metadata extracted from a real project that was taknig
|
||||
// too long processing layer map queries.
|
||||
fn bench_from_real_project(c: &mut Criterion) {
|
||||
// TODO consider compressing this file
|
||||
// Init layer map
|
||||
let now = Instant::now();
|
||||
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
// Choose uniformly distributed queries
|
||||
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
|
||||
|
||||
// Test with uniform query pattern
|
||||
c.bench_function("real_map_uniform_queries", |b| {
|
||||
// Choose inputs for get_difficulty_map
|
||||
let latest_lsn = layer_map
|
||||
.iter_historic_layers()
|
||||
.map(|l| l.get_lsn_range().end)
|
||||
.max()
|
||||
.unwrap();
|
||||
let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
|
||||
|
||||
// Check correctness of get_difficulty_map
|
||||
// TODO put this in a dedicated test outside of this mod
|
||||
{
|
||||
println!("running correctness check");
|
||||
|
||||
let now = Instant::now();
|
||||
let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
|
||||
assert!(result_bruteforce.len() == partitioning.parts.len());
|
||||
println!("Finished bruteforce in {:?}", now.elapsed());
|
||||
|
||||
let now = Instant::now();
|
||||
let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
|
||||
assert!(result_fast.len() == partitioning.parts.len());
|
||||
println!("Finished fast in {:?}", now.elapsed());
|
||||
|
||||
// Assert results are equal. Manually iterate for easier debugging.
|
||||
let zip = std::iter::zip(
|
||||
&partitioning.parts,
|
||||
std::iter::zip(result_bruteforce, result_fast),
|
||||
);
|
||||
for (_part, (bruteforce, fast)) in zip {
|
||||
assert_eq!(bruteforce, fast);
|
||||
}
|
||||
|
||||
println!("No issues found");
|
||||
}
|
||||
|
||||
// Define and name the benchmark function
|
||||
let mut group = c.benchmark_group("real_map");
|
||||
group.bench_function("uniform_queries", |b| {
|
||||
b.iter(|| {
|
||||
for q in queries.clone().into_iter() {
|
||||
layer_map.search(q.0, q.1);
|
||||
}
|
||||
});
|
||||
});
|
||||
group.bench_function("get_difficulty_map", |b| {
|
||||
b.iter(|| {
|
||||
layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
|
||||
});
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
|
||||
fn bench_sequential(c: &mut Criterion) {
|
||||
let mut layer_map: LayerMap<dyn Layer> = LayerMap::default();
|
||||
|
||||
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
|
||||
//
|
||||
// TODO This code is pretty slow and runs even if we're only running other
|
||||
@@ -208,39 +220,39 @@ fn bench_sequential(c: &mut Criterion) {
|
||||
// Putting it inside the `bench_function` closure is not a solution
|
||||
// because then it runs multiple times during warmup.
|
||||
let now = Instant::now();
|
||||
let mut layer_map = LayerMap::default();
|
||||
let mut updates = layer_map.batch_update();
|
||||
for i in 0..100_000 {
|
||||
// TODO try inserting a super-wide layer in between every 10 to reflect
|
||||
// what often happens with L1 layers that include non-rel changes.
|
||||
// Maybe do that as a separate test.
|
||||
let i32 = (i as u32) % 100;
|
||||
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
|
||||
let layer = DummyImage {
|
||||
key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
lsn: Lsn(10 * i),
|
||||
let layer = LayerDescriptor {
|
||||
key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
|
||||
lsn: Lsn(i)..Lsn(i + 1),
|
||||
is_incremental: false,
|
||||
short_id: format!("Layer {}", i),
|
||||
};
|
||||
layer_map.insert_historic(Arc::new(layer));
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
}
|
||||
|
||||
// Manually measure runtime without criterion because criterion
|
||||
// has a minimum sample size of 10 and I don't want to run it 10 times.
|
||||
println!("Finished init in {:?}", now.elapsed());
|
||||
updates.flush();
|
||||
println!("Finished layer map init in {:?}", now.elapsed());
|
||||
|
||||
// Choose 100 uniformly random queries
|
||||
let rng = &mut StdRng::seed_from_u64(1);
|
||||
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
|
||||
.choose_multiple(rng, 1)
|
||||
.choose_multiple(rng, 100)
|
||||
.copied()
|
||||
.collect();
|
||||
|
||||
// Define and name the benchmark function
|
||||
c.bench_function("sequential_uniform_queries", |b| {
|
||||
// Run the search queries
|
||||
let mut group = c.benchmark_group("sequential");
|
||||
group.bench_function("uniform_queries", |b| {
|
||||
b.iter(|| {
|
||||
for q in queries.clone().into_iter() {
|
||||
layer_map.search(q.0, q.1);
|
||||
}
|
||||
});
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(group_1, bench_from_captest_env);
|
||||
|
||||
@@ -693,6 +693,11 @@ impl PageServerConf {
|
||||
Ok(t_conf)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn test_repo_dir(test_name: &str) -> PathBuf {
|
||||
PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
||||
}
|
||||
|
||||
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
|
||||
let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ pub async fn collect_metrics(
|
||||
None,
|
||||
None,
|
||||
"synthetic size calculation",
|
||||
true,
|
||||
false,
|
||||
async move {
|
||||
calculate_synthetic_size_worker(synthetic_size_calculation_interval)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
|
||||
@@ -430,6 +430,13 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: inputs_only
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
description: |
|
||||
When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
|
||||
get:
|
||||
description: |
|
||||
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
|
||||
@@ -449,8 +456,9 @@ paths:
|
||||
format: hex
|
||||
size:
|
||||
type: integer
|
||||
nullable: true
|
||||
description: |
|
||||
Size metric in bytes.
|
||||
Size metric in bytes or null if inputs_only=true was given.
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
|
||||
@@ -239,11 +239,7 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
|
||||
request
|
||||
.uri()
|
||||
.query()
|
||||
.map(|v| {
|
||||
url::form_urlencoded::parse(v.as_bytes())
|
||||
.into_owned()
|
||||
.any(|(p, _)| p == param)
|
||||
})
|
||||
.map(|v| url::form_urlencoded::parse(v.as_bytes()).any(|(p, _)| p == param))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
@@ -252,13 +248,12 @@ fn get_query_param(request: &Request<Body>, param_name: &str) -> Result<String,
|
||||
Err(ApiError::BadRequest(anyhow!("empty query in request"))),
|
||||
|v| {
|
||||
url::form_urlencoded::parse(v.as_bytes())
|
||||
.into_owned()
|
||||
.find(|(k, _)| k == param_name)
|
||||
.map_or(
|
||||
Err(ApiError::BadRequest(anyhow!(
|
||||
"no {param_name} specified in query parameters"
|
||||
))),
|
||||
|(_, v)| Ok(v),
|
||||
|(_, v)| Ok(v.into_owned()),
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -282,7 +277,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
|
||||
let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
|
||||
.await
|
||||
.context("Failed to get local timeline info: {e:#}")
|
||||
.context("get local timeline info")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
Ok::<_, ApiError>(timeline_info)
|
||||
@@ -453,21 +448,39 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
json_response(StatusCode::OK, tenant_info)
|
||||
}
|
||||
|
||||
/// HTTP endpoint to query the current tenant_size of a tenant.
|
||||
///
|
||||
/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
|
||||
/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
|
||||
/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
|
||||
/// values.
|
||||
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let inputs_only = if query_param_present(&request, "inputs_only") {
|
||||
get_query_param(&request, "inputs_only")?
|
||||
.parse()
|
||||
.map_err(|_| ApiError::BadRequest(anyhow!("failed to parse inputs_only")))?
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// this can be long operation, it currently is not backed by any request coalescing or similar
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
.gather_size_inputs()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
|
||||
let size = if !inputs_only {
|
||||
Some(inputs.calculate().map_err(ApiError::InternalServerError)?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
/// Private response type with the additional "unstable" `inputs` field.
|
||||
///
|
||||
@@ -479,7 +492,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
id: TenantId,
|
||||
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
||||
size: u64,
|
||||
///
|
||||
/// Will be none if `?inputs_only=true` was given.
|
||||
size: Option<u64>,
|
||||
inputs: crate::tenant::size::ModelInputs,
|
||||
}
|
||||
|
||||
|
||||
@@ -488,7 +488,7 @@ impl Timeline {
|
||||
let mut buf = self
|
||||
.get(relsize_key, lsn)
|
||||
.await
|
||||
.context("read relation size of {rel:?}")?;
|
||||
.with_context(|| format!("read relation size of {rel:?}"))?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
total_size += relsize as u64;
|
||||
@@ -1405,15 +1405,15 @@ fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
|
||||
Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: segno,
|
||||
field4: 0,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..Key {
|
||||
field1: 0x01,
|
||||
field2,
|
||||
field3: segno,
|
||||
field4: 0,
|
||||
field3: 1,
|
||||
field4: segno,
|
||||
field5: 1,
|
||||
field6: 0,
|
||||
}
|
||||
|
||||
@@ -37,6 +37,17 @@ impl Key {
|
||||
| self.field6 as i128
|
||||
}
|
||||
|
||||
pub fn from_i128(x: i128) -> Self {
|
||||
Key {
|
||||
field1: ((x >> 120) & 0xf) as u8,
|
||||
field2: ((x >> 104) & 0xFFFF) as u32,
|
||||
field3: (x >> 72) as u32,
|
||||
field4: (x >> 40) as u32,
|
||||
field5: (x >> 32) as u8,
|
||||
field6: x as u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn next(&self) -> Key {
|
||||
self.add(1)
|
||||
}
|
||||
|
||||
@@ -183,12 +183,29 @@ pub enum TaskKind {
|
||||
// associated with one later, after receiving a command from the client.
|
||||
PageRequestHandler,
|
||||
|
||||
// Manages the WAL receiver connection for one timeline. It subscribes to
|
||||
// events from storage_broker, decides which safekeeper to connect to. It spawns a
|
||||
// separate WalReceiverConnection task to handle each connection.
|
||||
/// Manages the WAL receiver connection for one timeline.
|
||||
/// It subscribes to events from storage_broker and decides which safekeeper to connect to.
|
||||
/// Once the decision has been made, it establishes the connection using the `tokio-postgres` library.
|
||||
/// There is at most one connection at any given time.
|
||||
///
|
||||
/// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`.
|
||||
/// The `Client` object is what library users use to make requests & get responses.
|
||||
/// Internally, `Client` hands over requests to the `Connection` object.
|
||||
/// The `Connection` object is responsible for speaking the wire protocol.
|
||||
///
|
||||
/// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
|
||||
/// That abstraction doesn't use `task_mgr` and hence, has no `TaskKind`.
|
||||
/// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
|
||||
///
|
||||
/// Once the connection is established, the `TaskHandle` task creates a
|
||||
/// [`WalReceiverConnection`] task_mgr task that is responsible for polling
|
||||
/// the `Connection` object.
|
||||
/// A `CancellationToken` created by the `TaskHandle` task ensures
|
||||
/// that the [`WalReceiverConnection`] task will cancel soon after as the `TaskHandle` is dropped.
|
||||
WalReceiverManager,
|
||||
|
||||
// Handles a connection to a safekeeper, to stream WAL to a timeline.
|
||||
/// The task that polls the `tokio-postgres::Connection` object.
|
||||
/// See the comment on [`WalReceiverManager`].
|
||||
WalReceiverConnection,
|
||||
|
||||
// Garbage collection worker. One per tenant
|
||||
|
||||
@@ -188,7 +188,7 @@ impl UninitializedTimeline<'_> {
|
||||
mut self,
|
||||
timelines: &mut HashMap<TimelineId, Arc<Timeline>>,
|
||||
load_layer_map: bool,
|
||||
launch_wal_receiver: bool,
|
||||
activate: bool,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timeline_id = self.timeline_id;
|
||||
let tenant_id = self.owning_tenant.tenant_id;
|
||||
@@ -221,13 +221,12 @@ impl UninitializedTimeline<'_> {
|
||||
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
|
||||
)
|
||||
})?;
|
||||
new_timeline.set_state(TimelineState::Active);
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
|
||||
new_timeline.maybe_spawn_flush_loop();
|
||||
|
||||
if launch_wal_receiver {
|
||||
new_timeline.launch_wal_receiver();
|
||||
if activate {
|
||||
new_timeline.activate();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1462,8 +1461,7 @@ impl Tenant {
|
||||
tasks::start_background_loops(self.tenant_id);
|
||||
|
||||
for timeline in not_broken_timelines {
|
||||
timeline.set_state(TimelineState::Active);
|
||||
timeline.launch_wal_receiver();
|
||||
timeline.activate();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1487,7 +1485,7 @@ impl Tenant {
|
||||
.values()
|
||||
.filter(|timeline| timeline.current_state() != TimelineState::Broken);
|
||||
for timeline in not_broken_timelines {
|
||||
timeline.set_state(TimelineState::Suspended);
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
}
|
||||
}
|
||||
TenantState::Broken => {
|
||||
@@ -2626,10 +2624,10 @@ where
|
||||
#[cfg(test)]
|
||||
pub mod harness {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use once_cell::sync::Lazy;
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use std::{fs, path::PathBuf};
|
||||
use tempfile::TempDir;
|
||||
use utils::logging;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -2661,6 +2659,8 @@ pub mod harness {
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
|
||||
|
||||
impl From<TenantConf> for TenantConfOpt {
|
||||
fn from(tenant_conf: TenantConf) -> Self {
|
||||
Self {
|
||||
@@ -2681,31 +2681,42 @@ pub mod harness {
|
||||
}
|
||||
}
|
||||
|
||||
/// The harness saves some boilerplate and provides a way to create functional tenant
|
||||
/// without running pageserver binary. It uses temporary directory to store data in it.
|
||||
/// Tempdir gets removed on harness drop.
|
||||
pub struct TenantHarness {
|
||||
// keep the struct to not to remove tmp dir during the test
|
||||
_temp_repo_dir: TempDir,
|
||||
pub struct TenantHarness<'a> {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_conf: TenantConf,
|
||||
pub tenant_id: TenantId,
|
||||
|
||||
pub lock_guard: (
|
||||
Option<RwLockReadGuard<'a, ()>>,
|
||||
Option<RwLockWriteGuard<'a, ()>>,
|
||||
),
|
||||
}
|
||||
|
||||
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
impl TenantHarness {
|
||||
pub fn new() -> anyhow::Result<Self> {
|
||||
impl<'a> TenantHarness<'a> {
|
||||
pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
Self::create_internal(test_name, false)
|
||||
}
|
||||
pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
|
||||
Self::create_internal(test_name, true)
|
||||
}
|
||||
fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
|
||||
let lock_guard = if exclusive {
|
||||
(None, Some(LOCK.write().unwrap()))
|
||||
} else {
|
||||
(Some(LOCK.read().unwrap()), None)
|
||||
};
|
||||
|
||||
LOG_HANDLE.get_or_init(|| {
|
||||
logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
|
||||
});
|
||||
|
||||
let temp_repo_dir = tempfile::tempdir()?;
|
||||
// `TempDir` uses a randomly generated subdirectory of a system tmp dir,
|
||||
// so far it's enough to take care of concurrently running tests.
|
||||
let repo_dir = temp_repo_dir.path();
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.to_path_buf());
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
@@ -2723,10 +2734,10 @@ pub mod harness {
|
||||
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
|
||||
|
||||
Ok(Self {
|
||||
_temp_repo_dir: temp_repo_dir,
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
lock_guard,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -2820,8 +2831,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_basic() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_basic")?.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -2854,8 +2864,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_duplicate_timelines() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("no_duplicate_timelines")?
|
||||
.load()
|
||||
.await;
|
||||
let _ = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -2886,8 +2897,7 @@ mod tests {
|
||||
///
|
||||
#[tokio::test]
|
||||
async fn test_branch() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_branch")?.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -2984,8 +2994,10 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -3020,8 +3032,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
|
||||
.load()
|
||||
.await;
|
||||
|
||||
tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
|
||||
@@ -3070,8 +3083,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -3093,8 +3107,9 @@ mod tests {
|
||||
}
|
||||
#[tokio::test]
|
||||
async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -3125,7 +3140,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeline_load() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
const TEST_NAME: &str = "timeline_load";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
{
|
||||
let tenant = harness.load().await;
|
||||
let tline = tenant
|
||||
@@ -3144,7 +3160,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
const TEST_NAME: &str = "timeline_load_with_ancestor";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
// create two timelines
|
||||
{
|
||||
let tenant = harness.load().await;
|
||||
@@ -3182,7 +3199,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn corrupt_metadata() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
let tenant = harness.load().await;
|
||||
|
||||
tenant
|
||||
@@ -3223,8 +3241,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_images() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_images")?.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -3291,8 +3308,7 @@ mod tests {
|
||||
//
|
||||
#[tokio::test]
|
||||
async fn test_bulk_insert() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -3336,8 +3352,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_random_updates() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_random_updates")?.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -3410,8 +3425,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_traverse_branches() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_traverse_branches")?
|
||||
.load()
|
||||
.await;
|
||||
let mut tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
@@ -3495,8 +3511,9 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_traverse_ancestors() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_traverse_ancestors")?
|
||||
.load()
|
||||
.await;
|
||||
let mut tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
|
||||
.initialize()?;
|
||||
|
||||
@@ -76,7 +76,7 @@ impl EphemeralFile {
|
||||
})
|
||||
}
|
||||
|
||||
fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> io::Result<()> {
|
||||
fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
|
||||
let mut off = 0;
|
||||
while off < PAGE_SZ {
|
||||
let n = self
|
||||
@@ -277,7 +277,7 @@ impl Drop for EphemeralFile {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> io::Result<()> {
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
|
||||
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
|
||||
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
|
||||
Ok(_) => Ok(()),
|
||||
@@ -332,17 +332,25 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use crate::tenant::harness::TenantHarness;
|
||||
use rand::{seq::SliceRandom, thread_rng, RngCore};
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
|
||||
fn harness() -> Result<(TenantHarness, TimelineId), io::Error> {
|
||||
let harness = TenantHarness::new().expect("Failed to create tenant harness");
|
||||
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(harness.timeline_path(&timeline_id))?;
|
||||
fn harness(
|
||||
test_name: &str,
|
||||
) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
Ok((harness, timeline_id))
|
||||
let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
|
||||
|
||||
Ok((conf, tenant_id, timeline_id))
|
||||
}
|
||||
|
||||
// Helper function to slurp contents of a file, starting at the current position,
|
||||
@@ -359,10 +367,10 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_files() -> io::Result<()> {
|
||||
let (harness, timeline_id) = harness()?;
|
||||
fn test_ephemeral_files() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
|
||||
|
||||
let file_a = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
|
||||
let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
file_a.write_all_at(b"foo", 0)?;
|
||||
assert_eq!("foo", read_string(&file_a, 0, 20)?);
|
||||
@@ -373,7 +381,7 @@ mod tests {
|
||||
// Open a lot of files, enough to cause some page evictions.
|
||||
let mut efiles = Vec::new();
|
||||
for fileno in 0..100 {
|
||||
let efile = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
|
||||
let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
|
||||
assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
|
||||
efiles.push((fileno, efile));
|
||||
@@ -390,10 +398,10 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_blobs() -> io::Result<()> {
|
||||
let (harness, timeline_id) = harness()?;
|
||||
fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
|
||||
|
||||
let mut file = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo")?;
|
||||
assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
583
pageserver/src/tenant/layer_map/historic_layer_coverage.rs
Normal file
583
pageserver/src/tenant/layer_map/historic_layer_coverage.rs
Normal file
@@ -0,0 +1,583 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Range;
|
||||
|
||||
use tracing::info;
|
||||
|
||||
use super::layer_coverage::LayerCoverageTuple;
|
||||
|
||||
/// Layers in this module are identified and indexed by this data.
|
||||
///
|
||||
/// This is a helper struct to enable sorting layers by lsn.start.
|
||||
///
|
||||
/// These three values are enough to uniquely identify a layer, since
|
||||
/// a layer is obligated to contain all contents within range, so two
|
||||
/// deltas (or images) with the same range have identical content.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct LayerKey {
|
||||
// TODO I use i128 and u64 because it was easy for prototyping,
|
||||
// testing, and benchmarking. If we can use the Lsn and Key
|
||||
// types without overhead that would be preferable.
|
||||
pub key: Range<i128>,
|
||||
pub lsn: Range<u64>,
|
||||
pub is_image: bool,
|
||||
}
|
||||
|
||||
impl PartialOrd for LayerKey {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for LayerKey {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
// NOTE we really care about comparing by lsn.start first
|
||||
self.lsn
|
||||
.start
|
||||
.cmp(&other.lsn.start)
|
||||
.then(self.lsn.end.cmp(&other.lsn.end))
|
||||
.then(self.key.start.cmp(&other.key.start))
|
||||
.then(self.key.end.cmp(&other.key.end))
|
||||
.then(self.is_image.cmp(&other.is_image))
|
||||
}
|
||||
}
|
||||
|
||||
/// Efficiently queryable layer coverage for each LSN.
|
||||
///
|
||||
/// Allows answering layer map queries very efficiently,
|
||||
/// but doesn't allow retroactive insertion, which is
|
||||
/// sometimes necessary. See BufferedHistoricLayerCoverage.
|
||||
pub struct HistoricLayerCoverage<Value> {
|
||||
/// The latest state
|
||||
head: LayerCoverageTuple<Value>,
|
||||
|
||||
/// All previous states
|
||||
historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
|
||||
}
|
||||
|
||||
impl<T: Clone> Default for HistoricLayerCoverage<T> {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Value: Clone> HistoricLayerCoverage<Value> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
head: LayerCoverageTuple::default(),
|
||||
historic: BTreeMap::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add a layer
|
||||
///
|
||||
/// Panics if new layer has older lsn.start than an existing layer.
|
||||
/// See BufferedHistoricLayerCoverage for a more general insertion method.
|
||||
pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
|
||||
// It's only a persistent map, not a retroactive one
|
||||
if let Some(last_entry) = self.historic.iter().next_back() {
|
||||
let last_lsn = last_entry.0;
|
||||
if layer_key.lsn.start < *last_lsn {
|
||||
panic!("unexpected retroactive insert");
|
||||
}
|
||||
}
|
||||
|
||||
// Insert into data structure
|
||||
if layer_key.is_image {
|
||||
self.head
|
||||
.image_coverage
|
||||
.insert(layer_key.key, layer_key.lsn.clone(), value);
|
||||
} else {
|
||||
self.head
|
||||
.delta_coverage
|
||||
.insert(layer_key.key, layer_key.lsn.clone(), value);
|
||||
}
|
||||
|
||||
// Remember history. Clone is O(1)
|
||||
self.historic.insert(layer_key.lsn.start, self.head.clone());
|
||||
}
|
||||
|
||||
/// Query at a particular LSN, inclusive
|
||||
pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple<Value>> {
|
||||
match self.historic.range(..=lsn).next_back() {
|
||||
Some((_, v)) => Some(v),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove all entries after a certain LSN (inclusive)
|
||||
pub fn trim(&mut self, begin: &u64) {
|
||||
self.historic.split_off(begin);
|
||||
self.head = self
|
||||
.historic
|
||||
.iter()
|
||||
.rev()
|
||||
.next()
|
||||
.map(|(_, v)| v.clone())
|
||||
.unwrap_or_default();
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the most basic test that demonstrates intended usage.
|
||||
/// All layers in this test have height 1.
|
||||
#[test]
|
||||
fn test_persistent_simple() {
|
||||
let mut map = HistoricLayerCoverage::<String>::new();
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 100..101,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 1".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 3..9,
|
||||
lsn: 110..111,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 2".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 5..6,
|
||||
lsn: 120..121,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 3".to_string(),
|
||||
);
|
||||
|
||||
// After Layer 1 insertion
|
||||
let version = map.get_version(105).unwrap();
|
||||
assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
|
||||
assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
|
||||
|
||||
// After Layer 2 insertion
|
||||
let version = map.get_version(115).unwrap();
|
||||
assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
|
||||
assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string()));
|
||||
assert_eq!(version.image_coverage.query(11), None);
|
||||
|
||||
// After Layer 3 insertion
|
||||
let version = map.get_version(125).unwrap();
|
||||
assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
|
||||
assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string()));
|
||||
assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string()));
|
||||
}
|
||||
|
||||
/// Cover simple off-by-one edge cases
|
||||
#[test]
|
||||
fn test_off_by_one() {
|
||||
let mut map = HistoricLayerCoverage::<String>::new();
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 3..5,
|
||||
lsn: 100..110,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 1".to_string(),
|
||||
);
|
||||
|
||||
// Check different LSNs
|
||||
let version = map.get_version(99);
|
||||
assert!(version.is_none());
|
||||
let version = map.get_version(100).unwrap();
|
||||
assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
|
||||
let version = map.get_version(110).unwrap();
|
||||
assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
|
||||
|
||||
// Check different keys
|
||||
let version = map.get_version(105).unwrap();
|
||||
assert_eq!(version.image_coverage.query(2), None);
|
||||
assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string()));
|
||||
assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
|
||||
assert_eq!(version.image_coverage.query(5), None);
|
||||
}
|
||||
|
||||
/// Cover edge cases where layers begin or end on the same key
|
||||
#[test]
|
||||
fn test_key_collision() {
|
||||
let mut map = HistoricLayerCoverage::<String>::new();
|
||||
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 3..5,
|
||||
lsn: 100..110,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 10".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 5..8,
|
||||
lsn: 100..110,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 11".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 3..4,
|
||||
lsn: 200..210,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 20".to_string(),
|
||||
);
|
||||
|
||||
// Check after layer 11
|
||||
let version = map.get_version(105).unwrap();
|
||||
assert_eq!(version.image_coverage.query(2), None);
|
||||
assert_eq!(
|
||||
version.image_coverage.query(3),
|
||||
Some("Layer 10".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
version.image_coverage.query(5),
|
||||
Some("Layer 11".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
version.image_coverage.query(7),
|
||||
Some("Layer 11".to_string())
|
||||
);
|
||||
assert_eq!(version.image_coverage.query(8), None);
|
||||
|
||||
// Check after layer 20
|
||||
let version = map.get_version(205).unwrap();
|
||||
assert_eq!(version.image_coverage.query(2), None);
|
||||
assert_eq!(
|
||||
version.image_coverage.query(3),
|
||||
Some("Layer 20".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
version.image_coverage.query(5),
|
||||
Some("Layer 11".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
version.image_coverage.query(7),
|
||||
Some("Layer 11".to_string())
|
||||
);
|
||||
assert_eq!(version.image_coverage.query(8), None);
|
||||
}
|
||||
|
||||
/// Test when rectangles have nontrivial height and possibly overlap
|
||||
#[test]
|
||||
fn test_persistent_overlapping() {
|
||||
let mut map = HistoricLayerCoverage::<String>::new();
|
||||
|
||||
// Add 3 key-disjoint layers with varying LSN ranges
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 1..2,
|
||||
lsn: 100..200,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 1".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 4..5,
|
||||
lsn: 110..200,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 2".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 7..8,
|
||||
lsn: 120..300,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 3".to_string(),
|
||||
);
|
||||
|
||||
// Add wide and short layer
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 0..9,
|
||||
lsn: 130..199,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 4".to_string(),
|
||||
);
|
||||
|
||||
// Add wide layer taller than some
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 0..9,
|
||||
lsn: 140..201,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 5".to_string(),
|
||||
);
|
||||
|
||||
// Add wide layer taller than all
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 0..9,
|
||||
lsn: 150..301,
|
||||
is_image: true,
|
||||
},
|
||||
"Layer 6".to_string(),
|
||||
);
|
||||
|
||||
// After layer 4 insertion
|
||||
let version = map.get_version(135).unwrap();
|
||||
assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string()));
|
||||
assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
|
||||
assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string()));
|
||||
assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
|
||||
assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string()));
|
||||
assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
|
||||
assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string()));
|
||||
|
||||
// After layer 5 insertion
|
||||
let version = map.get_version(145).unwrap();
|
||||
assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string()));
|
||||
assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string()));
|
||||
assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string()));
|
||||
assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string()));
|
||||
assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string()));
|
||||
assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
|
||||
assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string()));
|
||||
|
||||
// After layer 6 insertion
|
||||
let version = map.get_version(155).unwrap();
|
||||
assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string()));
|
||||
assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string()));
|
||||
assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string()));
|
||||
assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string()));
|
||||
assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string()));
|
||||
assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string()));
|
||||
assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string()));
|
||||
}
|
||||
|
||||
/// Wrapper for HistoricLayerCoverage that allows us to hack around the lack
|
||||
/// of support for retroactive insertion by rebuilding the map since the
|
||||
/// change.
|
||||
///
|
||||
/// Why is this needed? We most often insert new layers with newer LSNs,
|
||||
/// but during compaction we create layers with non-latest LSN, and during
|
||||
/// GC we delete historic layers.
|
||||
///
|
||||
/// Even though rebuilding is an expensive (N log N) solution to the problem,
|
||||
/// it's not critical since we do something equally expensive just to decide
|
||||
/// whether or not to create new image layers.
|
||||
/// TODO It's not expensive but it's not great to hold a layer map write lock
|
||||
/// for that long.
|
||||
///
|
||||
/// If this becomes an actual bottleneck, one solution would be to build a
|
||||
/// segment tree that holds PersistentLayerMaps. Though this would mean that
|
||||
/// we take an additional log(N) performance hit for queries, which will probably
|
||||
/// still be more critical.
|
||||
///
|
||||
/// See this for more on persistent and retroactive techniques:
|
||||
/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
|
||||
pub struct BufferedHistoricLayerCoverage<Value> {
|
||||
/// A persistent layer map that we rebuild when we need to retroactively update
|
||||
historic_coverage: HistoricLayerCoverage<Value>,
|
||||
|
||||
/// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds.
|
||||
buffer: BTreeMap<LayerKey, Option<Value>>,
|
||||
|
||||
/// All current layers. This is not used for search. Only to make rebuilds easier.
|
||||
layers: BTreeMap<LayerKey, Value>,
|
||||
}
|
||||
|
||||
impl<T: std::fmt::Debug> std::fmt::Debug for BufferedHistoricLayerCoverage<T> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("RetroactiveLayerMap")
|
||||
.field("buffer", &self.buffer)
|
||||
.field("layers", &self.layers)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Clone> Default for BufferedHistoricLayerCoverage<T> {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
historic_coverage: HistoricLayerCoverage::<Value>::new(),
|
||||
buffer: BTreeMap::new(),
|
||||
layers: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
|
||||
self.buffer.insert(layer_key, Some(value));
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, layer_key: LayerKey) {
|
||||
self.buffer.insert(layer_key, None);
|
||||
}
|
||||
|
||||
pub fn rebuild(&mut self) {
|
||||
// Find the first LSN that needs to be rebuilt
|
||||
let rebuild_since: u64 = match self.buffer.iter().next() {
|
||||
Some((LayerKey { lsn, .. }, _)) => lsn.start,
|
||||
None => return, // No need to rebuild if buffer is empty
|
||||
};
|
||||
|
||||
// Apply buffered updates to self.layers
|
||||
let num_updates = self.buffer.len();
|
||||
self.buffer.retain(|layer_key, layer| {
|
||||
match layer {
|
||||
Some(l) => {
|
||||
self.layers.insert(layer_key.clone(), l.clone());
|
||||
}
|
||||
None => {
|
||||
self.layers.remove(layer_key);
|
||||
}
|
||||
};
|
||||
false
|
||||
});
|
||||
|
||||
// Rebuild
|
||||
let mut num_inserted = 0;
|
||||
self.historic_coverage.trim(&rebuild_since);
|
||||
for (layer_key, layer) in self.layers.range(
|
||||
LayerKey {
|
||||
lsn: rebuild_since..0,
|
||||
key: 0..0,
|
||||
is_image: false,
|
||||
}..,
|
||||
) {
|
||||
self.historic_coverage
|
||||
.insert(layer_key.clone(), layer.clone());
|
||||
num_inserted += 1;
|
||||
}
|
||||
|
||||
// TODO maybe only warn if ratio is at least 10
|
||||
info!(
|
||||
"Rebuilt layer map. Did {} insertions to process a batch of {} updates.",
|
||||
num_inserted, num_updates,
|
||||
)
|
||||
}
|
||||
|
||||
/// Iterate all the layers
|
||||
pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
|
||||
// NOTE we can actually perform this without rebuilding,
|
||||
// but it's not necessary for now.
|
||||
if !self.buffer.is_empty() {
|
||||
panic!("rebuild pls")
|
||||
}
|
||||
|
||||
self.layers.values().cloned()
|
||||
}
|
||||
|
||||
/// Return a reference to a queryable map, assuming all updates
|
||||
/// have already been processed using self.rebuild()
|
||||
pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
|
||||
// NOTE we error here instead of implicitly rebuilding because
|
||||
// rebuilding is somewhat expensive.
|
||||
// TODO maybe implicitly rebuild and log/sentry an error?
|
||||
if !self.buffer.is_empty() {
|
||||
anyhow::bail!("rebuild required")
|
||||
}
|
||||
|
||||
Ok(&self.historic_coverage)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_retroactive_regression_1() {
|
||||
let mut map = BufferedHistoricLayerCoverage::new();
|
||||
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 0..21267647932558653966460912964485513215,
|
||||
lsn: 23761336..23761457,
|
||||
is_image: false,
|
||||
},
|
||||
"sdfsdfs".to_string(),
|
||||
);
|
||||
|
||||
map.rebuild();
|
||||
|
||||
let version = map.get().unwrap().get_version(23761457).unwrap();
|
||||
assert_eq!(
|
||||
version.delta_coverage.query(100),
|
||||
Some("sdfsdfs".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_retroactive_simple() {
|
||||
let mut map = BufferedHistoricLayerCoverage::new();
|
||||
|
||||
// Append some images in increasing LSN order
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 0..5,
|
||||
lsn: 100..101,
|
||||
is_image: true,
|
||||
},
|
||||
"Image 1".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 3..9,
|
||||
lsn: 110..111,
|
||||
is_image: true,
|
||||
},
|
||||
"Image 2".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 4..6,
|
||||
lsn: 120..121,
|
||||
is_image: true,
|
||||
},
|
||||
"Image 3".to_string(),
|
||||
);
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 8..9,
|
||||
lsn: 120..121,
|
||||
is_image: true,
|
||||
},
|
||||
"Image 4".to_string(),
|
||||
);
|
||||
|
||||
// Add a delta layer out of order
|
||||
map.insert(
|
||||
LayerKey {
|
||||
key: 2..5,
|
||||
lsn: 105..106,
|
||||
is_image: true,
|
||||
},
|
||||
"Delta 1".to_string(),
|
||||
);
|
||||
|
||||
// Rebuild so we can start querying
|
||||
map.rebuild();
|
||||
|
||||
// Query key 4
|
||||
let version = map.get().unwrap().get_version(90);
|
||||
assert!(version.is_none());
|
||||
let version = map.get().unwrap().get_version(102).unwrap();
|
||||
assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string()));
|
||||
let version = map.get().unwrap().get_version(107).unwrap();
|
||||
assert_eq!(version.image_coverage.query(4), Some("Delta 1".to_string()));
|
||||
let version = map.get().unwrap().get_version(115).unwrap();
|
||||
assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
|
||||
let version = map.get().unwrap().get_version(125).unwrap();
|
||||
assert_eq!(version.image_coverage.query(4), Some("Image 3".to_string()));
|
||||
|
||||
// Remove Image 3
|
||||
map.remove(LayerKey {
|
||||
key: 4..6,
|
||||
lsn: 120..121,
|
||||
is_image: true,
|
||||
});
|
||||
map.rebuild();
|
||||
|
||||
// Check deletion worked
|
||||
let version = map.get().unwrap().get_version(125).unwrap();
|
||||
assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
|
||||
assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
|
||||
}
|
||||
229
pageserver/src/tenant/layer_map/layer_coverage.rs
Normal file
229
pageserver/src/tenant/layer_map/layer_coverage.rs
Normal file
@@ -0,0 +1,229 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use im::OrdMap;
|
||||
use rpds::RedBlackTreeMapSync;
|
||||
|
||||
/// Data structure that can efficiently:
|
||||
/// - find the latest layer by lsn.end at a given key
|
||||
/// - iterate the latest layers in a key range
|
||||
/// - insert layers in non-decreasing lsn.start order
|
||||
///
|
||||
/// The struct is parameterized over Value for easier
|
||||
/// testing, but in practice it's some sort of layer.
|
||||
pub struct LayerCoverage<Value> {
|
||||
/// For every change in coverage (as we sweep the key space)
|
||||
/// we store (lsn.end, value).
|
||||
///
|
||||
/// We use an immutable/persistent tree so that we can keep historic
|
||||
/// versions of this coverage without cloning the whole thing and
|
||||
/// incurring quadratic memory cost. See HistoricLayerCoverage.
|
||||
///
|
||||
/// We use the Sync version of the map because we want Self to
|
||||
/// be Sync. Using nonsync might be faster, if we can work with
|
||||
/// that.
|
||||
nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
|
||||
im_nodes: OrdMap<i128, Option<(u64, Value)>>,
|
||||
}
|
||||
|
||||
impl<T: Clone> Default for LayerCoverage<T> {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Value: Clone> LayerCoverage<Value> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
nodes: RedBlackTreeMapSync::default(),
|
||||
im_nodes: OrdMap::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to subdivide the key range without changing any values
|
||||
///
|
||||
/// Complexity: O(log N)
|
||||
fn add_node(&mut self, key: i128) {
|
||||
let value = match self.nodes.range(..=key).last() {
|
||||
Some((_, Some(v))) => Some(v.clone()),
|
||||
Some((_, None)) => None,
|
||||
None => None,
|
||||
};
|
||||
self.nodes.insert_mut(key, value);
|
||||
|
||||
let im_value = match self.im_nodes.range(..=key).last() {
|
||||
Some((_, Some(v))) => Some(v.clone()),
|
||||
Some((_, None)) => None,
|
||||
None => None,
|
||||
};
|
||||
self.im_nodes.remove(&key);
|
||||
self.im_nodes.insert(key, im_value);
|
||||
}
|
||||
|
||||
/// Insert a layer.
|
||||
///
|
||||
/// Complexity: worst case O(N), in practice O(log N). See NOTE in implementation.
|
||||
pub fn insert(&mut self, key: Range<i128>, lsn: Range<u64>, value: Value) {
|
||||
// Add nodes at endpoints
|
||||
//
|
||||
// NOTE The order of lines is important. We add nodes at the start
|
||||
// and end of the key range **before updating any nodes** in order
|
||||
// to pin down the current coverage outside of the relevant key range.
|
||||
// Only the coverage inside the layer's key range should change.
|
||||
self.add_node(key.start);
|
||||
self.add_node(key.end);
|
||||
|
||||
// Raise the height where necessary
|
||||
//
|
||||
// NOTE This loop is worst case O(N), but amortized O(log N) in the special
|
||||
// case when rectangles have no height. In practice I don't think we'll see
|
||||
// the kind of layer intersections needed to trigger O(N) behavior. The worst
|
||||
// case is N/2 horizontal layers overlapped with N/2 vertical layers in a
|
||||
// grid pattern.
|
||||
let mut to_update = Vec::new();
|
||||
let mut to_remove = Vec::new();
|
||||
let mut prev_covered = false;
|
||||
for (k, node) in self.nodes.range(key.clone()) {
|
||||
let needs_cover = match node {
|
||||
None => true,
|
||||
Some((h, _)) => h < &lsn.end,
|
||||
};
|
||||
if needs_cover {
|
||||
match prev_covered {
|
||||
true => to_remove.push(*k),
|
||||
false => to_update.push(*k),
|
||||
}
|
||||
}
|
||||
prev_covered = needs_cover;
|
||||
}
|
||||
if !prev_covered {
|
||||
to_remove.push(key.end);
|
||||
}
|
||||
for k in to_update {
|
||||
self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
|
||||
}
|
||||
for k in to_remove {
|
||||
self.nodes.remove_mut(&k);
|
||||
}
|
||||
|
||||
|
||||
let mut to_update = Vec::new();
|
||||
let mut to_remove = Vec::new();
|
||||
let mut prev_covered = false;
|
||||
for (k, node) in self.im_nodes.range(key.clone()) {
|
||||
let needs_cover = match node {
|
||||
None => true,
|
||||
Some((h, _)) => h < &lsn.end,
|
||||
};
|
||||
if needs_cover {
|
||||
match prev_covered {
|
||||
true => to_remove.push(*k),
|
||||
false => to_update.push(*k),
|
||||
}
|
||||
}
|
||||
prev_covered = needs_cover;
|
||||
}
|
||||
if !prev_covered {
|
||||
to_remove.push(key.end);
|
||||
}
|
||||
for k in to_update {
|
||||
self.im_nodes.remove(&k);
|
||||
self.im_nodes.insert(k, Some((lsn.end, value.clone())));
|
||||
}
|
||||
for k in to_remove {
|
||||
self.im_nodes.remove(&k);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
fn get_key_1(&self, key: i128) -> Option<u64> {
|
||||
self.im_nodes
|
||||
.get_prev(&key)?
|
||||
.1
|
||||
.as_ref()
|
||||
.map(|(k, _)| k.clone())
|
||||
}
|
||||
fn get_key_2(&self, key: i128) -> Option<u64> {
|
||||
self.im_nodes
|
||||
.range(..=key)
|
||||
.rev()
|
||||
.next()?
|
||||
.1
|
||||
.as_ref()
|
||||
.map(|(k, _)| k.clone())
|
||||
}
|
||||
|
||||
/// Get the latest (by lsn.end) layer at a given key
|
||||
///
|
||||
/// Complexity: O(log N)
|
||||
pub fn query(&self, key: i128) -> Option<Value> {
|
||||
|
||||
let k1 = self.get_key_1(key);
|
||||
let k2 = self.get_key_2(key);
|
||||
assert_eq!(k1, k2);
|
||||
|
||||
|
||||
// self.im_nodes
|
||||
// .get_prev(&key)?
|
||||
// .1
|
||||
// .as_ref()
|
||||
// .map(|(_, v)| v.clone())
|
||||
|
||||
self.im_nodes
|
||||
.range(..=key)
|
||||
.rev()
|
||||
.next()?
|
||||
.1
|
||||
.as_ref()
|
||||
.map(|(_, v)| v.clone())
|
||||
|
||||
// self.nodes
|
||||
// .range(..=key)
|
||||
// .rev()
|
||||
// .next()?
|
||||
// .1
|
||||
// .as_ref()
|
||||
// .map(|(_, v)| v.clone())
|
||||
}
|
||||
|
||||
/// Iterate the changes in layer coverage in a given range. You will likely
|
||||
/// want to start with self.query(key.start), and then follow up with self.range
|
||||
///
|
||||
/// Complexity: O(log N + result_size)
|
||||
pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<Value>)> {
|
||||
self.nodes
|
||||
.range(key)
|
||||
.map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
|
||||
}
|
||||
|
||||
/// O(1) clone
|
||||
pub fn clone(&self) -> Self {
|
||||
Self {
|
||||
nodes: self.nodes.clone(),
|
||||
im_nodes: self.im_nodes.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Image and delta coverage at a specific LSN.
|
||||
pub struct LayerCoverageTuple<Value> {
|
||||
pub image_coverage: LayerCoverage<Value>,
|
||||
pub delta_coverage: LayerCoverage<Value>,
|
||||
}
|
||||
|
||||
impl<T: Clone> Default for LayerCoverageTuple<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
image_coverage: LayerCoverage::default(),
|
||||
delta_coverage: LayerCoverage::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Value: Clone> LayerCoverageTuple<Value> {
|
||||
pub fn clone(&self) -> Self {
|
||||
Self {
|
||||
image_coverage: self.image_coverage.clone(),
|
||||
delta_coverage: self.delta_coverage.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1064,7 +1064,7 @@ mod tests {
|
||||
// Test scheduling
|
||||
#[test]
|
||||
fn upload_scheduling() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let harness = TenantHarness::create("upload_scheduling")?;
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
std::fs::create_dir_all(&timeline_path)?;
|
||||
|
||||
|
||||
@@ -23,7 +23,13 @@ use tracing::*;
|
||||
pub struct ModelInputs {
|
||||
updates: Vec<Update>,
|
||||
retention_period: u64,
|
||||
|
||||
/// Relevant lsns per timeline.
|
||||
///
|
||||
/// This field is not required for deserialization purposes, which is mostly used in tests. The
|
||||
/// LSNs explain the outcome (updates) but are not needed in size calculation.
|
||||
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
|
||||
#[serde(default)]
|
||||
timeline_inputs: HashMap<TimelineId, TimelineInputs>,
|
||||
}
|
||||
|
||||
@@ -32,6 +38,8 @@ pub struct ModelInputs {
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, serde::Serialize, serde::Deserialize)]
|
||||
struct TimelineInputs {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
ancestor_lsn: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
last_record: Lsn,
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
@@ -178,19 +186,20 @@ pub(super) async fn gather_inputs(
|
||||
// our advantage with `?` error handling.
|
||||
let mut joinset = tokio::task::JoinSet::new();
|
||||
|
||||
let timelines = tenant
|
||||
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
|
||||
tenant
|
||||
.refresh_gc_info()
|
||||
.await
|
||||
.context("Failed to refresh gc_info before gathering inputs")?;
|
||||
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
if timelines.is_empty() {
|
||||
// All timelines are below tenant's gc_horizon; alternative would be to use
|
||||
// Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
|
||||
// missing GcInfo::retain_lsns or having obsolete values for cutoff's.
|
||||
// perhaps the tenant has just been created, and as such doesn't have any data yet
|
||||
return Ok(ModelInputs {
|
||||
updates: vec![],
|
||||
retention_period: 0,
|
||||
timeline_inputs: HashMap::new(),
|
||||
timeline_inputs: HashMap::default(),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -201,13 +210,25 @@ pub(super) async fn gather_inputs(
|
||||
|
||||
let mut updates = Vec::new();
|
||||
|
||||
// record the per timline values used to determine `retention_period`
|
||||
// record the per timeline values useful to debug the model inputs, also used to track
|
||||
// ancestor_lsn without keeping a hold of Timeline
|
||||
let mut timeline_inputs = HashMap::with_capacity(timelines.len());
|
||||
|
||||
// used to determine the `retention_period` for the size model
|
||||
let mut max_cutoff_distance = None;
|
||||
|
||||
// mapping from (TimelineId, Lsn) => if this branch point has been handled already via
|
||||
// GcInfo::retain_lsns or if it needs to have its logical_size calculated.
|
||||
let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new();
|
||||
|
||||
for timeline in timelines {
|
||||
if !timeline.is_active() {
|
||||
anyhow::bail!(
|
||||
"timeline {} is not active, cannot calculate tenant_size now",
|
||||
timeline.timeline_id
|
||||
);
|
||||
}
|
||||
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
|
||||
let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
|
||||
@@ -273,13 +294,30 @@ pub(super) async fn gather_inputs(
|
||||
|
||||
// all timelines branch from something, because it might be impossible to pinpoint
|
||||
// which is the tenant_size_model's "default" branch.
|
||||
|
||||
let ancestor_lsn = timeline.get_ancestor_lsn();
|
||||
|
||||
updates.push(Update {
|
||||
lsn: timeline.get_ancestor_lsn(),
|
||||
lsn: ancestor_lsn,
|
||||
command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
|
||||
timeline_id: timeline.timeline_id,
|
||||
});
|
||||
|
||||
if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() {
|
||||
// refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches
|
||||
// which are over gc_horizon. for example, a "main" branch which never received any
|
||||
// updates apart from initdb not have branch points recorded.
|
||||
referenced_branch_froms
|
||||
.entry((parent_timeline_id, timeline.get_ancestor_lsn()))
|
||||
.or_default();
|
||||
}
|
||||
|
||||
for (lsn, _kind) in &interesting_lsns {
|
||||
// mark this visited so don't need to re-process this parent
|
||||
*referenced_branch_froms
|
||||
.entry((timeline.timeline_id, *lsn))
|
||||
.or_default() = true;
|
||||
|
||||
if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
|
||||
updates.push(Update {
|
||||
lsn: *lsn,
|
||||
@@ -295,22 +333,10 @@ pub(super) async fn gather_inputs(
|
||||
}
|
||||
}
|
||||
|
||||
// all timelines also have an end point if they have made any progress
|
||||
if last_record_lsn > timeline.get_ancestor_lsn()
|
||||
&& !interesting_lsns
|
||||
.iter()
|
||||
.any(|(lsn, _)| lsn == &last_record_lsn)
|
||||
{
|
||||
updates.push(Update {
|
||||
lsn: last_record_lsn,
|
||||
command: Command::EndOfBranch,
|
||||
timeline_id: timeline.timeline_id,
|
||||
});
|
||||
}
|
||||
|
||||
timeline_inputs.insert(
|
||||
timeline.timeline_id,
|
||||
TimelineInputs {
|
||||
ancestor_lsn,
|
||||
last_record: last_record_lsn,
|
||||
// this is not used above, because it might not have updated recently enough
|
||||
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
@@ -321,6 +347,80 @@ pub(super) async fn gather_inputs(
|
||||
);
|
||||
}
|
||||
|
||||
// iterate over discovered branch points and make sure we are getting logical sizes at those
|
||||
// points.
|
||||
for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() {
|
||||
if *handled {
|
||||
continue;
|
||||
}
|
||||
|
||||
let timeline_id = *timeline_id;
|
||||
let lsn = *lsn;
|
||||
|
||||
match timeline_inputs.get(&timeline_id) {
|
||||
Some(inputs) if inputs.ancestor_lsn == lsn => {
|
||||
// we don't need an update at this branch point which is also point where
|
||||
// timeline_id branch was branched from.
|
||||
continue;
|
||||
}
|
||||
Some(_) => {}
|
||||
None => {
|
||||
// we should have this because we have iterated through all of the timelines
|
||||
anyhow::bail!("missing timeline_input for {timeline_id}")
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) {
|
||||
updates.push(Update {
|
||||
lsn,
|
||||
timeline_id,
|
||||
command: Command::Update(*size),
|
||||
});
|
||||
|
||||
needed_cache.insert((timeline_id, lsn));
|
||||
} else {
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.context("find referenced ancestor timeline")?;
|
||||
let parallel_size_calcs = Arc::clone(limit);
|
||||
joinset.spawn(calculate_logical_size(
|
||||
parallel_size_calcs,
|
||||
timeline.clone(),
|
||||
lsn,
|
||||
));
|
||||
|
||||
if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
|
||||
// we should not find new ones because we iterated tenants all timelines
|
||||
anyhow::ensure!(
|
||||
timeline_inputs.contains_key(&parent_id),
|
||||
"discovered new timeline {parent_id} (parent of {timeline_id})"
|
||||
);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch
|
||||
// point. this is needed by the model.
|
||||
for (timeline_id, inputs) in timeline_inputs.iter() {
|
||||
let lsn = inputs.last_record;
|
||||
|
||||
if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) {
|
||||
// this means that the (timeline_id, last_record_lsn) represents a branch point
|
||||
// we do not want to add EndOfBranch updates for these points because it doesn't fit
|
||||
// into the current tenant_size_model.
|
||||
continue;
|
||||
}
|
||||
|
||||
if lsn > inputs.ancestor_lsn {
|
||||
// all timelines also have an end point if they have made any progress
|
||||
updates.push(Update {
|
||||
lsn,
|
||||
command: Command::EndOfBranch,
|
||||
timeline_id: *timeline_id,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let mut have_any_error = false;
|
||||
|
||||
while let Some(res) = joinset.join_next().await {
|
||||
@@ -379,6 +479,7 @@ pub(super) async fn gather_inputs(
|
||||
// handled by the variant order in `Command`.
|
||||
//
|
||||
updates.sort_unstable();
|
||||
|
||||
// And another sort to handle Command::BranchFrom ordering
|
||||
// in case when there are multiple branches at the same LSN.
|
||||
let sorted_updates = sort_updates_in_tree_order(updates)?;
|
||||
@@ -413,10 +514,10 @@ impl ModelInputs {
|
||||
let Lsn(now) = *lsn;
|
||||
match op {
|
||||
Command::Update(sz) => {
|
||||
storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
|
||||
storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?;
|
||||
}
|
||||
Command::EndOfBranch => {
|
||||
storage.insert_point(&Some(*timeline_id), "".into(), now, None);
|
||||
storage.insert_point(&Some(*timeline_id), "".into(), now, None)?;
|
||||
}
|
||||
Command::BranchFrom(parent) => {
|
||||
// This branch command may fail if it cannot find a parent to branch from.
|
||||
@@ -425,7 +526,7 @@ impl ModelInputs {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(storage.calculate(self.retention_period).total_children())
|
||||
Ok(storage.calculate(self.retention_period)?.total_children())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -574,7 +675,10 @@ fn updates_sort() {
|
||||
fn verify_size_for_multiple_branches() {
|
||||
// this is generated from integration test test_tenant_size_with_multiple_branches, but this way
|
||||
// it has the stable lsn's
|
||||
let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
|
||||
//
|
||||
// timelineinputs have been left out, because those explain the inputs, but don't participate
|
||||
// in further size calculations.
|
||||
let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#;
|
||||
|
||||
let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
|
||||
|
||||
|
||||
@@ -196,3 +196,50 @@ pub fn downcast_remote_layer(
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for dyn Layer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Layer")
|
||||
.field("short_id", &self.short_id())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds metadata about a layer without any content. Used mostly for testing.
|
||||
pub struct LayerDescriptor {
|
||||
pub key: Range<Key>,
|
||||
pub lsn: Range<Lsn>,
|
||||
pub is_incremental: bool,
|
||||
pub short_id: String,
|
||||
}
|
||||
|
||||
impl Layer for LayerDescriptor {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key.clone()
|
||||
}
|
||||
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn.clone()
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.is_incremental
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
todo!("This method shouldn't be part of the Layer trait")
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
self.short_id.clone()
|
||||
}
|
||||
|
||||
fn dump(&self, _verbose: bool) -> Result<()> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -729,16 +729,24 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn activate(self: &Arc<Self>) {
|
||||
self.set_state(TimelineState::Active);
|
||||
self.launch_wal_receiver();
|
||||
}
|
||||
|
||||
pub fn set_state(&self, new_state: TimelineState) {
|
||||
match (self.current_state(), new_state) {
|
||||
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
|
||||
debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
|
||||
warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
|
||||
}
|
||||
(st, TimelineState::Loading) => {
|
||||
error!("ignoring transition from {st:?} into Loading state");
|
||||
}
|
||||
(TimelineState::Broken, _) => {
|
||||
error!("Ignoring state update {new_state:?} for broken tenant");
|
||||
}
|
||||
(TimelineState::Stopping, TimelineState::Active) => {
|
||||
debug!("Not activating a Stopping timeline");
|
||||
error!("Not activating a Stopping timeline");
|
||||
}
|
||||
(_, new_state) => {
|
||||
self.state.send_replace(new_state);
|
||||
@@ -812,7 +820,7 @@ impl Timeline {
|
||||
pg_version: u32,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
let (state, _) = watch::channel(TimelineState::Suspended);
|
||||
let (state, _) = watch::channel(TimelineState::Loading);
|
||||
|
||||
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
|
||||
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
||||
@@ -970,6 +978,7 @@ impl Timeline {
|
||||
///
|
||||
pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
let mut num_layers = 0;
|
||||
|
||||
let timer = self.metrics.load_layer_map_histo.start_timer();
|
||||
@@ -1010,7 +1019,7 @@ impl Timeline {
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
layers.insert_historic(Arc::new(layer));
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
} else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
|
||||
// Create a DeltaLayer struct for each delta file.
|
||||
@@ -1041,7 +1050,7 @@ impl Timeline {
|
||||
|
||||
trace!("found layer {}", layer.path().display());
|
||||
total_physical_size += file_size;
|
||||
layers.insert_historic(Arc::new(layer));
|
||||
updates.insert_historic(Arc::new(layer));
|
||||
num_layers += 1;
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
@@ -1067,6 +1076,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
updates.flush();
|
||||
layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);
|
||||
|
||||
info!(
|
||||
@@ -1091,6 +1101,11 @@ impl Timeline {
|
||||
// Are we missing some files that are present in remote storage?
|
||||
// Create RemoteLayer instances for them.
|
||||
let mut local_only_layers = local_layers;
|
||||
|
||||
// We're holding a layer map lock for a while but this
|
||||
// method is only called during init so it's fine.
|
||||
let mut layer_map = self.layers.write().unwrap();
|
||||
let mut updates = layer_map.batch_update();
|
||||
for remote_layer_name in &index_part.timeline_layers {
|
||||
let local_layer = local_only_layers.remove(remote_layer_name);
|
||||
|
||||
@@ -1129,7 +1144,7 @@ impl Timeline {
|
||||
anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
|
||||
} else {
|
||||
self.metrics.resident_physical_size_gauge.sub(local_size);
|
||||
self.layers.write().unwrap().remove_historic(local_layer);
|
||||
updates.remove_historic(local_layer);
|
||||
// fall-through to adding the remote layer
|
||||
}
|
||||
} else {
|
||||
@@ -1171,7 +1186,7 @@ impl Timeline {
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
|
||||
self.layers.write().unwrap().insert_historic(remote_layer);
|
||||
updates.insert_historic(remote_layer);
|
||||
}
|
||||
LayerFileName::Delta(deltafilename) => {
|
||||
// Create a RemoteLayer for the delta file.
|
||||
@@ -1194,13 +1209,14 @@ impl Timeline {
|
||||
&remote_layer_metadata,
|
||||
);
|
||||
let remote_layer = Arc::new(remote_layer);
|
||||
self.layers.write().unwrap().insert_historic(remote_layer);
|
||||
updates.insert_historic(remote_layer);
|
||||
}
|
||||
#[cfg(test)]
|
||||
LayerFileName::Test(_) => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
updates.flush();
|
||||
Ok(local_only_layers)
|
||||
}
|
||||
|
||||
@@ -1392,7 +1408,7 @@ impl Timeline {
|
||||
TimelineState::Active => continue,
|
||||
TimelineState::Broken
|
||||
| TimelineState::Stopping
|
||||
| TimelineState::Suspended => {
|
||||
| TimelineState::Loading => {
|
||||
break format!("aborted because timeline became inactive (new state: {new_state:?})")
|
||||
}
|
||||
}
|
||||
@@ -2099,10 +2115,11 @@ impl Timeline {
|
||||
])?;
|
||||
|
||||
// Add it to the layer map
|
||||
{
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
layers.insert_historic(Arc::new(new_delta));
|
||||
}
|
||||
self.layers
|
||||
.write()
|
||||
.unwrap()
|
||||
.batch_update()
|
||||
.insert_historic(Arc::new(new_delta));
|
||||
|
||||
// update the timeline's physical size
|
||||
let sz = new_delta_path.metadata()?.len();
|
||||
@@ -2166,13 +2183,15 @@ impl Timeline {
|
||||
// are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
|
||||
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||
if img_lsn < lsn {
|
||||
let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
|
||||
let threshold = self.get_image_creation_threshold();
|
||||
let num_deltas =
|
||||
layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
|
||||
|
||||
debug!(
|
||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||
);
|
||||
if num_deltas >= self.get_image_creation_threshold() {
|
||||
if num_deltas >= threshold {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
@@ -2267,21 +2286,23 @@ impl Timeline {
|
||||
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
for l in image_layers {
|
||||
let path = l.filename();
|
||||
let metadata = timeline_path
|
||||
.join(path.file_name())
|
||||
.metadata()
|
||||
.context("reading metadata of layer file {path}")?;
|
||||
.with_context(|| format!("reading metadata of layer file {}", path.file_name()))?;
|
||||
|
||||
layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
|
||||
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.add(metadata.len());
|
||||
layers.insert_historic(Arc::new(l));
|
||||
updates.insert_historic(Arc::new(l));
|
||||
}
|
||||
updates.flush();
|
||||
drop(layers);
|
||||
timer.stop_and_record();
|
||||
|
||||
@@ -2577,6 +2598,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
|
||||
for l in new_layers {
|
||||
let new_delta_path = l.path();
|
||||
@@ -2597,7 +2619,7 @@ impl Timeline {
|
||||
|
||||
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||
let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
|
||||
layers.insert_historic(x);
|
||||
updates.insert_historic(x);
|
||||
}
|
||||
|
||||
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||
@@ -2611,8 +2633,9 @@ impl Timeline {
|
||||
}
|
||||
layer_names_to_delete.push(l.filename());
|
||||
l.delete()?;
|
||||
layers.remove_historic(l);
|
||||
updates.remove_historic(l);
|
||||
}
|
||||
updates.flush();
|
||||
drop(layers);
|
||||
|
||||
// Also schedule the deletions in remote storage
|
||||
@@ -2812,6 +2835,7 @@ impl Timeline {
|
||||
// 3. it doesn't need to be retained for 'retain_lsns';
|
||||
// 4. newer on-disk image layers cover the layer's whole key range
|
||||
//
|
||||
// TODO holding a write lock is too agressive and avoidable
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
result.layers_total += 1;
|
||||
@@ -2843,6 +2867,8 @@ impl Timeline {
|
||||
// might be referenced by child branches forever.
|
||||
// We can track this in child timeline GC and delete parent layers when
|
||||
// they are no longer needed. This might be complicated with long inheritance chains.
|
||||
//
|
||||
// TODO Vec is not a great choice for `retain_lsns`
|
||||
for retain_lsn in &retain_lsns {
|
||||
// start_lsn is inclusive
|
||||
if &l.get_lsn_range().start <= retain_lsn {
|
||||
@@ -2896,6 +2922,7 @@ impl Timeline {
|
||||
layers_to_remove.push(Arc::clone(&l));
|
||||
}
|
||||
|
||||
let mut updates = layers.batch_update();
|
||||
if !layers_to_remove.is_empty() {
|
||||
// Persist the new GC cutoff value in the metadata file, before
|
||||
// we actually remove anything.
|
||||
@@ -2913,7 +2940,13 @@ impl Timeline {
|
||||
}
|
||||
layer_names_to_delete.push(doomed_layer.filename());
|
||||
doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning?
|
||||
layers.remove_historic(doomed_layer);
|
||||
|
||||
// TODO Removing from the bottom of the layer map is expensive.
|
||||
// Maybe instead discard all layer map historic versions that
|
||||
// won't be needed for page reconstruction for this timeline,
|
||||
// and mark what we can't delete yet as deleted from the layer
|
||||
// map index without actually rebuilding the index.
|
||||
updates.remove_historic(doomed_layer);
|
||||
result.layers_removed += 1;
|
||||
}
|
||||
|
||||
@@ -2925,6 +2958,7 @@ impl Timeline {
|
||||
remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
|
||||
}
|
||||
}
|
||||
updates.flush();
|
||||
|
||||
info!(
|
||||
"GC completed removing {} layers, cutoff {}",
|
||||
@@ -3081,11 +3115,13 @@ impl Timeline {
|
||||
// Delta- or ImageLayer in the layer map.
|
||||
let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
|
||||
let mut layers = self_clone.layers.write().unwrap();
|
||||
let mut updates = layers.batch_update();
|
||||
{
|
||||
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
||||
layers.remove_historic(l);
|
||||
updates.remove_historic(l);
|
||||
}
|
||||
layers.insert_historic(new_layer);
|
||||
updates.insert_historic(new_layer);
|
||||
updates.flush();
|
||||
drop(layers);
|
||||
|
||||
// Now that we've inserted the download into the layer map,
|
||||
|
||||
@@ -525,13 +525,12 @@ mod tests {
|
||||
})
|
||||
}
|
||||
|
||||
fn test_files<OF, FD>(test_name: &str, openfunc: OF) -> Result<(), Error>
|
||||
fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
|
||||
where
|
||||
FD: Read + Write + Seek + FileExt,
|
||||
OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
|
||||
{
|
||||
let temp_repo_dir = tempfile::tempdir()?;
|
||||
let testdir = temp_repo_dir.path().join(test_name);
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir(testname);
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
let path_a = testdir.join("file_a");
|
||||
@@ -633,8 +632,7 @@ mod tests {
|
||||
const THREADS: usize = 100;
|
||||
const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
|
||||
|
||||
let temp_repo_dir = tempfile::tempdir()?;
|
||||
let testdir = temp_repo_dir.path().join("vfile_concurrency");
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
// Create a test file.
|
||||
|
||||
@@ -1146,8 +1146,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_relsize() -> Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_relsize")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&tline).await?;
|
||||
|
||||
@@ -1324,8 +1323,7 @@ mod tests {
|
||||
// and then created it again within the same layer.
|
||||
#[tokio::test]
|
||||
async fn test_drop_extend() -> Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_drop_extend")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&tline).await?;
|
||||
|
||||
@@ -1378,8 +1376,7 @@ mod tests {
|
||||
// and then extended it again within the same layer.
|
||||
#[tokio::test]
|
||||
async fn test_truncate_extend() -> Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&tline).await?;
|
||||
|
||||
@@ -1500,8 +1497,7 @@ mod tests {
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
#[tokio::test]
|
||||
async fn test_large_rel() -> Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let tenant = harness.load().await;
|
||||
let tenant = TenantHarness::create("test_large_rel")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&tline).await?;
|
||||
|
||||
|
||||
@@ -183,13 +183,23 @@ async fn connection_manager_loop_step(
|
||||
|
||||
new_event = async {
|
||||
loop {
|
||||
if walreceiver_state.timeline.current_state() == TimelineState::Loading {
|
||||
warn!("wal connection manager should only be launched after timeline has become active");
|
||||
}
|
||||
match timeline_state_updates.changed().await {
|
||||
Ok(()) => {
|
||||
let new_state = walreceiver_state.timeline.current_state();
|
||||
match new_state {
|
||||
// we're already active as walreceiver, no need to reactivate
|
||||
TimelineState::Active => continue,
|
||||
TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
|
||||
TimelineState::Broken | TimelineState::Stopping => {
|
||||
info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
TimelineState::Loading => {
|
||||
warn!("timeline transitioned back to Loading state, that should not happen");
|
||||
return ControlFlow::Continue(new_state);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_sender_dropped_error) => return ControlFlow::Break(()),
|
||||
@@ -197,7 +207,7 @@ async fn connection_manager_loop_step(
|
||||
}
|
||||
} => match new_event {
|
||||
ControlFlow::Continue(new_state) => {
|
||||
info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
|
||||
info!("observed timeline state change, new state is {new_state:?}");
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
ControlFlow::Break(()) => {
|
||||
@@ -289,7 +299,9 @@ async fn subscribe_for_timeline_updates(
|
||||
return resp.into_inner();
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
|
||||
// Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
|
||||
// entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
|
||||
info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -846,7 +858,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let harness = TenantHarness::create("no_connection_no_candidate")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -879,7 +891,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn connection_no_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let harness = TenantHarness::create("connection_no_candidate")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -942,7 +954,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn no_connection_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let harness = TenantHarness::create("no_connection_candidate")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1001,7 +1013,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
@@ -1041,7 +1053,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -1105,7 +1117,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -1166,7 +1178,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::new()?;
|
||||
let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let new_lsn = Lsn(100_100).align();
|
||||
@@ -1232,7 +1244,7 @@ mod tests {
|
||||
|
||||
const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
|
||||
|
||||
async fn dummy_state(harness: &TenantHarness) -> WalreceiverState {
|
||||
async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
|
||||
WalreceiverState {
|
||||
id: TenantTimelineId {
|
||||
tenant_id: harness.tenant_id,
|
||||
|
||||
@@ -77,9 +77,13 @@ pub async fn handle_walreceiver_connection(
|
||||
info!("DB connection stream finished: {expected_error}");
|
||||
return Ok(());
|
||||
}
|
||||
Err(elapsed) => anyhow::bail!(
|
||||
"Timed out while waiting {elapsed} for walreceiver connection to open"
|
||||
),
|
||||
Err(_) => {
|
||||
// Timing out to connect to a safekeeper node could happen long time, due to
|
||||
// many reasons that pageserver cannot control.
|
||||
// Do not produce an error, but make it visible, that timeouts happen by logging the `event.
|
||||
info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1206,6 +1206,9 @@ class PageserverHttpClient(requests.Session):
|
||||
return res_json
|
||||
|
||||
def tenant_size(self, tenant_id: TenantId) -> int:
|
||||
return self.tenant_size_and_modelinputs(tenant_id)[0]
|
||||
|
||||
def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
|
||||
"""
|
||||
Returns the tenant size, together with the model inputs as the second tuple item.
|
||||
"""
|
||||
@@ -1216,9 +1219,9 @@ class PageserverHttpClient(requests.Session):
|
||||
assert TenantId(res["id"]) == tenant_id
|
||||
size = res["size"]
|
||||
assert type(size) == int
|
||||
# there are additional inputs, which are the collected raw information before being fed to the tenant_size_model
|
||||
# there are no tests for those right now.
|
||||
return size
|
||||
inputs = res["inputs"]
|
||||
assert type(inputs) is dict
|
||||
return (size, inputs)
|
||||
|
||||
def timeline_list(
|
||||
self,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from typing import List, Tuple
|
||||
from typing import Any, List, Tuple
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
|
||||
from fixtures.types import Lsn
|
||||
@@ -9,28 +10,247 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
(tenant_id, _) = env.neon_cli.create_tenant()
|
||||
http_client = env.pageserver.http_client()
|
||||
size = http_client.tenant_size(tenant_id)
|
||||
initial_size = http_client.tenant_size(tenant_id)
|
||||
|
||||
# we should never have zero, because there should be the initdb however
|
||||
# this is questionable if we should have anything in this case, as the
|
||||
# gc_cutoff is negative
|
||||
assert (
|
||||
size == 0
|
||||
), "initial implementation returns zero tenant_size before last_record_lsn is past gc_horizon"
|
||||
# we should never have zero, because there should be the initdb "changes"
|
||||
assert initial_size > 0, "initial implementation returns ~initdb tenant_size"
|
||||
|
||||
with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
|
||||
main_branch_name = "main"
|
||||
|
||||
with env.postgres.create_start(
|
||||
main_branch_name,
|
||||
tenant_id=tenant_id,
|
||||
config_lines=["autovacuum=off", "checkpoint_timeout=10min"],
|
||||
) as pg:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("SELECT 1")
|
||||
row = cur.fetchone()
|
||||
assert row is not None
|
||||
assert row[0] == 1
|
||||
size = http_client.tenant_size(tenant_id)
|
||||
assert size == 0, "starting idle compute should not change the tenant size"
|
||||
# we've disabled the autovacuum and checkpoint
|
||||
# so background processes should not change the size.
|
||||
# If this test will flake we should probably loosen the check
|
||||
assert size == initial_size, "starting idle compute should not change the tenant size"
|
||||
|
||||
# the size should be the same, until we increase the size over the
|
||||
# gc_horizon
|
||||
size = http_client.tenant_size(tenant_id)
|
||||
assert size == 0, "tenant_size should not be affected by shutdown of compute"
|
||||
size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
|
||||
assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
|
||||
|
||||
expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
|
||||
actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"])) # type: ignore
|
||||
assert actual_commands == expected_commands
|
||||
|
||||
|
||||
def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Issue found in production. Because the ancestor branch was under
|
||||
gc_horizon, the branchpoint was "dangling" and the computation could not be
|
||||
done.
|
||||
|
||||
Assuming gc_horizon = 50
|
||||
root: I 0---10------>20
|
||||
branch: |-------------------I---------->150
|
||||
gc_horizon
|
||||
"""
|
||||
env = neon_simple_env
|
||||
(tenant_id, _) = env.neon_cli.create_tenant()
|
||||
http_client = env.pageserver.http_client()
|
||||
|
||||
initial_size = http_client.tenant_size(tenant_id)
|
||||
|
||||
first_branch_timeline_id = env.neon_cli.create_branch("first-branch", tenant_id=tenant_id)
|
||||
|
||||
with env.postgres.create_start("first-branch", tenant_id=tenant_id) as pg:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(
|
||||
"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, first_branch_timeline_id)
|
||||
|
||||
size_after_branching = http_client.tenant_size(tenant_id)
|
||||
log.info(f"size_after_branching: {size_after_branching}")
|
||||
|
||||
assert size_after_branching > initial_size
|
||||
|
||||
|
||||
def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
More general version of test_branched_empty_timeline_size
|
||||
|
||||
Assuming gc_horizon = 50
|
||||
|
||||
root: I 0------10
|
||||
first: I 10
|
||||
nth_0: I 10
|
||||
nth_1: I 10
|
||||
nth_n: 10------------I--------100
|
||||
"""
|
||||
env = neon_simple_env
|
||||
(tenant_id, _) = env.neon_cli.create_tenant()
|
||||
http_client = env.pageserver.http_client()
|
||||
|
||||
initial_size = http_client.tenant_size(tenant_id)
|
||||
|
||||
first_branch_name = "first"
|
||||
env.neon_cli.create_branch(first_branch_name, tenant_id=tenant_id)
|
||||
|
||||
size_after_branching = http_client.tenant_size(tenant_id)
|
||||
|
||||
# this might be flaky like test_get_tenant_size_with_multiple_branches
|
||||
# https://github.com/neondatabase/neon/issues/2962
|
||||
assert size_after_branching == initial_size
|
||||
|
||||
last_branch_name = first_branch_name
|
||||
last_branch = None
|
||||
|
||||
for i in range(0, 4):
|
||||
latest_branch_name = f"nth_{i}"
|
||||
last_branch = env.neon_cli.create_branch(
|
||||
latest_branch_name, ancestor_branch_name=last_branch_name, tenant_id=tenant_id
|
||||
)
|
||||
last_branch_name = latest_branch_name
|
||||
|
||||
size_after_branching = http_client.tenant_size(tenant_id)
|
||||
assert size_after_branching == initial_size
|
||||
|
||||
assert last_branch is not None
|
||||
|
||||
with env.postgres.create_start(last_branch_name, tenant_id=tenant_id) as pg:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(
|
||||
"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
|
||||
)
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, last_branch)
|
||||
|
||||
size_after_writes = http_client.tenant_size(tenant_id)
|
||||
assert size_after_writes > initial_size
|
||||
|
||||
|
||||
@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
|
||||
def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
gc_horizon = 15
|
||||
|
||||
main: 0--I-10------>20
|
||||
branch: |-------------------I---------->150
|
||||
gc_horizon
|
||||
"""
|
||||
|
||||
env = neon_simple_env
|
||||
gc_horizon = 20_000
|
||||
(tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
|
||||
http_client = env.pageserver.http_client()
|
||||
|
||||
with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
|
||||
initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
|
||||
flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
|
||||
|
||||
size_before_branching = http_client.tenant_size(tenant_id)
|
||||
|
||||
assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
|
||||
|
||||
branch_id = env.neon_cli.create_branch(
|
||||
"branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
|
||||
)
|
||||
|
||||
with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
|
||||
|
||||
size_after = http_client.tenant_size(tenant_id)
|
||||
|
||||
assert size_before_branching < size_after
|
||||
|
||||
|
||||
@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
|
||||
def test_parent_within_horizon(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
gc_horizon = 5
|
||||
|
||||
main: 0----10----I->20
|
||||
branch: |-------------------I---------->150
|
||||
gc_horizon
|
||||
"""
|
||||
|
||||
env = neon_simple_env
|
||||
gc_horizon = 200_000
|
||||
(tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
|
||||
http_client = env.pageserver.http_client()
|
||||
|
||||
with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
|
||||
initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
|
||||
|
||||
flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
|
||||
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t00 AS SELECT i::bigint n FROM generate_series(0, 2000) s(i)")
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
|
||||
|
||||
size_before_branching = http_client.tenant_size(tenant_id)
|
||||
|
||||
assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
|
||||
|
||||
branch_id = env.neon_cli.create_branch(
|
||||
"branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
|
||||
)
|
||||
|
||||
with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
|
||||
|
||||
size_after = http_client.tenant_size(tenant_id)
|
||||
|
||||
assert size_before_branching < size_after
|
||||
|
||||
|
||||
@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
|
||||
def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
gc_horizon = small
|
||||
|
||||
main: 0--------10-----I>20
|
||||
first: |-----------------------------I>150
|
||||
second: |---------I>30
|
||||
"""
|
||||
|
||||
env = neon_simple_env
|
||||
(tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": "1024"})
|
||||
http_client = env.pageserver.http_client()
|
||||
|
||||
initial_size = http_client.tenant_size(tenant_id)
|
||||
|
||||
first_id = env.neon_cli.create_branch("first", tenant_id=tenant_id)
|
||||
second_id = env.neon_cli.create_branch("second", tenant_id=tenant_id)
|
||||
|
||||
ids = {"main": main_id, "first": first_id, "second": second_id}
|
||||
|
||||
latest_size = None
|
||||
|
||||
# gc is not expected to change the results
|
||||
|
||||
for branch_name, amount in [("main", 2000), ("first", 15000), ("second", 3000)]:
|
||||
with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
|
||||
with pg.cursor() as cur:
|
||||
cur.execute(
|
||||
f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {amount}) s(i)"
|
||||
)
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, ids[branch_name])
|
||||
size_now = http_client.tenant_size(tenant_id)
|
||||
if latest_size is not None:
|
||||
assert size_now > latest_size
|
||||
else:
|
||||
assert size_now > initial_size
|
||||
|
||||
latest_size = size_now
|
||||
|
||||
|
||||
def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
1
test_runner/sql_regress/.gitignore
vendored
1
test_runner/sql_regress/.gitignore
vendored
@@ -2,6 +2,7 @@
|
||||
/pg_regress
|
||||
|
||||
# Generated subdirectories
|
||||
/tmp_check/
|
||||
/results/
|
||||
/log/
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ memchr = { version = "2" }
|
||||
nom = { version = "7" }
|
||||
num-bigint = { version = "0.4" }
|
||||
num-integer = { version = "0.1", features = ["i128"] }
|
||||
num-traits = { version = "0.2", features = ["i128", "libm"] }
|
||||
num-traits = { version = "0.2", features = ["i128"] }
|
||||
prost = { version = "0.11" }
|
||||
rand = { version = "0.8", features = ["small_rng"] }
|
||||
regex = { version = "1" }
|
||||
|
||||
Reference in New Issue
Block a user