Compare commits

..

1 Commits

Author SHA1 Message Date
Ivan Efremov
e27d674b8e Update anstream package and deps
Update because of cargo deny check error:
error[unsound]: Unsoundness in anstream
2024-12-04 18:22:31 +02:00
60 changed files with 716 additions and 2800 deletions

View File

@@ -1,29 +1,16 @@
# Autoscaling
/libs/vm_monitor/ @neondatabase/autoscaling
# DevProd
/.github/ @neondatabase/developer-productivity
# Compute
/pgxn/ @neondatabase/compute
/vendor/ @neondatabase/compute
/compute/ @neondatabase/compute
/compute_tools/ @neondatabase/compute
# Proxy
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/libs/pageserver_api/ @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
/libs/proxy/ @neondatabase/proxy
/proxy/ @neondatabase/proxy
# Storage
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/storage
/libs/vm_monitor/ @neondatabase/autoscaling
/pageserver/ @neondatabase/storage
/pgxn/ @neondatabase/compute
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
/proxy/ @neondatabase/proxy
/safekeeper/ @neondatabase/storage
/storage_controller @neondatabase/storage
/storage_scrubber @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/storage
# Shared
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
/libs/compute_api/ @neondatabase/compute @neondatabase/control-plane
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
/vendor/ @neondatabase/compute

237
Cargo.lock generated
View File

@@ -84,58 +84,58 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
[[package]]
name = "anstream"
version = "0.6.15"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"is-terminal",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.8"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.0"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.0.0"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys 0.48.0",
"windows-sys 0.59.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.4"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
checksum = "c677ab05e09154296dd37acecd46420c17b9713e8366facafa8fc0885167cf4c"
dependencies = [
"anstyle",
"windows-sys 0.52.0",
"windows-sys 0.48.0",
]
[[package]]
name = "anyhow"
version = "1.0.94"
version = "1.0.71"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
dependencies = [
"backtrace",
]
@@ -1124,7 +1124,7 @@ dependencies = [
"num-traits",
"serde",
"wasm-bindgen",
"windows-targets 0.52.4",
"windows-targets 0.52.6",
]
[[package]]
@@ -1167,33 +1167,35 @@ dependencies = [
[[package]]
name = "clap"
version = "4.5.22"
version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc"
dependencies = [
"clap_builder",
"clap_derive",
"once_cell",
]
[[package]]
name = "clap_builder"
version = "4.5.22"
version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990"
dependencies = [
"anstream",
"anstyle",
"bitflags 1.3.2",
"clap_lex",
"strsim 0.11.1",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.18"
version = "4.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
dependencies = [
"heck",
"heck 0.4.1",
"proc-macro2",
"quote",
"syn 2.0.90",
@@ -1201,15 +1203,15 @@ dependencies = [
[[package]]
name = "clap_lex"
version = "0.7.3"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
[[package]]
name = "colorchoice"
version = "1.0.0"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "combine"
@@ -1612,7 +1614,7 @@ dependencies = [
"ident_case",
"proc-macro2",
"quote",
"strsim 0.10.0",
"strsim",
"syn 2.0.90",
]
@@ -1810,7 +1812,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
dependencies = [
"darling",
"either",
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.90",
@@ -2463,6 +2465,12 @@ dependencies = [
"num-traits",
]
[[package]]
name = "heck"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "heck"
version = "0.5.0"
@@ -2471,9 +2479,15 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.3.3"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
[[package]]
name = "hermit-abi"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
[[package]]
name = "hex"
@@ -2871,21 +2885,15 @@ checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708"
[[package]]
name = "is-terminal"
version = "0.4.12"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b"
dependencies = [
"hermit-abi",
"hermit-abi 0.4.0",
"libc",
"windows-sys 0.52.0",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itertools"
version = "0.10.5"
@@ -3167,7 +3175,7 @@ version = "0.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
dependencies = [
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.90",
@@ -3484,7 +3492,7 @@ version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.9",
"libc",
]
@@ -3951,7 +3959,7 @@ dependencies = [
"libc",
"redox_syscall 0.3.5",
"smallvec",
"windows-targets 0.48.0",
"windows-targets 0.48.5",
]
[[package]]
@@ -4169,7 +4177,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4182,7 +4190,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -4195,6 +4203,7 @@ dependencies = [
"rand 0.8.5",
"sha2",
"stringprep",
"tokio",
]
[[package]]
@@ -4216,7 +4225,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4455,7 +4464,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
dependencies = [
"bytes",
"heck",
"heck 0.5.0",
"itertools 0.12.1",
"log",
"multimap",
@@ -6163,12 +6172,6 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "strum"
version = "0.26.3"
@@ -6181,7 +6184,7 @@ version = "0.26.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
dependencies = [
"heck",
"heck 0.5.0",
"proc-macro2",
"quote",
"rustversion",
@@ -6307,7 +6310,7 @@ dependencies = [
"fastrand 2.2.0",
"once_cell",
"rustix",
"windows-sys 0.52.0",
"windows-sys 0.59.0",
]
[[package]]
@@ -6546,7 +6549,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
dependencies = [
"async-trait",
"byteorder",
@@ -7079,9 +7082,9 @@ checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf8parse"
version = "0.2.1"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "utils"
@@ -7475,7 +7478,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
dependencies = [
"windows-core",
"windows-targets 0.52.4",
"windows-targets 0.52.6",
]
[[package]]
@@ -7484,7 +7487,7 @@ version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets 0.52.4",
"windows-targets 0.52.6",
]
[[package]]
@@ -7493,7 +7496,7 @@ version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets 0.48.0",
"windows-targets 0.48.5",
]
[[package]]
@@ -7502,122 +7505,138 @@ version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.4",
"windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-targets"
version = "0.48.0"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm 0.48.0",
"windows_aarch64_msvc 0.48.0",
"windows_i686_gnu 0.48.0",
"windows_i686_msvc 0.48.0",
"windows_x86_64_gnu 0.48.0",
"windows_x86_64_gnullvm 0.48.0",
"windows_x86_64_msvc 0.48.0",
"windows_aarch64_gnullvm 0.48.5",
"windows_aarch64_msvc 0.48.5",
"windows_i686_gnu 0.48.5",
"windows_i686_msvc 0.48.5",
"windows_x86_64_gnu 0.48.5",
"windows_x86_64_gnullvm 0.48.5",
"windows_x86_64_msvc 0.48.5",
]
[[package]]
name = "windows-targets"
version = "0.52.4"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm 0.52.4",
"windows_aarch64_msvc 0.52.4",
"windows_i686_gnu 0.52.4",
"windows_i686_msvc 0.52.4",
"windows_x86_64_gnu 0.52.4",
"windows_x86_64_gnullvm 0.52.4",
"windows_x86_64_msvc 0.52.4",
"windows_aarch64_gnullvm 0.52.6",
"windows_aarch64_msvc 0.52.6",
"windows_i686_gnu 0.52.6",
"windows_i686_gnullvm",
"windows_i686_msvc 0.52.6",
"windows_x86_64_gnu 0.52.6",
"windows_x86_64_gnullvm 0.52.6",
"windows_x86_64_msvc 0.52.6",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.48.0"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.4"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.0"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.4"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.48.0"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_gnu"
version = "0.52.4"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.48.0"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_i686_msvc"
version = "0.52.4"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.0"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.4"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.0"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.4"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.0"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.4"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "winnow"

View File

@@ -115,7 +115,7 @@ RUN set -e \
# Keep the version the same as in compute/compute-node.Dockerfile and
# test_runner/regress/test_compute_metrics.py.
ENV SQL_EXPORTER_VERSION=0.16.0
ENV SQL_EXPORTER_VERSION=0.13.1
RUN curl -fsSL \
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
--output sql_exporter.tar.gz \

View File

@@ -1324,7 +1324,7 @@ FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
# Keep the version the same as in build-tools.Dockerfile and
# test_runner/regress/test_compute_metrics.py.
FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
#########################################################################################
#

View File

@@ -6,7 +6,6 @@
import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
import 'sql_exporter/compute_current_lsn.libsonnet',
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
import 'sql_exporter/compute_max_connections.libsonnet',
import 'sql_exporter/compute_receive_lsn.libsonnet',
import 'sql_exporter/compute_subscriptions_count.libsonnet',

View File

@@ -1,7 +0,0 @@
SELECT
(SELECT current_setting('neon.timeline_id')) AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;

View File

@@ -1,17 +0,0 @@
local neon = import 'neon.libsonnet';
local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql';
local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql';
{
metric_name: 'compute_logical_snapshots_bytes',
type: 'gauge',
help: 'Size of the pg_logical/snapshots directory, not including temporary files',
key_labels: [
'timeline_id',
],
values: [
'logical_snapshots_bytes',
],
query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir,
}

View File

@@ -1,9 +0,0 @@
SELECT
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
) AS logical_snapshots_bytes;

View File

@@ -1243,7 +1243,12 @@ impl ComputeNode {
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
let max_concurrent_connections = spec.reconfigure_concurrency;
// TODO(ololobus): We need a concurrency during reconfiguration as well,
// but DB is already running and used by user. We can easily get out of
// `max_connections` limit, and the current code won't handle that.
// let compute_state = self.state.lock().unwrap().clone();
// let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
let max_concurrent_connections = 1;
// Temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are reconfiguring:

View File

@@ -53,7 +53,6 @@ use compute_api::spec::Role;
use nix::sys::signal::kill;
use nix::sys::signal::Signal;
use pageserver_api::shard::ShardStripeSize;
use reqwest::header::CONTENT_TYPE;
use serde::{Deserialize, Serialize};
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
@@ -619,7 +618,6 @@ impl Endpoint {
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
local_proxy_config: None,
reconfigure_concurrency: 1,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -819,7 +817,6 @@ impl Endpoint {
self.http_address.ip(),
self.http_address.port()
))
.header(CONTENT_TYPE.as_str(), "application/json")
.body(format!(
"{{\"spec\":{}}}",
serde_json::to_string_pretty(&spec)?

View File

@@ -19,10 +19,6 @@ pub type PgIdent = String;
/// String type alias representing Postgres extension version
pub type ExtVersion = String;
fn default_reconfigure_concurrency() -> usize {
1
}
/// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description.
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
@@ -71,7 +67,7 @@ pub struct ComputeSpec {
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
/// An optional hint that can be passed to speed up startup time if we know
/// An optinal hint that can be passed to speed up startup time if we know
/// that no pg catalog mutations (like role creation, database creation,
/// extension creation) need to be done on the actual database to start.
#[serde(default)] // Default false
@@ -90,7 +86,9 @@ pub struct ComputeSpec {
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
// updated to fill these fields, we can make these non optional.
pub tenant_id: Option<TenantId>,
pub timeline_id: Option<TimelineId>,
pub pageserver_connstring: Option<String>,
#[serde(default)]
@@ -115,20 +113,6 @@ pub struct ComputeSpec {
/// Local Proxy configuration used for JWT authentication
#[serde(default)]
pub local_proxy_config: Option<LocalProxySpec>,
/// Number of concurrent connections during the parallel RunInEachDatabase
/// phase of the apply config process.
///
/// We need a higher concurrency during reconfiguration in case of many DBs,
/// but instance is already running and used by client. We can easily get out of
/// `max_connections` limit, and the current code won't handle that.
///
/// Default is 1, but also allow control plane to override this value for specific
/// projects. It's also recommended to bump `superuser_reserved_connections` +=
/// `reconfigure_concurrency` for such projects to ensure that we always have
/// enough spare connections for reconfiguration process to succeed.
#[serde(default = "default_reconfigure_concurrency")]
pub reconfigure_concurrency: usize,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -331,9 +315,6 @@ mod tests {
// Features list defaults to empty vector.
assert!(spec.features.is_empty());
// Reconfigure concurrency defaults to 1.
assert_eq!(spec.reconfigure_concurrency, 1);
}
#[test]

View File

@@ -245,17 +245,6 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
}
}
/// Scheduling policy enables us to selectively disable some automatic actions that the
/// controller performs on a tenant shard. This is only set to a non-default value by
/// human intervention, and it is reset to the default value (Active) when the tenant's
/// placement policy is modified away from Attached.
///
/// The typical use of a non-Active scheduling policy is one of:
/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
///
/// If you're not sure which policy to use to pin a shard to its current location, you probably
/// want Pause.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum ShardSchedulingPolicy {
// Normal mode: the tenant's scheduled locations may be updated at will, including

View File

@@ -158,8 +158,7 @@ impl ShardIdentity {
key_to_shard_number(self.count, self.stripe_size, key)
}
/// Return true if the key is stored only on this shard. This does not include
/// global keys, see is_key_global().
/// Return true if the key should be ingested by this shard
///
/// Shards must ingest _at least_ keys which return true from this check.
pub fn is_key_local(&self, key: &Key) -> bool {
@@ -172,7 +171,7 @@ impl ShardIdentity {
}
/// Return true if the key should be stored on all shards, not just one.
pub fn is_key_global(&self, key: &Key) -> bool {
fn is_key_global(&self, key: &Key) -> bool {
if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
// Special keys that are only stored on shard 0
false

View File

@@ -164,12 +164,6 @@ impl TenantShardId {
}
}
impl std::fmt::Display for ShardNumber {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}
impl std::fmt::Display for ShardSlug<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(

View File

@@ -1,6 +1,5 @@
pub mod heavier_once_cell;
pub mod duplex;
pub mod gate;
pub mod spsc_fold;

View File

@@ -1 +0,0 @@
pub mod mpsc;

View File

@@ -1,36 +0,0 @@
use tokio::sync::mpsc;
/// A bi-directional channel.
pub struct Duplex<S, R> {
pub tx: mpsc::Sender<S>,
pub rx: mpsc::Receiver<R>,
}
/// Creates a bi-directional channel.
///
/// The channel will buffer up to the provided number of messages. Once the buffer is full,
/// attempts to send new messages will wait until a message is received from the channel.
/// The provided buffer capacity must be at least 1.
pub fn channel<A: Send, B: Send>(buffer: usize) -> (Duplex<A, B>, Duplex<B, A>) {
let (tx_a, rx_a) = mpsc::channel::<A>(buffer);
let (tx_b, rx_b) = mpsc::channel::<B>(buffer);
(Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a })
}
impl<S: Send, R: Send> Duplex<S, R> {
/// Sends a value, waiting until there is capacity.
///
/// A successful send occurs when it is determined that the other end of the channel has not hung up already.
pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError<S>> {
self.tx.send(x).await
}
/// Receives the next value for this receiver.
///
/// This method returns `None` if the channel has been closed and there are
/// no remaining messages in the channel's buffer.
pub async fn recv(&mut self) -> Option<R> {
self.rx.recv().await
}
}

View File

@@ -62,8 +62,10 @@ async fn ingest(
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
let gate = utils::sync::gate::Gate::default();
let entered = gate.enter().unwrap();
let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &gate, &ctx).await?;
let layer =
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
let data = Value::Image(Bytes::from(vec![0u8; put_size]));
let data_ser_size = data.serialized_size().unwrap() as usize;

View File

@@ -87,7 +87,7 @@ use crate::tenant::timeline::offload::offload_timeline;
use crate::tenant::timeline::offload::OffloadError;
use crate::tenant::timeline::CompactFlags;
use crate::tenant::timeline::CompactOptions;
use crate::tenant::timeline::CompactRequest;
use crate::tenant::timeline::CompactRange;
use crate::tenant::timeline::CompactionError;
use crate::tenant::timeline::Timeline;
use crate::tenant::GetTimelineError;
@@ -1978,26 +1978,6 @@ async fn timeline_gc_handler(
json_response(StatusCode::OK, gc_result)
}
// Cancel scheduled compaction tasks
async fn timeline_cancel_compact_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant.cancel_scheduled_compaction(timeline_id);
json_response(StatusCode::OK, ())
}
.instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.await
}
// Run compaction immediately on given timeline.
async fn timeline_compact_handler(
mut request: Request<Body>,
@@ -2007,7 +1987,7 @@ async fn timeline_compact_handler(
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let compact_request = json_request_maybe::<Option<CompactRequest>>(&mut request).await?;
let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
let state = get_state(&request);
@@ -2032,50 +2012,22 @@ async fn timeline_compact_handler(
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
let wait_until_scheduled_compaction_done =
parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
.unwrap_or(false);
let sub_compaction = compact_request
.as_ref()
.map(|r| r.sub_compaction)
.unwrap_or(false);
let options = CompactOptions {
compact_range: compact_request
.as_ref()
.and_then(|r| r.compact_range.clone()),
compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
compact_range,
flags,
sub_compaction,
};
let scheduled = compact_request
.as_ref()
.map(|r| r.scheduled)
.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
if scheduled {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
let rx = tenant.schedule_compaction(timeline_id, options).await;
if wait_until_scheduled_compaction_done {
// It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
rx.await.ok();
}
} else {
timeline
.compact_with_options(&cancel, options, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await
// XXX map to correct ApiError for the cases where it's due to shutdown
.context("wait completion").map_err(ApiError::InternalServerError)?;
}
timeline
.compact_with_options(&cancel, options, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
if wait_until_uploaded {
timeline.remote_client.wait_completion().await
// XXX map to correct ApiError for the cases where it's due to shutdown
.context("wait completion").map_err(ApiError::InternalServerError)?;
}
json_response(StatusCode::OK, ())
}
@@ -2156,20 +2108,16 @@ async fn timeline_checkpoint_handler(
// By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
let wait_until_flushed: bool =
parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true);
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
if wait_until_flushed {
timeline.freeze_and_flush().await
} else {
timeline.freeze().await.and(Ok(()))
}.map_err(|e| {
timeline
.freeze_and_flush()
.await
.map_err(|e| {
match e {
tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
other => ApiError::InternalServerError(other.into()),
@@ -3353,10 +3301,6 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|r| api_handler(r, timeline_compact_handler),
)
.delete(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|r| api_handler(r, timeline_cancel_compact_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
|r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),

View File

@@ -464,24 +464,6 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_disk_consistent_lsn",
"Disk consistent LSN grouped by timeline",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_projected_remote_consistent_lsn",
"Projected remote consistent LSN grouped by timeline",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_pitr_history_size",
@@ -1223,60 +1205,31 @@ pub(crate) mod virtual_file_io_engine {
});
}
pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
pub(crate) struct SmgrOpTimerInner {
pub(crate) struct SmgrOpTimer {
global_latency_histo: Histogram,
// Optional because not all op types are tracked per-timeline
per_timeline_latency_histo: Option<Histogram>,
global_flush_in_progress_micros: IntCounter,
per_timeline_flush_in_progress_micros: IntCounter,
start: Instant,
throttled: Duration,
op: SmgrQueryType,
}
pub(crate) struct SmgrOpFlushInProgress {
base: Instant,
global_micros: IntCounter,
per_timeline_micros: IntCounter,
}
impl SmgrOpTimer {
pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
let Some(throttle) = throttle else {
return;
};
let inner = self.0.as_mut().expect("other public methods consume self");
inner.throttled += *throttle;
self.throttled += *throttle;
}
}
pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
let (flush_start, inner) = self
.smgr_op_end()
.expect("this method consume self, and the only other caller is drop handler");
let SmgrOpTimerInner {
global_flush_in_progress_micros,
per_timeline_flush_in_progress_micros,
..
} = inner;
SmgrOpFlushInProgress {
base: flush_start,
global_micros: global_flush_in_progress_micros,
per_timeline_micros: per_timeline_flush_in_progress_micros,
}
}
impl Drop for SmgrOpTimer {
fn drop(&mut self) {
let elapsed = self.start.elapsed();
/// Returns `None`` if this method has already been called, `Some` otherwise.
fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
let inner = self.0.take()?;
let now = Instant::now();
let elapsed = now - inner.start;
let elapsed = match elapsed.checked_sub(inner.throttled) {
let elapsed = match elapsed.checked_sub(self.throttled) {
Some(elapsed) => elapsed,
None => {
use utils::rate_limit::RateLimit;
@@ -1287,9 +1240,9 @@ impl SmgrOpTimer {
})))
});
let mut guard = LOGGED.lock().unwrap();
let rate_limit = &mut guard[inner.op];
let rate_limit = &mut guard[self.op];
rate_limit.call(|| {
warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
});
elapsed // un-throttled time, more info than just saturating to 0
}
@@ -1297,54 +1250,10 @@ impl SmgrOpTimer {
let elapsed = elapsed.as_secs_f64();
inner.global_latency_histo.observe(elapsed);
if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
self.global_latency_histo.observe(elapsed);
if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
per_timeline_getpage_histo.observe(elapsed);
}
Some((now, inner))
}
}
impl Drop for SmgrOpTimer {
fn drop(&mut self) {
self.smgr_op_end();
}
}
impl SmgrOpFlushInProgress {
pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
where
Fut: std::future::Future<Output = O>,
{
let mut fut = std::pin::pin!(fut);
let now = Instant::now();
// Whenever observe_guard gets called, or dropped,
// it adds the time elapsed since its last call to metrics.
// Last call is tracked in `now`.
let mut observe_guard = scopeguard::guard(
|| {
let elapsed = now - self.base;
self.global_micros
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
self.per_timeline_micros
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
self.base = now;
},
|mut observe| {
observe();
},
);
loop {
match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
Ok(v) => return v,
Err(_timeout) => {
(*observe_guard)();
}
}
}
}
}
@@ -1375,8 +1284,6 @@ pub(crate) struct SmgrQueryTimePerTimeline {
per_timeline_getpage_latency: Histogram,
global_batch_size: Histogram,
per_timeline_batch_size: Histogram,
global_flush_in_progress_micros: IntCounter,
per_timeline_flush_in_progress_micros: IntCounter,
}
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1539,26 +1446,6 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
.set(value.try_into().unwrap());
}
static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_service_pagestream_flush_in_progress_micros",
"Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
easily discoverable in monitoring. \
Hence, this is NOT a completion latency historgram.",
&["tenant_id", "shard_id", "timeline_id"],
)
.expect("failed to define a metric")
});
static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_page_service_pagestream_flush_in_progress_micros_global",
"Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
)
.expect("failed to define a metric")
});
impl SmgrQueryTimePerTimeline {
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1599,12 +1486,6 @@ impl SmgrQueryTimePerTimeline {
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
.unwrap();
let global_flush_in_progress_micros =
PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
.unwrap();
Self {
global_started,
global_latency,
@@ -1612,8 +1493,6 @@ impl SmgrQueryTimePerTimeline {
per_timeline_getpage_started,
global_batch_size,
per_timeline_batch_size,
global_flush_in_progress_micros,
per_timeline_flush_in_progress_micros,
}
}
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
@@ -1626,17 +1505,13 @@ impl SmgrQueryTimePerTimeline {
None
};
SmgrOpTimer(Some(SmgrOpTimerInner {
SmgrOpTimer {
global_latency_histo: self.global_latency[op as usize].clone(),
per_timeline_latency_histo,
start: started_at,
op,
throttled: Duration::ZERO,
global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
per_timeline_flush_in_progress_micros: self
.per_timeline_flush_in_progress_micros
.clone(),
}))
}
}
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
@@ -2311,15 +2186,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
.expect("failed to define a metric"),
});
pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_timeline_wal_records_received",
"Number of WAL records received per shard",
&["tenant_id", "shard_id", "timeline_id"]
)
.expect("failed to define a metric")
});
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_seconds",
@@ -2528,8 +2394,7 @@ pub(crate) struct TimelineMetrics {
pub load_layer_map_histo: StorageTimeMetrics,
pub garbage_collect_histo: StorageTimeMetrics,
pub find_gc_cutoffs_histo: StorageTimeMetrics,
pub last_record_lsn_gauge: IntGauge,
pub disk_consistent_lsn_gauge: IntGauge,
pub last_record_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub(crate) layer_size_image: UIntGauge,
@@ -2547,7 +2412,6 @@ pub(crate) struct TimelineMetrics {
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
/// Number of valid LSN leases.
pub valid_lsn_lease_count_gauge: UIntGauge,
pub wal_records_received: IntCounter,
shutdown: std::sync::atomic::AtomicBool,
}
@@ -2611,11 +2475,7 @@ impl TimelineMetrics {
&shard_id,
&timeline_id,
);
let last_record_lsn_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2705,10 +2565,6 @@ impl TimelineMetrics {
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
TimelineMetrics {
tenant_id,
shard_id,
@@ -2722,8 +2578,7 @@ impl TimelineMetrics {
garbage_collect_histo,
find_gc_cutoffs_histo,
load_layer_map_histo,
last_record_lsn_gauge,
disk_consistent_lsn_gauge,
last_record_gauge,
pitr_history_size,
archival_size,
layer_size_image,
@@ -2741,7 +2596,6 @@ impl TimelineMetrics {
evictions_with_low_residence_duration,
),
valid_lsn_lease_count_gauge,
wal_records_received,
shutdown: std::sync::atomic::AtomicBool::default(),
}
}
@@ -2788,7 +2642,6 @@ impl TimelineMetrics {
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
{
@@ -2879,16 +2732,6 @@ impl TimelineMetrics {
shard_id,
timeline_id,
]);
let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
}
}
@@ -2962,7 +2805,6 @@ pub(crate) struct RemoteTimelineClientMetrics {
calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
}
impl RemoteTimelineClientMetrics {
@@ -2977,10 +2819,6 @@ impl RemoteTimelineClientMetrics {
.unwrap(),
);
let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
.get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
.unwrap();
RemoteTimelineClientMetrics {
tenant_id: tenant_id_str,
shard_id: shard_id_str,
@@ -2989,7 +2827,6 @@ impl RemoteTimelineClientMetrics {
bytes_started_counter: Mutex::new(HashMap::default()),
bytes_finished_counter: Mutex::new(HashMap::default()),
remote_physical_size_gauge,
projected_remote_consistent_lsn_gauge,
}
}
@@ -3203,7 +3040,6 @@ impl Drop for RemoteTimelineClientMetrics {
calls,
bytes_started_counter,
bytes_finished_counter,
projected_remote_consistent_lsn_gauge,
} = self;
for ((a, b), _) in calls.get_mut().unwrap().drain() {
let mut res = [Ok(()), Ok(())];
@@ -3233,14 +3069,6 @@ impl Drop for RemoteTimelineClientMetrics {
let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
{
let _ = projected_remote_consistent_lsn_gauge;
let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
]);
}
}
}

View File

@@ -1017,8 +1017,10 @@ impl PageServerHandler {
// Map handler result to protocol behavior.
// Some handler errors cause exit from pagestream protocol.
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
let mut timers: smallvec::SmallVec<[_; 1]> =
smallvec::SmallVec::with_capacity(handler_results.len());
for handler_result in handler_results {
let (response_msg, timer) = match handler_result {
let response_msg = match handler_result {
Err(e) => match &e {
PageStreamError::Shutdown => {
// If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1042,66 +1044,34 @@ impl PageServerHandler {
span.in_scope(|| {
error!("error reading relation or page version: {full:#}")
});
(
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
}),
None, // TODO: measure errors
)
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
}
},
Ok((response_msg, timer)) => (response_msg, Some(timer)),
Ok((response_msg, timer)) => {
// Extending the lifetime of the timers so observations on drop
// include the flush time.
timers.push(timer);
response_msg
}
};
//
// marshal & transmit response message
//
pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
// We purposefully don't count flush time into the timer.
//
// The reason is that current compute client will not perform protocol processing
// if the postgres backend process is doing things other than `->smgr_read()`.
// This is especially the case for prefetch.
//
// If the compute doesn't read from the connection, eventually TCP will backpressure
// all the way into our flush call below.
//
// The timer's underlying metric is used for a storage-internal latency SLO and
// we don't want to include latency in it that we can't control.
// And as pointed out above, in this case, we don't control the time that flush will take.
let flushing_timer =
timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
// what we want to do
let flush_fut = pgb_writer.flush();
// metric for how long flushing takes
let flush_fut = match flushing_timer {
Some(flushing_timer) => {
futures::future::Either::Left(flushing_timer.measure(flush_fut))
}
None => futures::future::Either::Right(flush_fut),
};
// do it while respecting cancellation
let _: () = async move {
tokio::select! {
biased;
_ = cancel.cancelled() => {
// We were requested to shut down.
info!("shutdown request received in page handler");
return Err(QueryError::Shutdown)
}
res = flush_fut => {
res?;
}
}
Ok(())
}
// and log the info! line inside the request span
.instrument(span.clone())
.await?;
}
tokio::select! {
biased;
_ = cancel.cancelled() => {
// We were requested to shut down.
info!("shutdown request received in page handler");
return Err(QueryError::Shutdown)
}
res = pgb_writer.flush() => {
res?;
}
}
drop(timers);
Ok(())
}

View File

@@ -37,19 +37,14 @@ use remote_timeline_client::manifest::{
};
use remote_timeline_client::UploadQueueNotReadyError;
use std::collections::BTreeMap;
use std::collections::VecDeque;
use std::fmt;
use std::future::Future;
use std::sync::atomic::AtomicBool;
use std::sync::Weak;
use std::time::SystemTime;
use storage_broker::BrokerClientChannel;
use timeline::compaction::ScheduledCompactionTask;
use timeline::import_pgdata;
use timeline::offload::offload_timeline;
use timeline::CompactFlags;
use timeline::CompactOptions;
use timeline::CompactionError;
use timeline::ShutdownMode;
use tokio::io::BufReader;
use tokio::sync::watch;
@@ -344,11 +339,6 @@ pub struct Tenant {
/// Overhead of mutex is acceptable because compaction is done with a multi-second period.
compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
/// Scheduled compaction tasks. Currently, this can only be populated by triggering
/// a manual gc-compaction from the manual compaction API.
scheduled_compaction_tasks:
std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
/// If the tenant is in Activating state, notify this to encourage it
/// to proceed to Active as soon as possible, rather than waiting for lazy
/// background warmup.
@@ -2963,100 +2953,27 @@ impl Tenant {
for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
{
// pending_task_left == None: cannot compact, maybe still pending tasks
// pending_task_left == Some(true): compaction task left
// pending_task_left == Some(false): no compaction task left
let pending_task_left = if *can_compact {
let has_pending_l0_compaction_task = timeline
.compact(cancel, EnumSet::empty(), ctx)
.instrument(info_span!("compact_timeline", %timeline_id))
.await
.inspect_err(|e| match e {
timeline::CompactionError::ShuttingDown => (),
timeline::CompactionError::Offload(_) => {
// Failures to offload timelines do not trip the circuit breaker, because
// they do not do lots of writes the way compaction itself does: it is cheap
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
}
timeline::CompactionError::Other(e) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, e);
}
})?;
if has_pending_l0_compaction_task {
Some(true)
} else {
let mut has_pending_scheduled_compaction_task;
let next_scheduled_compaction_task = {
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
if !tline_pending_tasks.is_empty() {
info!(
"{} tasks left in the compaction schedule queue",
tline_pending_tasks.len()
);
Some(
timeline
.compact(cancel, EnumSet::empty(), ctx)
.instrument(info_span!("compact_timeline", %timeline_id))
.await
.inspect_err(|e| match e {
timeline::CompactionError::ShuttingDown => (),
timeline::CompactionError::Offload(_) => {
// Failures to offload timelines do not trip the circuit breaker, because
// they do not do lots of writes the way compaction itself does: it is cheap
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
}
let next_task = tline_pending_tasks.pop_front();
has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
next_task
} else {
has_pending_scheduled_compaction_task = false;
None
}
};
if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
{
if !next_scheduled_compaction_task
.options
.flags
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
{
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
} else if next_scheduled_compaction_task.options.sub_compaction {
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
let jobs = timeline
.gc_compaction_split_jobs(next_scheduled_compaction_task.options)
.await
.map_err(CompactionError::Other)?;
if jobs.is_empty() {
info!("no jobs to run, skipping scheduled compaction task");
} else {
has_pending_scheduled_compaction_task = true;
let jobs_len = jobs.len();
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
let tline_pending_tasks = guard.entry(*timeline_id).or_default();
for (idx, job) in jobs.into_iter().enumerate() {
tline_pending_tasks.push_back(ScheduledCompactionTask {
options: job,
result_tx: if idx == jobs_len - 1 {
// The last compaction job sends the completion signal
next_scheduled_compaction_task.result_tx.take()
} else {
None
},
});
}
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
timeline::CompactionError::Other(e) => {
self.compaction_circuit_breaker
.lock()
.unwrap()
.fail(&CIRCUIT_BREAKERS_BROKEN, e);
}
} else {
let _ = timeline
.compact_with_options(
cancel,
next_scheduled_compaction_task.options,
ctx,
)
.instrument(info_span!("scheduled_compact_timeline", %timeline_id))
.await?;
if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
// TODO: we can send compaction statistics in the future
tx.send(()).ok();
}
}
}
Some(has_pending_scheduled_compaction_task)
}
})?,
)
} else {
None
};
@@ -3076,36 +2993,6 @@ impl Tenant {
Ok(has_pending_task)
}
/// Cancel scheduled compaction tasks
pub(crate) fn cancel_scheduled_compaction(
&self,
timeline_id: TimelineId,
) -> Vec<ScheduledCompactionTask> {
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
current_tline_pending_tasks.into_iter().collect()
} else {
Vec::new()
}
}
/// Schedule a compaction task for a timeline.
pub(crate) async fn schedule_compaction(
&self,
timeline_id: TimelineId,
options: CompactOptions,
) -> tokio::sync::oneshot::Receiver<()> {
let (tx, rx) = tokio::sync::oneshot::channel();
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
let tline_pending_tasks = guard.entry(timeline_id).or_default();
tline_pending_tasks.push_back(ScheduledCompactionTask {
options,
result_tx: Some(tx),
});
rx
}
// Call through to all timelines to freeze ephemeral layers if needed. Usually
// this happens during ingest: this background housekeeping is for freezing layers
// that are open but haven't been written to for some time.
@@ -4118,7 +4005,6 @@ impl Tenant {
// use an extremely long backoff.
Some(Duration::from_secs(3600 * 24)),
)),
scheduled_compaction_tasks: Mutex::new(Default::default()),
activate_now_sem: tokio::sync::Semaphore::new(0),
attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
cancel: CancellationToken::default(),
@@ -9277,7 +9163,6 @@ mod tests {
CompactOptions {
flags: dryrun_flags,
compact_range: None,
..Default::default()
},
&ctx,
)
@@ -9514,7 +9399,6 @@ mod tests {
CompactOptions {
flags: dryrun_flags,
compact_range: None,
..Default::default()
},
&ctx,
)
@@ -10001,15 +9885,7 @@ mod tests {
// Do a partial compaction on key range 0..2
tline
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(0)..get_key(2)).into()),
..Default::default()
},
&ctx,
)
.partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10048,15 +9924,7 @@ mod tests {
// Do a partial compaction on key range 2..4
tline
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(2)..get_key(4)).into()),
..Default::default()
},
&ctx,
)
.partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10100,15 +9968,7 @@ mod tests {
// Do a partial compaction on key range 4..9
tline
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(4)..get_key(9)).into()),
..Default::default()
},
&ctx,
)
.partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10151,15 +10011,7 @@ mod tests {
// Do a partial compaction on key range 9..10
tline
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(9)..get_key(10)).into()),
..Default::default()
},
&ctx,
)
.partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10207,15 +10059,7 @@ mod tests {
// Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
tline
.compact_with_gc(
&cancel,
CompactOptions {
flags: EnumSet::new(),
compact_range: Some((get_key(0)..get_key(10)).into()),
..Default::default()
},
&ctx,
)
.partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;

View File

@@ -8,8 +8,10 @@ use crate::page_cache;
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
use crate::virtual_file::owned_buffers_io::write::Buffer;
use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
use bytes::BytesMut;
use camino::Utf8PathBuf;
use num_traits::Num;
use pageserver_api::shard::TenantShardId;
@@ -18,7 +20,6 @@ use tracing::error;
use std::io;
use std::sync::atomic::AtomicU64;
use std::sync::Arc;
use utils::id::TimelineId;
pub struct EphemeralFile {
@@ -26,7 +27,10 @@ pub struct EphemeralFile {
_timeline_id: TimelineId,
page_cache_file_id: page_cache::FileId,
bytes_written: u64,
buffered_writer: owned_buffers_io::write::BufferedWriter<IoBufferMut, VirtualFile>,
buffered_writer: owned_buffers_io::write::BufferedWriter<
BytesMut,
size_tracking_writer::Writer<VirtualFile>,
>,
/// Gate guard is held on as long as we need to do operations in the path (delete on drop)
_gate_guard: utils::sync::gate::GateGuard,
}
@@ -38,9 +42,9 @@ impl EphemeralFile {
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
gate: &utils::sync::gate::Gate,
gate_guard: utils::sync::gate::GateGuard,
ctx: &RequestContext,
) -> anyhow::Result<EphemeralFile> {
) -> Result<EphemeralFile, io::Error> {
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
let filename_disambiguator =
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
@@ -51,17 +55,15 @@ impl EphemeralFile {
"ephemeral-{filename_disambiguator}"
)));
let file = Arc::new(
VirtualFile::open_with_options_v2(
&filename,
virtual_file::OpenOptions::new()
.read(true)
.write(true)
.create(true),
ctx,
)
.await?,
);
let file = VirtualFile::open_with_options(
&filename,
virtual_file::OpenOptions::new()
.read(true)
.write(true)
.create(true),
ctx,
)
.await?;
let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
@@ -71,12 +73,10 @@ impl EphemeralFile {
page_cache_file_id,
bytes_written: 0,
buffered_writer: owned_buffers_io::write::BufferedWriter::new(
file,
|| IoBufferMut::with_capacity(TAIL_SZ),
gate.enter()?,
ctx,
size_tracking_writer::Writer::new(file),
BytesMut::with_capacity(TAIL_SZ),
),
_gate_guard: gate.enter()?,
_gate_guard: gate_guard,
})
}
}
@@ -85,7 +85,7 @@ impl Drop for EphemeralFile {
fn drop(&mut self) {
// unlink the file
// we are clear to do this, because we have entered a gate
let path = self.buffered_writer.as_inner().path();
let path = self.buffered_writer.as_inner().as_inner().path();
let res = std::fs::remove_file(path);
if let Err(e) = res {
if e.kind() != std::io::ErrorKind::NotFound {
@@ -132,18 +132,6 @@ impl EphemeralFile {
srcbuf: &[u8],
ctx: &RequestContext,
) -> std::io::Result<u64> {
let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?;
if let Some(control) = control {
control.release().await;
}
Ok(pos)
}
async fn write_raw_controlled(
&mut self,
srcbuf: &[u8],
ctx: &RequestContext,
) -> std::io::Result<(u64, Option<owned_buffers_io::write::FlushControl>)> {
let pos = self.bytes_written;
let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
@@ -157,9 +145,9 @@ impl EphemeralFile {
})?;
// Write the payload
let (nwritten, control) = self
let nwritten = self
.buffered_writer
.write_buffered_borrowed_controlled(srcbuf, ctx)
.write_buffered_borrowed(srcbuf, ctx)
.await?;
assert_eq!(
nwritten,
@@ -169,7 +157,7 @@ impl EphemeralFile {
self.bytes_written = new_bytes_written;
Ok((pos, control))
Ok(pos)
}
}
@@ -180,12 +168,11 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
dst: tokio_epoll_uring::Slice<B>,
ctx: &'a RequestContext,
) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
let submitted_offset = self.buffered_writer.bytes_submitted();
let file_size_tracking_writer = self.buffered_writer.as_inner();
let flushed_offset = file_size_tracking_writer.bytes_written();
let mutable = self.buffered_writer.inspect_mutable();
let mutable = &mutable[0..mutable.pending()];
let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
let buffer = self.buffered_writer.inspect_buffer();
let buffered = &buffer[0..buffer.pending()];
let dst_cap = dst.bytes_total().into_u64();
let end = {
@@ -210,42 +197,11 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
}
}
}
let (written_range, maybe_flushed_range) = {
if maybe_flushed.is_some() {
// [ written ][ maybe_flushed ][ mutable ]
// <- TAIL_SZ -><- TAIL_SZ ->
// ^
// `submitted_offset`
// <++++++ on disk +++++++????????????????>
(
Range(
start,
std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)),
),
Range(
std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)),
std::cmp::min(end, submitted_offset),
),
)
} else {
// [ written ][ mutable ]
// <- TAIL_SZ ->
// ^
// `submitted_offset`
// <++++++ on disk +++++++++++++++++++++++>
(
Range(start, std::cmp::min(end, submitted_offset)),
// zero len
Range(submitted_offset, u64::MIN),
)
}
};
let mutable_range = Range(std::cmp::max(start, submitted_offset), end);
let written_range = Range(start, std::cmp::min(end, flushed_offset));
let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
let dst = if written_range.len() > 0 {
let file: &VirtualFile = self.buffered_writer.as_inner();
let file: &VirtualFile = file_size_tracking_writer.as_inner();
let bounds = dst.bounds();
let slice = file
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
@@ -255,21 +211,19 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
dst
};
let dst = if maybe_flushed_range.len() > 0 {
let offset_in_buffer = maybe_flushed_range
let dst = if buffered_range.len() > 0 {
let offset_in_buffer = buffered_range
.0
.checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64))
.checked_sub(flushed_offset)
.unwrap()
.into_usize();
// Checked previously the buffer is Some.
let maybe_flushed = maybe_flushed.unwrap();
let to_copy = &maybe_flushed
[offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())];
let to_copy =
&buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
let bounds = dst.bounds();
let mut view = dst.slice({
let start = written_range.len().into_usize();
let end = start
.checked_add(maybe_flushed_range.len().into_usize())
.checked_add(buffered_range.len().into_usize())
.unwrap();
start..end
});
@@ -280,28 +234,6 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
dst
};
let dst = if mutable_range.len() > 0 {
let offset_in_buffer = mutable_range
.0
.checked_sub(submitted_offset)
.unwrap()
.into_usize();
let to_copy =
&mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
let bounds = dst.bounds();
let mut view = dst.slice({
let start =
written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
start..end
});
view.as_mut_rust_slice_full_zeroed()
.copy_from_slice(to_copy);
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
} else {
dst
};
// TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
Ok((dst, (end - start).into_usize()))
@@ -363,7 +295,7 @@ mod tests {
let gate = utils::sync::gate::Gate::default();
let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
.await
.unwrap();
@@ -394,15 +326,14 @@ mod tests {
let gate = utils::sync::gate::Gate::default();
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
.await
.unwrap();
let mut file =
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
.await
.unwrap();
let mutable = file.buffered_writer.inspect_mutable();
let cap = mutable.capacity();
let align = mutable.align();
let cap = file.buffered_writer.inspect_buffer().capacity();
let write_nbytes = cap * 2 + cap / 2;
let write_nbytes = cap + cap / 2;
let content: Vec<u8> = rand::thread_rng()
.sample_iter(rand::distributions::Standard)
@@ -410,39 +341,30 @@ mod tests {
.collect();
let mut value_offsets = Vec::new();
for range in (0..write_nbytes)
.step_by(align)
.map(|start| start..(start + align).min(write_nbytes))
{
let off = file.write_raw(&content[range], &ctx).await.unwrap();
for i in 0..write_nbytes {
let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
value_offsets.push(off);
}
assert_eq!(file.len() as usize, write_nbytes);
for (i, range) in (0..write_nbytes)
.step_by(align)
.map(|start| start..(start + align).min(write_nbytes))
.enumerate()
{
assert_eq!(value_offsets[i], range.start.into_u64());
let buf = IoBufferMut::with_capacity(range.len());
assert!(file.len() as usize == write_nbytes);
for i in 0..write_nbytes {
assert_eq!(value_offsets[i], i.into_u64());
let buf = IoBufferMut::with_capacity(1);
let (buf_slice, nread) = file
.read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx)
.read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
.await
.unwrap();
let buf = buf_slice.into_inner();
assert_eq!(nread, range.len());
assert_eq!(&buf, &content[range]);
assert_eq!(nread, 1);
assert_eq!(&buf, &content[i..i + 1]);
}
let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap();
assert!(file_contents == content[0..cap * 2]);
let file_contents =
std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
assert_eq!(file_contents, &content[0..cap]);
let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);
let mutable_buffer_contents = file.buffered_writer.inspect_mutable();
assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
let buffer_contents = file.buffered_writer.inspect_buffer();
assert_eq!(buffer_contents, &content[cap..write_nbytes]);
}
#[tokio::test]
@@ -451,16 +373,16 @@ mod tests {
let gate = utils::sync::gate::Gate::default();
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
.await
.unwrap();
let mut file =
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
.await
.unwrap();
// mutable buffer and maybe_flushed buffer each has `cap` bytes.
let cap = file.buffered_writer.inspect_mutable().capacity();
let cap = file.buffered_writer.inspect_buffer().capacity();
let content: Vec<u8> = rand::thread_rng()
.sample_iter(rand::distributions::Standard)
.take(cap * 2 + cap / 2)
.take(cap + cap / 2)
.collect();
file.write_raw(&content, &ctx).await.unwrap();
@@ -468,21 +390,23 @@ mod tests {
// assert the state is as this test expects it to be
assert_eq!(
&file.load_to_io_buf(&ctx).await.unwrap(),
&content[0..cap * 2 + cap / 2]
&content[0..cap + cap / 2]
);
let md = file.buffered_writer.as_inner().path().metadata().unwrap();
let md = file
.buffered_writer
.as_inner()
.as_inner()
.path()
.metadata()
.unwrap();
assert_eq!(
md.len(),
2 * cap.into_u64(),
"buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
cap.into_u64(),
"buffered writer does one write if we write 1.5x buffer capacity"
);
assert_eq!(
&file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
&content[cap..cap * 2]
);
assert_eq!(
&file.buffered_writer.inspect_mutable()[0..cap / 2],
&content[cap * 2..cap * 2 + cap / 2]
&file.buffered_writer.inspect_buffer()[0..cap / 2],
&content[cap..cap + cap / 2]
);
}
@@ -498,19 +422,19 @@ mod tests {
let gate = utils::sync::gate::Gate::default();
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
.await
.unwrap();
let mut file =
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
.await
.unwrap();
let cap = file.buffered_writer.inspect_buffer().capacity();
let mutable = file.buffered_writer.inspect_mutable();
let cap = mutable.capacity();
let align = mutable.align();
let content: Vec<u8> = rand::thread_rng()
.sample_iter(rand::distributions::Standard)
.take(cap * 2 + cap / 2)
.take(cap + cap / 2)
.collect();
let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap();
file.write_raw(&content, &ctx).await.unwrap();
let test_read = |start: usize, len: usize| {
let file = &file;
@@ -530,38 +454,16 @@ mod tests {
}
};
let test_read_all_offset_combinations = || {
async move {
test_read(align, align).await;
// border onto edge of file
test_read(cap - align, align).await;
// read across file and buffer
test_read(cap - align, 2 * align).await;
// stay from start of maybe flushed buffer
test_read(cap, align).await;
// completely within maybe flushed buffer
test_read(cap + align, align).await;
// border onto edge of maybe flushed buffer.
test_read(cap * 2 - align, align).await;
// read across maybe flushed and mutable buffer
test_read(cap * 2 - align, 2 * align).await;
// read across three segments
test_read(cap - align, cap + 2 * align).await;
// completely within mutable buffer
test_read(cap * 2 + align, align).await;
}
};
// completely within the file range
assert!(align < cap, "test assumption");
assert!(cap % align == 0);
// test reads at different flush stages.
let not_started = control.unwrap().into_not_started();
test_read_all_offset_combinations().await;
let in_progress = not_started.ready_to_flush();
test_read_all_offset_combinations().await;
in_progress.wait_until_flush_is_done().await;
test_read_all_offset_combinations().await;
assert!(20 < cap, "test assumption");
test_read(10, 10).await;
// border onto edge of file
test_read(cap - 10, 10).await;
// read across file and buffer
test_read(cap - 10, 20).await;
// stay from start of buffer
test_read(cap, 10).await;
// completely within buffer
test_read(cap + 10, 10).await;
}
}

View File

@@ -681,7 +681,6 @@ impl RemoteTimelineClient {
layer_file_name: &LayerName,
layer_metadata: &LayerFileMetadata,
local_path: &Utf8Path,
gate: &utils::sync::gate::Gate,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<u64, DownloadError> {
@@ -701,7 +700,6 @@ impl RemoteTimelineClient {
layer_file_name,
layer_metadata,
local_path,
gate,
cancel,
ctx,
)
@@ -2192,9 +2190,6 @@ impl RemoteTimelineClient {
upload_queue.clean.1 = Some(task.task_id);
let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
self.metrics
.projected_remote_consistent_lsn_gauge
.set(lsn.0);
if self.generation.is_none() {
// Legacy mode: skip validating generation

View File

@@ -26,6 +26,8 @@ use crate::span::{
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
use crate::tenant::storage_layer::LayerName;
use crate::tenant::Generation;
#[cfg_attr(target_os = "macos", allow(unused_imports))]
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
use crate::TEMP_FILE_SUFFIX;
use remote_storage::{
@@ -58,7 +60,6 @@ pub async fn download_layer_file<'a>(
layer_file_name: &'a LayerName,
layer_metadata: &'a LayerFileMetadata,
local_path: &Utf8Path,
gate: &utils::sync::gate::Gate,
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<u64, DownloadError> {
@@ -87,9 +88,7 @@ pub async fn download_layer_file<'a>(
let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
let bytes_amount = download_retry(
|| async {
download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await
},
|| async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
&format!("download {remote_path:?}"),
cancel,
)
@@ -149,7 +148,6 @@ async fn download_object<'a>(
storage: &'a GenericRemoteStorage,
src_path: &RemotePath,
dst_path: &Utf8PathBuf,
#[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
cancel: &CancellationToken,
#[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
) -> Result<u64, DownloadError> {
@@ -207,18 +205,13 @@ async fn download_object<'a>(
}
#[cfg(target_os = "linux")]
crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
use crate::virtual_file::owned_buffers_io;
use crate::virtual_file::IoBufferMut;
use std::sync::Arc;
use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
use bytes::BytesMut;
async {
let destination_file = Arc::new(
VirtualFile::create(dst_path, ctx)
.await
.with_context(|| {
format!("create a destination file for layer '{dst_path}'")
})
.map_err(DownloadError::Other)?,
);
let destination_file = VirtualFile::create(dst_path, ctx)
.await
.with_context(|| format!("create a destination file for layer '{dst_path}'"))
.map_err(DownloadError::Other)?;
let mut download = storage
.download(src_path, &DownloadOpts::default(), cancel)
@@ -226,16 +219,14 @@ async fn download_object<'a>(
pausable_failpoint!("before-downloading-layer-stream-pausable");
let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
destination_file,
|| IoBufferMut::with_capacity(super::BUFFER_SIZE),
gate.enter().map_err(|_| DownloadError::Cancelled)?,
ctx,
);
// TODO: use vectored write (writev) once supported by tokio-epoll-uring.
// There's chunks_vectored() on the stream.
let (bytes_amount, destination_file) = async {
let size_tracking = size_tracking_writer::Writer::new(destination_file);
let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
size_tracking,
BytesMut::with_capacity(super::BUFFER_SIZE),
);
while let Some(res) =
futures::StreamExt::next(&mut download.download_stream).await
{
@@ -243,10 +234,10 @@ async fn download_object<'a>(
Ok(chunk) => chunk,
Err(e) => return Err(e),
};
buffered.write_buffered_borrowed(&chunk, ctx).await?;
buffered.write_buffered(chunk.slice_len(), ctx).await?;
}
let inner = buffered.flush_and_into_inner(ctx).await?;
Ok(inner)
let size_tracking = buffered.flush_and_into_inner(ctx).await?;
Ok(size_tracking.into_inner())
}
.await?;

View File

@@ -1183,7 +1183,6 @@ impl<'a> TenantDownloader<'a> {
&layer.name,
&layer.metadata,
&local_path,
&self.secondary_state.gate,
&self.secondary_state.cancel,
ctx,
)

View File

@@ -555,12 +555,13 @@ impl InMemoryLayer {
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
start_lsn: Lsn,
gate: &utils::sync::gate::Gate,
gate_guard: utils::sync::gate::GateGuard,
ctx: &RequestContext,
) -> Result<InMemoryLayer> {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?;
let file =
EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
let key = InMemoryLayerFileId(file.page_cache_file_id());
Ok(InMemoryLayer {

View File

@@ -1149,7 +1149,6 @@ impl LayerInner {
&self.desc.layer_name(),
&self.metadata(),
&self.path,
&timeline.gate,
&timeline.cancel,
ctx,
)

View File

@@ -53,7 +53,7 @@ use utils::{
postgres_client::PostgresClientProtocol,
sync::gate::{Gate, GateGuard},
};
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
use wal_decoder::serialized_batch::SerializedValueBatch;
use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -768,7 +768,7 @@ pub enum GetLogicalSizePriority {
Background,
}
#[derive(Debug, enumset::EnumSetType)]
#[derive(enumset::EnumSetType)]
pub(crate) enum CompactFlags {
ForceRepartition,
ForceImageLayerCreation,
@@ -777,19 +777,6 @@ pub(crate) enum CompactFlags {
DryRun,
}
#[serde_with::serde_as]
#[derive(Debug, Clone, serde::Deserialize)]
pub(crate) struct CompactRequest {
pub compact_range: Option<CompactRange>,
pub compact_below_lsn: Option<Lsn>,
/// Whether the compaction job should be scheduled.
#[serde(default)]
pub scheduled: bool,
/// Whether the compaction job should be split across key ranges.
#[serde(default)]
pub sub_compaction: bool,
}
#[serde_with::serde_as]
#[derive(Debug, Clone, serde::Deserialize)]
pub(crate) struct CompactRange {
@@ -799,27 +786,10 @@ pub(crate) struct CompactRange {
pub end: Key,
}
impl From<Range<Key>> for CompactRange {
fn from(range: Range<Key>) -> Self {
CompactRange {
start: range.start,
end: range.end,
}
}
}
#[derive(Debug, Clone, Default)]
#[derive(Clone, Default)]
pub(crate) struct CompactOptions {
pub flags: EnumSet<CompactFlags>,
/// If set, the compaction will only compact the key range specified by this option.
/// This option is only used by GC compaction.
pub compact_range: Option<CompactRange>,
/// If set, the compaction will only compact the LSN below this value.
/// This option is only used by GC compaction.
pub compact_below_lsn: Option<Lsn>,
/// Enable sub-compaction (split compaction job across key ranges).
/// This option is only used by GC compaction.
pub sub_compaction: bool,
}
impl std::fmt::Debug for Timeline {
@@ -1463,31 +1433,23 @@ impl Timeline {
Ok(lease)
}
/// Freeze the current open in-memory layer. It will be written to disk on next iteration.
/// Returns the flush request ID which can be awaited with wait_flush_completion().
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze(&self) -> Result<u64, FlushLayerError> {
self.freeze0().await
}
/// Freeze and flush the open in-memory layer, waiting for it to be written to disk.
/// Flush to disk all data that was written with the put_* functions
#[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
self.freeze_and_flush0().await
}
/// Freeze the current open in-memory layer. It will be written to disk on next iteration.
/// Returns the flush request ID which can be awaited with wait_flush_completion().
pub(crate) async fn freeze0(&self) -> Result<u64, FlushLayerError> {
let mut g = self.write_lock.lock().await;
let to_lsn = self.get_last_record_lsn();
self.freeze_inmem_layer_at(to_lsn, &mut g).await
}
// This exists to provide a non-span creating version of `freeze_and_flush` we can call without
// polluting the span hierarchy.
pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
let token = self.freeze0().await?;
let token = {
// Freeze the current open in-memory layer. It will be written to disk on next
// iteration.
let mut g = self.write_lock.lock().await;
let to_lsn = self.get_last_record_lsn();
self.freeze_inmem_layer_at(to_lsn, &mut g).await?
};
self.wait_flush_completion(token).await
}
@@ -1642,8 +1604,6 @@ impl Timeline {
CompactOptions {
flags,
compact_range: None,
compact_below_lsn: None,
sub_compaction: false,
},
ctx,
)
@@ -2399,7 +2359,7 @@ impl Timeline {
result
.metrics
.last_record_lsn_gauge
.last_record_gauge
.set(disk_consistent_lsn.0 as i64);
result
})
@@ -3495,6 +3455,7 @@ impl Timeline {
ctx: &RequestContext,
) -> anyhow::Result<Arc<InMemoryLayer>> {
let mut guard = self.layers.write().await;
let gate_guard = self.gate.enter().context("enter gate for inmem layer")?;
let last_record_lsn = self.get_last_record_lsn();
ensure!(
@@ -3511,7 +3472,7 @@ impl Timeline {
self.conf,
self.timeline_id,
self.tenant_shard_id,
&self.gate,
gate_guard,
ctx,
)
.await?;
@@ -3521,7 +3482,7 @@ impl Timeline {
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
assert!(new_lsn.is_aligned());
self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
self.metrics.last_record_gauge.set(new_lsn.0 as i64);
self.last_record_lsn.advance(new_lsn);
}
@@ -3889,10 +3850,6 @@ impl Timeline {
fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
let old_value = self.disk_consistent_lsn.fetch_max(new_value);
assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
self.metrics
.disk_consistent_lsn_gauge
.set(new_value.0 as i64);
new_value != old_value
}
@@ -5931,23 +5888,6 @@ impl<'a> TimelineWriter<'a> {
return Ok(());
}
// In debug builds, assert that we don't write any keys that don't belong to this shard.
// We don't assert this in release builds, since key ownership policies may change over
// time. Stray keys will be removed during compaction.
if cfg!(debug_assertions) {
for metadata in &batch.metadata {
if let ValueMeta::Serialized(metadata) = metadata {
let key = Key::from_compact(metadata.key);
assert!(
self.shard_identity.is_key_local(&key)
|| self.shard_identity.is_key_global(&key),
"key {key} does not belong on shard {}",
self.shard_identity.shard_index()
);
}
}
}
let batch_max_lsn = batch.max_lsn;
let buf_size: u64 = batch.buffer_size() as u64;

View File

@@ -10,12 +10,13 @@ use std::sync::Arc;
use super::layer_manager::LayerManager;
use super::{
CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
ImageLayerCreationMode, RecordedDuration, Timeline,
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
RecordedDuration, Timeline,
};
use anyhow::{anyhow, bail, Context};
use bytes::Bytes;
use enumset::EnumSet;
use fail::fail_point;
use itertools::Itertools;
use pageserver_api::key::KEY_SIZE;
@@ -29,6 +30,7 @@ use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::statvfs::Statvfs;
use crate::tenant::checks::check_valid_layermap;
use crate::tenant::remote_timeline_client::WaitCompletionError;
use crate::tenant::storage_layer::batch_split_writer::{
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -62,12 +64,6 @@ use super::CompactionError;
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
const COMPACTION_DELTA_THRESHOLD: usize = 5;
/// A scheduled compaction task.
pub struct ScheduledCompactionTask {
pub options: CompactOptions,
pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
}
pub struct GcCompactionJobDescription {
/// All layers to read in the compaction job
selected_layers: Vec<Layer>,
@@ -1178,12 +1174,11 @@ impl Timeline {
.await
.map_err(CompactionError::Other)?;
} else {
let shard = self.shard_identity.shard_index();
let owner = self.shard_identity.get_shard_number(&key);
if cfg!(debug_assertions) {
panic!("key {key} does not belong on shard {shard}, owned by {owner}");
}
debug!("dropping key {key} during compaction (it belongs on shard {owner})");
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
key,
self.shard_identity.get_shard_number(&key)
);
}
if !new_layers.is_empty() {
@@ -1751,114 +1746,22 @@ impl Timeline {
Ok(())
}
/// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
/// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
/// ad-hoc information about gc compaction itself.
pub(crate) async fn gc_compaction_split_jobs(
pub(crate) async fn compact_with_gc(
self: &Arc<Self>,
cancel: &CancellationToken,
options: CompactOptions,
) -> anyhow::Result<Vec<CompactOptions>> {
if !options.sub_compaction {
return Ok(vec![options]);
}
let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
start: Key::MIN,
end: Key::MAX,
});
let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
compact_below_lsn
} else {
let gc_info = self.gc_info.read().unwrap();
gc_info.cutoffs.select_min() // use the real gc cutoff
};
let mut compact_jobs = Vec::new();
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
let Ok(partition) = self.partitioning.try_lock() else {
bail!("failed to acquire partition lock");
};
let ((dense_ks, sparse_ks), _) = &*partition;
// Truncate the key range to be within user specified compaction range.
fn truncate_to(
source_start: &Key,
source_end: &Key,
target_start: &Key,
target_end: &Key,
) -> Option<(Key, Key)> {
let start = source_start.max(target_start);
let end = source_end.min(target_end);
if start < end {
Some((*start, *end))
} else {
None
}
}
let mut split_key_ranges = Vec::new();
let ranges = dense_ks
.parts
.iter()
.map(|partition| partition.ranges.iter())
.chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter()))
.flatten()
.cloned()
.collect_vec();
for range in ranges.iter() {
let Some((start, end)) = truncate_to(
&range.start,
&range.end,
&compact_range.start,
&compact_range.end,
) else {
continue;
};
split_key_ranges.push((start, end));
}
split_key_ranges.sort();
let guard = self.layers.read().await;
let layer_map = guard.layer_map()?;
let mut current_start = None;
// Split compaction job to about 2GB each
const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
let ranges_num = split_key_ranges.len();
for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
if current_start.is_none() {
current_start = Some(start);
}
let start = current_start.unwrap();
if start >= end {
// We have already processed this partition.
continue;
}
let res = layer_map.range_search(start..end, compact_below_lsn);
let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
let mut compact_options = options.clone();
// Try to extend the compaction range so that we include at least one full layer file.
let extended_end = res
.found
.keys()
.map(|layer| layer.layer.key_range.end)
.min();
// It is possible that the search range does not contain any layer files when we reach the end of the loop.
// In this case, we simply use the specified key range end.
let end = if let Some(extended_end) = extended_end {
extended_end.max(end)
} else {
end
};
info!(
"splitting compaction job: {}..{}, estimated_size={}",
start, end, total_size
);
compact_options.compact_range = Some(CompactRange { start, end });
compact_options.compact_below_lsn = Some(compact_below_lsn);
compact_options.sub_compaction = false;
compact_jobs.push(compact_options);
current_start = Some(end);
}
}
drop(guard);
Ok(compact_jobs)
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.partial_compact_with_gc(
options
.compact_range
.map(|range| range.start..range.end)
.unwrap_or_else(|| Key::MIN..Key::MAX),
cancel,
options.flags,
ctx,
)
.await
}
/// An experimental compaction building block that combines compaction with garbage collection.
@@ -1868,51 +1771,19 @@ impl Timeline {
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
/// and create delta layers with all deltas >= gc horizon.
///
/// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction.
/// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
/// Partial compaction will read and process all layers overlapping with the key range, even if it might
/// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
/// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
/// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
/// part of the range.
///
/// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
/// the LSN. Otherwise, it will use the gc cutoff by default.
pub(crate) async fn compact_with_gc(
pub(crate) async fn partial_compact_with_gc(
self: &Arc<Self>,
compaction_key_range: Range<Key>,
cancel: &CancellationToken,
options: CompactOptions,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
if options.sub_compaction {
info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
let jobs = self.gc_compaction_split_jobs(options).await?;
let jobs_len = jobs.len();
for (idx, job) in jobs.into_iter().enumerate() {
info!(
"running enhanced gc bottom-most compaction, sub-compaction {}/{}",
idx + 1,
jobs_len
);
self.compact_with_gc_inner(cancel, job, ctx).await?;
}
if jobs_len == 0 {
info!("no jobs to run, skipping gc bottom-most compaction");
}
return Ok(());
}
self.compact_with_gc_inner(cancel, options, ctx).await
}
async fn compact_with_gc_inner(
self: &Arc<Self>,
cancel: &CancellationToken,
options: CompactOptions,
ctx: &RequestContext,
) -> anyhow::Result<()> {
assert!(
!options.sub_compaction,
"sub-compaction should be handled by the outer function"
);
// Block other compaction/GC tasks from running for now. GC-compaction could run along
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1932,12 +1803,6 @@ impl Timeline {
)
.await?;
let flags = options.flags;
let compaction_key_range = options
.compact_range
.map(|range| range.start..range.end)
.unwrap_or_else(|| Key::MIN..Key::MAX);
let dry_run = flags.contains(CompactFlags::DryRun);
if compaction_key_range == (Key::MIN..Key::MAX) {
@@ -1961,18 +1826,7 @@ impl Timeline {
let layers = guard.layer_map()?;
let gc_info = self.gc_info.read().unwrap();
let mut retain_lsns_below_horizon = Vec::new();
let gc_cutoff = {
let real_gc_cutoff = gc_info.cutoffs.select_min();
// The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
// each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
// the real cutoff.
let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
if gc_cutoff > real_gc_cutoff {
warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
gc_cutoff = real_gc_cutoff;
}
gc_cutoff
};
let gc_cutoff = gc_info.cutoffs.select_min();
for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
if lsn < &gc_cutoff {
retain_lsns_below_horizon.push(*lsn);
@@ -1992,7 +1846,7 @@ impl Timeline {
.map(|desc| desc.get_lsn_range().end)
.max()
else {
info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
info!("no layers to compact with gc");
return Ok(());
};
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
@@ -2015,7 +1869,7 @@ impl Timeline {
}
}
if selected_layers.is_empty() {
info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
info!("no layers to compact with gc");
return Ok(());
}
retain_lsns_below_horizon.sort();
@@ -2082,15 +1936,14 @@ impl Timeline {
// Step 1: construct a k-merge iterator over all layers.
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
// disable the check for now because we need to adjust the check for partial compactions, will enable later.
// let layer_names = job_desc
// .selected_layers
// .iter()
// .map(|layer| layer.layer_desc().layer_name())
// .collect_vec();
// if let Some(err) = check_valid_layermap(&layer_names) {
// warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
// }
let layer_names = job_desc
.selected_layers
.iter()
.map(|layer| layer.layer_desc().layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
}
// The maximum LSN we are processing in this compaction loop
let end_lsn = job_desc
.selected_layers
@@ -2195,11 +2048,6 @@ impl Timeline {
// This is not handled in the filter iterator because shard is determined by hash.
// Therefore, it does not give us any performance benefit to do things like skip
// a whole layer file as handling key spaces (ranges).
if cfg!(debug_assertions) {
let shard = self.shard_identity.shard_index();
let owner = self.shard_identity.get_shard_number(&key);
panic!("key {key} does not belong on shard {shard}, owned by {owner}");
}
continue;
}
if !job_desc.compaction_key_range.contains(&key) {

View File

@@ -182,7 +182,7 @@ impl OpenLayerManager {
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
gate: &utils::sync::gate::Gate,
gate_guard: utils::sync::gate::GateGuard,
ctx: &RequestContext,
) -> anyhow::Result<Arc<InMemoryLayer>> {
ensure!(lsn.is_aligned());
@@ -212,9 +212,15 @@ impl OpenLayerManager {
lsn
);
let new_layer =
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, gate, ctx)
.await?;
let new_layer = InMemoryLayer::create(
conf,
timeline_id,
tenant_shard_id,
start_lsn,
gate_guard,
ctx,
)
.await?;
let layer = Arc::new(new_layer);
self.layer_map.open_layer = Some(layer.clone());

View File

@@ -369,13 +369,6 @@ pub(super) async fn handle_walreceiver_connection(
// advances it to its end LSN. 0 is just an initialization placeholder.
let mut modification = timeline.begin_modification(Lsn(0));
if !records.is_empty() {
timeline
.metrics
.wal_records_received
.inc_by(records.len() as u64);
}
for interpreted in records {
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
&& uncommitted_records > 0
@@ -517,7 +510,6 @@ pub(super) async fn handle_walreceiver_connection(
}
// Ingest the records without immediately committing them.
timeline.metrics.wal_records_received.inc();
let ingested = walingest
.ingest_record(interpreted, &mut modification, &ctx)
.await

View File

@@ -20,7 +20,7 @@ use camino::{Utf8Path, Utf8PathBuf};
use once_cell::sync::OnceCell;
use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign};
use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
use owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
use owned_buffers_io::io_buf_ext::FullSlice;
use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
use pageserver_api::shard::TenantShardId;
@@ -63,6 +63,9 @@ pub(crate) mod owned_buffers_io {
pub(crate) mod io_buf_ext;
pub(crate) mod slice;
pub(crate) mod write;
pub(crate) mod util {
pub(crate) mod size_tracking_writer;
}
}
#[derive(Debug)]
@@ -218,7 +221,7 @@ impl VirtualFile {
self.inner.read_exact_at_page(page, offset, ctx).await
}
pub async fn write_all_at<Buf: IoBufAligned + Send>(
pub async fn write_all_at<Buf: IoBuf + Send>(
&self,
buf: FullSlice<Buf>,
offset: u64,
@@ -1322,14 +1325,14 @@ impl Drop for VirtualFileInner {
}
impl OwnedAsyncWriter for VirtualFile {
async fn write_all_at<Buf: IoBufAligned + Send>(
&self,
#[inline(always)]
async fn write_all<Buf: IoBuf + Send>(
&mut self,
buf: FullSlice<Buf>,
offset: u64,
ctx: &RequestContext,
) -> std::io::Result<FullSlice<Buf>> {
let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await;
res.map(|_| buf)
) -> std::io::Result<(usize, FullSlice<Buf>)> {
let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
res.map(move |v| (v, buf))
}
}
@@ -1448,7 +1451,7 @@ mod tests {
}
}
}
async fn write_all_at<Buf: IoBufAligned + Send>(
async fn write_all_at<Buf: IoBuf + Send>(
&self,
buf: FullSlice<Buf>,
offset: u64,
@@ -1591,7 +1594,6 @@ mod tests {
&ctx,
)
.await?;
file_a
.write_all(b"foobar".to_vec().slice_len(), &ctx)
.await?;
@@ -1650,10 +1652,10 @@ mod tests {
)
.await?;
file_b
.write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx)
.write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
.await?;
file_b
.write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx)
.write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
.await?;
assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");

View File

@@ -4,7 +4,7 @@ pub trait Alignment: std::marker::Unpin + 'static {
}
/// Alignment at compile time.
#[derive(Debug, Clone, Copy)]
#[derive(Debug)]
pub struct ConstAlign<const A: usize>;
impl<const A: usize> Alignment for ConstAlign<A> {
@@ -14,7 +14,7 @@ impl<const A: usize> Alignment for ConstAlign<A> {
}
/// Alignment at run time.
#[derive(Debug, Clone, Copy)]
#[derive(Debug)]
pub struct RuntimeAlign {
align: usize,
}

View File

@@ -3,10 +3,9 @@ use std::{
sync::Arc,
};
use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign};
use super::{alignment::Alignment, raw::RawAlignedBuffer};
/// An shared, immutable aligned buffer type.
#[derive(Clone, Debug)]
pub struct AlignedBuffer<A: Alignment> {
/// Shared raw buffer.
raw: Arc<RawAlignedBuffer<A>>,
@@ -87,13 +86,6 @@ impl<A: Alignment> AlignedBuffer<A> {
range: begin..end,
}
}
/// Returns the mutable aligned buffer, if the immutable aligned buffer
/// has exactly one strong reference. Otherwise returns `None`.
pub fn into_mut(self) -> Option<AlignedBufferMut<A>> {
let raw = Arc::into_inner(self.raw)?;
Some(AlignedBufferMut::from_raw(raw))
}
}
impl<A: Alignment> Deref for AlignedBuffer<A> {
@@ -116,14 +108,6 @@ impl<A: Alignment> PartialEq<[u8]> for AlignedBuffer<A> {
}
}
impl<const A: usize, const N: usize> From<&[u8; N]> for AlignedBuffer<ConstAlign<A>> {
fn from(value: &[u8; N]) -> Self {
let mut buf = AlignedBufferMut::with_capacity(N);
buf.extend_from_slice(value);
buf.freeze()
}
}
/// SAFETY: the underlying buffer references a stable memory region.
unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBuffer<A> {
fn stable_ptr(&self) -> *const u8 {

View File

@@ -1,7 +1,4 @@
use std::{
mem::MaybeUninit,
ops::{Deref, DerefMut},
};
use std::ops::{Deref, DerefMut};
use super::{
alignment::{Alignment, ConstAlign},
@@ -49,11 +46,6 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
}
impl<A: Alignment> AlignedBufferMut<A> {
/// Constructs a mutable aligned buffer from raw.
pub(super) fn from_raw(raw: RawAlignedBuffer<A>) -> Self {
AlignedBufferMut { raw }
}
/// Returns the total number of bytes the buffer can hold.
#[inline]
pub fn capacity(&self) -> usize {
@@ -136,39 +128,6 @@ impl<A: Alignment> AlignedBufferMut<A> {
let len = self.len();
AlignedBuffer::from_raw(self.raw, 0..len)
}
/// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed.
#[inline]
pub fn extend_from_slice(&mut self, extend: &[u8]) {
let cnt = extend.len();
self.reserve(cnt);
// SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy.
unsafe {
let dst = self.spare_capacity_mut();
// Reserved above
debug_assert!(dst.len() >= cnt);
core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt);
}
// SAFETY: We do have at least `cnt` bytes remaining before advance.
unsafe {
bytes::BufMut::advance_mut(self, cnt);
}
}
/// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit<u8>`.
#[inline]
fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit<u8>] {
// SAFETY: we guarantees that the `Self::capacity()` bytes from
// `Self::as_mut_ptr()` are allocated.
unsafe {
let ptr = self.as_mut_ptr().add(self.len());
let len = self.capacity() - self.len();
core::slice::from_raw_parts_mut(ptr.cast(), len)
}
}
}
impl<A: Alignment> Deref for AlignedBufferMut<A> {

View File

@@ -1,15 +1,9 @@
use tokio_epoll_uring::{IoBuf, IoBufMut};
use tokio_epoll_uring::IoBufMut;
use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf};
use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf};
/// A marker trait for a mutable aligned buffer type.
pub trait IoBufAlignedMut: IoBufMut {}
/// A marker trait for an aligned buffer type.
pub trait IoBufAligned: IoBuf {}
impl IoBufAlignedMut for IoBufferMut {}
impl IoBufAligned for IoBuffer {}
impl IoBufAlignedMut for PageWriteGuardBuf {}

View File

@@ -5,8 +5,6 @@ use bytes::{Bytes, BytesMut};
use std::ops::{Deref, Range};
use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
use super::write::CheapCloneForRead;
/// The true owned equivalent for Rust [`slice`]. Use this for the write path.
///
/// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`,
@@ -45,18 +43,6 @@ where
}
}
impl<B> CheapCloneForRead for FullSlice<B>
where
B: IoBuf + CheapCloneForRead,
{
fn cheap_clone(&self) -> Self {
let bounds = self.slice.bounds();
let clone = self.slice.get_ref().cheap_clone();
let slice = clone.slice(bounds);
Self { slice }
}
}
pub(crate) trait IoBufExt {
/// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`.
fn slice_len(self) -> FullSlice<Self>

View File

@@ -0,0 +1,50 @@
use crate::{
context::RequestContext,
virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter},
};
use tokio_epoll_uring::IoBuf;
pub struct Writer<W> {
dst: W,
bytes_amount: u64,
}
impl<W> Writer<W> {
pub fn new(dst: W) -> Self {
Self {
dst,
bytes_amount: 0,
}
}
pub fn bytes_written(&self) -> u64 {
self.bytes_amount
}
pub fn as_inner(&self) -> &W {
&self.dst
}
/// Returns the wrapped `VirtualFile` object as well as the number
/// of bytes that were written to it through this object.
#[cfg_attr(target_os = "macos", allow(dead_code))]
pub fn into_inner(self) -> (u64, W) {
(self.bytes_amount, self.dst)
}
}
impl<W> OwnedAsyncWriter for Writer<W>
where
W: OwnedAsyncWriter,
{
#[inline(always)]
async fn write_all<Buf: IoBuf + Send>(
&mut self,
buf: FullSlice<Buf>,
ctx: &RequestContext,
) -> std::io::Result<(usize, FullSlice<Buf>)> {
let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
self.bytes_amount += u64::try_from(nwritten).unwrap();
Ok((nwritten, buf))
}
}

View File

@@ -1,88 +1,55 @@
mod flush;
use std::sync::Arc;
use flush::FlushHandle;
use bytes::BytesMut;
use tokio_epoll_uring::IoBuf;
use crate::{
context::RequestContext,
virtual_file::{IoBuffer, IoBufferMut},
};
use crate::context::RequestContext;
use super::{
io_buf_aligned::IoBufAligned,
io_buf_ext::{FullSlice, IoBufExt},
};
pub(crate) use flush::FlushControl;
pub(crate) trait CheapCloneForRead {
/// Returns a cheap clone of the buffer.
fn cheap_clone(&self) -> Self;
}
impl CheapCloneForRead for IoBuffer {
fn cheap_clone(&self) -> Self {
// Cheap clone over an `Arc`.
self.clone()
}
}
use super::io_buf_ext::{FullSlice, IoBufExt};
/// A trait for doing owned-buffer write IO.
/// Think [`tokio::io::AsyncWrite`] but with owned buffers.
/// The owned buffers need to be aligned due to Direct IO requirements.
pub trait OwnedAsyncWriter {
fn write_all_at<Buf: IoBufAligned + Send>(
&self,
async fn write_all<Buf: IoBuf + Send>(
&mut self,
buf: FullSlice<Buf>,
offset: u64,
ctx: &RequestContext,
) -> impl std::future::Future<Output = std::io::Result<FullSlice<Buf>>> + Send;
) -> std::io::Result<(usize, FullSlice<Buf>)>;
}
/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
/// small writes into larger writes of size [`Buffer::cap`].
// TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput,
// since we would avoid copying majority of the data into the internal buffer.
pub struct BufferedWriter<B: Buffer, W> {
writer: Arc<W>,
///
/// # Passthrough Of Large Writers
///
/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
/// cause the internal buffer to be flushed prematurely so that the large
/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
///
/// This pass-through is generally beneficial for throughput, but if
/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
/// unlimited large writes may cause latency or fairness issues.
///
/// In such cases, a different implementation that always buffers in memory
/// may be preferable.
pub struct BufferedWriter<B, W> {
writer: W,
/// invariant: always remains Some(buf) except
/// - while IO is ongoing => goes back to Some() once the IO completed successfully
/// - after an IO error => stays `None` forever
///
/// In these exceptional cases, it's `None`.
mutable: Option<B>,
/// A handle to the background flush task for writting data to disk.
flush_handle: FlushHandle<B::IoBuf, W>,
/// The number of bytes submitted to the background task.
bytes_submitted: u64,
buf: Option<B>,
}
impl<B, Buf, W> BufferedWriter<B, W>
where
B: Buffer<IoBuf = Buf> + Send + 'static,
Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
B: Buffer<IoBuf = Buf> + Send,
Buf: IoBuf + Send,
W: OwnedAsyncWriter,
{
/// Creates a new buffered writer.
///
/// The `buf_new` function provides a way to initialize the owned buffers used by this writer.
pub fn new(
writer: Arc<W>,
buf_new: impl Fn() -> B,
gate_guard: utils::sync::gate::GateGuard,
ctx: &RequestContext,
) -> Self {
pub fn new(writer: W, buf: B) -> Self {
Self {
writer: writer.clone(),
mutable: Some(buf_new()),
flush_handle: FlushHandle::spawn_new(
writer,
buf_new(),
gate_guard,
ctx.attached_child(),
),
bytes_submitted: 0,
writer,
buf: Some(buf),
}
}
@@ -90,71 +57,87 @@ where
&self.writer
}
/// Returns the number of bytes submitted to the background flush task.
pub fn bytes_submitted(&self) -> u64 {
self.bytes_submitted
}
/// Panics if used after any of the write paths returned an error
pub fn inspect_mutable(&self) -> &B {
self.mutable()
}
/// Gets a reference to the maybe flushed read-only buffer.
/// Returns `None` if the writer has not submitted any flush request.
pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice<Buf>> {
self.flush_handle.maybe_flushed.as_ref()
pub fn inspect_buffer(&self) -> &B {
self.buf()
}
#[cfg_attr(target_os = "macos", allow(dead_code))]
pub async fn flush_and_into_inner(
mut self,
ctx: &RequestContext,
) -> std::io::Result<(u64, Arc<W>)> {
pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
self.flush(ctx).await?;
let Self {
mutable: buf,
writer,
mut flush_handle,
bytes_submitted: bytes_amount,
} = self;
flush_handle.shutdown().await?;
let Self { buf, writer } = self;
assert!(buf.is_some());
Ok((bytes_amount, writer))
Ok(writer)
}
/// Gets a reference to the mutable in-memory buffer.
#[inline(always)]
fn mutable(&self) -> &B {
self.mutable
fn buf(&self) -> &B {
self.buf
.as_ref()
.expect("must not use after we returned an error")
}
/// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
#[cfg_attr(target_os = "macos", allow(dead_code))]
pub async fn write_buffered_borrowed(
pub async fn write_buffered<S: IoBuf + Send>(
&mut self,
chunk: &[u8],
chunk: FullSlice<S>,
ctx: &RequestContext,
) -> std::io::Result<usize> {
let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?;
if let Some(control) = control {
control.release().await;
) -> std::io::Result<(usize, FullSlice<S>)> {
let chunk = chunk.into_raw_slice();
let chunk_len = chunk.len();
// avoid memcpy for the middle of the chunk
if chunk.len() >= self.buf().cap() {
self.flush(ctx).await?;
// do a big write, bypassing `buf`
assert_eq!(
self.buf
.as_ref()
.expect("must not use after an error")
.pending(),
0
);
let (nwritten, chunk) = self
.writer
.write_all(FullSlice::must_new(chunk), ctx)
.await?;
assert_eq!(nwritten, chunk_len);
return Ok((nwritten, chunk));
}
Ok(len)
// in-memory copy the < BUFFER_SIZED tail of the chunk
assert!(chunk.len() < self.buf().cap());
let mut slice = &chunk[..];
while !slice.is_empty() {
let buf = self.buf.as_mut().expect("must not use after an error");
let need = buf.cap() - buf.pending();
let have = slice.len();
let n = std::cmp::min(need, have);
buf.extend_from_slice(&slice[..n]);
slice = &slice[n..];
if buf.pending() >= buf.cap() {
assert_eq!(buf.pending(), buf.cap());
self.flush(ctx).await?;
}
}
assert!(slice.is_empty(), "by now we should have drained the chunk");
Ok((chunk_len, FullSlice::must_new(chunk)))
}
/// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior.
pub(crate) async fn write_buffered_borrowed_controlled(
/// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
///
/// It is less performant because we always have to copy the borrowed data into the internal buffer
/// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
/// for large writes.
pub async fn write_buffered_borrowed(
&mut self,
mut chunk: &[u8],
ctx: &RequestContext,
) -> std::io::Result<(usize, Option<FlushControl>)> {
) -> std::io::Result<usize> {
let chunk_len = chunk.len();
let mut control: Option<FlushControl> = None;
while !chunk.is_empty() {
let buf = self.mutable.as_mut().expect("must not use after an error");
let buf = self.buf.as_mut().expect("must not use after an error");
let need = buf.cap() - buf.pending();
let have = chunk.len();
let n = std::cmp::min(need, have);
@@ -162,27 +145,26 @@ where
chunk = &chunk[n..];
if buf.pending() >= buf.cap() {
assert_eq!(buf.pending(), buf.cap());
if let Some(control) = control.take() {
control.release().await;
}
control = self.flush(ctx).await?;
self.flush(ctx).await?;
}
}
Ok((chunk_len, control))
Ok(chunk_len)
}
#[must_use = "caller must explcitly check the flush control"]
async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result<Option<FlushControl>> {
let buf = self.mutable.take().expect("must not use after an error");
async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
let buf = self.buf.take().expect("must not use after an error");
let buf_len = buf.pending();
if buf_len == 0 {
self.mutable = Some(buf);
return Ok(None);
self.buf = Some(buf);
return Ok(());
}
let (recycled, flush_control) = self.flush_handle.flush(buf, self.bytes_submitted).await?;
self.bytes_submitted += u64::try_from(buf_len).unwrap();
self.mutable = Some(recycled);
Ok(Some(flush_control))
let slice = buf.flush();
let (nwritten, slice) = self.writer.write_all(slice, ctx).await?;
assert_eq!(nwritten, buf_len);
self.buf = Some(Buffer::reuse_after_flush(
slice.into_raw_slice().into_inner(),
));
Ok(())
}
}
@@ -210,77 +192,64 @@ pub trait Buffer {
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
}
impl Buffer for IoBufferMut {
type IoBuf = IoBuffer;
impl Buffer for BytesMut {
type IoBuf = BytesMut;
#[inline(always)]
fn cap(&self) -> usize {
self.capacity()
}
fn extend_from_slice(&mut self, other: &[u8]) {
if self.len() + other.len() > self.cap() {
panic!("Buffer capacity exceeded");
}
IoBufferMut::extend_from_slice(self, other);
BytesMut::extend_from_slice(self, other)
}
#[inline(always)]
fn pending(&self) -> usize {
self.len()
}
fn flush(self) -> FullSlice<Self::IoBuf> {
self.freeze().slice_len()
fn flush(self) -> FullSlice<BytesMut> {
self.slice_len()
}
/// Caller should make sure that `iobuf` only have one strong reference before invoking this method.
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
let mut recycled = iobuf
.into_mut()
.expect("buffer should only have one strong reference");
recycled.clear();
recycled
fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
iobuf.clear();
iobuf
}
}
impl OwnedAsyncWriter for Vec<u8> {
async fn write_all<Buf: IoBuf + Send>(
&mut self,
buf: FullSlice<Buf>,
_: &RequestContext,
) -> std::io::Result<(usize, FullSlice<Buf>)> {
self.extend_from_slice(&buf[..]);
Ok((buf.len(), buf))
}
}
#[cfg(test)]
mod tests {
use std::sync::Mutex;
use bytes::BytesMut;
use super::*;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::TaskKind;
#[derive(Default, Debug)]
#[derive(Default)]
struct RecorderWriter {
/// record bytes and write offsets.
writes: Mutex<Vec<(Vec<u8>, u64)>>,
writes: Vec<Vec<u8>>,
}
impl RecorderWriter {
/// Gets recorded bytes and write offsets.
fn get_writes(&self) -> Vec<Vec<u8>> {
self.writes
.lock()
.unwrap()
.iter()
.map(|(buf, _)| buf.clone())
.collect()
}
}
impl OwnedAsyncWriter for RecorderWriter {
async fn write_all_at<Buf: IoBufAligned + Send>(
&self,
async fn write_all<Buf: IoBuf + Send>(
&mut self,
buf: FullSlice<Buf>,
offset: u64,
_: &RequestContext,
) -> std::io::Result<FullSlice<Buf>> {
self.writes
.lock()
.unwrap()
.push((Vec::from(&buf[..]), offset));
Ok(buf)
) -> std::io::Result<(usize, FullSlice<Buf>)> {
self.writes.push(Vec::from(&buf[..]));
Ok((buf.len(), buf))
}
}
@@ -288,21 +257,71 @@ mod tests {
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
macro_rules! write {
($writer:ident, $data:literal) => {{
$writer
.write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx())
.await?;
}};
}
#[tokio::test]
async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> {
async fn test_buffered_writes_only() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
write!(writer, b"a");
write!(writer, b"b");
write!(writer, b"c");
write!(writer, b"d");
write!(writer, b"e");
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
assert_eq!(
recorder.writes,
vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
);
Ok(())
}
#[tokio::test]
async fn test_passthrough_writes_only() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
write!(writer, b"abc");
write!(writer, b"de");
write!(writer, b"");
write!(writer, b"fghijk");
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
assert_eq!(
recorder.writes,
vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
);
Ok(())
}
#[tokio::test]
async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
write!(writer, b"a");
write!(writer, b"bc");
write!(writer, b"d");
write!(writer, b"e");
let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
assert_eq!(
recorder.writes,
vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
);
Ok(())
}
#[tokio::test]
async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
let ctx = test_ctx();
let ctx = &ctx;
let recorder = Arc::new(RecorderWriter::default());
let gate = utils::sync::gate::Gate::default();
let mut writer = BufferedWriter::<_, RecorderWriter>::new(
recorder,
|| IoBufferMut::with_capacity(2),
gate.enter()?,
ctx,
);
let recorder = RecorderWriter::default();
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
writer.write_buffered_borrowed(b"abc", ctx).await?;
writer.write_buffered_borrowed(b"", ctx).await?;
writer.write_buffered_borrowed(b"d", ctx).await?;
writer.write_buffered_borrowed(b"e", ctx).await?;
writer.write_buffered_borrowed(b"fg", ctx).await?;
@@ -310,9 +329,9 @@ mod tests {
writer.write_buffered_borrowed(b"j", ctx).await?;
writer.write_buffered_borrowed(b"klmno", ctx).await?;
let (_, recorder) = writer.flush_and_into_inner(ctx).await?;
let recorder = writer.flush_and_into_inner(ctx).await?;
assert_eq!(
recorder.get_writes(),
recorder.writes,
{
let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
expect

View File

@@ -1,314 +0,0 @@
use std::sync::Arc;
use utils::sync::duplex;
use crate::{
context::RequestContext,
virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice},
};
use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter};
/// A handle to the flush task.
pub struct FlushHandle<Buf, W> {
inner: Option<FlushHandleInner<Buf, W>>,
/// Immutable buffer for serving tail reads.
/// `None` if no flush request has been submitted.
pub(super) maybe_flushed: Option<FullSlice<Buf>>,
}
pub struct FlushHandleInner<Buf, W> {
/// A bi-directional channel that sends (buffer, offset) for writes,
/// and receives recyled buffer.
channel: duplex::mpsc::Duplex<FlushRequest<Buf>, FullSlice<Buf>>,
/// Join handle for the background flush task.
join_handle: tokio::task::JoinHandle<std::io::Result<Arc<W>>>,
}
struct FlushRequest<Buf> {
slice: FullSlice<Buf>,
offset: u64,
#[cfg(test)]
ready_to_flush_rx: tokio::sync::oneshot::Receiver<()>,
#[cfg(test)]
done_flush_tx: tokio::sync::oneshot::Sender<()>,
}
/// Constructs a request and a control object for a new flush operation.
#[cfg(not(test))]
fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
let request = FlushRequest { slice, offset };
let control = FlushControl::untracked();
(request, control)
}
/// Constructs a request and a control object for a new flush operation.
#[cfg(test)]
fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel();
let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel();
let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx);
let request = FlushRequest {
slice,
offset,
ready_to_flush_rx,
done_flush_tx,
};
(request, control)
}
/// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior.
#[cfg(test)]
pub(crate) struct FlushControl {
not_started: FlushNotStarted,
}
#[cfg(not(test))]
pub(crate) struct FlushControl;
impl FlushControl {
#[cfg(test)]
fn not_started(
ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
done_flush_rx: tokio::sync::oneshot::Receiver<()>,
) -> Self {
FlushControl {
not_started: FlushNotStarted {
ready_to_flush_tx,
done_flush_rx,
},
}
}
#[cfg(not(test))]
fn untracked() -> Self {
FlushControl
}
/// In tests, turn flush control into a not started state.
#[cfg(test)]
pub(crate) fn into_not_started(self) -> FlushNotStarted {
self.not_started
}
/// Release control to the submitted buffer.
///
/// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution.
pub async fn release(self) {
#[cfg(test)]
{
self.not_started
.ready_to_flush()
.wait_until_flush_is_done()
.await;
}
}
}
impl<Buf, W> FlushHandle<Buf, W>
where
Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
{
/// Spawns a new background flush task and obtains a handle.
///
/// Note: The background task so we do not need to explicitly maintain a queue of buffers.
pub fn spawn_new<B>(
file: Arc<W>,
buf: B,
gate_guard: utils::sync::gate::GateGuard,
ctx: RequestContext,
) -> Self
where
B: Buffer<IoBuf = Buf> + Send + 'static,
{
// It is fine to buffer up to only 1 message. We only 1 message in-flight at a time.
let (front, back) = duplex::mpsc::channel(1);
let join_handle = tokio::spawn(async move {
FlushBackgroundTask::new(back, file, gate_guard, ctx)
.run(buf.flush())
.await
});
FlushHandle {
inner: Some(FlushHandleInner {
channel: front,
join_handle,
}),
maybe_flushed: None,
}
}
/// Submits a buffer to be flushed in the background task.
/// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged.
/// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise
/// clear `maybe_flushed`.
pub async fn flush<B>(&mut self, buf: B, offset: u64) -> std::io::Result<(B, FlushControl)>
where
B: Buffer<IoBuf = Buf> + Send + 'static,
{
let slice = buf.flush();
// Saves a buffer for read while flushing. This also removes reference to the old buffer.
self.maybe_flushed = Some(slice.cheap_clone());
let (request, flush_control) = new_flush_op(slice, offset);
// Submits the buffer to the background task.
let submit = self.inner_mut().channel.send(request).await;
if submit.is_err() {
return self.handle_error().await;
}
// Wait for an available buffer from the background flush task.
// This is the BACKPRESSURE mechanism: if the flush task can't keep up,
// then the write path will eventually wait for it here.
let Some(recycled) = self.inner_mut().channel.recv().await else {
return self.handle_error().await;
};
// The only other place that could hold a reference to the recycled buffer
// is in `Self::maybe_flushed`, but we have already replace it with the new buffer.
let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner());
Ok((recycled, flush_control))
}
async fn handle_error<T>(&mut self) -> std::io::Result<T> {
Err(self
.shutdown()
.await
.expect_err("flush task only disconnects duplex if it exits with an error"))
}
/// Cleans up the channel, join the flush task.
pub async fn shutdown(&mut self) -> std::io::Result<Arc<W>> {
let handle = self
.inner
.take()
.expect("must not use after we returned an error");
drop(handle.channel.tx);
handle.join_handle.await.unwrap()
}
/// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`.
/// This only happens if the handle is used after an error.
fn inner_mut(&mut self) -> &mut FlushHandleInner<Buf, W> {
self.inner
.as_mut()
.expect("must not use after we returned an error")
}
}
/// A background task for flushing data to disk.
pub struct FlushBackgroundTask<Buf, W> {
/// A bi-directional channel that receives (buffer, offset) for writes,
/// and send back recycled buffer.
channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
/// A writter for persisting data to disk.
writer: Arc<W>,
ctx: RequestContext,
/// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk.
_gate_guard: utils::sync::gate::GateGuard,
}
impl<Buf, W> FlushBackgroundTask<Buf, W>
where
Buf: IoBufAligned + Send + Sync,
W: OwnedAsyncWriter + Sync + 'static,
{
/// Creates a new background flush task.
fn new(
channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
file: Arc<W>,
gate_guard: utils::sync::gate::GateGuard,
ctx: RequestContext,
) -> Self {
FlushBackgroundTask {
channel,
writer: file,
_gate_guard: gate_guard,
ctx,
}
}
/// Runs the background flush task.
/// The passed in slice is immediately sent back to the flush handle through the duplex channel.
async fn run(mut self, slice: FullSlice<Buf>) -> std::io::Result<Arc<W>> {
// Sends the extra buffer back to the handle.
self.channel.send(slice).await.map_err(|_| {
std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early")
})?;
// Exit condition: channel is closed and there is no remaining buffer to be flushed
while let Some(request) = self.channel.recv().await {
#[cfg(test)]
{
// In test, wait for control to signal that we are ready to flush.
if request.ready_to_flush_rx.await.is_err() {
tracing::debug!("control dropped");
}
}
// Write slice to disk at `offset`.
let slice = self
.writer
.write_all_at(request.slice, request.offset, &self.ctx)
.await?;
#[cfg(test)]
{
// In test, tell control we are done flushing buffer.
if request.done_flush_tx.send(()).is_err() {
tracing::debug!("control dropped");
}
}
// Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer.
if self.channel.send(slice).await.is_err() {
// Although channel is closed. Still need to finish flushing the remaining buffers.
continue;
}
}
Ok(self.writer)
}
}
#[cfg(test)]
pub(crate) struct FlushNotStarted {
ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
done_flush_rx: tokio::sync::oneshot::Receiver<()>,
}
#[cfg(test)]
pub(crate) struct FlushInProgress {
done_flush_rx: tokio::sync::oneshot::Receiver<()>,
}
#[cfg(test)]
pub(crate) struct FlushDone;
#[cfg(test)]
impl FlushNotStarted {
/// Signals the background task the buffer is ready to flush to disk.
pub fn ready_to_flush(self) -> FlushInProgress {
self.ready_to_flush_tx
.send(())
.map(|_| FlushInProgress {
done_flush_rx: self.done_flush_rx,
})
.unwrap()
}
}
#[cfg(test)]
impl FlushInProgress {
/// Waits until background flush is done.
pub async fn wait_until_flush_is_done(self) -> FlushDone {
self.done_flush_rx.await.unwrap();
FlushDone
}
}

View File

@@ -582,21 +582,18 @@ impl WalIngest {
forknum: FSM_FORKNUM,
};
// Zero out the last remaining FSM page, if this shard owns it. We are not precise here,
// and instead of digging in the FSM bitmap format we just clear the whole page.
let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0
&& self
.shard
.is_key_local(&rel_block_to_key(rel, fsm_physical_page_no))
{
if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
// Tail of last remaining FSM page has to be zeroed.
// We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
fsm_physical_page_no += 1;
}
// Truncate this shard's view of the FSM relation size, if it even has one.
// TODO: re-examine the None case here wrt. sharding; should we error?
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
.await?;
}
@@ -620,7 +617,7 @@ impl WalIngest {
// tail bits in the last remaining map page, representing truncated heap
// blocks, need to be cleared. This is not only tidy, but also necessary
// because we don't get a chance to clear the bits if the heap is extended
// again. Only do this on the shard that owns the page.
// again.
if (trunc_byte != 0 || trunc_offs != 0)
&& self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
{
@@ -634,9 +631,10 @@ impl WalIngest {
)?;
vm_page_no += 1;
}
// Truncate this shard's view of the VM relation size, if it even has one.
// TODO: re-examine the None case here wrt. sharding; should we error?
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
.await?;
}

View File

@@ -610,9 +610,6 @@ prefetch_read(PrefetchRequest *slot)
{
NeonResponse *response;
MemoryContext old;
BufferTag buftag;
shardno_t shard_no;
uint64 my_ring_index;
Assert(slot->status == PRFS_REQUESTED);
Assert(slot->response == NULL);
@@ -626,29 +623,11 @@ prefetch_read(PrefetchRequest *slot)
slot->status, slot->response,
(long)slot->my_ring_index, (long)MyPState->ring_receive);
/*
* Copy the request info so that if an error happens and the prefetch
* queue is flushed during the receive call, we can print the original
* values in the error message
*/
buftag = slot->buftag;
shard_no = slot->shard_no;
my_ring_index = slot->my_ring_index;
old = MemoryContextSwitchTo(MyPState->errctx);
response = (NeonResponse *) page_server->receive(shard_no);
response = (NeonResponse *) page_server->receive(slot->shard_no);
MemoryContextSwitchTo(old);
if (response)
{
/* The slot should still be valid */
if (slot->status != PRFS_REQUESTED ||
slot->response != NULL ||
slot->my_ring_index != MyPState->ring_receive)
neon_shard_log(shard_no, ERROR,
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
slot->status, slot->response,
(long) slot->my_ring_index, (long) MyPState->ring_receive);
/* update prefetch state */
MyPState->n_responses_buffered += 1;
MyPState->n_requests_inflight -= 1;
@@ -663,15 +642,11 @@ prefetch_read(PrefetchRequest *slot)
}
else
{
/*
* Note: The slot might no longer be valid, if the connection was lost
* and the prefetch queue was flushed during the receive call
*/
neon_shard_log(shard_no, LOG,
neon_shard_log(slot->shard_no, LOG,
"No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
(long) my_ring_index,
RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
buftag.forkNum, buftag.blockNum);
(long)slot->my_ring_index,
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
slot->buftag.forkNum, slot->buftag.blockNum);
return false;
}
}

View File

@@ -70,10 +70,6 @@ impl std::fmt::Display for Backend<'_, ()> {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ControlPlane(api, ()) => match &**api {
ControlPlaneClient::ProxyV1(endpoint) => fmt
.debug_tuple("ControlPlane::ProxyV1")
.field(&endpoint.url())
.finish(),
ControlPlaneClient::Neon(endpoint) => fmt
.debug_tuple("ControlPlane::Neon")
.field(&endpoint.url())

View File

@@ -46,9 +46,6 @@ enum AuthBackendType {
#[value(name("console"), alias("cplane"))]
ControlPlane,
#[value(name("cplane-v1"), alias("control-plane"))]
ControlPlaneV1,
#[value(name("link"), alias("control-redirect"))]
ConsoleRedirect,
@@ -521,39 +518,6 @@ async fn main() -> anyhow::Result<()> {
.instrument(span),
);
}
} else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
match (redis_notifications_client, regional_redis_client.clone()) {
(None, None) => {}
(client1, client2) => {
let cache = api.caches.project_info.clone();
if let Some(client) = client1 {
maintenance_tasks.spawn(notifications::task_main(
client,
cache.clone(),
cancel_map.clone(),
args.region.clone(),
));
}
if let Some(client) = client2 {
maintenance_tasks.spawn(notifications::task_main(
client,
cache.clone(),
cancel_map.clone(),
args.region.clone(),
));
}
maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
}
}
if let Some(regional_redis_client) = regional_redis_client {
let cache = api.caches.endpoints_cache.clone();
let con = regional_redis_client;
let span = tracing::info_span!("endpoints_cache");
maintenance_tasks.spawn(
async move { cache.do_read(con, cancellation_token.clone()).await }
.instrument(span),
);
}
}
}
@@ -698,65 +662,6 @@ fn build_auth_backend(
args: &ProxyCliArgs,
) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
match &args.auth_backend {
AuthBackendType::ControlPlaneV1 => {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
let project_info_cache_config: ProjectInfoCacheOptions =
args.project_info_cache.parse()?;
let endpoint_cache_config: config::EndpointCacheConfig =
args.endpoint_cache_config.parse()?;
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
info!(
"Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
);
info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
wake_compute_cache_config,
project_info_cache_config,
endpoint_cache_config,
)));
let config::ConcurrencyLockOptions {
shards,
limiter,
epoch,
timeout,
} = args.wake_compute_lock.parse()?;
info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
"wake_compute_lock",
limiter,
shards,
timeout,
epoch,
&Metrics::get().wake_compute_lock,
)?));
tokio::spawn(locks.garbage_collect_worker());
let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
let endpoint = http::Endpoint::new(url, http::new_client());
let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
let wake_compute_endpoint_rate_limiter =
Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
endpoint,
args.control_plane_token.clone(),
caches,
locks,
wake_compute_endpoint_rate_limiter,
);
let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
let config = Box::leak(Box::new(auth_backend));
Ok(Either::Left(config))
}
AuthBackendType::ControlPlane => {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
let project_info_cache_config: ProjectInfoCacheOptions =
@@ -792,15 +697,13 @@ fn build_auth_backend(
)?));
tokio::spawn(locks.garbage_collect_worker());
let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
let url = args.auth_endpoint.parse()?;
let endpoint = http::Endpoint::new(url, http::new_client());
let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
let wake_compute_endpoint_rate_limiter =
Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
let api = control_plane::client::neon::NeonControlPlaneClient::new(
endpoint,
args.control_plane_token.clone(),

View File

@@ -1,514 +0,0 @@
//! Production console backend.
use std::sync::Arc;
use std::time::Duration;
use ::http::header::AUTHORIZATION;
use ::http::HeaderName;
use futures::TryFutureExt;
use postgres_client::config::SslMode;
use tokio::time::Instant;
use tracing::{debug, info, info_span, warn, Instrument};
use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute};
use crate::auth::backend::jwt::AuthRule;
use crate::auth::backend::ComputeUserInfo;
use crate::cache::Cached;
use crate::context::RequestContext;
use crate::control_plane::caches::ApiCaches;
use crate::control_plane::errors::{
ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
};
use crate::control_plane::locks::ApiLocks;
use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
use crate::control_plane::{
AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
};
use crate::metrics::{CacheOutcome, Metrics};
use crate::rate_limiter::WakeComputeRateLimiter;
use crate::types::{EndpointCacheKey, EndpointId};
use crate::{compute, http, scram};
const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
#[derive(Clone)]
pub struct NeonControlPlaneClient {
endpoint: http::Endpoint,
pub caches: &'static ApiCaches,
pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
// put in a shared ref so we don't copy secrets all over in memory
jwt: Arc<str>,
}
impl NeonControlPlaneClient {
/// Construct an API object containing the auth parameters.
pub fn new(
endpoint: http::Endpoint,
jwt: Arc<str>,
caches: &'static ApiCaches,
locks: &'static ApiLocks<EndpointCacheKey>,
wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
) -> Self {
Self {
endpoint,
caches,
locks,
wake_compute_endpoint_rate_limiter,
jwt,
}
}
pub(crate) fn url(&self) -> &str {
self.endpoint.url().as_str()
}
async fn do_get_auth_info(
&self,
ctx: &RequestContext,
user_info: &ComputeUserInfo,
) -> Result<AuthInfo, GetAuthInfoError> {
if !self
.caches
.endpoints_cache
.is_valid(ctx, &user_info.endpoint.normalize())
{
// TODO: refactor this because it's weird
// this is a failure to authenticate but we return Ok.
info!("endpoint is not valid, skipping the request");
return Ok(AuthInfo::default());
}
let request_id = ctx.session_id().to_string();
let application_name = ctx.console_application_name();
async {
let request = self
.endpoint
.get_path("get_endpoint_access_control")
.header(X_REQUEST_ID, &request_id)
.header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
.query(&[("session_id", ctx.session_id())])
.query(&[
("application_name", application_name.as_str()),
("endpointish", user_info.endpoint.as_str()),
("role", user_info.user.as_str()),
])
.build()?;
debug!(url = request.url().as_str(), "sending http request");
let start = Instant::now();
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
let response = self.endpoint.execute(request).await?;
drop(pause);
info!(duration = ?start.elapsed(), "received http response");
let body = match parse_body::<GetEndpointAccessControl>(response).await {
Ok(body) => body,
// Error 404 is special: it's ok not to have a secret.
// TODO(anna): retry
Err(e) => {
return if e.get_reason().is_not_found() {
// TODO: refactor this because it's weird
// this is a failure to authenticate but we return Ok.
Ok(AuthInfo::default())
} else {
Err(e.into())
};
}
};
// Ivan: don't know where it will be used, so I leave it here
let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
let secret = if body.role_secret.is_empty() {
None
} else {
let secret = scram::ServerSecret::parse(&body.role_secret)
.map(AuthSecret::Scram)
.ok_or(GetAuthInfoError::BadSecret)?;
Some(secret)
};
let allowed_ips = body.allowed_ips.unwrap_or_default();
Metrics::get()
.proxy
.allowed_ips_number
.observe(allowed_ips.len() as f64);
Ok(AuthInfo {
secret,
allowed_ips,
project_id: body.project_id,
})
}
.inspect_err(|e| tracing::debug!(error = ?e))
.instrument(info_span!("do_get_auth_info"))
.await
}
async fn do_get_endpoint_jwks(
&self,
ctx: &RequestContext,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
if !self
.caches
.endpoints_cache
.is_valid(ctx, &endpoint.normalize())
{
return Err(GetEndpointJwksError::EndpointNotFound);
}
let request_id = ctx.session_id().to_string();
async {
let request = self
.endpoint
.get_with_url(|url| {
url.path_segments_mut()
.push("endpoints")
.push(endpoint.as_str())
.push("jwks");
})
.header(X_REQUEST_ID, &request_id)
.header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
.query(&[("session_id", ctx.session_id())])
.build()
.map_err(GetEndpointJwksError::RequestBuild)?;
debug!(url = request.url().as_str(), "sending http request");
let start = Instant::now();
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
let response = self
.endpoint
.execute(request)
.await
.map_err(GetEndpointJwksError::RequestExecute)?;
drop(pause);
info!(duration = ?start.elapsed(), "received http response");
let body = parse_body::<EndpointJwksResponse>(response).await?;
let rules = body
.jwks
.into_iter()
.map(|jwks| AuthRule {
id: jwks.id,
jwks_url: jwks.jwks_url,
audience: jwks.jwt_audience,
role_names: jwks.role_names,
})
.collect();
Ok(rules)
}
.inspect_err(|e| tracing::debug!(error = ?e))
.instrument(info_span!("do_get_endpoint_jwks"))
.await
}
async fn do_wake_compute(
&self,
ctx: &RequestContext,
user_info: &ComputeUserInfo,
) -> Result<NodeInfo, WakeComputeError> {
let request_id = ctx.session_id().to_string();
let application_name = ctx.console_application_name();
async {
let mut request_builder = self
.endpoint
.get_path("wake_compute")
.header("X-Request-ID", &request_id)
.header("Authorization", format!("Bearer {}", &self.jwt))
.query(&[("session_id", ctx.session_id())])
.query(&[
("application_name", application_name.as_str()),
("endpointish", user_info.endpoint.as_str()),
]);
let options = user_info.options.to_deep_object();
if !options.is_empty() {
request_builder = request_builder.query(&options);
}
let request = request_builder.build()?;
debug!(url = request.url().as_str(), "sending http request");
let start = Instant::now();
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
let response = self.endpoint.execute(request).await?;
drop(pause);
info!(duration = ?start.elapsed(), "received http response");
let body = parse_body::<WakeCompute>(response).await?;
// Unfortunately, ownership won't let us use `Option::ok_or` here.
let (host, port) = match parse_host_port(&body.address) {
None => return Err(WakeComputeError::BadComputeAddress(body.address)),
Some(x) => x,
};
// Don't set anything but host and port! This config will be cached.
// We'll set username and such later using the startup message.
// TODO: add more type safety (in progress).
let mut config = compute::ConnCfg::new(host.to_owned(), port);
config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
let node = NodeInfo {
config,
aux: body.aux,
allow_self_signed_compute: false,
};
Ok(node)
}
.inspect_err(|e| tracing::debug!(error = ?e))
.instrument(info_span!("do_wake_compute"))
.await
}
}
impl super::ControlPlaneApi for NeonControlPlaneClient {
#[tracing::instrument(skip_all)]
async fn get_role_secret(
&self,
ctx: &RequestContext,
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, GetAuthInfoError> {
let normalized_ep = &user_info.endpoint.normalize();
let user = &user_info.user;
if let Some(role_secret) = self
.caches
.project_info
.get_role_secret(normalized_ep, user)
{
return Ok(role_secret);
}
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
if let Some(project_id) = auth_info.project_id {
let normalized_ep_int = normalized_ep.into();
self.caches.project_info.insert_role_secret(
project_id,
normalized_ep_int,
user.into(),
auth_info.secret.clone(),
);
self.caches.project_info.insert_allowed_ips(
project_id,
normalized_ep_int,
Arc::new(auth_info.allowed_ips),
);
ctx.set_project_id(project_id);
}
// When we just got a secret, we don't need to invalidate it.
Ok(Cached::new_uncached(auth_info.secret))
}
async fn get_allowed_ips_and_secret(
&self,
ctx: &RequestContext,
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
let normalized_ep = &user_info.endpoint.normalize();
if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
Metrics::get()
.proxy
.allowed_ips_cache_misses
.inc(CacheOutcome::Hit);
return Ok((allowed_ips, None));
}
Metrics::get()
.proxy
.allowed_ips_cache_misses
.inc(CacheOutcome::Miss);
let auth_info = self.do_get_auth_info(ctx, user_info).await?;
let allowed_ips = Arc::new(auth_info.allowed_ips);
let user = &user_info.user;
if let Some(project_id) = auth_info.project_id {
let normalized_ep_int = normalized_ep.into();
self.caches.project_info.insert_role_secret(
project_id,
normalized_ep_int,
user.into(),
auth_info.secret.clone(),
);
self.caches.project_info.insert_allowed_ips(
project_id,
normalized_ep_int,
allowed_ips.clone(),
);
ctx.set_project_id(project_id);
}
Ok((
Cached::new_uncached(allowed_ips),
Some(Cached::new_uncached(auth_info.secret)),
))
}
#[tracing::instrument(skip_all)]
async fn get_endpoint_jwks(
&self,
ctx: &RequestContext,
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
self.do_get_endpoint_jwks(ctx, endpoint).await
}
#[tracing::instrument(skip_all)]
async fn wake_compute(
&self,
ctx: &RequestContext,
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, WakeComputeError> {
let key = user_info.endpoint_cache_key();
macro_rules! check_cache {
() => {
if let Some(cached) = self.caches.node_info.get(&key) {
let (cached, info) = cached.take_value();
let info = info.map_err(|c| {
info!(key = &*key, "found cached wake_compute error");
WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
})?;
debug!(key = &*key, "found cached compute node info");
ctx.set_project(info.aux.clone());
return Ok(cached.map(|()| info));
}
};
}
// Every time we do a wakeup http request, the compute node will stay up
// for some time (highly depends on the console's scale-to-zero policy);
// The connection info remains the same during that period of time,
// which means that we might cache it to reduce the load and latency.
check_cache!();
let permit = self.locks.get_permit(&key).await?;
// after getting back a permit - it's possible the cache was filled
// double check
if permit.should_check_cache() {
// TODO: if there is something in the cache, mark the permit as success.
check_cache!();
}
// check rate limit
if !self
.wake_compute_endpoint_rate_limiter
.check(user_info.endpoint.normalize_intern(), 1)
{
return Err(WakeComputeError::TooManyConnections);
}
let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
match node {
Ok(node) => {
ctx.set_project(node.aux.clone());
debug!(key = &*key, "created a cache entry for woken compute node");
let mut stored_node = node.clone();
// store the cached node as 'warm_cached'
stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
Ok(cached.map(|()| node))
}
Err(err) => match err {
WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
let Some(status) = &err.status else {
return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
err,
)));
};
let reason = status
.details
.error_info
.map_or(Reason::Unknown, |x| x.reason);
// if we can retry this error, do not cache it.
if reason.can_retry() {
return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
err,
)));
}
// at this point, we should only have quota errors.
debug!(
key = &*key,
"created a cache entry for the wake compute error"
);
self.caches.node_info.insert_ttl(
key,
Err(err.clone()),
Duration::from_secs(30),
);
Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
err,
)))
}
err => return Err(err),
},
}
}
}
/// Parse http response body, taking status code into account.
async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
response: http::Response,
) -> Result<T, ControlPlaneError> {
let status = response.status();
if status.is_success() {
// We shouldn't log raw body because it may contain secrets.
info!("request succeeded, processing the body");
return Ok(response.json().await?);
}
let s = response.bytes().await?;
// Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
info!("response_error plaintext: {:?}", s);
// Don't throw an error here because it's not as important
// as the fact that the request itself has failed.
let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
warn!("failed to parse error body: {e}");
ControlPlaneErrorMessage {
error: "reason unclear (malformed error message)".into(),
http_status_code: status,
status: None,
}
});
body.http_status_code = status;
warn!("console responded with an error ({status}): {body:?}");
Err(ControlPlaneError::Message(Box::new(body)))
}
fn parse_host_port(input: &str) -> Option<(&str, u16)> {
let (host, port) = input.rsplit_once(':')?;
let ipv6_brackets: &[_] = &['[', ']'];
Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_host_port_v4() {
let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
assert_eq!(host, "127.0.0.1");
assert_eq!(port, 5432);
}
#[test]
fn test_parse_host_port_v6() {
let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
assert_eq!(host, "2001:db8::1");
assert_eq!(port, 5432);
}
#[test]
fn test_parse_host_port_url() {
let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
.expect("failed to parse");
assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
assert_eq!(port, 5432);
}
}

View File

@@ -1,4 +1,3 @@
pub mod cplane_proxy_v1;
#[cfg(any(test, feature = "testing"))]
pub mod mock;
pub mod neon;
@@ -28,8 +27,6 @@ use crate::types::EndpointId;
#[non_exhaustive]
#[derive(Clone)]
pub enum ControlPlaneClient {
/// New Proxy V1 control plane API
ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
/// Current Management API (V2).
Neon(neon::NeonControlPlaneClient),
/// Local mock control plane.
@@ -48,7 +45,6 @@ impl ControlPlaneApi for ControlPlaneClient {
user_info: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
match self {
Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
@@ -65,7 +61,6 @@ impl ControlPlaneApi for ControlPlaneClient {
user_info: &ComputeUserInfo,
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
match self {
Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
@@ -80,7 +75,6 @@ impl ControlPlaneApi for ControlPlaneClient {
endpoint: EndpointId,
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
match self {
Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
@@ -95,7 +89,6 @@ impl ControlPlaneApi for ControlPlaneClient {
user_info: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError> {
match self {
Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
Self::Neon(api) => api.wake_compute(ctx, user_info).await,
#[cfg(any(test, feature = "testing"))]
Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,

View File

@@ -1,4 +1,4 @@
//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
//! Production console backend.
use std::sync::Arc;
use std::time::Duration;

View File

@@ -230,16 +230,6 @@ pub(crate) struct GetRoleSecret {
pub(crate) project_id: Option<ProjectIdInt>,
}
/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
/// Returned by the `/get_endpoint_access_control` API method.
#[derive(Deserialize)]
pub(crate) struct GetEndpointAccessControl {
pub(crate) role_secret: Box<str>,
pub(crate) allowed_ips: Option<Vec<IpPattern>>,
pub(crate) project_id: Option<ProjectIdInt>,
pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
}
// Manually implement debug to omit sensitive info.
impl fmt::Debug for GetRoleSecret {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {

View File

@@ -636,13 +636,6 @@ impl Persistence {
.into_boxed(),
};
// Clear generation_pageserver if we are moving into a state where we won't have
// any attached pageservers.
let input_generation_pageserver = match input_placement_policy {
None | Some(PlacementPolicy::Attached(_)) => None,
Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
};
#[derive(AsChangeset)]
#[diesel(table_name = crate::schema::tenant_shards)]
struct ShardUpdate {
@@ -650,7 +643,6 @@ impl Persistence {
placement_policy: Option<String>,
config: Option<String>,
scheduling_policy: Option<String>,
generation_pageserver: Option<Option<i64>>,
}
let update = ShardUpdate {
@@ -663,7 +655,6 @@ impl Persistence {
.map(|c| serde_json::to_string(&c).unwrap()),
scheduling_policy: input_scheduling_policy
.map(|p| serde_json::to_string(&p).unwrap()),
generation_pageserver: input_generation_pageserver,
};
query.set(update).execute(conn)?;

View File

@@ -513,9 +513,6 @@ struct ShardUpdate {
/// If this is None, generation is not updated.
generation: Option<Generation>,
/// If this is None, scheduling policy is not updated.
scheduling_policy: Option<ShardSchedulingPolicy>,
}
enum StopReconciliationsReason {
@@ -792,7 +789,7 @@ impl Service {
node_list_futs.push({
async move {
tracing::info!("Scanning shards on node {node}...");
let timeout = Duration::from_secs(5);
let timeout = Duration::from_secs(1);
let response = node
.with_client_retries(
|client| async move { client.list_location_config().await },
@@ -2379,23 +2376,6 @@ impl Service {
}
};
// Ordinarily we do not update scheduling policy, but when making major changes
// like detaching or demoting to secondary-only, we need to force the scheduling
// mode to Active, or the caller's expected outcome (detach it) will not happen.
let scheduling_policy = match req.config.mode {
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
// Special case: when making major changes like detaching or demoting to secondary-only,
// we need to force the scheduling mode to Active, or nothing will happen.
Some(ShardSchedulingPolicy::Active)
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// While attached, continue to respect whatever the existing scheduling mode is.
None
}
};
let mut create = true;
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
// Saw an existing shard: this is not a creation
@@ -2421,7 +2401,6 @@ impl Service {
placement_policy: placement_policy.clone(),
tenant_config: req.config.tenant_conf.clone(),
generation: set_generation,
scheduling_policy,
});
}
@@ -2518,7 +2497,6 @@ impl Service {
placement_policy,
tenant_config,
generation,
scheduling_policy,
} in &updates
{
self.persistence
@@ -2527,7 +2505,7 @@ impl Service {
Some(placement_policy.clone()),
Some(tenant_config.clone()),
*generation,
*scheduling_policy,
None,
)
.await?;
}
@@ -2543,7 +2521,6 @@ impl Service {
placement_policy,
tenant_config,
generation: update_generation,
scheduling_policy,
} in updates
{
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
@@ -2562,10 +2539,6 @@ impl Service {
shard.generation = Some(generation);
}
if let Some(scheduling_policy) = scheduling_policy {
shard.set_scheduling_policy(scheduling_policy);
}
shard.schedule(scheduler, &mut schedule_context)?;
let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
@@ -3019,17 +2992,9 @@ impl Service {
let TenantPolicyRequest {
placement,
mut scheduling,
scheduling,
} = req;
if let Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) = placement {
// When someone configures a tenant to detach, we force the scheduling policy to enable
// this to take effect.
if scheduling.is_none() {
scheduling = Some(ShardSchedulingPolicy::Active);
}
}
self.persistence
.update_tenant_shard(
TenantFilter::Tenant(tenant_id),

View File

@@ -268,7 +268,7 @@ impl BucketConfig {
config.bucket_name, config.bucket_region
),
RemoteStorageKind::AzureContainer(config) => format!(
"container {}, storage account {:?}, region {}",
"bucket {}, storage account {:?}, region {}",
config.container_name, config.storage_account, config.container_region
),
}

View File

@@ -152,8 +152,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
"pageserver_resident_physical_size",
"pageserver_io_operations_bytes_total",
"pageserver_last_record_lsn",
"pageserver_disk_consistent_lsn",
"pageserver_projected_remote_consistent_lsn",
"pageserver_standby_horizon",
"pageserver_smgr_query_seconds_bucket",
"pageserver_smgr_query_seconds_count",
@@ -175,8 +173,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
counter("pageserver_tenant_throttling_count_accounted_finish"),
counter("pageserver_tenant_throttling_wait_usecs_sum"),
counter("pageserver_tenant_throttling_count"),
counter("pageserver_timeline_wal_records_received"),
counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
*histogram("pageserver_page_service_batch_size"),
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
# "pageserver_directory_entries_count", -- only used if above a certain threshold

View File

@@ -850,7 +850,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
force_repartition=False,
force_image_layer_creation=False,
force_l0_compaction=False,
wait_until_flushed=True,
wait_until_uploaded=False,
compact: bool | None = None,
**kwargs,
@@ -863,8 +862,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
query["force_image_layer_creation"] = "true"
if force_l0_compaction:
query["force_l0_compaction"] = "true"
if not wait_until_flushed:
query["wait_until_flushed"] = "false"
if wait_until_uploaded:
query["wait_until_uploaded"] = "true"
@@ -872,7 +869,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
query["compact"] = "true" if compact else "false"
log.info(
f"Requesting checkpoint: tenant={tenant_id} timeline={timeline_id} wait_until_flushed={wait_until_flushed} wait_until_uploaded={wait_until_uploaded} compact={compact}"
f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}"
)
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",

View File

@@ -54,15 +54,23 @@ def wait_for_upload(
tenant: TenantId | TenantShardId,
timeline: TimelineId,
lsn: Lsn,
timeout=20,
):
"""Waits for local timeline upload up to specified LSN"""
"""waits for local timeline upload up to specified lsn"""
def is_uploaded():
remote_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
assert remote_lsn >= lsn, f"remote_consistent_lsn at {remote_lsn}"
wait_until(is_uploaded, name=f"upload to {lsn}", timeout=timeout)
current_lsn = Lsn(0)
for i in range(20):
current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
if current_lsn >= lsn:
log.info("wait finished")
return
lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
log.info(
f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}"
)
time.sleep(1)
raise Exception(
f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}"
)
def _tenant_in_expected_state(tenant_info: dict[str, Any], expected_state: str):

View File

@@ -1,142 +0,0 @@
from __future__ import annotations
import random
from concurrent.futures import ThreadPoolExecutor
import pytest
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.utils import (
wait_for_last_record_lsn,
wait_for_upload,
wait_for_upload_queue_empty,
)
from fixtures.remote_storage import s3_storage
@pytest.mark.timeout(900)
@pytest.mark.parametrize("size", [8, 1024, 8192])
@pytest.mark.parametrize("s3", [True, False], ids=["s3", "local"])
@pytest.mark.parametrize("backpressure", [True, False], ids=["backpressure", "nobackpressure"])
@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"])
def test_ingest_insert_bulk(
request: pytest.FixtureRequest,
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
fsync: bool,
backpressure: bool,
s3: bool,
size: int,
):
"""
Benchmarks ingestion of 5 GB of sequential insert WAL. Measures ingestion and S3 upload
separately. Also does a Safekeeper→Pageserver re-ingestion to measure Pageserver ingestion in
isolation.
"""
CONCURRENCY = 1 # 1 is optimal without fsync or backpressure
VOLUME = 5 * 1024**3
rows = VOLUME // (size + 64) # +64 roughly accounts for per-row WAL overhead
neon_env_builder.safekeepers_enable_fsync = fsync
if s3:
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
# NB: don't use S3 for Safekeeper. It doesn't affect throughput (no backpressure), but it
# would compete with Pageserver for bandwidth.
# neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
neon_env_builder.disable_scrub_on_exit() # immediate shutdown may leave stray layers
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start(
"main",
config_lines=[
f"fsync = {fsync}",
"max_replication_apply_lag = 0",
f"max_replication_flush_lag = {'10GB' if backpressure else '0'}",
# NB: neon_local defaults to 15MB, which is too slow -- production uses 500MB.
f"max_replication_write_lag = {'500MB' if backpressure else '0'}",
],
)
endpoint.safe_psql("create extension neon")
# Wait for the timeline to be propagated to the pageserver.
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
# Ingest rows.
log.info("Ingesting data")
start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
def insert_rows(endpoint, table, count, value):
with endpoint.connect().cursor() as cur:
cur.execute("set statement_timeout = 0")
cur.execute(f"create table {table} (id int, data bytea)")
cur.execute(f"insert into {table} values (generate_series(1, {count}), %s)", (value,))
with zenbenchmark.record_duration("upload"):
with zenbenchmark.record_duration("ingest"):
with ThreadPoolExecutor(max_workers=CONCURRENCY) as pool:
for i in range(CONCURRENCY):
# Write a random value for all rows. This is sufficient to prevent compression,
# e.g. in TOAST. Randomly generating every row is too slow.
value = random.randbytes(size)
worker_rows = rows / CONCURRENCY
pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
# Wait for pageserver to ingest the WAL.
client = env.pageserver.http_client()
wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
# Wait for pageserver S3 upload. Checkpoint to flush the last in-memory layer.
client.timeline_checkpoint(
env.initial_tenant,
env.initial_timeline,
compact=False,
wait_until_flushed=False,
)
wait_for_upload(client, env.initial_tenant, env.initial_timeline, end_lsn, timeout=600)
# Empty out upload queue for next benchmark.
wait_for_upload_queue_empty(client, env.initial_tenant, env.initial_timeline)
backpressure_time = endpoint.safe_psql("select backpressure_throttling_time()")[0][0]
# Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
# reingest all the WAL directly from the safekeeper. This gives us a baseline of how fast the
# pageserver can ingest this WAL in isolation.
status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant)
assert status is not None
endpoint.stop() # avoid spurious getpage errors
client.tenant_delete(env.initial_tenant)
env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0])
with zenbenchmark.record_duration("recover"):
log.info("Recovering WAL into pageserver")
client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
# Emit metrics.
wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
zenbenchmark.record("row_count", rows, "rows", MetricReport.TEST_PARAM)
zenbenchmark.record("concurrency", CONCURRENCY, "clients", MetricReport.TEST_PARAM)
zenbenchmark.record(
"backpressure_time", backpressure_time // 1000, "ms", MetricReport.LOWER_IS_BETTER
)
props = {p["name"]: p["value"] for _, p in request.node.user_properties}
for name in ("ingest", "upload", "recover"):
throughput = int(wal_written_mb / props[name])
zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER)
# Pageserver shutdown will likely get stuck on the upload queue, just shut it down immediately.
env.stop(immediate=True)

View File

@@ -72,7 +72,7 @@ def test_storage_controller_many_tenants(
we don't fall over for a thousand shards.
"""
neon_env_builder.num_pageservers = 6
neon_env_builder.num_pageservers = 5
neon_env_builder.storage_controller_config = {
# Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
# TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
@@ -84,11 +84,6 @@ def test_storage_controller_many_tenants(
compute_reconfigure_listener.control_plane_compute_hook_api
)
AZS = ["alpha", "bravo", "charlie"]
neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update(
{"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"}
)
# A small sleep on each call into the notify hook, to simulate the latency of doing a database write
compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))

View File

@@ -15,7 +15,7 @@ from fixtures.pageserver.http import PageserverApiException
from fixtures.utils import skip_in_debug_build, wait_until
from fixtures.workload import Workload
AGGRESSIVE_COMPACTION_TENANT_CONF = {
AGGRESIVE_COMPACTION_TENANT_CONF = {
# Disable gc and compaction. The test runs compaction manually.
"gc_period": "0s",
"compaction_period": "0s",
@@ -24,7 +24,6 @@ AGGRESSIVE_COMPACTION_TENANT_CONF = {
# Compact small layers
"compaction_target_size": 1024**2,
"image_creation_threshold": 2,
# "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
}
@@ -52,7 +51,7 @@ def test_pageserver_compaction_smoke(
page_cache_size=10
"""
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
@@ -121,28 +120,14 @@ page_cache_size=10
assert vectored_average < 8
@pytest.mark.skip(
"This is being fixed and tracked in https://github.com/neondatabase/neon/issues/9114"
)
@skip_in_debug_build("only run with release build")
def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
SMOKE_CONF = {
# Run both gc and gc-compaction.
"gc_period": "5s",
"compaction_period": "5s",
# No PiTR interval and small GC horizon
"pitr_interval": "0s",
"gc_horizon": f"{1024 ** 2}",
"lsn_lease_length": "0s",
}
env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
row_count = 10000
churn_rounds = 50
row_count = 1000
churn_rounds = 10
ps_http = env.pageserver.http_client()
@@ -156,28 +141,20 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
if i % 10 == 0:
log.info(f"Running churn round {i}/{churn_rounds} ...")
workload.churn_rows(row_count, env.pageserver.id)
# Force L0 compaction to ensure the number of layers is within bounds, so that gc-compaction can run.
ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1
ps_http.timeline_compact(
tenant_id,
timeline_id,
enhanced_gc_bottom_most_compaction=True,
body={
"scheduled": True,
"sub_compaction": True,
"compact_range": {
"start": "000000000000000000000000000000000000",
# skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this
"end": "010000000000000000000000000000000000",
},
"start": "000000000000000000000000000000000000",
"end": "030000000000000000000000000000000000",
},
)
workload.churn_rows(row_count, env.pageserver.id)
# ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
env.pageserver.assert_log_contains(
"scheduled_compact_timeline.*picked .* layers for compaction"
)
log.info("Validating at workload end ...")
workload.validate(env.pageserver.id)

View File

@@ -215,7 +215,7 @@ if SQL_EXPORTER is None:
#
# The "host" network mode allows sql_exporter to talk to the
# endpoint which is running on the host.
super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
super().__init__("docker.io/burningalchemist/sql_exporter:0.13.1", network_mode="host")
self.__logs_dir = logs_dir
self.__port = port

View File

@@ -3230,55 +3230,3 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr
# Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
raise
@run_only_on_default_postgres("Postgres version makes no difference here")
def test_storage_controller_detached_stopped(
neon_env_builder: NeonEnvBuilder,
):
"""
Test that detaching a tenant while it has scheduling policy set to Paused or Stop works
"""
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
neon_env_builder.num_pageservers = 1
env = neon_env_builder.init_configs()
env.start()
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
tenant_id = TenantId.generate()
env.storage_controller.tenant_create(
tenant_id,
shard_count=1,
)
assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
# Disable scheduling: ordinarily this would prevent the tenant's configuration being
# reconciled to pageservers, but this should be overridden when detaching.
env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy.*")
env.storage_controller.tenant_policy_update(
tenant_id,
{"scheduling": "Stop"},
)
env.storage_controller.consistency_check()
# Detach the tenant
virtual_ps_http.tenant_location_conf(
tenant_id,
{
"mode": "Detached",
"secondary_conf": None,
"tenant_conf": {},
"generation": None,
},
)
env.storage_controller.consistency_check()
# Confirm the detach happened
assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []