From decd265c99a160d5d5c37332ef23c2c0898b40f9 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 11 Mar 2025 17:18:09 -0500 Subject: [PATCH 01/71] Revert notify to 6.0.0 (#11162) The upgrade to 8.0.0 caused severe performance regressions in the start_postgres_ms metric, which measures the time it takes from execing Postgres to the time Postgres marks itself as ready in the postmaster.pid file. We use the notify crate to watch for changes in the pgdata directory and the postmaster.pid file. Signed-off-by: Tristan Partin --- Cargo.lock | 42 +++++++++++++++++++++------------------ Cargo.toml | 4 +++- workspace_hack/Cargo.toml | 6 ++---- 3 files changed, 28 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 778ff19fec..d023d340d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3254,11 +3254,11 @@ dependencies = [ [[package]] name = "inotify" -version = "0.11.0" +version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f37dccff2791ab604f9babef0ba14fbe0be30bd368dc541e2b08d07c8aa908f3" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" dependencies = [ - "bitflags 2.8.0", + "bitflags 1.3.2", "inotify-sys", "libc", ] @@ -3732,6 +3732,18 @@ dependencies = [ "adler2", ] +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "log", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.48.0", +] + [[package]] name = "mio" version = "1.0.3" @@ -3739,7 +3751,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", - "log", "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] @@ -3817,29 +3828,23 @@ checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" [[package]] name = "notify" -version = "8.0.0" +version = "6.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fee8403b3d66ac7b26aee6e40a897d85dc5ce26f44da36b8b73e987cc52e943" +checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" dependencies = [ "bitflags 2.8.0", + "crossbeam-channel", "filetime", "fsevent-sys", "inotify", "kqueue", "libc", "log", - "mio", - "notify-types", + "mio 0.8.11", "walkdir", - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] -[[package]] -name = "notify-types" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e0826a989adedc2a244799e823aece04662b66609d96af8dff7ac6df9a8925d" - [[package]] name = "ntapi" version = "0.4.1" @@ -4980,7 +4985,7 @@ checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" dependencies = [ "bytes", "heck", - "itertools 0.10.5", + "itertools 0.12.1", "log", "multimap", "once_cell", @@ -5021,7 +5026,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" dependencies = [ "anyhow", - "itertools 0.10.5", + "itertools 0.12.1", "proc-macro2", "quote", "syn 2.0.90", @@ -7123,7 +7128,7 @@ dependencies = [ "backtrace", "bytes", "libc", - "mio", + "mio 1.0.3", "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", @@ -8387,7 +8392,6 @@ dependencies = [ "hyper-util", "indexmap 1.9.3", "indexmap 2.0.1", - "itertools 0.10.5", "itertools 0.12.1", "lazy_static", "libc", diff --git a/Cargo.toml b/Cargo.toml index c59c4c5435..4a32b6d95d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -126,7 +126,9 @@ measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } -notify = "8.0.0" +# Do not update to >= 7.0.0, at least. The update will have a significant impact +# on compute startup metrics (start_postgres_ms), >= 25% degradation. +notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 4a6ab6e745..183cc66ab9 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -47,8 +47,7 @@ hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] hyper-util = { version = "0.1", features = ["client-legacy", "http1", "http2", "server", "service"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12" } -itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +itertools = { version = "0.12" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -115,8 +114,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } -itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12" } -itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +itertools = { version = "0.12" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } From e8396034acca009716a8431b23aa924efaa057a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Tue, 11 Mar 2025 23:59:30 +0100 Subject: [PATCH 02/71] fix(ci): fail meta using jq halt_error if data is unexpectedly missing (#11151) ## Problem When the githb API is having problems, we might not get data back, and are happily setting vars as empty. This causes problems down the line. See https://github.com/neondatabase/neon/actions/runs/13718859397/job/38381946590?pr=11132#step:5:1 for example. ## Summary of changes Fail the `meta` job if we don't get expected data back from github. --- .github/scripts/previous-releases.jq | 6 ++++++ .github/workflows/_meta.yml | 10 +++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/scripts/previous-releases.jq b/.github/scripts/previous-releases.jq index b0b00bce18..51204da099 100644 --- a/.github/scripts/previous-releases.jq +++ b/.github/scripts/previous-releases.jq @@ -17,6 +17,12 @@ ({}; .[$entry.component] |= (if . == null or $entry.version > .version then $entry else . end)) +# Ensure that each component exists, or fail +| (["storage", "compute", "proxy"] - (keys)) as $missing +| if ($missing | length) > 0 then + "Error: Found no release for \($missing | join(", "))!\n" | halt_error(1) + else . end + # Convert the resulting object into an array of formatted strings | to_entries | map("\(.key)=\(.value.full)") diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml index a3fc125648..cae7fae6a4 100644 --- a/.github/workflows/_meta.yml +++ b/.github/workflows/_meta.yml @@ -24,6 +24,10 @@ on: permissions: {} +defaults: + run: + shell: bash -euo pipefail {0} + jobs: tags: runs-on: ubuntu-22.04 @@ -83,7 +87,11 @@ jobs: echo "tag=release-compute-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT ;; pr|storage-rc-pr|compute-rc-pr|proxy-rc-pr) - BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') + BUILD_AND_TEST_RUN_ID=$(gh api --paginate \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=${CURRENT_SHA}&branch=${CURRENT_BRANCH}" \ + | jq '[.workflow_runs[] | select(.name == "Build and Test")][0].id // ("Error: No matching workflow run found." | halt_error(1))') echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT ;; workflow-dispatch) From da2431f11fcf71fbe905f6bc129b6f7902f3d44e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 12 Mar 2025 03:30:56 +0100 Subject: [PATCH 03/71] storcon: add --control-plane-url config option (#11173) Adds the `--control-plane-url` config option to the storcon, which we want to migrate to instead of using `notify-attach`. Part of #11163 --- docs/storage_controller.md | 2 +- storage_controller/src/compute_hook.rs | 11 ++++++++++- storage_controller/src/main.rs | 13 ++++++++++--- storage_controller/src/service.rs | 9 +++++++++ 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/docs/storage_controller.md b/docs/storage_controller.md index 6d2ef929a4..cf00cd8e33 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -101,7 +101,7 @@ changes such as a pageserver node becoming unavailable, or the tenant's shard co postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver location changes. -The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires +The hook is configured using the storage controller's `--control-plane-url` CLI option. If the hook requires JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request. In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index b602af362d..5ce4d63d77 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -624,7 +624,16 @@ impl ComputeHook { MaybeSendResult::Transmit((request, lock)) => (request, lock), }; - let result = if let Some(notify_url) = &self.config.compute_hook_url { + let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url { + Some(if control_plane_url.ends_with('/') { + format!("{control_plane_url}notify-attach") + } else { + format!("{control_plane_url}/notify-attach") + }) + } else { + self.config.compute_hook_url.clone() + }; + let result = if let Some(notify_url) = &compute_hook_url { self.do_notify(notify_url, &request, cancel).await } else { self.do_notify_local(&request).await.map_err(|e| { diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 46ac1cd7ca..6e3c70c42b 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -71,6 +71,10 @@ struct Cli { #[arg(long)] compute_hook_url: Option, + /// URL to control plane storage API prefix + #[arg(long)] + control_plane_url: Option, + /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller #[arg(long)] database_url: Option, @@ -313,11 +317,13 @@ async fn async_main() -> anyhow::Result<()> { "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" ); } - StrictMode::Strict if args.compute_hook_url.is_none() => { - // Production systems should always have a compute hook set, to prevent falling + StrictMode::Strict + if args.compute_hook_url.is_none() && args.control_plane_url.is_none() => + { + // Production systems should always have a control plane URL set, to prevent falling // back to trying to use neon_local. anyhow::bail!( - "`--compute-hook-url` is not set: this is only permitted in `--dev` mode" + "neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode" ); } StrictMode::Strict => { @@ -343,6 +349,7 @@ async fn async_main() -> anyhow::Result<()> { control_plane_jwt_token: secrets.control_plane_jwt_token, peer_jwt_token: secrets.peer_jwt_token, compute_hook_url: args.compute_hook_url, + control_plane_url: args.control_plane_url, max_offline_interval: args .max_offline_interval .map(humantime::Duration::into) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index a06748abc6..96b67fa81e 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -363,6 +363,15 @@ pub struct Config { /// assume it is running in a test environment and try to update neon_local. pub compute_hook_url: Option, + /// Prefix for storage API endpoints of the control plane. We use this prefix to compute + /// URLs that we use to send pageserver and safekeeper attachment locations. + /// If this is None, the compute hook will assume it is running in a test environment + /// and try to invoke neon_local instead. + /// + /// For now, there is also `compute_hook_url` which allows configuration of the pageserver + /// specific endpoint, but it is in the process of being phased out. + pub control_plane_url: Option, + /// Grace period within which a pageserver does not respond to heartbeats, but is still /// considered active. Once the grace period elapses, the next heartbeat failure will /// mark the pagseserver offline. From f60ffe30214ddb7042b5bd2febab663d4d36f6e3 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 12 Mar 2025 07:52:18 +0200 Subject: [PATCH 04/71] Rebase compare local debug mode (#11174) ## Problem DEBUG_COMPARE_LOCAL mode is broken See https://neondb.slack.com/archives/C03QLRH7PPD/p1732862608323269?thread_ts=1732711054.862919&cid=C03QLRH7PPD ## Summary of changes Fix compile errors and unlogged build issues. Co-authored-by: Konstantin Knizhnik --- pgxn/neon/pagestore_smgr.c | 105 +++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 44 deletions(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 0414661a5f..1135212e22 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -76,6 +76,10 @@ #include "access/xlogrecovery.h" #endif +#if PG_VERSION_NUM < 160000 +typedef PGAlignedBlock PGIOAlignedBlock; +#endif + /* * If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API * calls to md.c, and *also* do the calls to the Page Server. On every @@ -1803,7 +1807,7 @@ static XLogRecPtr log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, Page page, bool page_std) { - PGAlignedBlock copied_buffer; + PGIOAlignedBlock copied_buffer; memcpy(copied_buffer.data, page, BLCKSZ); return log_newpage(rinfo, forkNum, blkno, copied_buffer.data, page_std); @@ -1820,7 +1824,7 @@ static XLogRecPtr log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, BlockNumber nblocks, Page *pages, bool page_std) { - PGAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; + PGIOAlignedBlock copied_buffer[XLR_MAX_BLOCK_ID]; BlockNumber blknos[XLR_MAX_BLOCK_ID]; Page pageptrs[XLR_MAX_BLOCK_ID]; int nregistered = 0; @@ -1858,7 +1862,7 @@ log_newpages_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno, static bool PageIsEmptyHeapPage(char *buffer) { - PGAlignedBlock empty_page; + PGIOAlignedBlock empty_page; PageInit((Page) empty_page.data, BLCKSZ, 0); @@ -2847,7 +2851,7 @@ static void neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, int nblocks, bool skipFsync) { - const PGAlignedBlock buffer = {0}; + const PGIOAlignedBlock buffer = {0}; int remblocks = nblocks; XLogRecPtr lsn = 0; @@ -3389,15 +3393,16 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; - char mdbuf[BLCKSZ]; - char mdbuf_masked[BLCKSZ]; + PGIOAlignedBlock mdbuf; + PGIOAlignedBlock mdbuf_masked; + XLogRecPtr request_lsn = request_lsns.request_lsn; - mdread(reln, forkNum, blkno, mdbuf); + mdread(reln, forkNum, blkno, mdbuf.data); memcpy(pageserver_masked, buffer, BLCKSZ); - memcpy(mdbuf_masked, mdbuf, BLCKSZ); + memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - if (PageIsNew((Page) mdbuf)) + if (PageIsNew((Page) mdbuf.data)) { if (!PageIsNew((Page) pageserver_masked)) { @@ -3416,41 +3421,41 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf)); + hexdump_page(mdbuf.data)); } - else if (PageGetSpecialSize(mdbuf) == 0) + else if (PageGetSpecialSize(mdbuf.data) == 0) { /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } - else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) { /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), forkNum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } @@ -3542,77 +3547,85 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL - if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; - char mdbuf[BLCKSZ]; - char mdbuf_masked[BLCKSZ]; + PGIOAlignedBlock mdbuf; + PGIOAlignedBlock mdbuf_masked; + XLogRecPtr request_lsn = request_lsns->request_lsn; for (int i = 0; i < nblocks; i++) { + BlockNumber blkno = blocknum + i; + if (!BITMAP_ISSET(read, i)) + continue; + #if PG_MAJORVERSION_NUM >= 17 - mdreadv(reln, forkNum, blkno + i, &mdbuf, 1); + { + void* mdbuffers[1] = { mdbuf.data }; + mdreadv(reln, forknum, blkno, mdbuffers, 1); + } #else - mdread(reln, forkNum, blkno + i, mdbuf); + mdread(reln, forknum, blkno, mdbuf.data); #endif - memcpy(pageserver_masked, buffer, BLCKSZ); - memcpy(mdbuf_masked, mdbuf, BLCKSZ); + memcpy(pageserver_masked, buffers[i], BLCKSZ); + memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - if (PageIsNew((Page) mdbuf)) + if (PageIsNew((Page) mdbuf.data)) { if (!PageIsNew((Page) pageserver_masked)) { neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(buffer)); + hexdump_page(buffers[i])); } } - else if (PageIsNew((Page) buffer)) + else if (PageIsNew((Page) buffers[i])) { neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf)); + hexdump_page(mdbuf.data)); } - else if (PageGetSpecialSize(mdbuf) == 0) + else if (PageGetSpecialSize(mdbuf.data) == 0) { /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } - else if (PageGetSpecialSize(mdbuf) == MAXALIGN(sizeof(BTPageOpaqueData))) + else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf))->btpo_cycleid < MAX_BT_CYCLE_ID) + if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) { /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked, blkno); + RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - if (memcmp(mdbuf_masked, pageserver_masked, BLCKSZ) != 0) + if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) { neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", blkno, RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, + forknum, (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked), + hexdump_page(mdbuf_masked.data), hexdump_page(pageserver_masked)); } } @@ -3664,6 +3677,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo switch (reln->smgr_relpersistence) { case 0: +#ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) { @@ -3682,6 +3696,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo */ return; } +#endif break; case RELPERSISTENCE_PERMANENT: @@ -3732,6 +3747,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, switch (reln->smgr_relpersistence) { case 0: +#ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) { @@ -3747,6 +3763,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ return; } +#endif break; case RELPERSISTENCE_PERMANENT: @@ -3768,7 +3785,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) - mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); #endif } From 7bf639733417e122b68e1d9dbfe23e71d23ae1b0 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 12 Mar 2025 10:23:41 +0000 Subject: [PATCH 05/71] pageserver: remove legacy `TimelineInfo::latest_gc_cutoff` field (1/2) (#11149) ## Problem This field was retained for backward compat only in https://github.com/neondatabase/neon/pull/10707. Once https://github.com/neondatabase/cloud/pull/25233 is released, nothing external will be reading this field. Internally, this was a mandatory field so storage controller is still trying to decode it, so we must do this removal in two steps: this PR makes the field optional, and after one release we can fully remove it. Related: https://github.com/neondatabase/cloud/issues/24250 ## Summary of changes - Rename field to `_unused` - Remove field from swagger - Make field optional --- libs/pageserver_api/src/models.rs | 7 ++++--- pageserver/src/http/openapi_spec.yml | 4 ---- pageserver/src/http/routes.rs | 5 +---- 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index b1ebad83b1..5e5bcf5338 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -1225,9 +1225,10 @@ pub struct TimelineInfo { pub last_record_lsn: Lsn, pub prev_record_lsn: Option, - /// Legacy field for compat with control plane. Synonym of `min_readable_lsn`. - /// TODO: remove once control plane no longer reads it. - pub latest_gc_cutoff_lsn: Lsn, + /// Legacy field, retained for one version to enable old storage controller to + /// decode (it was a mandatory field). + #[serde(default, rename = "latest_gc_cutoff_lsn")] + pub _unused: Lsn, /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients. /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead, diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 0fb9a240d5..e799efcce3 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1079,7 +1079,6 @@ components: - last_record_lsn - disk_consistent_lsn - state - - latest_gc_cutoff_lsn properties: timeline_id: type: string @@ -1123,9 +1122,6 @@ components: min_readable_lsn: type: string format: hex - latest_gc_cutoff_lsn: - type: string - format: hex applied_gc_cutoff_lsn: type: string format: hex diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e5848bfd25..ba5fb521ff 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -460,10 +460,7 @@ async fn build_timeline_info_common( initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), - // Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally - // we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we - // actually trimmed data to), which can pass each other when PITR is changed. - latest_gc_cutoff_lsn: min_readable_lsn, + _unused: Default::default(), // Unused, for legacy decode only min_readable_lsn, applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(), current_logical_size: current_logical_size.size_dont_care_about_accuracy(), From 1c0ff3c04d5432ba5f0295042fcca77118ebf49e Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 12 Mar 2025 11:07:49 +0000 Subject: [PATCH 06/71] utils: explicit OTEL export config and OTEL enablement via common entry point (#11139) We want to export performance traces from the pageserver in OTEL format. End goal is to see them in Grafana. To this end, there are two changes here: 1. Update the `tracing-utils` crate to allow for explicitly specifying the export configuration. Pageserver configuration is loaded from a file on start-up. This allows us to use the same flow for export configs there. 2. Update the `utils::logging::init` common entry point to set up OTEL tracing infrastructure if requested. Note that an entirely different tracing subscriber is used. This is to avoid interference with the existing tracing set-up. For now, no service uses this functionality. PR to plug this into the pageserver is [here](https://github.com/neondatabase/neon/pull/11140). Related https://github.com/neondatabase/neon/issues/9873 --- Cargo.lock | 2 + compute_tools/src/logger.rs | 3 +- libs/tracing-utils/src/lib.rs | 83 +++++++++++++++++++++++++--- libs/utils/Cargo.toml | 1 + libs/utils/src/logging.rs | 1 + pageserver/Cargo.toml | 1 + pageserver/compaction/tests/tests.rs | 2 +- pageserver/src/bin/pageserver.rs | 1 + pageserver/src/tenant.rs | 2 +- proxy/src/logging.rs | 3 +- 10 files changed, 87 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d023d340d4..dd13e5a833 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4313,6 +4313,7 @@ dependencies = [ "tokio-util", "toml_edit", "tracing", + "tracing-utils", "url", "utils", "uuid", @@ -7850,6 +7851,7 @@ dependencies = [ "tracing", "tracing-error", "tracing-subscriber", + "tracing-utils", "walkdir", ] diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs index a65614e94e..c36f302f99 100644 --- a/compute_tools/src/logger.rs +++ b/compute_tools/src/logger.rs @@ -24,7 +24,8 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result .with_writer(std::io::stderr); // Initialize OpenTelemetry - let otlp_layer = tracing_utils::init_tracing("compute_ctl").await; + let otlp_layer = + tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()).await; // Put it all together tracing_subscriber::registry() diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs index 72f94d61e4..74992a7d03 100644 --- a/libs/tracing-utils/src/lib.rs +++ b/libs/tracing-utils/src/lib.rs @@ -21,7 +21,7 @@ //! .with_writer(std::io::stderr); //! //! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces -//! let otlp_layer = tracing_utils::init_tracing("my_application").await; +//! let otlp_layer = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()).await; //! //! // Put it all together //! tracing_subscriber::registry() @@ -38,8 +38,12 @@ pub mod http; use opentelemetry::KeyValue; use opentelemetry::trace::TracerProvider; -use tracing::Subscriber; +use opentelemetry_otlp::WithExportConfig; +pub use opentelemetry_otlp::{ExportConfig, Protocol}; +use tracing::level_filters::LevelFilter; +use tracing::{Dispatch, Subscriber}; use tracing_subscriber::Layer; +use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::registry::LookupSpan; /// Set up OpenTelemetry exporter, using configuration from environment variables. @@ -69,19 +73,28 @@ use tracing_subscriber::registry::LookupSpan; /// /// This doesn't block, but is marked as 'async' to hint that this must be called in /// asynchronous execution context. -pub async fn init_tracing(service_name: &str) -> Option> +pub async fn init_tracing( + service_name: &str, + export_config: ExportConfig, +) -> Option> where S: Subscriber + for<'span> LookupSpan<'span>, { if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) { return None; }; - Some(init_tracing_internal(service_name.to_string())) + Some(init_tracing_internal( + service_name.to_string(), + export_config, + )) } /// Like `init_tracing`, but creates a separate tokio Runtime for the tracing /// tasks. -pub fn init_tracing_without_runtime(service_name: &str) -> Option> +pub fn init_tracing_without_runtime( + service_name: &str, + export_config: ExportConfig, +) -> Option> where S: Subscriber + for<'span> LookupSpan<'span>, { @@ -112,16 +125,22 @@ where )); let _guard = runtime.enter(); - Some(init_tracing_internal(service_name.to_string())) + Some(init_tracing_internal( + service_name.to_string(), + export_config, + )) } -fn init_tracing_internal(service_name: String) -> impl Layer +fn init_tracing_internal(service_name: String, export_config: ExportConfig) -> impl Layer where S: Subscriber + for<'span> LookupSpan<'span>, { - // Sets up exporter from the OTEL_EXPORTER_* environment variables. + // Sets up exporter from the provided [`ExportConfig`] parameter. + // If the endpoint is not specified, it is loaded from the + // OTEL_EXPORTER_OTLP_ENDPOINT environment variable. let exporter = opentelemetry_otlp::SpanExporter::builder() .with_http() + .with_export_config(export_config) .build() .expect("could not initialize opentelemetry exporter"); @@ -151,3 +170,51 @@ where pub fn shutdown_tracing() { opentelemetry::global::shutdown_tracer_provider(); } + +pub enum OtelEnablement { + Disabled, + Enabled { + service_name: String, + export_config: ExportConfig, + runtime: &'static tokio::runtime::Runtime, + }, +} + +pub struct OtelGuard { + pub dispatch: Dispatch, +} + +impl Drop for OtelGuard { + fn drop(&mut self) { + shutdown_tracing(); + } +} + +/// Initializes OTEL infrastructure for performance tracing according to the provided configuration +/// +/// Performance tracing is handled by a different [`tracing::Subscriber`]. This functions returns +/// an [`OtelGuard`] containing a [`tracing::Dispatch`] associated with a newly created subscriber. +/// Applications should use this dispatch for their performance traces. +/// +/// The lifetime of the guard should match taht of the application. On drop, it tears down the +/// OTEL infra. +pub fn init_performance_tracing(otel_enablement: OtelEnablement) -> Option { + let otel_subscriber = match otel_enablement { + OtelEnablement::Disabled => None, + OtelEnablement::Enabled { + service_name, + export_config, + runtime, + } => { + let otel_layer = runtime + .block_on(init_tracing(&service_name, export_config)) + .with_filter(LevelFilter::INFO); + let otel_subscriber = tracing_subscriber::registry().with(otel_layer); + let otel_dispatch = Dispatch::new(otel_subscriber); + + Some(otel_dispatch) + } + }; + + otel_subscriber.map(|dispatch| OtelGuard { dispatch }) +} diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index ac44300a51..4180602ac7 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -42,6 +42,7 @@ toml_edit = { workspace = true, features = ["serde"] } tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } +tracing-utils.workspace = true rand.workspace = true scopeguard.workspace = true strum.workspace = true diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index 881f1e765d..f37f05692a 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -165,6 +165,7 @@ pub fn init( }; log_layer.with_filter(rust_log_env_filter()) }); + let r = r.with( TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()), ); diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index a372be5044..d17a19ce65 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -70,6 +70,7 @@ tokio-stream.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = [ "serde" ] } tracing.workspace = true +tracing-utils.workspace = true url.workspace = true walkdir.workspace = true metrics.workspace = true diff --git a/pageserver/compaction/tests/tests.rs b/pageserver/compaction/tests/tests.rs index bd8b54a286..565f66ce1a 100644 --- a/pageserver/compaction/tests/tests.rs +++ b/pageserver/compaction/tests/tests.rs @@ -12,7 +12,7 @@ pub(crate) fn setup_logging() { logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, ) - .expect("Failed to init test logging") + .expect("Failed to init test logging"); }); } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index c4af0d5d41..4d30a6358b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -111,6 +111,7 @@ fn main() -> anyhow::Result<()> { } else { TracingErrorLayerEnablement::Disabled }; + logging::init( conf.log_format, tracing_error_layer_enablement, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 62e1cdac0c..2bce56345a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5754,7 +5754,7 @@ pub(crate) mod harness { logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, logging::Output::Stdout, ) - .expect("Failed to init test logging") + .expect("Failed to init test logging"); }); } diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 6f9845fd6e..454fe81357 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -46,7 +46,8 @@ pub async fn init() -> anyhow::Result { .expect("this should be a valid filter directive"), ); - let otlp_layer = tracing_utils::init_tracing("proxy").await; + let otlp_layer = + tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default()).await; let json_log_layer = if logfmt == LogFormat::Json { Some(JsonLoggingLayer::new( From 73e37ae388f4c5b6473b89baa3ba5a919c06e548 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Wed, 12 Mar 2025 17:23:31 +0400 Subject: [PATCH 07/71] Suppress "request was dropped" errors in test_timeline_archive (#11190) ## Problem Test `test_timeline_archive` is flaky because it makes requests that are intended to fail. It sometimes leads to warning in pageserver's logs. More details are in the issue. - Closes: https://github.com/neondatabase/neon/issues/11177 ## Summary of changes - Suppress such errors. --- test_runner/regress/test_timeline_archive.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index c17840d31c..11567cafd0 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -42,6 +42,14 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): # If we run the unsharded version, talk to the storage controller ps_http = env.storage_controller.pageserver_api() + for ps in env.pageservers: + # We make /archival_config requests that are intended to fail. + # It's expected that storcon drops requests to other pageservers after + # it gets the first error (https://github.com/neondatabase/neon/issues/11177) + ps.allowed_errors.append( + ".*WARN.* path=/v1/tenant/.*/archival_config .*request was dropped before completing", + ) + # first try to archive a non existing timeline for an existing tenant: invalid_timeline_id = TimelineId.generate() with pytest.raises(PageserverApiException, match="timeline not found") as exc: From 7015dbbdf001bff6670a2027fb609083dcd06e10 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 12 Mar 2025 14:02:11 +0000 Subject: [PATCH 08/71] storcon_cli: remove pre-warm helper (#11183) ## Problem This command was used when onboarding tenants to the storage controller. We no longer do that, so the command can go. ## Summary of changes - Remove `storcon_cli tenant-warmup` command --- control_plane/storcon_cli/src/main.rs | 98 +-------------------------- 1 file changed, 2 insertions(+), 96 deletions(-) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index b5c4f21e97..ae4bf9a519 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -14,8 +14,8 @@ use pageserver_api::controller_api::{ TenantShardMigrateRequest, TenantShardMigrateResponse, }; use pageserver_api::models::{ - EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters, - TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest, + EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig, + TenantConfigPatchRequest, TenantConfigRequest, TenantShardSplitRequest, TenantShardSplitResponse, }; use pageserver_api::shard::{ShardStripeSize, TenantShardId}; @@ -158,12 +158,6 @@ enum Command { #[arg(long)] tenant_id: TenantId, }, - /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary - /// mode so that it can warm up content on a pageserver. - TenantWarmup { - #[arg(long)] - tenant_id: TenantId, - }, TenantSetPreferredAz { #[arg(long)] tenant_id: TenantId, @@ -871,94 +865,6 @@ async fn main() -> anyhow::Result<()> { ) .await?; } - Command::TenantWarmup { tenant_id } => { - let describe_response = storcon_client - .dispatch::<(), TenantDescribeResponse>( - Method::GET, - format!("control/v1/tenant/{tenant_id}"), - None, - ) - .await; - match describe_response { - Ok(describe) => { - if matches!(describe.policy, PlacementPolicy::Secondary) { - // Fine: it's already known to controller in secondary mode: calling - // again to put it into secondary mode won't cause problems. - } else { - anyhow::bail!("Tenant already present with policy {:?}", describe.policy); - } - } - Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => { - // Fine: this tenant isn't know to the storage controller yet. - } - Err(e) => { - // Unexpected API error - return Err(e.into()); - } - } - - vps_client - .location_config( - TenantShardId::unsharded(tenant_id), - pageserver_api::models::LocationConfig { - mode: pageserver_api::models::LocationConfigMode::Secondary, - generation: None, - secondary_conf: Some(LocationConfigSecondary { warm: true }), - shard_number: 0, - shard_count: 0, - shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0, - tenant_conf: TenantConfig::default(), - }, - None, - true, - ) - .await?; - - let describe_response = storcon_client - .dispatch::<(), TenantDescribeResponse>( - Method::GET, - format!("control/v1/tenant/{tenant_id}"), - None, - ) - .await?; - - let secondary_ps_id = describe_response - .shards - .first() - .unwrap() - .node_secondary - .first() - .unwrap(); - - println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}"); - loop { - let (status, progress) = vps_client - .tenant_secondary_download( - TenantShardId::unsharded(tenant_id), - Some(Duration::from_secs(10)), - ) - .await?; - println!( - "Progress: {}/{} layers, {}/{} bytes", - progress.layers_downloaded, - progress.layers_total, - progress.bytes_downloaded, - progress.bytes_total - ); - match status { - StatusCode::OK => { - println!("Download complete"); - break; - } - StatusCode::ACCEPTED => { - // Loop - } - _ => { - anyhow::bail!("Unexpected download status: {status}"); - } - } - } - } Command::TenantDrop { tenant_id, unclean } => { if !unclean { anyhow::bail!( From fc515e7be2d7efb5627b0119ec293dd31c48007a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Wed, 12 Mar 2025 15:26:52 +0100 Subject: [PATCH 09/71] chore(deps): bump env_logger to 0.11.7 (#11188) ## Problem `humantime` is unmaintained, we want to migrate to `jiff`, see https://github.com/neondatabase/neon/issues/11179. `env_logger` in older versions depend on `humantime`, and newer versions depend on `jiff`, so we need to update it. ## Summary of changes Update `env_logger` to the most recent release, which does not depend on `humantime` anymore. --- Cargo.lock | 180 +++++++++++++++++++++----------------- Cargo.toml | 2 +- workspace_hack/Cargo.toml | 3 + 3 files changed, 103 insertions(+), 82 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dd13e5a833..1721c185f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -191,7 +191,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "synstructure", ] @@ -203,7 +203,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -272,7 +272,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -283,7 +283,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1021,7 +1021,7 @@ dependencies = [ "regex", "rustc-hash 2.1.1", "shlex", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1248,7 +1248,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1703,7 +1703,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1727,7 +1727,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.10.0", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1738,7 +1738,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1888,7 +1888,7 @@ dependencies = [ "dsl_auto_type", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1908,7 +1908,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1937,7 +1937,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -1960,7 +1960,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2105,7 +2105,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2115,28 +2115,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" dependencies = [ "log", -] - -[[package]] -name = "env_logger" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" -dependencies = [ - "humantime", - "is-terminal", - "log", "regex", - "termcolor", ] [[package]] name = "env_logger" -version = "0.11.2" +version = "0.11.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +checksum = "c3716d7a920fb4fac5d84e9d4bce8ceb321e9414b4409da61b07b75c1e3d0697" dependencies = [ + "anstream", + "anstyle", "env_filter", + "jiff", "log", ] @@ -2157,7 +2148,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2417,7 +2408,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -2530,7 +2521,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -3148,7 +3139,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -3241,7 +3232,7 @@ dependencies = [ "crossbeam-channel", "crossbeam-utils", "dashmap 6.1.0", - "env_logger 0.11.2", + "env_logger", "indexmap 2.0.1", "itoa", "log", @@ -3364,6 +3355,30 @@ dependencies = [ "tracing", ] +[[package]] +name = "jiff" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d699bc6dfc879fb1bf9bdff0d4c56f0884fc6f0d0eb0fba397a6d00cd9a6b85e" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", +] + +[[package]] +name = "jiff-static" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d16e75759ee0aa64c57a56acbf43916987b20c77373cb7e808979e02b93c9f9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -3535,9 +3550,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.20" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" [[package]] name = "lru" @@ -3618,7 +3633,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4491,7 +4506,7 @@ dependencies = [ "parquet", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4593,7 +4608,7 @@ checksum = "f6e859e6e5bd50440ab63c47e3ebabc90f26251f7c73c3d3e837b74a1cc3fa67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4689,6 +4704,15 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "postgres" version = "0.19.7" @@ -4796,7 +4820,7 @@ dependencies = [ "bytes", "crc32c", "criterion", - "env_logger 0.10.2", + "env_logger", "log", "memoffset 0.9.0", "once_cell", @@ -4895,7 +4919,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -4909,9 +4933,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.92" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" dependencies = [ "unicode-ident", ] @@ -4995,7 +5019,7 @@ dependencies = [ "prost 0.12.6", "prost-types 0.12.6", "regex", - "syn 2.0.90", + "syn 2.0.100", "tempfile", ] @@ -5016,7 +5040,7 @@ dependencies = [ "prost 0.13.3", "prost-types 0.13.3", "regex", - "syn 2.0.90", + "syn 2.0.100", "tempfile", ] @@ -5030,7 +5054,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -5043,7 +5067,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -5090,7 +5114,7 @@ dependencies = [ "consumption_metrics", "ecdsa 0.16.9", "ed25519-dalek", - "env_logger 0.10.2", + "env_logger", "fallible-iterator", "flate2", "framed-websockets", @@ -5227,9 +5251,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.37" +version = "1.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" dependencies = [ "proc-macro2", ] @@ -5758,7 +5782,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.90", + "syn 2.0.100", "unicode-ident", ] @@ -5973,7 +5997,7 @@ dependencies = [ "crc32c", "criterion", "desim", - "env_logger 0.10.2", + "env_logger", "fail", "futures", "hex", @@ -6304,7 +6328,7 @@ checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6386,7 +6410,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6788,7 +6812,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6839,9 +6863,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.90" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", @@ -6871,7 +6895,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6922,15 +6946,6 @@ dependencies = [ "serde_json", ] -[[package]] -name = "termcolor" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" -dependencies = [ - "winapi-util", -] - [[package]] name = "test-context" version = "0.3.0" @@ -6949,7 +6964,7 @@ checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6978,7 +6993,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -6989,7 +7004,7 @@ checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7172,7 +7187,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7405,7 +7420,7 @@ dependencies = [ "prost-build 0.13.3", "prost-types 0.13.3", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7520,7 +7535,7 @@ checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -7915,7 +7930,7 @@ dependencies = [ "anyhow", "camino-tempfile", "clap", - "env_logger 0.10.2", + "env_logger", "log", "postgres", "postgres_ffi", @@ -8020,7 +8035,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "wasm-bindgen-shared", ] @@ -8054,7 +8069,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -8361,6 +8376,7 @@ name = "workspace_hack" version = "0.1.0" dependencies = [ "ahash", + "anstream", "anyhow", "base64 0.13.1", "base64 0.21.7", @@ -8377,6 +8393,8 @@ dependencies = [ "digest", "displaydoc", "either", + "env_filter", + "env_logger", "fail", "form_urlencoded", "futures-channel", @@ -8429,7 +8447,7 @@ dependencies = [ "spki 0.7.3", "stable_deref_trait", "subtle", - "syn 2.0.90", + "syn 2.0.100", "sync_wrapper 0.1.2", "tikv-jemalloc-ctl", "tikv-jemalloc-sys", @@ -8546,7 +8564,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "synstructure", ] @@ -8568,7 +8586,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -8588,7 +8606,7 @@ checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", "synstructure", ] @@ -8610,7 +8628,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] @@ -8632,7 +8650,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.100", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 4a32b6d95d..7b86a64e9a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -221,7 +221,7 @@ zerocopy = { version = "0.7", features = ["derive"] } json-structural-diff = { version = "0.2.0" } ## TODO replace this with tracing -env_logger = "0.10" +env_logger = "0.11" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 183cc66ab9..f1696c5ff9 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -16,6 +16,7 @@ license.workspace = true ### BEGIN HAKARI SECTION [dependencies] ahash = { version = "0.8" } +anstream = { version = "0.6" } anyhow = { version = "1", features = ["backtrace"] } base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] } base64-647d43efb71741da = { package = "base64", version = "0.21" } @@ -30,6 +31,8 @@ der = { version = "0.7", default-features = false, features = ["oid", "pem", "st deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } digest = { version = "0.10", features = ["mac", "oid", "std"] } either = { version = "1" } +env_filter = { version = "0.1", default-features = false, features = ["regex"] } +env_logger = { version = "0.11" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } form_urlencoded = { version = "1" } futures-channel = { version = "0.3", features = ["sink"] } From 1436b8469c6fe52f37fb9ba41a3f854313cacd79 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 12 Mar 2025 15:34:29 +0100 Subject: [PATCH 10/71] pageserver: appease unused lint on macOS (#11192) ## Problem `info_span!` is only used in a `linux` branch, causing the unused lint to fire on macOS. ## Summary of changes Fully qualify the `info_span!` use. --- pageserver/src/tenant/remote_timeline_client/download.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 954ff0c1d6..0001f67c99 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -18,7 +18,7 @@ use tokio::fs::{self, File, OpenOptions}; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; -use tracing::{info_span, warn}; +use tracing::warn; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use utils::{backoff, pausable_failpoint}; @@ -229,7 +229,7 @@ async fn download_object( || IoBufferMut::with_capacity(super::BUFFER_SIZE), gate.enter().map_err(|_| DownloadError::Cancelled)?, ctx, - info_span!(parent: None, "download_object_buffered_writer", %dst_path), + tracing::info_span!(parent: None, "download_object_buffered_writer", %dst_path), ); // TODO: use vectored write (writev) once supported by tokio-epoll-uring. From c7717c85c7a45144b016e5d799125197ba14c001 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 12 Mar 2025 16:16:54 +0100 Subject: [PATCH 11/71] storcon,pageserver: use persisted stripe size when loading unsharded tenants (#11193) ## Problem When the storage controller and Pageserver loads tenants from persisted storage, it uses `ShardIdentity::unsharded()` for unsharded tenants. However, this replaces the persisted stripe size of unsharded tenants with the default stripe size. This doesn't really matter for practical purposes, since the stripe size is meaningless for unsharded tenants anyway, but can cause consistency check failures if the persisted stripe size differs from the default. This was seen in #11168, where we change the default stripe size. Touches #11168. ## Summary of changes Carry over the persisted stripe size from `TenantShardPersistence` for unsharded tenants, and from `LocationConf` on Pageservers. Also add bounds checks for type casts when loading persisted shard metadata. --- libs/pageserver_api/src/shard.rs | 10 +++++++ pageserver/src/tenant/config.rs | 6 +++- storage_controller/src/persistence.rs | 40 ++++++++++++++++++++++----- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index eca04b1f3d..8386d6e586 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -112,6 +112,16 @@ impl ShardIdentity { } } + /// An unsharded identity with the given stripe size (if non-zero). This is typically used to + /// carry over a stripe size for an unsharded tenant from persistent storage. + pub fn unsharded_with_stripe_size(stripe_size: ShardStripeSize) -> Self { + let mut shard_identity = Self::unsharded(); + if stripe_size.0 > 0 { + shard_identity.stripe_size = stripe_size; + } + shard_identity + } + /// A broken instance of this type is only used for `TenantState::Broken` tenants, /// which are constructed in code paths that don't have access to proper configuration. /// diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 334fb04604..4308db84e5 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -219,7 +219,11 @@ impl LocationConf { }; let shard = if conf.shard_count == 0 { - ShardIdentity::unsharded() + // NB: carry over the persisted stripe size instead of using the default. This doesn't + // matter for most practical purposes, since unsharded tenants don't use the stripe + // size, but can cause inconsistencies between storcon and Pageserver and cause manual + // splits without `new_stripe_size` to use an unintended stripe size. + ShardIdentity::unsharded_with_stripe_size(ShardStripeSize(conf.shard_stripe_size)) } else { ShardIdentity::new( ShardNumber(conf.shard_number), diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 5146fe472e..4a97aac125 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1613,23 +1613,49 @@ pub(crate) struct TenantShardPersistence { } impl TenantShardPersistence { + fn get_shard_count(&self) -> Result { + self.shard_count + .try_into() + .map(ShardCount) + .map_err(|_| ShardConfigError::InvalidCount) + } + + fn get_shard_number(&self) -> Result { + self.shard_number + .try_into() + .map(ShardNumber) + .map_err(|_| ShardConfigError::InvalidNumber) + } + + fn get_stripe_size(&self) -> Result { + self.shard_stripe_size + .try_into() + .map(ShardStripeSize) + .map_err(|_| ShardConfigError::InvalidStripeSize) + } + pub(crate) fn get_shard_identity(&self) -> Result { if self.shard_count == 0 { - Ok(ShardIdentity::unsharded()) + // NB: carry over the stripe size from the persisted record, to avoid consistency check + // failures if the persisted value differs from the default stripe size. The stripe size + // doesn't really matter for unsharded tenants anyway. + Ok(ShardIdentity::unsharded_with_stripe_size( + self.get_stripe_size()?, + )) } else { Ok(ShardIdentity::new( - ShardNumber(self.shard_number as u8), - ShardCount::new(self.shard_count as u8), - ShardStripeSize(self.shard_stripe_size as u32), + self.get_shard_number()?, + self.get_shard_count()?, + self.get_stripe_size()?, )?) } } - pub(crate) fn get_tenant_shard_id(&self) -> Result { + pub(crate) fn get_tenant_shard_id(&self) -> anyhow::Result { Ok(TenantShardId { tenant_id: TenantId::from_str(self.tenant_id.as_str())?, - shard_number: ShardNumber(self.shard_number as u8), - shard_count: ShardCount::new(self.shard_count as u8), + shard_number: self.get_shard_number()?, + shard_count: self.get_shard_count()?, }) } } From 02a83913eccd6488216139ddc151a06f354b4f1a Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 12 Mar 2025 15:31:28 +0000 Subject: [PATCH 12/71] storcon: do not update observed state on node activation (#11155) ## Problem When a node becomes active, we query its locations and update the observed state in-place. This can race with the observed state updates done when processing reconcile results. ## Summary of changes The argument for this reconciliation step is that is reduces the need for background reconciliations. I don't think is actually true anymore. There's two cases. 1. Restart of node after drain. Usually the node does not go through the offline state here, so observed locations were not marked as none. In any case, there should be a handful of shards max on the node since we've just drained it. 2. Node comes back online after failure or network partition. When the node is marked offline, we reschedule everything away from it. When it later becomes active, the previous observed location is extraneous and requires a reconciliation anyway. Closes https://github.com/neondatabase/neon/issues/11148 --- control_plane/src/bin/neon_local.rs | 14 +++++++--- control_plane/src/storage_controller.rs | 7 ++++- storage_controller/src/service.rs | 26 ++++++++++++++++--- .../regress/test_storage_controller.py | 11 +++++--- test_runner/regress/test_tenant_size.py | 2 +- 5 files changed, 48 insertions(+), 12 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index ba1411b615..72ebbafd3b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -36,7 +36,9 @@ use pageserver_api::config::{ use pageserver_api::controller_api::{ NodeAvailabilityWrapper, PlacementPolicy, TenantCreateRequest, }; -use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{ + ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, +}; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; @@ -1129,12 +1131,16 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any let tenant_id = get_tenant_id(args.tenant_id, env)?; let tenant_conf: HashMap<_, _> = args.config.iter().flat_map(|c| c.split_once(':')).collect(); + let config = PageServerNode::parse_config(tenant_conf)?; - pageserver - .tenant_config(tenant_id, tenant_conf) + let req = TenantConfigRequest { tenant_id, config }; + + let storage_controller = StorageController::from_env(env); + storage_controller + .set_tenant_config(&req) .await .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?; - println!("tenant {tenant_id} successfully configured on the pageserver"); + println!("tenant {tenant_id} successfully configured via storcon"); } } Ok(()) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 439d7936a7..bbd7f67720 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -14,7 +14,7 @@ use pageserver_api::controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; -use pageserver_api::models::{TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; @@ -878,4 +878,9 @@ impl StorageController { ) .await } + + pub async fn set_tenant_config(&self, req: &TenantConfigRequest) -> anyhow::Result<()> { + self.dispatch(Method::PUT, "v1/tenant/config".to_string(), Some(req)) + .await + } } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 96b67fa81e..667b53b725 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2004,21 +2004,41 @@ impl Service { tracing::info!("Loaded {} LocationConfigs", configs.tenant_shards.len()); let mut cleanup = Vec::new(); + let mut mismatched_locations = 0; { let mut locked = self.inner.write().unwrap(); - for (tenant_shard_id, observed_loc) in configs.tenant_shards { + for (tenant_shard_id, reported) in configs.tenant_shards { let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { cleanup.push(tenant_shard_id); continue; }; - tenant_shard + + let on_record = &mut tenant_shard .observed .locations - .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); + .entry(node.get_id()) + .or_insert_with(|| ObservedStateLocation { conf: None }) + .conf; + + // If the location reported by the node does not match our observed state, + // then we mark it as uncertain and let the background reconciliation loop + // deal with it. + // + // Note that this also covers net new locations reported by the node. + if *on_record != reported { + mismatched_locations += 1; + *on_record = None; + } } } + if mismatched_locations > 0 { + tracing::info!( + "Set observed state to None for {mismatched_locations} mismatched locations" + ); + } + for tenant_shard_id in cleanup { tracing::info!("Detaching {tenant_shard_id}"); match node diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 29919f2fe7..5eaf69cfa1 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1749,18 +1749,23 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): # Restart the failed pageserver victim_ps.start() + env.storage_controller.reconcile_until_idle() + # We expect that the re-attach call correctly tipped off the pageserver that its locations # are all secondaries now. locations = victim_ps.http_client().tenant_list_locations()["tenant_shards"] assert len(locations) == 2 assert all(loc[1]["mode"] == "Secondary" for loc in locations) - # We expect that this situation resulted from the re_attach call, and not any explicit - # Reconciler runs: assert that the reconciliation count has not gone up since we restarted. + # We expect that this situation resulted from background reconciliations + # Reconciler runs: assert that the reconciliation count has gone up by exactly + # one for each shard reconciles_after_restart = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} ) - assert reconciles_after_restart == reconciles_before_restart + + assert reconciles_before_restart is not None + assert reconciles_after_restart == reconciles_before_restart + 2 def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 713f89c60f..81e727a3aa 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -436,7 +436,7 @@ def test_single_branch_get_tenant_size_grows( # when our tenant is configured with a tiny pitr interval, dropping a table should # cause synthetic size to go down immediately tenant_config["pitr_interval"] = "0s" - env.pageserver.http_client().set_tenant_config(tenant_id, tenant_config) + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, tenant_config) (current_lsn, size) = get_current_consistent_size( env, endpoint, size_debug_file, http_client, tenant_id, timeline_id ) From 40672b739eb26b154d973be218b817727e0e9a91 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 12 Mar 2025 10:34:46 -0500 Subject: [PATCH 13/71] Move maybe_add_request_id_header middleware into middleware module (#11187) This matches the authorization middleware. --------- Signed-off-by: Tristan Partin Co-authored-by: Mikhail Kot --- compute_tools/src/http/middleware/mod.rs | 1 + .../src/http/middleware/request_id.rs | 16 ++++++++++++++++ compute_tools/src/http/server.rs | 19 +++---------------- 3 files changed, 20 insertions(+), 16 deletions(-) create mode 100644 compute_tools/src/http/middleware/request_id.rs diff --git a/compute_tools/src/http/middleware/mod.rs b/compute_tools/src/http/middleware/mod.rs index caeeeedfe5..147d6d2c7d 100644 --- a/compute_tools/src/http/middleware/mod.rs +++ b/compute_tools/src/http/middleware/mod.rs @@ -1 +1,2 @@ pub(in crate::http) mod authorize; +pub(in crate::http) mod request_id; diff --git a/compute_tools/src/http/middleware/request_id.rs b/compute_tools/src/http/middleware/request_id.rs new file mode 100644 index 0000000000..e685b27d91 --- /dev/null +++ b/compute_tools/src/http/middleware/request_id.rs @@ -0,0 +1,16 @@ +use axum::{extract::Request, middleware::Next, response::Response}; +use uuid::Uuid; + +use crate::http::headers::X_REQUEST_ID; + +/// This middleware function allows compute_ctl to generate its own request ID +/// if one isn't supplied. The control plane will always send one as a UUID. The +/// neon Postgres extension on the other hand does not send one. +pub async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { + let headers = request.headers_mut(); + if !headers.contains_key(X_REQUEST_ID) { + headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); + } + + next.run(request).await +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 126fa86d1c..b70b6c619c 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -5,9 +5,8 @@ use std::time::Duration; use anyhow::Result; use axum::Router; -use axum::extract::Request; -use axum::middleware::{self, Next}; -use axum::response::{IntoResponse, Response}; +use axum::middleware::{self}; +use axum::response::IntoResponse; use axum::routing::{get, post}; use http::StatusCode; use jsonwebtoken::jwk::JwkSet; @@ -17,8 +16,8 @@ use tower_http::{ auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer, }; use tracing::{Span, error, info}; -use uuid::Uuid; +use super::middleware::request_id::maybe_add_request_id_header; use super::{ headers::X_REQUEST_ID, middleware::authorize::Authorize, @@ -219,15 +218,3 @@ impl Server { tokio::spawn(self.serve(state)); } } - -/// This middleware function allows compute_ctl to generate its own request ID -/// if one isn't supplied. The control plane will always send one as a UUID. The -/// neon Postgres extension on the other hand does not send one. -async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response { - let headers = request.headers_mut(); - if headers.get(X_REQUEST_ID).is_none() { - headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap()); - } - - next.run(request).await -} From 7aec1364dd4cfa26571abcf0cac79e663260ca6a Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 12 Mar 2025 15:47:17 +0000 Subject: [PATCH 14/71] chore(proxy): remove enum and composite type queries (#11178) In our json encoding, we only need to know about array types. Information about composites or enums are not actually used. Enums are quite popular, needing to type query them when not needed can add some latency cost for no gain. --- libs/proxy/postgres-types2/src/lib.rs | 10 +- libs/proxy/tokio-postgres2/src/client.rs | 119 ++-------------- .../tokio-postgres2/src/generic_client.rs | 10 +- libs/proxy/tokio-postgres2/src/prepare.rs | 133 ++++-------------- libs/proxy/tokio-postgres2/src/transaction.rs | 5 + proxy/src/serverless/sql_over_http.rs | 16 ++- 6 files changed, 69 insertions(+), 224 deletions(-) diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs index 0ccd8c295f..b6bcabc922 100644 --- a/libs/proxy/postgres-types2/src/lib.rs +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -135,8 +135,8 @@ impl Type { pub enum Kind { /// A simple type like `VARCHAR` or `INTEGER`. Simple, - /// An enumerated type along with its variants. - Enum(Vec), + /// An enumerated type. + Enum, /// A pseudo-type. Pseudo, /// An array type along with the type of its elements. @@ -146,9 +146,9 @@ pub enum Kind { /// A multirange type along with the type of its elements. Multirange(Type), /// A domain type along with its underlying type. - Domain(Type), - /// A composite type along with information about its fields. - Composite(Vec), + Domain(Oid), + /// A composite type. + Composite(Oid), } /// Information about a field of a composite type. diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 08a06163e1..186eb07000 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -19,10 +19,10 @@ use crate::config::{Host, SslMode}; use crate::connection::{Request, RequestMessages}; use crate::query::RowStream; use crate::simple_query::SimpleQueryStream; -use crate::types::{Oid, ToSql, Type}; +use crate::types::{Oid, Type}; use crate::{ - CancelToken, Error, ReadyForQueryStatus, Row, SimpleQueryMessage, Statement, Transaction, - TransactionBuilder, query, simple_query, slice_iter, + CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction, + TransactionBuilder, query, simple_query, }; pub struct Responses { @@ -54,26 +54,18 @@ impl Responses { /// A cache of type info and prepared statements for fetching type info /// (corresponding to the queries in the [crate::prepare] module). #[derive(Default)] -struct CachedTypeInfo { +pub(crate) struct CachedTypeInfo { /// A statement for basic information for a type from its /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its /// fallback). - typeinfo: Option, - /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY). - typeinfo_composite: Option, - /// A statement for getting information for a composite type from its OID. - /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or - /// its fallback). - typeinfo_enum: Option, + pub(crate) typeinfo: Option, /// Cache of types already looked up. - types: HashMap, + pub(crate) types: HashMap, } pub struct InnerClient { sender: mpsc::UnboundedSender, - cached_typeinfo: Mutex, /// A buffer to use when writing out postgres commands. buffer: Mutex, @@ -91,38 +83,6 @@ impl InnerClient { }) } - pub fn typeinfo(&self) -> Option { - self.cached_typeinfo.lock().typeinfo.clone() - } - - pub fn set_typeinfo(&self, statement: &Statement) { - self.cached_typeinfo.lock().typeinfo = Some(statement.clone()); - } - - pub fn typeinfo_composite(&self) -> Option { - self.cached_typeinfo.lock().typeinfo_composite.clone() - } - - pub fn set_typeinfo_composite(&self, statement: &Statement) { - self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone()); - } - - pub fn typeinfo_enum(&self) -> Option { - self.cached_typeinfo.lock().typeinfo_enum.clone() - } - - pub fn set_typeinfo_enum(&self, statement: &Statement) { - self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone()); - } - - pub fn type_(&self, oid: Oid) -> Option { - self.cached_typeinfo.lock().types.get(&oid).cloned() - } - - pub fn set_type(&self, oid: Oid, type_: &Type) { - self.cached_typeinfo.lock().types.insert(oid, type_.clone()); - } - /// Call the given function with a buffer to be used when writing out /// postgres commands. pub fn with_buf(&self, f: F) -> R @@ -142,7 +102,6 @@ pub struct SocketConfig { pub host: Host, pub port: u16, pub connect_timeout: Option, - // pub keepalive: Option, } /// An asynchronous PostgreSQL client. @@ -151,6 +110,7 @@ pub struct SocketConfig { /// through this client object. pub struct Client { inner: Arc, + cached_typeinfo: CachedTypeInfo, socket_config: SocketConfig, ssl_mode: SslMode, @@ -169,9 +129,9 @@ impl Client { Client { inner: Arc::new(InnerClient { sender, - cached_typeinfo: Default::default(), buffer: Default::default(), }), + cached_typeinfo: Default::default(), socket_config, ssl_mode, @@ -189,55 +149,6 @@ impl Client { &self.inner } - /// Executes a statement, returning a vector of the resulting rows. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - pub async fn query( - &self, - statement: Statement, - params: &[&(dyn ToSql + Sync)], - ) -> Result, Error> { - self.query_raw(statement, slice_iter(params)) - .await? - .try_collect() - .await - } - - /// The maximally flexible version of [`query`]. - /// - /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list - /// provided, 1-indexed. - /// - /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be - /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front - /// with the `prepare` method. - /// - /// # Panics - /// - /// Panics if the number of parameters provided does not match the number expected. - /// - /// [`query`]: #method.query - pub async fn query_raw<'a, I>( - &self, - statement: Statement, - params: I, - ) -> Result - where - I: IntoIterator, - I::IntoIter: ExactSizeIterator, - { - query::query(&self.inner, statement, params).await - } - /// Pass text directly to the Postgres backend to allow it to sort out typing itself and /// to save a roundtrip pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result @@ -284,14 +195,10 @@ impl Client { simple_query::batch_execute(self.inner(), query).await } - pub async fn discard_all(&self) -> Result { + pub async fn discard_all(&mut self) -> Result { // clear the prepared statements that are about to be nuked from the postgres session - { - let mut typeinfo = self.inner.cached_typeinfo.lock(); - typeinfo.typeinfo = None; - typeinfo.typeinfo_composite = None; - typeinfo.typeinfo_enum = None; - } + + self.cached_typeinfo.typeinfo = None; self.batch_execute("discard all").await } @@ -359,8 +266,8 @@ impl Client { } /// Query for type information - pub async fn get_type(&self, oid: Oid) -> Result { - crate::prepare::get_type(&self.inner, oid).await + pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result { + crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await } /// Determines if the connection to the server has already closed. diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs index 31c3d8fa3e..8e28843347 100644 --- a/libs/proxy/tokio-postgres2/src/generic_client.rs +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -22,7 +22,7 @@ pub trait GenericClient: private::Sealed { I::IntoIter: ExactSizeIterator + Sync + Send; /// Query for type information - async fn get_type(&self, oid: Oid) -> Result; + async fn get_type(&mut self, oid: Oid) -> Result; } impl private::Sealed for Client {} @@ -38,8 +38,8 @@ impl GenericClient for Client { } /// Query for type information - async fn get_type(&self, oid: Oid) -> Result { - crate::prepare::get_type(self.inner(), oid).await + async fn get_type(&mut self, oid: Oid) -> Result { + self.get_type_inner(oid).await } } @@ -56,7 +56,7 @@ impl GenericClient for Transaction<'_> { } /// Query for type information - async fn get_type(&self, oid: Oid) -> Result { - self.client().get_type(oid).await + async fn get_type(&mut self, oid: Oid) -> Result { + self.client_mut().get_type(oid).await } } diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs index b36d2e5f74..ba13a528f6 100644 --- a/libs/proxy/tokio-postgres2/src/prepare.rs +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -9,10 +9,10 @@ use log::debug; use postgres_protocol2::message::backend::Message; use postgres_protocol2::message::frontend; -use crate::client::InnerClient; +use crate::client::{CachedTypeInfo, InnerClient}; use crate::codec::FrontendMessage; use crate::connection::RequestMessages; -use crate::types::{Field, Kind, Oid, Type}; +use crate::types::{Kind, Oid, Type}; use crate::{Column, Error, Statement, query, slice_iter}; pub(crate) const TYPEINFO_QUERY: &str = "\ @@ -23,23 +23,7 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid WHERE t.oid = $1 "; -const TYPEINFO_ENUM_QUERY: &str = "\ -SELECT enumlabel -FROM pg_catalog.pg_enum -WHERE enumtypid = $1 -ORDER BY enumsortorder -"; - -pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\ -SELECT attname, atttypid -FROM pg_catalog.pg_attribute -WHERE attrelid = $1 -AND NOT attisdropped -AND attnum > 0 -ORDER BY attnum -"; - -pub async fn prepare( +async fn prepare_typecheck( client: &Arc, name: &'static str, query: &str, @@ -67,7 +51,7 @@ pub async fn prepare( let mut parameters = vec![]; let mut it = parameter_description.parameters(); while let Some(oid) = it.next().map_err(Error::parse)? { - let type_ = get_type(client, oid).await?; + let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?; parameters.push(type_); } @@ -75,7 +59,7 @@ pub async fn prepare( if let Some(row_description) = row_description { let mut it = row_description.fields(); while let Some(field) = it.next().map_err(Error::parse)? { - let type_ = get_type(client, field.type_oid()).await?; + let type_ = Type::from_oid(field.type_oid()).ok_or_else(Error::unexpected_message)?; let column = Column::new(field.name().to_string(), type_, field); columns.push(column); } @@ -84,15 +68,6 @@ pub async fn prepare( Ok(Statement::new(client, name, parameters, columns)) } -fn prepare_rec<'a>( - client: &'a Arc, - name: &'static str, - query: &'a str, - types: &'a [Type], -) -> Pin> + 'a + Send>> { - Box::pin(prepare(client, name, query, types)) -} - fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { if types.is_empty() { debug!("preparing query {}: {}", name, query); @@ -108,16 +83,20 @@ fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Resu }) } -pub async fn get_type(client: &Arc, oid: Oid) -> Result { +pub async fn get_type( + client: &Arc, + typecache: &mut CachedTypeInfo, + oid: Oid, +) -> Result { if let Some(type_) = Type::from_oid(oid) { return Ok(type_); } - if let Some(type_) = client.type_(oid) { - return Ok(type_); - } + if let Some(type_) = typecache.types.get(&oid) { + return Ok(type_.clone()); + }; - let stmt = typeinfo_statement(client).await?; + let stmt = typeinfo_statement(client, typecache).await?; let rows = query::query(client, stmt, slice_iter(&[&oid])).await?; pin_mut!(rows); @@ -136,100 +115,48 @@ pub async fn get_type(client: &Arc, oid: Oid) -> Result( client: &'a Arc, + typecache: &'a mut CachedTypeInfo, oid: Oid, ) -> Pin> + Send + 'a>> { - Box::pin(get_type(client, oid)) + Box::pin(get_type(client, typecache, oid)) } -async fn typeinfo_statement(client: &Arc) -> Result { - if let Some(stmt) = client.typeinfo() { - return Ok(stmt); +async fn typeinfo_statement( + client: &Arc, + typecache: &mut CachedTypeInfo, +) -> Result { + if let Some(stmt) = &typecache.typeinfo { + return Ok(stmt.clone()); } let typeinfo = "neon_proxy_typeinfo"; - let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?; + let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?; - client.set_typeinfo(&stmt); - Ok(stmt) -} - -async fn get_enum_variants(client: &Arc, oid: Oid) -> Result, Error> { - let stmt = typeinfo_enum_statement(client).await?; - - query::query(client, stmt, slice_iter(&[&oid])) - .await? - .and_then(|row| async move { row.try_get(0) }) - .try_collect() - .await -} - -async fn typeinfo_enum_statement(client: &Arc) -> Result { - if let Some(stmt) = client.typeinfo_enum() { - return Ok(stmt); - } - - let typeinfo = "neon_proxy_typeinfo_enum"; - let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?; - - client.set_typeinfo_enum(&stmt); - Ok(stmt) -} - -async fn get_composite_fields(client: &Arc, oid: Oid) -> Result, Error> { - let stmt = typeinfo_composite_statement(client).await?; - - let rows = query::query(client, stmt, slice_iter(&[&oid])) - .await? - .try_collect::>() - .await?; - - let mut fields = vec![]; - for row in rows { - let name = row.try_get(0)?; - let oid = row.try_get(1)?; - let type_ = get_type_rec(client, oid).await?; - fields.push(Field::new(name, type_)); - } - - Ok(fields) -} - -async fn typeinfo_composite_statement(client: &Arc) -> Result { - if let Some(stmt) = client.typeinfo_composite() { - return Ok(stmt); - } - - let typeinfo = "neon_proxy_typeinfo_composite"; - let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?; - - client.set_typeinfo_composite(&stmt); + typecache.typeinfo = Some(stmt.clone()); Ok(stmt) } diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs index eecbfc5873..f32603470f 100644 --- a/libs/proxy/tokio-postgres2/src/transaction.rs +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -72,4 +72,9 @@ impl<'a> Transaction<'a> { pub fn client(&self) -> &Client { self.client } + + /// Returns a reference to the underlying `Client`. + pub fn client_mut(&mut self) -> &mut Client { + self.client + } } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 612702231f..47009086c3 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -860,7 +860,13 @@ impl QueryData { let cancel_token = inner.cancel_token(); let res = match select( - pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)), + pin!(query_to_json( + config, + &mut *inner, + self, + &mut 0, + parsed_headers + )), pin!(cancel.cancelled()), ) .await @@ -944,7 +950,7 @@ impl BatchQueryData { builder = builder.deferrable(true); } - let transaction = builder + let mut transaction = builder .start() .await .inspect_err(|_| { @@ -957,7 +963,7 @@ impl BatchQueryData { let json_output = match query_batch( config, cancel.child_token(), - &transaction, + &mut transaction, self, parsed_headers, ) @@ -1009,7 +1015,7 @@ impl BatchQueryData { async fn query_batch( config: &'static HttpConfig, cancel: CancellationToken, - transaction: &Transaction<'_>, + transaction: &mut Transaction<'_>, queries: BatchQueryData, parsed_headers: HttpHeaders, ) -> Result { @@ -1047,7 +1053,7 @@ async fn query_batch( async fn query_to_json( config: &'static HttpConfig, - client: &T, + client: &mut T, data: QueryData, current_size: &mut usize, parsed_headers: HttpHeaders, From bb3c0ff251f0925594ef40d16df26bb56770e68b Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 12 Mar 2025 11:09:02 -0500 Subject: [PATCH 15/71] Make collecting the installed extensions metric async (#11071) If the goal is to make compute_ctl completely asynchronous, then this is one step to getting there. Signed-off-by: Tristan Partin --- compute_tools/src/compute.rs | 6 ++-- compute_tools/src/installed_extensions.rs | 35 +++++++++++++++++------ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 354528e2cd..c2a3e38ed6 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -645,9 +645,9 @@ impl ComputeNode { if pspec.spec.mode == ComputeMode::Primary { self.configure_as_primary(&compute_state)?; - let conf = self.get_conn_conf(None); - tokio::task::spawn_blocking(|| { - let res = get_installed_extensions(conf); + let conf = self.get_tokio_conn_conf(None); + tokio::task::spawn(async { + let res = get_installed_extensions(conf).await; match res { Ok(extensions) => { info!( diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 6921505466..d95c168a99 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use anyhow::Result; use compute_api::responses::{InstalledExtension, InstalledExtensions}; -use postgres::{Client, NoTls}; +use tokio_postgres::{Client, Config, NoTls}; use crate::metrics::INSTALLED_EXTENSIONS; @@ -10,7 +10,7 @@ use crate::metrics::INSTALLED_EXTENSIONS; /// and to make database listing query here more explicit. /// /// Limit the number of databases to 500 to avoid excessive load. -fn list_dbs(client: &mut Client) -> Result> { +async fn list_dbs(client: &mut Client) -> Result> { // `pg_database.datconnlimit = -2` means that the database is in the // invalid state let databases = client @@ -20,7 +20,8 @@ fn list_dbs(client: &mut Client) -> Result> { AND datconnlimit <> - 2 LIMIT 500", &[], - )? + ) + .await? .iter() .map(|row| { let db: String = row.get("datname"); @@ -36,20 +37,36 @@ fn list_dbs(client: &mut Client) -> Result> { /// Same extension can be installed in multiple databases with different versions, /// so we report a separate metric (number of databases where it is installed) /// for each extension version. -pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result { +pub async fn get_installed_extensions(mut conf: Config) -> Result { conf.application_name("compute_ctl:get_installed_extensions"); - let mut client = conf.connect(NoTls)?; - let databases: Vec = list_dbs(&mut client)?; + let databases: Vec = { + let (mut client, connection) = conf.connect(NoTls).await?; + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + list_dbs(&mut client).await? + }; let mut extensions_map: HashMap<(String, String, String), InstalledExtension> = HashMap::new(); for db in databases.iter() { conf.dbname(db); - let mut db_client = conf.connect(NoTls)?; - let extensions: Vec<(String, String, i32)> = db_client + + let (client, connection) = conf.connect(NoTls).await?; + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let extensions: Vec<(String, String, i32)> = client .query( "SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension", &[], - )? + ) + .await? .iter() .map(|row| { ( From 5eed0e4b94ffcfd686d071aa278f96deebddaad9 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 12 Mar 2025 12:31:09 -0500 Subject: [PATCH 16/71] Add docs to performance/test_logical_replication.py on how to run the suite (#10175) These docs are in tandem with what was recently published on the internal docs site. Signed-off-by: Tristan Partin --- .../performance/test_logical_replication.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index fdc56cc496..807ed522e1 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -23,6 +23,25 @@ if TYPE_CHECKING: from psycopg2.extensions import connection, cursor +""" +These benchmarks stress test logical replication within Neon. In order to run +them locally, they require setting up some infrastructure. See +https://docs.neon.build/compute/logical_replication_benchmarks.html for how to +do that. After setting that up, run the following shell commands. + +# These are the project IDs setup for the purposes of running these benchmarks +export BENCHMARK_PROJECT_ID_PUB= +export BENCHMARK_PROJECT_ID_SUB= + +# See https://neon.tech/docs/manage/api-keys +export NEON_API_KEY= + +# Fiddling with the --timeout parameter may be required depending on the +# performance of the benchmark +pytest -m remote_cluster 'test_runner/performance/test_logical_replication.py' +""" + + @pytest.mark.timeout(1000) def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg: VanillaPostgres): env = neon_simple_env From 8a5a739af0c66acc061a5a0f1192e1759ce47738 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 12 Mar 2025 16:34:19 -0400 Subject: [PATCH 17/71] test(pageserver): add small tenant compaction (#11049) ## Problem close https://github.com/neondatabase/neon/issues/10881 ## Summary of changes Mock a tenant with very small amount of data. --------- Signed-off-by: Alex Chi Z --- test_runner/regress/test_compaction.py | 36 ++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 0df88e14c2..c8cce7a4e7 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -524,6 +524,42 @@ def test_pageserver_gc_compaction_trigger(neon_env_builder: NeonEnvBuilder): workload.validate(env.pageserver.id) +def test_pageserver_small_tenant_compaction(neon_env_builder: NeonEnvBuilder): + """ + Create a small tenant that rarely needs compaction and ensure that everything works. + """ + SMOKE_CONF = { + # Run both gc and gc-compaction. + "gc_period": "5s", + "compaction_period": "5s", + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": 1024, + "lsn_lease_length": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(10000, env.pageserver.id) + + for _ in range(100): + workload.churn_rows(10, env.pageserver.id, upload=False, ingest=False) + ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True) + ps_http.timeline_compact(tenant_id, timeline_id) + ps_http.timeline_gc(tenant_id, timeline_id, None) + + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + + # Stripe sizes in number of pages. TINY_STRIPES = 16 LARGE_STRIPES = 32768 From ef0d4a48a8546625ba9824c86839e71851b9bbdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Wed, 12 Mar 2025 22:00:59 +0100 Subject: [PATCH 18/71] Reuse artifacts from release PRs (#11061) ## Problem When we release our components, we perform builds in the release PR, then test the components, then merge the PR, and then build everything *again*, run tests *again*, and only then start deployments. To speed things up, we want to perform builds and run tests in the PR, and start deployments using the existing artifacts from the release PR. To make that possible, we need to have both CI pipelines running on the same commit hash, which requires fast forwarding release. That only works, if we have a commit in the PR that has the current release branch state as an ancestor. ## Summary of changes - Changes to release PR creation: - Remove templates and automatic bodies for release PRs. The previous template wasn't used anymore, and the automatic body we created in the pipeline didn't contain any useful content anymore after the changees here. - Make it possible to select the source branch. For releases that aren't cut from `main`, like https://github.com/neondatabase/neon/pull/11051, we need a way to trigger the new flow from a different branch. - Determine `release-branch` automatically from the component name instead of passing that as well. - Changes to the merge queue job: - Rename `get-changed-files` to `meta` in preparation of additional data being fetched as part of that job - Fail the merge queue if we're trying to merge into a branch other than main - this is to prevent non-fast-forward merges. - Label PRs to branches other than main as `fast-forward`, to trigger the fast-forward job - Add a fast-forward job that can be triggered with the `fast-forward` label that performs a fast-forward merge. This only happens if the PR has `mergeable_state == clean`, so CI having passed. - Build and Test on releases now skips building images, skips testing images and skips triggering e2e tests. We add new tags to the images from the release PR to tag them as release images, and we push them to the prod registries. --- .github/PULL_REQUEST_TEMPLATE/release-pr.md | 21 ---- .github/scripts/generate_image_maps.py | 43 ++++---- .github/scripts/lint-release-pr.sh | 110 ++++++++++++++++++++ .github/workflows/_create-release-pr.yml | 45 +++++--- .github/workflows/_meta.yml | 14 +++ .github/workflows/build_and_test.yml | 80 ++++++-------- .github/workflows/fast-forward.yml | 36 +++++++ .github/workflows/lint-release-pr.yml | 23 ++++ .github/workflows/pre-merge-checks.yml | 47 ++++++--- .github/workflows/release.yml | 6 +- 10 files changed, 300 insertions(+), 125 deletions(-) delete mode 100644 .github/PULL_REQUEST_TEMPLATE/release-pr.md create mode 100755 .github/scripts/lint-release-pr.sh create mode 100644 .github/workflows/fast-forward.yml create mode 100644 .github/workflows/lint-release-pr.yml diff --git a/.github/PULL_REQUEST_TEMPLATE/release-pr.md b/.github/PULL_REQUEST_TEMPLATE/release-pr.md deleted file mode 100644 index 44b3094c24..0000000000 --- a/.github/PULL_REQUEST_TEMPLATE/release-pr.md +++ /dev/null @@ -1,21 +0,0 @@ -## Release 202Y-MM-DD - -**NB: this PR must be merged only by 'Create a merge commit'!** - -### Checklist when preparing for release -- [ ] Read or refresh [the release flow guide](https://www.notion.so/neondatabase/Release-general-flow-61f2e39fd45d4d14a70c7749604bd70b) -- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers? -- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan? - - - -### Checklist after release -- [ ] Make sure instructions from PRs included in this release and labeled `manual_release_instructions` are executed (either by you or by people who wrote them). -- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/219/files)) -- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel -- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true) -- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some) -- [ ] Check [cloud SLO dashboard](https://neonprod.grafana.net/d/_oWcBMJ7k/cloud-slos?orgId=1) -- [ ] Check [compute startup metrics dashboard](https://neonprod.grafana.net/d/5OkYJEmVz/compute-startup-time) - - diff --git a/.github/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py index 39ece5b38f..f67e07024c 100644 --- a/.github/scripts/generate_image_maps.py +++ b/.github/scripts/generate_image_maps.py @@ -1,14 +1,16 @@ import itertools import json import os +import sys -build_tag = os.environ["BUILD_TAG"] -branch = os.environ["BRANCH"] -dev_acr = os.environ["DEV_ACR"] -prod_acr = os.environ["PROD_ACR"] -dev_aws = os.environ["DEV_AWS"] -prod_aws = os.environ["PROD_AWS"] -aws_region = os.environ["AWS_REGION"] +source_tag = os.getenv("SOURCE_TAG") +target_tag = os.getenv("TARGET_TAG") +branch = os.getenv("BRANCH") +dev_acr = os.getenv("DEV_ACR") +prod_acr = os.getenv("PROD_ACR") +dev_aws = os.getenv("DEV_AWS") +prod_aws = os.getenv("PROD_AWS") +aws_region = os.getenv("AWS_REGION") components = { "neon": ["neon"], @@ -39,24 +41,23 @@ registries = { outputs: dict[str, dict[str, list[str]]] = {} -target_tags = [build_tag, "latest"] if branch == "main" else [build_tag] -target_stages = ["dev", "prod"] if branch.startswith("release") else ["dev"] +target_tags = [target_tag, "latest"] if branch == "main" else [target_tag] +target_stages = ( + ["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"] +) for component_name, component_images in components.items(): for stage in target_stages: - outputs[f"{component_name}-{stage}"] = dict( - [ - ( - f"docker.io/neondatabase/{component_image}:{build_tag}", - [ - f"{combo[0]}/{component_image}:{combo[1]}" - for combo in itertools.product(registries[stage], target_tags) - ], - ) - for component_image in component_images + outputs[f"{component_name}-{stage}"] = { + f"docker.io/neondatabase/{component_image}:{source_tag}": [ + f"{registry}/{component_image}:{tag}" + for registry, tag in itertools.product(registries[stage], target_tags) + if not (registry == "docker.io/neondatabase" and tag == source_tag) ] - ) + for component_image in component_images + } -with open(os.environ["GITHUB_OUTPUT"], "a") as f: +with open(os.getenv("GITHUB_OUTPUT", "/dev/null"), "a") as f: for key, value in outputs.items(): f.write(f"{key}={json.dumps(value)}\n") + print(f"Image map for {key}:\n{json.dumps(value, indent=2)}\n\n", file=sys.stderr) diff --git a/.github/scripts/lint-release-pr.sh b/.github/scripts/lint-release-pr.sh new file mode 100755 index 0000000000..8e081000f9 --- /dev/null +++ b/.github/scripts/lint-release-pr.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash + +set -euo pipefail + +DOCS_URL="https://docs.neon.build/overview/repositories/neon.html" + +message() { + if [[ -n "${GITHUB_PR_NUMBER:-}" ]]; then + gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --edit-last --body "$1" \ + || gh pr comment --repo "${GITHUB_REPOSITORY}" "${GITHUB_PR_NUMBER}" --body "$1" + fi + echo "$1" +} + +report_error() { + message "❌ $1 + For more details, see the documentation: ${DOCS_URL}" + + exit 1 +} + +case "$RELEASE_BRANCH" in + "release") COMPONENT="Storage" ;; + "release-proxy") COMPONENT="Proxy" ;; + "release-compute") COMPONENT="Compute" ;; + *) + report_error "Unknown release branch: ${RELEASE_BRANCH}" + ;; +esac + + +# Identify main and release branches +MAIN_BRANCH="origin/main" +REMOTE_RELEASE_BRANCH="origin/${RELEASE_BRANCH}" + +# Find merge base +MERGE_BASE=$(git merge-base "${MAIN_BRANCH}" "${REMOTE_RELEASE_BRANCH}") +echo "Merge base of ${MAIN_BRANCH} and ${RELEASE_BRANCH}: ${MERGE_BASE}" + +# Get the HEAD commit (last commit in PR, expected to be the merge commit) +LAST_COMMIT=$(git rev-parse HEAD) + +MERGE_COMMIT_MESSAGE=$(git log -1 --format=%s "${LAST_COMMIT}") +EXPECTED_MESSAGE_REGEX="^$COMPONENT release [0-9]{4}-[0-9]{2}-[0-9]{2}$" + +if ! [[ "${MERGE_COMMIT_MESSAGE}" =~ ${EXPECTED_MESSAGE_REGEX} ]]; then + report_error "Merge commit message does not match expected pattern: ' release YYYY-MM-DD' + Expected component: ${COMPONENT} + Found: '${MERGE_COMMIT_MESSAGE}'" +fi +echo "✅ Merge commit message is correctly formatted: '${MERGE_COMMIT_MESSAGE}'" + +LAST_COMMIT_PARENTS=$(git cat-file -p "${LAST_COMMIT}" | jq -sR '[capture("parent (?[0-9a-f]{40})"; "g") | .parent]') + +if [[ "$(echo "${LAST_COMMIT_PARENTS}" | jq 'length')" -ne 2 ]]; then + report_error "Last commit must be a merge commit with exactly two parents" +fi + +EXPECTED_RELEASE_HEAD=$(git rev-parse "${REMOTE_RELEASE_BRANCH}") +if echo "${LAST_COMMIT_PARENTS}" | jq -e --arg rel "${EXPECTED_RELEASE_HEAD}" 'index($rel) != null' > /dev/null; then + LINEAR_HEAD=$(echo "${LAST_COMMIT_PARENTS}" | jq -r '[.[] | select(. != $rel)][0]' --arg rel "${EXPECTED_RELEASE_HEAD}") +else + report_error "Last commit must merge the release branch (${RELEASE_BRANCH})" +fi +echo "✅ Last commit correctly merges the previous commit and the release branch" +echo "Top commit of linear history: ${LINEAR_HEAD}" + +MERGE_COMMIT_TREE=$(git rev-parse "${LAST_COMMIT}^{tree}") +LINEAR_HEAD_TREE=$(git rev-parse "${LINEAR_HEAD}^{tree}") + +if [[ "${MERGE_COMMIT_TREE}" != "${LINEAR_HEAD_TREE}" ]]; then + report_error "Tree of merge commit (${MERGE_COMMIT_TREE}) does not match tree of linear history head (${LINEAR_HEAD_TREE}) + This indicates that the merge of ${RELEASE_BRANCH} into this branch was not performed using the merge strategy 'ours'" +fi +echo "✅ Merge commit tree matches the linear history head" + +EXPECTED_PREVIOUS_COMMIT="${LINEAR_HEAD}" + +# Now traverse down the history, ensuring each commit has exactly one parent +CURRENT_COMMIT="${EXPECTED_PREVIOUS_COMMIT}" +while [[ "${CURRENT_COMMIT}" != "${MERGE_BASE}" && "${CURRENT_COMMIT}" != "${EXPECTED_RELEASE_HEAD}" ]]; do + CURRENT_COMMIT_PARENTS=$(git cat-file -p "${CURRENT_COMMIT}" | jq -sR '[capture("parent (?[0-9a-f]{40})"; "g") | .parent]') + + if [[ "$(echo "${CURRENT_COMMIT_PARENTS}" | jq 'length')" -ne 1 ]]; then + report_error "Commit ${CURRENT_COMMIT} must have exactly one parent" + fi + + NEXT_COMMIT=$(echo "${CURRENT_COMMIT_PARENTS}" | jq -r '.[0]') + + if [[ "${NEXT_COMMIT}" == "${MERGE_BASE}" ]]; then + echo "✅ Reached merge base (${MERGE_BASE})" + PR_BASE="${MERGE_BASE}" + if [[ "${NEXT_COMMIT}" == "${EXPECTED_RELEASE_HEAD}" ]]; then + echo "✅ Reached release branch (${EXPECTED_RELEASE_HEAD})" + PR_BASE="${EXPECTED_RELEASE_HEAD}" + elif [[ -z "${NEXT_COMMIT}" ]]; then + report_error "Unexpected end of commit history before reaching merge base" + fi + + # Move to the next commit in the chain + CURRENT_COMMIT="${NEXT_COMMIT}" +done + +echo "✅ All commits are properly ordered and linear" +echo "✅ Release PR structure is valid" + +echo + +message "Commits that are part of this release: +$(git log --oneline "${PR_BASE}..${LINEAR_HEAD}")" diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml index 3c130c8229..82acbc0f84 100644 --- a/.github/workflows/_create-release-pr.yml +++ b/.github/workflows/_create-release-pr.yml @@ -7,8 +7,8 @@ on: description: 'Component name' required: true type: string - release-branch: - description: 'Release branch' + source-branch: + description: 'Source branch' required: true type: string secrets: @@ -30,17 +30,24 @@ jobs: steps: - uses: actions/checkout@v4 with: - ref: main + ref: ${{ inputs.source-branch }} - name: Set variables id: vars env: COMPONENT_NAME: ${{ inputs.component-name }} - RELEASE_BRANCH: ${{ inputs.release-branch }} + RELEASE_BRANCH: >- + ${{ + false + || inputs.component-name == 'Storage' && 'release' + || inputs.component-name == 'Proxy' && 'release-proxy' + || inputs.component-name == 'Compute' && 'release-compute' + }} run: | today=$(date +'%Y-%m-%d') echo "title=${COMPONENT_NAME} release ${today}" | tee -a ${GITHUB_OUTPUT} echo "rc-branch=rc/${RELEASE_BRANCH}/${today}" | tee -a ${GITHUB_OUTPUT} + echo "release-branch=${RELEASE_BRANCH}" | tee -a ${GITHUB_OUTPUT} - name: Configure git run: | @@ -49,31 +56,35 @@ jobs: - name: Create RC branch env: + RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} TITLE: ${{ steps.vars.outputs.title }} run: | - git checkout -b "${RC_BRANCH}" + git switch -c "${RC_BRANCH}" - # create an empty commit to distinguish workflow runs - # from other possible releases from the same commit - git commit --allow-empty -m "${TITLE}" + # Manually create a merge commit on the current branch, keeping the + # tree and setting the parents to the current HEAD and the HEAD of the + # release branch. This commit is what we'll fast-forward the release + # branch to when merging the release branch. + # For details on why, look at + # https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs + current_tree=$(git rev-parse 'HEAD^{tree}') + release_head=$(git rev-parse "${RELEASE_BRANCH}") + current_head=$(git rev-parse HEAD) + merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}") + + # Fast-forward the current branch to the newly created merge_commit + git merge --ff-only ${merge_commit} git push origin "${RC_BRANCH}" - - name: Create a PR into ${{ inputs.release-branch }} + - name: Create a PR into ${{ steps.vars.outputs.release-branch }} env: GH_TOKEN: ${{ secrets.ci-access-token }} RC_BRANCH: ${{ steps.vars.outputs.rc-branch }} - RELEASE_BRANCH: ${{ inputs.release-branch }} + RELEASE_BRANCH: ${{ steps.vars.outputs.release-branch }} TITLE: ${{ steps.vars.outputs.title }} run: | - cat << EOF > body.md - ## ${TITLE} - - **Please merge this Pull Request using 'Create a merge commit' button** - EOF - gh pr create --title "${TITLE}" \ - --body-file "body.md" \ --head "${RC_BRANCH}" \ --base "${RELEASE_BRANCH}" diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml index cae7fae6a4..c9e7b66efa 100644 --- a/.github/workflows/_meta.yml +++ b/.github/workflows/_meta.yml @@ -21,6 +21,9 @@ on: run-kind: description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`" value: ${{ jobs.tags.outputs.run-kind }} + release-pr-run-id: + description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found." + value: ${{ jobs.tags.outputs.release-pr-run-id }} permissions: {} @@ -37,6 +40,7 @@ jobs: proxy: ${{ steps.previous-releases.outputs.proxy }} storage: ${{ steps.previous-releases.outputs.storage }} run-kind: ${{ steps.run-kind.outputs.run-kind }} + release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }} permissions: contents: read steps: @@ -113,3 +117,13 @@ jobs: "/repos/${GITHUB_REPOSITORY}/releases" \ | jq -f .github/scripts/previous-releases.jq -r \ | tee -a "${GITHUB_OUTPUT}" + + - name: Get the release PR run ID + id: release-pr-run-id + if: ${{ contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy)|(compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // "Faied to find Build and Test run from RC PR!" | halt_error(1)') + echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1c0971a49d..e1ad972a61 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -476,7 +476,7 @@ jobs: ( !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') - || contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) + || needs.meta.outputs.run-kind == 'push-main' ) && !failure() && !cancelled() }} needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ] @@ -487,7 +487,7 @@ jobs: neon-image-arch: needs: [ check-permissions, build-build-tools-image, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} strategy: matrix: arch: [ x64, arm64 ] @@ -537,7 +537,7 @@ jobs: neon-image: needs: [ neon-image-arch, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 permissions: id-token: write # aws-actions/configure-aws-credentials @@ -559,7 +559,7 @@ jobs: compute-node-image-arch: needs: [ check-permissions, build-build-tools-image, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -651,7 +651,7 @@ jobs: compute-node-image: needs: [ compute-node-image-arch, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -694,7 +694,7 @@ jobs: vm-compute-node-image-arch: needs: [ check-permissions, meta, compute-node-image ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} strategy: fail-fast: false @@ -747,7 +747,7 @@ jobs: vm-compute-node-image: needs: [ vm-compute-node-image-arch, meta ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ubuntu-22.04 strategy: matrix: @@ -773,7 +773,12 @@ jobs: test-images: needs: [ check-permissions, meta, neon-image, compute-node-image ] # Depends on jobs that can get skipped - if: "!failure() && !cancelled()" + if: >- + ${{ + !failure() + && !cancelled() + && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + }} strategy: fail-fast: false matrix: @@ -800,7 +805,7 @@ jobs: # Ensure that we don't have bad versions. - name: Verify image versions shell: bash # ensure no set -e for better error messages - if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} run: | pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") @@ -821,19 +826,19 @@ jobs: env: TAG: >- ${{ - contains(fromJSON('["compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release || needs.meta.outputs.build-tag }} COMPUTE_TAG: >- ${{ - contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) && needs.meta.outputs.previous-compute-release || needs.meta.outputs.build-tag }} TEST_EXTENSIONS_TAG: >- ${{ - contains(fromJSON('["storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) + contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) && 'latest' || needs.meta.outputs.build-tag }} @@ -885,7 +890,13 @@ jobs: id: generate run: python3 .github/scripts/generate_image_maps.py env: - BUILD_TAG: "${{ needs.meta.outputs.build-tag }}" + SOURCE_TAG: >- + ${{ + contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) + && needs.meta.outputs.release-pr-run-id + || needs.meta.outputs.build-tag + }} + TARGET_TAG: ${{ needs.meta.outputs.build-tag }} BRANCH: "${{ github.ref_name }}" DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}" PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}" @@ -895,7 +906,7 @@ jobs: push-neon-image-dev: needs: [ meta, generate-image-maps, neon-image ] - if: ${{ contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login @@ -913,7 +924,7 @@ jobs: push-compute-image-dev: needs: [ meta, generate-image-maps, vm-compute-node-image ] - if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + if: ${{ !failure() && !cancelled() && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} uses: ./.github/workflows/_push-to-container-registry.yml permissions: id-token: write # Required for aws/azure login @@ -1235,7 +1246,7 @@ jobs: # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: - needs: [ deploy ] + needs: [ meta, deploy ] permissions: id-token: write # aws-actions/configure-aws-credentials statuses: write @@ -1245,37 +1256,6 @@ jobs: runs-on: ubuntu-22.04 steps: - - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR - id: fetch-last-release-pr-info - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - branch_name_and_pr_number=$(gh pr list \ - --repo "${GITHUB_REPOSITORY}" \ - --base release \ - --state merged \ - --limit 10 \ - --json mergeCommit,headRefName,number \ - --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }") - branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name') - pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number') - - run_id=$(gh run list \ - --repo "${GITHUB_REPOSITORY}" \ - --workflow build_and_test.yml \ - --branch "${branch_name}" \ - --json databaseId \ - --limit 1 \ - --jq '.[].databaseId') - - last_commit_sha=$(gh pr view "${pr_number}" \ - --repo "${GITHUB_REPOSITORY}" \ - --json commits \ - --jq '.commits[-1].oid') - - echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT} - echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT} - - uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 @@ -1286,8 +1266,8 @@ jobs: env: BUCKET: neon-github-public-dev AWS_REGION: eu-central-1 - COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }} - RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }} + COMMIT_SHA: ${{ github.sha }} + RUN_ID: ${{ needs.meta.outputs.release-pr-run-id }} run: | old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}" new_prefix="artifacts/latest" @@ -1376,5 +1356,5 @@ jobs: || needs.files-changed.result == 'skipped' || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) - || needs.test-images.result == 'skipped' + || (needs.test-images.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.trigger-custom-extensions-build-and-wait.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) diff --git a/.github/workflows/fast-forward.yml b/.github/workflows/fast-forward.yml new file mode 100644 index 0000000000..bc63ff120d --- /dev/null +++ b/.github/workflows/fast-forward.yml @@ -0,0 +1,36 @@ +name: Fast forward merge +on: + pull_request: + types: [labeled] + branches: + - release + - release-proxy + - release-compute + +jobs: + fast-forward: + if: ${{ github.event.label.name == 'fast-forward' }} + runs-on: ubuntu-22.04 + + steps: + - name: Remove fast-forward label to PR + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: | + gh pr edit ${{ github.event.pull_request.number }} --repo "${GITHUB_REPOSITORY}" --remove-label "fast-forward" + + - name: Fast forwarding + uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979 + # See https://docs.github.com/en/graphql/reference/enums#mergestatestatus + if: ${{ github.event.pull_request.mergeable_state == 'clean' }} + with: + merge: true + comment: on-error + github_token: ${{ secrets.CI_ACCESS_TOKEN }} + + - name: Comment if mergeable_state is not clean + if: ${{ github.event.pull_request.mergeable_state != 'clean' }} + run: | + gh pr comment ${{ github.event.pull_request.number }} \ + --repo "${GITHUB_REPOSITORY}" \ + --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`." diff --git a/.github/workflows/lint-release-pr.yml b/.github/workflows/lint-release-pr.yml new file mode 100644 index 0000000000..f12ddfe377 --- /dev/null +++ b/.github/workflows/lint-release-pr.yml @@ -0,0 +1,23 @@ +name: Lint Release PR + +on: + pull_request: + branches: + - release + - release-proxy + - release-compute + +jobs: + lint-release-pr: + runs-on: ubuntu-22.04 + steps: + - name: Checkout PR branch + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Fetch full history for git operations + + - name: Run lint script + env: + RELEASE_BRANCH: ${{ github.base_ref }} + run: | + ./.github/scripts/lint-release-pr.sh diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index c47b3fe0de..1e81550314 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -8,8 +8,6 @@ on: - .github/workflows/build-build-tools-image.yml - .github/workflows/pre-merge-checks.yml merge_group: - branches: - - main defaults: run: @@ -19,11 +17,13 @@ defaults: permissions: {} jobs: - get-changed-files: + meta: runs-on: ubuntu-22.04 outputs: python-changed: ${{ steps.python-src.outputs.any_changed }} rust-changed: ${{ steps.rust-src.outputs.any_changed }} + branch: ${{ steps.group-metadata.outputs.branch }} + pr-number: ${{ steps.group-metadata.outputs.pr-number }} steps: - uses: actions/checkout@v4 @@ -58,12 +58,20 @@ jobs: echo "${PYTHON_CHANGED_FILES}" echo "${RUST_CHANGED_FILES}" + - name: Merge group metadata + if: ${{ github.event_name == 'merge_group' }} + id: group-metadata + env: + MERGE_QUEUE_REF: ${{ github.event.merge_group.head_ref }} + run: | + echo $MERGE_QUEUE_REF | jq -Rr 'capture("refs/heads/gh-readonly-queue/(?.*)/pr-(?[0-9]+)-[0-9a-f]{40}") | ["branch=" + .branch, "pr-number=" + .pr_number] | .[]' | tee -a "${GITHUB_OUTPUT}" + build-build-tools-image: if: | false - || needs.get-changed-files.outputs.python-changed == 'true' - || needs.get-changed-files.outputs.rust-changed == 'true' - needs: [ get-changed-files ] + || needs.meta.outputs.python-changed == 'true' + || needs.meta.outputs.rust-changed == 'true' + needs: [ meta ] uses: ./.github/workflows/build-build-tools-image.yml with: # Build only one combination to save time @@ -72,8 +80,8 @@ jobs: secrets: inherit check-codestyle-python: - if: needs.get-changed-files.outputs.python-changed == 'true' - needs: [ get-changed-files, build-build-tools-image ] + if: needs.meta.outputs.python-changed == 'true' + needs: [ meta, build-build-tools-image ] uses: ./.github/workflows/_check-codestyle-python.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` @@ -81,8 +89,8 @@ jobs: secrets: inherit check-codestyle-rust: - if: needs.get-changed-files.outputs.rust-changed == 'true' - needs: [ get-changed-files, build-build-tools-image ] + if: needs.meta.outputs.rust-changed == 'true' + needs: [ meta, build-build-tools-image ] uses: ./.github/workflows/_check-codestyle-rust.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` @@ -101,7 +109,7 @@ jobs: statuses: write # for `github.repos.createCommitStatus(...)` contents: write needs: - - get-changed-files + - meta - check-codestyle-python - check-codestyle-rust runs-on: ubuntu-22.04 @@ -129,7 +137,20 @@ jobs: run: exit 1 if: | false - || (needs.check-codestyle-python.result == 'skipped' && needs.get-changed-files.outputs.python-changed == 'true') - || (needs.check-codestyle-rust.result == 'skipped' && needs.get-changed-files.outputs.rust-changed == 'true') + || (github.event_name == 'merge_group' && needs.meta.outputs.branch != 'main') + || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.python-changed == 'true') + || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.rust-changed == 'true') || contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') + + - name: Add fast-forward label to PR to trigger fast-forward merge + if: >- + ${{ + always() + && github.event_name == 'merge_group' + && contains(fromJson('["release", "release-proxy", "release-compute"]'), github.base_ref) + }} + env: + GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} + run: >- + gh pr edit ${{ needs.meta.outputs.pr-number }} --repo "${GITHUB_REPOSITORY}" --add-label "fast-forward" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 919846ce44..a88ddecd0a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -38,7 +38,7 @@ jobs: uses: ./.github/workflows/_create-release-pr.yml with: component-name: 'Storage' - release-branch: 'release' + source-branch: ${{ github.ref_name }} secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} @@ -51,7 +51,7 @@ jobs: uses: ./.github/workflows/_create-release-pr.yml with: component-name: 'Proxy' - release-branch: 'release-proxy' + source-branch: ${{ github.ref_name }} secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} @@ -64,6 +64,6 @@ jobs: uses: ./.github/workflows/_create-release-pr.yml with: component-name: 'Compute' - release-branch: 'release-compute' + source-branch: ${{ github.ref_name }} secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} From c3b3b507f76b841f061104ede0f14c837e879bd7 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 12 Mar 2025 18:27:23 -0400 Subject: [PATCH 19/71] feat(pageserver): support detaching behavior v2 (#11158) ## Problem close https://github.com/neondatabase/neon/issues/10310 ## Summary of changes This patch adds a new behavior for the detach_ancestor API: detach with multi-level ancestor and no reparenting. Though we can potentially support multi-level + do reparenting / single-level + no-reparenting in the future, as it's not required for the recovery/snapshot epic, I'd prefer keeping things simple now that we only handle the old one and the new one instead of supporting the full feature matrix. I only added a test case of successful detaching instead of testing failures. I'd like to make this into staging and add more tests in the future. --------- Signed-off-by: Alex Chi Z --- pageserver/src/http/routes.rs | 6 +- pageserver/src/tenant/metadata.rs | 5 +- pageserver/src/tenant/mgr.rs | 10 +- pageserver/src/tenant/timeline.rs | 17 ++- .../src/tenant/timeline/detach_ancestor.rs | 138 ++++++++++++++++-- test_runner/fixtures/pageserver/http.py | 5 +- .../regress/test_timeline_detach_ancestor.py | 137 ++++++++++++++++- 7 files changed, 294 insertions(+), 24 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ba5fb521ff..44159aee0a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -72,6 +72,7 @@ use crate::tenant::remote_timeline_client::{ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; +use crate::tenant::timeline::detach_ancestor::DetachBehavior; use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; use crate::tenant::timeline::{ CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout, @@ -2505,6 +2506,8 @@ async fn timeline_detach_ancestor_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let behavior: Option = parse_query_param(&request, "detach_behavior")?; + let behavior = behavior.unwrap_or_default(); let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); @@ -2554,7 +2557,7 @@ async fn timeline_detach_ancestor_handler( let ctx = &ctx.with_scope_timeline(&timeline); let progress = timeline - .prepare_to_detach_from_ancestor(&tenant, options, ctx) + .prepare_to_detach_from_ancestor(&tenant, options, behavior, ctx) .await?; // uncomment to allow early as possible Tenant::drop @@ -2569,6 +2572,7 @@ async fn timeline_detach_ancestor_handler( tenant_shard_id, timeline_id, prepared, + behavior, attempt, ctx, ) diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 77f9a3579d..dceae89d1c 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -300,9 +300,8 @@ impl TimelineMetadata { /// Returns true if anything was changed pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { - if let Some(ancestor) = self.body.ancestor_timeline { - assert_eq!(ancestor, branchpoint.0); - } + // Detaching from ancestor now doesn't always detach directly to the direct ancestor, but we + // ensure the LSN is the same. So we don't check the timeline ID. if self.body.ancestor_lsn != Lsn(0) { assert_eq!(self.body.ancestor_lsn, branchpoint.1); } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 003f84e640..092bfdf6c1 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1914,6 +1914,7 @@ impl TenantManager { tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, + behavior: detach_ancestor::DetachBehavior, mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, ) -> Result, detach_ancestor::Error> { @@ -1957,7 +1958,14 @@ impl TenantManager { .map_err(Error::NotFound)?; let resp = timeline - .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) + .detach_from_ancestor_and_reparent( + &tenant, + prepared, + attempt.ancestor_timeline_id, + attempt.ancestor_lsn, + behavior, + ctx, + ) .await?; let mut slot_guard = slot_guard; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e01c3dbd4d..61542409f7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -5388,9 +5388,10 @@ impl Timeline { self: &Arc, tenant: &crate::tenant::Tenant, options: detach_ancestor::Options, + behavior: detach_ancestor::DetachBehavior, ctx: &RequestContext, ) -> Result { - detach_ancestor::prepare(self, tenant, options, ctx).await + detach_ancestor::prepare(self, tenant, behavior, options, ctx).await } /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and @@ -5406,9 +5407,21 @@ impl Timeline { self: &Arc, tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, + behavior: detach_ancestor::DetachBehavior, ctx: &RequestContext, ) -> Result { - detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await + detach_ancestor::detach_and_reparent( + self, + tenant, + prepared, + ancestor_timeline_id, + ancestor_lsn, + behavior, + ctx, + ) + .await } /// Final step which unblocks the GC. diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index b08003d04a..c3e4bedc50 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -32,6 +32,9 @@ pub(crate) enum Error { #[error("too many ancestors")] TooManyAncestors, + #[error("ancestor is not empty")] + AncestorNotEmpty, + #[error("shutting down, please retry later")] ShuttingDown, @@ -89,7 +92,9 @@ impl From for ApiError { fn from(value: Error) -> Self { match value { Error::NoAncestor => ApiError::Conflict(value.to_string()), - Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")), + Error::TooManyAncestors | Error::AncestorNotEmpty => { + ApiError::BadRequest(anyhow::anyhow!("{value}")) + } Error::ShuttingDown => ApiError::ShuttingDown, Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")), Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => { @@ -127,13 +132,37 @@ pub(crate) struct PreparedTimelineDetach { layers: Vec, } -/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. +// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. #[derive(Debug)] pub(crate) struct Options { pub(crate) rewrite_concurrency: std::num::NonZeroUsize, pub(crate) copy_concurrency: std::num::NonZeroUsize, } +/// Controls the detach ancestor behavior. +/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point. +/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all. +#[derive(Debug, Clone, Copy, Default)] +pub enum DetachBehavior { + #[default] + NoAncestorAndReparent, + MultiLevelAndNoReparent, +} + +impl std::str::FromStr for DetachBehavior { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s { + "no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent), + "multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent), + "v1" => Ok(DetachBehavior::NoAncestorAndReparent), + "v2" => Ok(DetachBehavior::MultiLevelAndNoReparent), + _ => Err("cannot parse detach behavior"), + } + } +} + impl Default for Options { fn default() -> Self { Self { @@ -147,7 +176,8 @@ impl Default for Options { #[derive(Debug)] pub(crate) struct Attempt { pub(crate) timeline_id: TimelineId, - + pub(crate) ancestor_timeline_id: TimelineId, + pub(crate) ancestor_lsn: Lsn, _guard: completion::Completion, gate_entered: Option, } @@ -167,25 +197,30 @@ impl Attempt { pub(super) async fn prepare( detached: &Arc, tenant: &Tenant, + behavior: DetachBehavior, options: Options, ctx: &RequestContext, ) -> Result { use Error::*; - let Some((ancestor, ancestor_lsn)) = detached + let Some((mut ancestor, mut ancestor_lsn)) = detached .ancestor_timeline .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { + let ancestor_id; + let ancestor_lsn; let still_in_progress = { let accessor = detached.remote_client.initialized_upload_queue()?; // we are safe to inspect the latest uploaded, because we can only witness this after // restart is complete and ancestor is no more. let latest = accessor.latest_uploaded_index_part(); - if latest.lineage.detached_previous_ancestor().is_none() { + let Some((id, lsn)) = latest.lineage.detached_previous_ancestor() else { return Err(NoAncestor); }; + ancestor_id = id; + ancestor_lsn = lsn; latest .gc_blocking @@ -196,7 +231,8 @@ pub(super) async fn prepare( if still_in_progress { // gc is still blocked, we can still reparent and complete. // we are safe to reparent remaining, because they were locked in in the beginning. - let attempt = continue_with_blocked_gc(detached, tenant).await?; + let attempt = + continue_with_blocked_gc(detached, tenant, ancestor_id, ancestor_lsn).await?; // because the ancestor of detached is already set to none, we have published all // of the layers, so we are still "prepared." @@ -224,13 +260,34 @@ pub(super) async fn prepare( check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; - if ancestor.ancestor_timeline.is_some() { + if let DetachBehavior::MultiLevelAndNoReparent = behavior { + // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline. + while let Some(ancestor_of_ancestor) = ancestor.ancestor_timeline.clone() { + if ancestor_lsn != ancestor.ancestor_lsn { + // non-technical requirement; we could flatten still if ancestor LSN does not match but that needs + // us to copy and cut more layers. + return Err(AncestorNotEmpty); + } + // Use the ancestor of the ancestor as the new ancestor (only when the ancestor LSNs are the same) + ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable + ancestor = ancestor_of_ancestor; + // TODO: do we still need to check if we don't want to reparent? + check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; + } + } else if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose // not to, at least initially return Err(TooManyAncestors); } - let attempt = start_new_attempt(detached, tenant).await?; + tracing::info!( + "attempt to detach the timeline from the ancestor: {}@{}, behavior={:?}", + ancestor.timeline_id, + ancestor_lsn, + behavior + ); + + let attempt = start_new_attempt(detached, tenant, ancestor.timeline_id, ancestor_lsn).await?; utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable"); @@ -450,8 +507,13 @@ pub(super) async fn prepare( Ok(Progress::Prepared(attempt, prepared)) } -async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result { - let attempt = obtain_exclusive_attempt(detached, tenant)?; +async fn start_new_attempt( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { + let attempt = obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn)?; // insert the block in the index_part.json, if not already there. let _dont_care = tenant @@ -466,13 +528,23 @@ async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result Result { +async fn continue_with_blocked_gc( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { // FIXME: it would be nice to confirm that there is an in-memory version, since we've just // verified there is a persistent one? - obtain_exclusive_attempt(detached, tenant) + obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn) } -fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result { +fn obtain_exclusive_attempt( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { use Error::{OtherTimelineDetachOngoing, ShuttingDown}; // ensure we are the only active attempt for this tenant @@ -493,6 +565,8 @@ fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result, tenant: &Tenant, prepared: PreparedTimelineDetach, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, + behavior: DetachBehavior, _ctx: &RequestContext, ) -> Result { let PreparedTimelineDetach { layers } = prepared; @@ -822,7 +899,30 @@ pub(super) async fn detach_and_reparent( "cannot (detach? reparent)? complete if the operation is not still ongoing" ); - let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) { + let ancestor_to_detach = match detached.ancestor_timeline.as_ref() { + Some(mut ancestor) => { + while ancestor.timeline_id != ancestor_timeline_id { + match ancestor.ancestor_timeline.as_ref() { + Some(found) => { + if ancestor_lsn != ancestor.ancestor_lsn { + return Err(Error::DetachReparent(anyhow::anyhow!( + "cannot find the ancestor timeline to detach from: wrong ancestor lsn" + ))); + } + ancestor = found; + } + None => { + return Err(Error::DetachReparent(anyhow::anyhow!( + "cannot find the ancestor timeline to detach from" + ))); + } + } + } + Some(ancestor) + } + None => None, + }; + let ancestor = match (ancestor_to_detach, recorded_branchpoint) { (Some(ancestor), None) => { assert!( !layers.is_empty(), @@ -895,6 +995,11 @@ pub(super) async fn detach_and_reparent( Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), }; + if let DetachBehavior::MultiLevelAndNoReparent = behavior { + // Do not reparent if the user requests to behave so. + return Ok(DetachingAndReparenting::Reparented(HashSet::new())); + } + let mut tasks = tokio::task::JoinSet::new(); // Returns a single permit semaphore which will be used to make one reparenting succeed, @@ -1032,6 +1137,11 @@ pub(super) async fn complete( } /// Query against a locked `Tenant::timelines`. +/// +/// A timeline is reparentable if: +/// +/// - It is not the timeline being detached. +/// - It has the same ancestor as the timeline being detached. Note that the ancestor might not be the direct ancestor. fn reparentable_timelines<'a, I>( timelines: I, detached: &'a Arc, diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 0efe0b9575..61aab2213d 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1070,11 +1070,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, batch_size: int | None = None, + behavior_v2: bool = False, **kwargs, ) -> set[TimelineId]: - params = {} + params: dict[str, Any] = {} if batch_size is not None: params["batch_size"] = batch_size + if behavior_v2: + params["detach_behavior"] = "v2" res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", params=params, diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 612a767480..79537ba83a 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -319,8 +319,9 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): # this does not contain Z in the end, so fromisoformat accepts it # it is to be in line with the deletion timestamp.. well, almost. when = original_ancestor[2][:26] - when_ts = datetime.datetime.fromisoformat(when) - assert when_ts < datetime.datetime.now() + when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) + now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + assert when_ts < now assert len(lineage.get("reparenting_history", [])) == 0 elif expected_ancestor == timeline_id: assert len(lineage.get("original_ancestor", [])) == 0 @@ -342,6 +343,138 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) +def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): + """ + Test the v2 behavior of ancestor detach. + + old main -------|---------X---------> + | | | + | | +-> after + | +--X empty snapshot branch + | | + | +-> branch-to-detach + | + +-> earlier + + Ends up as: + + old main -------|---------X---------> + | | | + | | +-> after + | +--X empty snapshot branch + | + +-> earlier + + + new main -------|---------|----> branch-to-detach + """ + + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + + branchpoint_pipe = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + earlier = env.create_branch( + "earlier", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_pipe + ) + + snapshot_branchpoint = env.create_branch( + "snapshot_branchpoint", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_x + ) + + branch_to_detach = env.create_branch( + "branch_to_detach", + ancestor_branch_name="snapshot_branchpoint", + ancestor_start_lsn=branchpoint_x, + ) + + after = env.create_branch("after", ancestor_branch_name="main", ancestor_start_lsn=None) + + all_reparented = client.detach_ancestor(env.initial_tenant, branch_to_detach, behavior_v2=True) + assert set(all_reparented) == set() + + env.pageserver.quiesce_tenants() + + # checking the ancestor after is much faster than waiting for the endpoint not start + expected_result = [ + ("main", env.initial_timeline, None, 16384, 1), + ("after", after, env.initial_timeline, 16384, 1), + ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 8192, 1), + ("branch_to_detach", branch_to_detach, None, 8192, 1), + ("earlier", earlier, env.initial_timeline, 0, 1), + ] + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + ancestor_timeline_id = details["ancestor_timeline_id"] + if expected_ancestor is None: + assert ancestor_timeline_id is None + else: + assert ( + TimelineId(ancestor_timeline_id) == expected_ancestor + ), f"when checking branch {branch_name}, mapping={expected_result}" + + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == branch_to_detach: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) + now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + assert when_ts < now + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == branch_to_detach: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + + for name, _, _, rows, starts in expected_result: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 + + # delete the new timeline to confirm it doesn't carry over the anything from the old timeline + client.timeline_delete(env.initial_tenant, branch_to_detach) + wait_timeline_detail_404(client, env.initial_tenant, branch_to_detach) + + # delete the after timeline + client.timeline_delete(env.initial_tenant, after) + wait_timeline_detail_404(client, env.initial_tenant, after) + + def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): """ Makes sure that the timeline is able to receive writes through-out the detach process. From 48be4df3f3a12c4d6ba049718e35d8a472a8de05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Wed, 12 Mar 2025 23:32:38 +0100 Subject: [PATCH 20/71] fix(ci): fetch all refs in release PR creation (#11201) ## Problem #11061 changed release PR creation, and I missed that we need to explicitly fetch the whole history so that the relevant git refs and objects are available. ## Summary of changes - Fetch all git refs including history by setting fetch-depth to 0 - Reference release branch as a remote branch, because we haven't checked it out locally --- .github/workflows/_create-release-pr.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml index 82acbc0f84..62266d876e 100644 --- a/.github/workflows/_create-release-pr.yml +++ b/.github/workflows/_create-release-pr.yml @@ -31,6 +31,7 @@ jobs: - uses: actions/checkout@v4 with: ref: ${{ inputs.source-branch }} + fetch-depth: 0 - name: Set variables id: vars @@ -69,7 +70,7 @@ jobs: # For details on why, look at # https://docs.neon.build/overview/repositories/neon.html#background-on-commit-history-of-release-prs current_tree=$(git rev-parse 'HEAD^{tree}') - release_head=$(git rev-parse "${RELEASE_BRANCH}") + release_head=$(git rev-parse "origin/${RELEASE_BRANCH}") current_head=$(git rev-parse HEAD) merge_commit=$(git commit-tree -p "${current_head}" -p "${release_head}" -m "${TITLE}" "${current_tree}") From 507353404c19f14c867db7341c94b06a954d6845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Thu, 13 Mar 2025 00:54:43 +0100 Subject: [PATCH 21/71] fix(ci): pass emtpy body when creating release PRs (#11203) ## Problem #11061 changed release pr creation, and I missed that creating PRs using `gh` in non-interactive environments *requires* `--body` instead of defaulting to an empty body. ## Summary of changes Explicitly set an empty body when creating release PRs. --- .github/workflows/_create-release-pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml index 62266d876e..9b1d1aa454 100644 --- a/.github/workflows/_create-release-pr.yml +++ b/.github/workflows/_create-release-pr.yml @@ -87,5 +87,6 @@ jobs: TITLE: ${{ steps.vars.outputs.title }} run: | gh pr create --title "${TITLE}" \ + --body "" \ --head "${RC_BRANCH}" \ --base "${RELEASE_BRANCH}" From afc9524bc7b6c6edbe22d98f780260ce90ef0b90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Thu, 13 Mar 2025 09:17:33 +0100 Subject: [PATCH 22/71] fix(ci): run lint-release-pr on head-ref (#11206) ## Problem #11061 changed release pr creation, and I missed that the workflow will checkout a would-be-merge of the rc branch and the release branch instead of the head ref, unless explicitly instructed otherwise. ## Summary of changes Check out head ref for linting the release PRs. --- .github/workflows/lint-release-pr.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/lint-release-pr.yml b/.github/workflows/lint-release-pr.yml index f12ddfe377..b7d010f66d 100644 --- a/.github/workflows/lint-release-pr.yml +++ b/.github/workflows/lint-release-pr.yml @@ -15,6 +15,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch full history for git operations + ref: ${{ github.event.pull_request.head.ref }} - name: Run lint script env: From 803e6f908a31343cd72f33b3019fbcdd986ad1d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Thu, 13 Mar 2025 10:42:38 +0100 Subject: [PATCH 23/71] fix(ci): fix syntax of lint-release-pr (#11208) ## Problem A small adjustment in #11061 broke the lint-release-pr.sh script, and the new version was neither tested nor linted. This has been done now, the script is once again tested and passing `shellcheck`. ## Summary of changes Add missing `el` of `elif` condition chain. --- .github/scripts/lint-release-pr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/lint-release-pr.sh b/.github/scripts/lint-release-pr.sh index 8e081000f9..6dc5b99f0e 100755 --- a/.github/scripts/lint-release-pr.sh +++ b/.github/scripts/lint-release-pr.sh @@ -90,7 +90,7 @@ while [[ "${CURRENT_COMMIT}" != "${MERGE_BASE}" && "${CURRENT_COMMIT}" != "${EXP if [[ "${NEXT_COMMIT}" == "${MERGE_BASE}" ]]; then echo "✅ Reached merge base (${MERGE_BASE})" PR_BASE="${MERGE_BASE}" - if [[ "${NEXT_COMMIT}" == "${EXPECTED_RELEASE_HEAD}" ]]; then + elif [[ "${NEXT_COMMIT}" == "${EXPECTED_RELEASE_HEAD}" ]]; then echo "✅ Reached release branch (${EXPECTED_RELEASE_HEAD})" PR_BASE="${EXPECTED_RELEASE_HEAD}" elif [[ -z "${NEXT_COMMIT}" ]]; then From efb1df4362e313baa2d5969762c69a74d14d4e98 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 13 Mar 2025 10:17:01 +0000 Subject: [PATCH 24/71] =?UTF-8?q?fix:=20Change=20metric=5Funit=20from=20'm?= =?UTF-8?q?icroseconds'=20to=20'=CE=BCs'=20in=20test=5Fcompute=5Fctl=5Fapi?= =?UTF-8?q?.py=20(#11209)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Fix metric_unit length in test_compute_ctl_api.py ## Description This PR changes the metric_unit from "microseconds" to "μs" in test_compute_ctl_api.py to fix the issue where perf test results were not being stored in the database due to the string exceeding the 10 character limit of the metric_unit column in the perf_test_results table. ## Problem As reported in Slack, the perf test results were not being uploaded to the database because the "microseconds" string (12 characters) exceeds the 10 character limit of the metric_unit column in the perf_test_results table. ## Solution Replace "microseconds" with "μs" in all metric_unit parameters in the test_compute_ctl_api.py file. ## Testing The changes have been committed and pushed. The PR is ready for review. Link to Devin run: https://app.devin.ai/sessions/e29edd672bd34114b059915820e8a853 Requested by: Peter Bendel Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: peterbendel@neon.tech --- test_runner/performance/test_compute_ctl_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test_runner/performance/test_compute_ctl_api.py b/test_runner/performance/test_compute_ctl_api.py index 87eb1f2c35..d6d0a84e8e 100644 --- a/test_runner/performance/test_compute_ctl_api.py +++ b/test_runner/performance/test_compute_ctl_api.py @@ -41,24 +41,24 @@ def test_compute_ctl_api_latencies( zenbenchmark.record( "status_response_latency_p50_us", status_response_latency_us[len(status_response_latency_us) // 2], - "microseconds", + "μs", MetricReport.LOWER_IS_BETTER, ) zenbenchmark.record( "metrics_response_latency_p50_us", metrics_response_latency_us[len(metrics_response_latency_us) // 2], - "microseconds", + "μs", MetricReport.LOWER_IS_BETTER, ) zenbenchmark.record( "status_response_latency_p99_us", status_response_latency_us[len(status_response_latency_us) * 99 // 100], - "microseconds", + "μs", MetricReport.LOWER_IS_BETTER, ) zenbenchmark.record( "metrics_response_latency_p99_us", metrics_response_latency_us[len(metrics_response_latency_us) * 99 // 100], - "microseconds", + "μs", MetricReport.LOWER_IS_BETTER, ) From 5a245a837dcee0c4e8d50ae20f167877a5fa3f08 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 13 Mar 2025 14:28:10 +0100 Subject: [PATCH 25/71] storcon: retain stripe size when autosplitting sharded tenants (#11194) ## Problem Autosplits always request `DEFAULT_STRIPE_SIZE` for splits. However, splits do not allow changing the stripe size of already-sharded tenants, and will error out if it differs. In #11168, we are changing the stripe size, which could hit this when attempting to autosplit already sharded tenants. Touches #11168. ## Summary of changes Pass `new_stripe_size: None` when autosplitting already sharded tenants. Otherwise, pass `DEFAULT_STRIPE_SIZE` instead of the shard identity's stripe size, since we want to use the current default rather than an old, persisted default. --- storage_controller/src/service.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 667b53b725..445b174b96 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -7894,6 +7894,9 @@ impl Service { /// At most one tenant will be split per call: the one with the largest max logical size. It /// will split 1 → 8 shards. /// + /// An unsharded tenant will get DEFAULT_STRIPE_SIZE, regardless of what its ShardIdentity says. + /// A sharded tenant will retain its stripe size, as splits do not allow changing it. + /// /// TODO: consider splitting based on total logical size rather than max logical size. /// /// TODO: consider spawning multiple splits in parallel: this is only called once every 20 @@ -7939,6 +7942,16 @@ impl Service { "Auto-splitting tenant for size threshold {split_threshold}: current size {split_candidate:?}" ); + // Retain the stripe size of sharded tenants, as splits don't allow changing it. Otherwise, + // use DEFAULT_STRIPE_SIZE for unsharded tenants -- their stripe size doesn't really matter, + // and if we change the default stripe size we want to use the new default rather than an + // old, persisted stripe size. + let new_stripe_size = match split_candidate.id.shard_count.count() { + 0 => panic!("invalid shard count 0"), + 1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE), + 2.. => None, + }; + let this = self.clone(); tokio::spawn( async move { @@ -7952,7 +7965,7 @@ impl Service { // because our max shard count is relatively low anyway. This policy // will be adjusted in future once we support higher shard count. new_shard_count: MAX_SHARDS.literal(), - new_stripe_size: Some(ShardParameters::DEFAULT_STRIPE_SIZE), + new_stripe_size, }, ) .await From 89c7e4e9171cba712dadddd5ca0203f5176b4109 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Thu, 13 Mar 2025 14:40:43 +0100 Subject: [PATCH 26/71] fix(ci): use paranthesis for error handling in jq when fetching release PRs (#11217) ## Problem #11061 introduced code fetching previous releases. #11151 introduced jq error handling, which has also been applied in #11061, but parenthesis have been missed. ## Summary of changes Add parenthesis around error handling code. --- .github/workflows/_meta.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml index c9e7b66efa..f029385980 100644 --- a/.github/workflows/_meta.yml +++ b/.github/workflows/_meta.yml @@ -125,5 +125,5 @@ jobs: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} run: | - RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy)|(compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // "Faied to find Build and Test run from RC PR!" | halt_error(1)') + RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy)|(compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Faied to find Build and Test run from RC PR!" | halt_error(1))') echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT From c036fec06576d73a2a1dfc2579cee1dc33afee90 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 13 Mar 2025 15:28:42 +0100 Subject: [PATCH 27/71] pageserver: enable `compaction_l0_first` by default (#11212) ## Problem `compaction_l0_first` has already been enabled in production for a couple of weeks. ## Summary of changes Enable `compaction_l0_first` by default. Also set `CompactFlags::NoYield` in `timeline_checkpoint_handler`, to ensure explicitly requested compaction runs to completion. This endpoint is mainly used in tests, and caused some flakiness where tests expected compaction to complete. --- libs/pageserver_api/src/config.rs | 9 ++++++--- pageserver/src/http/routes.rs | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index ce7de1e0c7..6e457823dd 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -272,10 +272,11 @@ pub struct TenantConfigToml { /// size exceeds `compaction_upper_limit * checkpoint_distance`. pub compaction_upper_limit: usize, pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, - /// If true, compact down L0 across all tenant timelines before doing regular compaction. + /// If true, compact down L0 across all tenant timelines before doing regular compaction. L0 + /// compaction must be responsive to avoid read amp during heavy ingestion. Defaults to true. pub compaction_l0_first: bool, /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only - /// has an effect if `compaction_l0_first` is `true`. + /// has an effect if `compaction_l0_first` is true. Defaults to true. pub compaction_l0_semaphore: bool, /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure, /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer @@ -567,7 +568,9 @@ pub mod tenant_conf_defaults { // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So // with this config, we can get a maximum peak compaction usage of 9 GB. pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20; - pub const DEFAULT_COMPACTION_L0_FIRST: bool = false; + // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid + // read amp. + pub const DEFAULT_COMPACTION_L0_FIRST: bool = true; pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true; pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 44159aee0a..70c3cc8522 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2392,6 +2392,7 @@ async fn timeline_checkpoint_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + flags |= CompactFlags::NoYield; // run compaction to completion if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { flags |= CompactFlags::ForceL0Compaction; } From b2286f5bcb1491227665d4fb6f35b229b8052e81 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 13 Mar 2025 10:38:45 -0400 Subject: [PATCH 28/71] fix(pageserver): don't panic if gc-compaction find no keys (#11200) ## Problem There was a panic on staging that compaction didn't find any keys. This is possible if all layers selected for compaction does not contain any keys within the current shard. ## Summary of changes Make panic an error. In the future, we can try creating an empty image layer so that GC can clean up those layers. Otherwise, for now, we can only rely on shard ancestor compaction to remove these data. Signed-off-by: Alex Chi Z --- pageserver/src/tenant/timeline/compaction.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index e6f2104e90..300daec9bf 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3189,7 +3189,11 @@ impl Timeline { } // TODO: move the below part to the loop body - let last_key = last_key.expect("no keys produced during compaction"); + let Some(last_key) = last_key else { + return Err(CompactionError::Other(anyhow!( + "no keys produced during compaction" + ))); + }; stat.on_unique_key_visited(); let retention = self From 3dec1175728210af3b9a5c90d0d4938fafedeb6b Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 13 Mar 2025 15:03:22 +0000 Subject: [PATCH 29/71] feat(compute_ctl): use TLS if configured (#10972) Closes: https://github.com/neondatabase/cloud/issues/22998 If control-plane reports that TLS should be used, load the certificates (and watch for updates), make sure postgres use them, and detects updates. Procedure: 1. Load certificates 2. Reconfigure postgres/pgbouncer 3. Loop on a timer until certificates have loaded 4. Go to 1 Notes: 1. We only run this procedure if requested on startup by control plane. 2. We needed to compile pgbouncer with openssl enabled 3. Postgres doesn't allow tls keys to be globally accessible - must be read only to the postgres user. I couldn't convince the autoscaling team to let me put this logic into the VM settings, so instead compute_ctl will copy the keys to be read-only by postgres. 4. To mitigate a race condition, we also verify that the key matches the cert. --- Cargo.lock | 71 +++++++++++++++- Cargo.toml | 2 +- compute/compute-node.Dockerfile | 4 +- compute_tools/Cargo.toml | 7 +- compute_tools/src/compute.rs | 95 ++++++++++++++++++--- compute_tools/src/config.rs | 20 ++++- compute_tools/src/http/server.rs | 8 +- compute_tools/src/lib.rs | 1 + compute_tools/src/pg_helpers.rs | 42 ++++++--- compute_tools/src/tls.rs | 118 ++++++++++++++++++++++++++ libs/compute_api/Cargo.toml | 1 + libs/compute_api/src/responses.rs | 8 ++ libs/compute_api/src/spec.rs | 8 +- proxy/src/binary/local_proxy.rs | 39 +++++++-- proxy/src/binary/proxy.rs | 2 + proxy/src/config.rs | 3 +- proxy/src/console_redirect_proxy.rs | 3 +- proxy/src/proxy/handshake.rs | 2 +- proxy/src/proxy/mod.rs | 3 +- proxy/src/proxy/tests/mod.rs | 8 +- proxy/src/serverless/mod.rs | 38 +++------ proxy/src/serverless/sql_over_http.rs | 4 +- proxy/src/tls/server_config.rs | 20 +++-- workspace_hack/Cargo.toml | 7 +- 24 files changed, 427 insertions(+), 87 deletions(-) create mode 100644 compute_tools/src/tls.rs diff --git a/Cargo.lock b/Cargo.lock index 1721c185f0..898ff1eabb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1309,6 +1309,7 @@ version = "0.1.0" dependencies = [ "anyhow", "chrono", + "indexmap 2.0.1", "jsonwebtoken", "regex", "remote_storage", @@ -1339,6 +1340,7 @@ dependencies = [ "flate2", "futures", "http 1.1.0", + "indexmap 2.0.1", "jsonwebtoken", "metrics", "nix 0.27.1", @@ -1347,17 +1349,20 @@ dependencies = [ "once_cell", "opentelemetry", "opentelemetry_sdk", + "p256 0.13.2", "postgres", "postgres_initdb", "regex", "remote_storage", "reqwest", + "ring", "rlimit", "rust-ini", "serde", "serde_json", "serde_with", "signal-hook", + "spki 0.7.3", "tar", "thiserror 1.0.69", "tokio", @@ -1377,6 +1382,7 @@ dependencies = [ "vm_monitor", "walkdir", "workspace_hack", + "x509-cert", "zstd", ] @@ -1801,6 +1807,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" dependencies = [ "const-oid", + "der_derive", + "flagset", "pem-rfc7468", "zeroize", ] @@ -1819,6 +1827,17 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "der_derive" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "deranged" version = "0.3.11" @@ -2282,6 +2301,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "flagset" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ea1ec5f8307826a5b71094dd91fc04d4ae75d5709b20ad351c7fb4815c86ec" + [[package]] name = "flate2" version = "1.0.26" @@ -6425,9 +6450,9 @@ dependencies = [ [[package]] name = "sha1" -version = "0.10.5" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", "cpufeatures", @@ -7135,6 +7160,27 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tls_codec" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de2e01245e2bb89d6f05801c564fa27624dbd7b1846859876c7dad82e90bf6b" +dependencies = [ + "tls_codec_derive", + "zeroize", +] + +[[package]] +name = "tls_codec_derive" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2e76690929402faae40aebdda620a2c0e25dd6d3b9afe48867dfd95991f4bd" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "tokio" version = "1.43.0" @@ -8387,12 +8433,15 @@ dependencies = [ "chrono", "clap", "clap_builder", + "const-oid", "crypto-bigint 0.5.5", "der 0.7.8", "deranged", "digest", "displaydoc", + "ecdsa 0.16.9", "either", + "elliptic-curve 0.13.8", "env_filter", "env_logger", "fail", @@ -8427,6 +8476,7 @@ dependencies = [ "num-rational", "num-traits", "once_cell", + "p256 0.13.2", "parquet", "prettyplease", "proc-macro2", @@ -8439,6 +8489,7 @@ dependencies = [ "reqwest", "rustls 0.23.18", "scopeguard", + "sec1 0.7.3", "serde", "serde_json", "sha2", @@ -8484,6 +8535,18 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" +[[package]] +name = "x509-cert" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1301e935010a701ae5f8655edc0ad17c44bad3ac5ce8c39185f75453b720ae94" +dependencies = [ + "const-oid", + "der 0.7.8", + "spki 0.7.3", + "tls_codec", +] + [[package]] name = "x509-certificate" version = "0.23.1" @@ -8612,9 +8675,9 @@ dependencies = [ [[package]] name = "zeroize" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" dependencies = [ "serde", "zeroize_derive", diff --git a/Cargo.toml b/Cargo.toml index 7b86a64e9a..82fb463182 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -112,7 +112,7 @@ hyper0 = { package = "hyper", version = "0.14" } hyper = "1.4" hyper-util = "0.1" tokio-tungstenite = "0.21.0" -indexmap = "2" +indexmap = { version = "2", features = ["serde"] } indoc = "2" ipnet = "2.10.0" itertools = "0.10" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 6e46185e36..d5483018b4 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1735,6 +1735,8 @@ RUN set -e \ libevent-dev \ libtool \ pkg-config \ + libcurl4-openssl-dev \ + libssl-dev \ && apt clean && rm -rf /var/lib/apt/lists/* # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc) @@ -1743,7 +1745,7 @@ RUN set -e \ && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \ && cd pgbouncer \ && ./autogen.sh \ - && ./configure --prefix=/usr/local/pgbouncer --without-openssl \ + && ./configure --prefix=/usr/local/pgbouncer \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index dd2896714d..90951e7ddb 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -26,6 +26,7 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +indexmap.workspace = true jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true @@ -34,16 +35,19 @@ num_cpus.workspace = true once_cell.workspace = true opentelemetry.workspace = true opentelemetry_sdk.workspace = true +p256 = { version = "0.13", features = ["pem"] } postgres.workspace = true regex.workspace = true +reqwest = { workspace = true, features = ["json"] } +ring = "0.17" serde.workspace = true serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true +spki = { version = "0.7.3", features = ["std"] } tar.workspace = true tower.workspace = true tower-http.workspace = true -reqwest = { workspace = true, features = ["json"] } tokio = { workspace = true, features = ["rt", "rt-multi-thread"] } tokio-postgres.workspace = true tokio-util.workspace = true @@ -57,6 +61,7 @@ thiserror.workspace = true url.workspace = true uuid.workspace = true walkdir.workspace = true +x509-cert = { version = "0.2.5" } postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index c2a3e38ed6..a0654ea0e4 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -41,6 +41,7 @@ use crate::rsyslog::configure_audit_rsyslog; use crate::spec::*; use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; +use crate::tls::watch_cert_for_changes; use crate::{config, extension_server, local_proxy}; pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0); @@ -112,6 +113,7 @@ pub struct ComputeNode { // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, + pub compute_ctl_config: ComputeCtlConfig, } // store some metrics about download size that might impact startup time @@ -135,8 +137,6 @@ pub struct ComputeState { /// passed by the control plane with a /configure HTTP request. pub pspec: Option, - pub compute_ctl_config: ComputeCtlConfig, - /// If the spec is passed by a /configure request, 'startup_span' is the /// /configure request's tracing span. The main thread enters it when it /// processes the compute startup, so that the compute startup is considered @@ -160,7 +160,6 @@ impl ComputeState { last_active: None, error: None, pspec: None, - compute_ctl_config: ComputeCtlConfig::default(), startup_span: None, metrics: ComputeMetrics::default(), } @@ -314,7 +313,6 @@ impl ComputeNode { let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?; new_state.pspec = Some(pspec); } - new_state.compute_ctl_config = compute_ctl_config; Ok(ComputeNode { params, @@ -323,6 +321,7 @@ impl ComputeNode { state: Mutex::new(new_state), state_changed: Condvar::new(), ext_download_progress: RwLock::new(HashMap::new()), + compute_ctl_config, }) } @@ -345,7 +344,7 @@ impl ComputeNode { // requests while configuration is still in progress. crate::http::server::Server::External { port: this.params.external_http_port, - jwks: this.state.lock().unwrap().compute_ctl_config.jwks.clone(), + config: this.compute_ctl_config.clone(), compute_id: this.params.compute_id.clone(), } .launch(&this); @@ -524,6 +523,16 @@ impl ComputeNode { // Collect all the tasks that must finish here let mut pre_tasks = tokio::task::JoinSet::new(); + // Make sure TLS certificates are properly loaded and in the right place. + if self.compute_ctl_config.tls.is_some() { + let this = self.clone(); + pre_tasks.spawn(async move { + this.watch_cert_for_changes().await; + + Ok::<(), anyhow::Error>(()) + }); + } + // If there are any remote extensions in shared_preload_libraries, start downloading them if pspec.spec.remote_extensions.is_some() { let (this, spec) = (self.clone(), pspec.spec.clone()); @@ -579,11 +588,13 @@ impl ComputeNode { if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings { info!("tuning pgbouncer"); + let pgbouncer_settings = pgbouncer_settings.clone(); + let tls_config = self.compute_ctl_config.tls.clone(); + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = pgbouncer_settings.clone(); let _handle = tokio::spawn(async move { - let res = tune_pgbouncer(pgbouncer_settings).await; + let res = tune_pgbouncer(pgbouncer_settings, tls_config).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); // Continue with the startup anyway @@ -1105,9 +1116,10 @@ impl ComputeNode { // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; config::write_postgres_conf( - &pgdata_path.join("postgresql.conf"), + pgdata_path, &pspec.spec, self.params.internal_http_port, + &self.compute_ctl_config.tls, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -1489,11 +1501,13 @@ impl ComputeNode { if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); + let pgbouncer_settings = pgbouncer_settings.clone(); + let tls_config = self.compute_ctl_config.tls.clone(); + // Spawn a background task to do the tuning, // so that we don't block the main thread that starts Postgres. - let pgbouncer_settings = pgbouncer_settings.clone(); tokio::spawn(async move { - let res = tune_pgbouncer(pgbouncer_settings).await; + let res = tune_pgbouncer(pgbouncer_settings, tls_config).await; if let Err(err) = res { error!("error while tuning pgbouncer: {err:?}"); } @@ -1505,7 +1519,8 @@ impl ComputeNode { // Spawn a background task to do the configuration, // so that we don't block the main thread that starts Postgres. - let local_proxy = local_proxy.clone(); + let mut local_proxy = local_proxy.clone(); + local_proxy.tls = self.compute_ctl_config.tls.clone(); tokio::spawn(async move { if let Err(err) = local_proxy::configure(&local_proxy) { error!("error while configuring local_proxy: {err:?}"); @@ -1515,8 +1530,12 @@ impl ComputeNode { // Write new config let pgdata_path = Path::new(&self.params.pgdata); - let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, self.params.internal_http_port)?; + config::write_postgres_conf( + pgdata_path, + &spec, + self.params.internal_http_port, + &self.compute_ctl_config.tls, + )?; if !spec.skip_pg_catalog_updates { let max_concurrent_connections = spec.reconfigure_concurrency; @@ -1587,6 +1606,56 @@ impl ComputeNode { Ok(()) } + pub async fn watch_cert_for_changes(self: Arc) { + // update status on cert renewal + if let Some(tls_config) = &self.compute_ctl_config.tls { + let tls_config = tls_config.clone(); + + // wait until the cert exists. + let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await; + + tokio::task::spawn_blocking(move || { + let handle = tokio::runtime::Handle::current(); + 'cert_update: loop { + // let postgres/pgbouncer/local_proxy know the new cert/key exists. + // we need to wait until it's configurable first. + + let mut state = self.state.lock().unwrap(); + 'status_update: loop { + match state.status { + // let's update the state to config pending + ComputeStatus::ConfigurationPending | ComputeStatus::Running => { + state.set_status( + ComputeStatus::ConfigurationPending, + &self.state_changed, + ); + break 'status_update; + } + + // exit loop + ComputeStatus::Failed + | ComputeStatus::TerminationPending + | ComputeStatus::Terminated => break 'cert_update, + + // wait + ComputeStatus::Init + | ComputeStatus::Configuration + | ComputeStatus::Empty => { + state = self.state_changed.wait(state).unwrap(); + } + } + } + drop(state); + + // wait for a new certificate update + if handle.block_on(cert_watch.changed()).is_err() { + break; + } + } + }); + } + } + /// Update the `last_active` in the shared state, but ensure that it's a more recent one. pub fn update_last_active(&self, last_active: Option>) { let mut state = self.state.lock().unwrap(); diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 0760568ff8..7aa7360f9d 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,11 +6,13 @@ use std::io::Write; use std::io::prelude::*; use std::path::Path; +use compute_api::responses::TlsConfig; use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption}; use crate::pg_helpers::{ GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, }; +use crate::tls::{self, SERVER_CRT, SERVER_KEY}; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. @@ -38,10 +40,12 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { /// Create or completely rewrite configuration file specified by `path` pub fn write_postgres_conf( - path: &Path, + pgdata_path: &Path, spec: &ComputeSpec, extension_server_port: u16, + tls_config: &Option, ) -> Result<()> { + let path = pgdata_path.join("postgresql.conf"); // File::create() destroys the file content if it exists. let mut file = File::create(path)?; @@ -86,6 +90,20 @@ pub fn write_postgres_conf( )?; } + // tls + if let Some(tls_config) = tls_config { + writeln!(file, "ssl = on")?; + + // postgres requires the keyfile to be in a secure file, + // currently too complicated to ensure that at the VM level, + // so we just copy them to another file instead. :shrug: + tls::update_key_path_blocking(pgdata_path, tls_config); + + // these are the default, but good to be explicit. + writeln!(file, "ssl_cert_file = '{}'", SERVER_CRT)?; + writeln!(file, "ssl_key_file = '{}'", SERVER_KEY)?; + } + // Locales if cfg!(target_os = "macos") { writeln!(file, "lc_messages='C'")?; diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index b70b6c619c..10f767e97c 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -8,8 +8,8 @@ use axum::Router; use axum::middleware::{self}; use axum::response::IntoResponse; use axum::routing::{get, post}; +use compute_api::responses::ComputeCtlConfig; use http::StatusCode; -use jsonwebtoken::jwk::JwkSet; use tokio::net::TcpListener; use tower::ServiceBuilder; use tower_http::{ @@ -41,7 +41,7 @@ pub enum Server { }, External { port: u16, - jwks: JwkSet, + config: ComputeCtlConfig, compute_id: String, }, } @@ -79,7 +79,7 @@ impl From<&Server> for Router> { router } Server::External { - jwks, compute_id, .. + config, compute_id, .. } => { let unauthenticated_router = Router::>::new().route("/metrics", get(metrics::get_metrics)); @@ -95,7 +95,7 @@ impl From<&Server> for Router> { .route("/terminate", post(terminate::terminate)) .layer(AsyncRequireAuthorizationLayer::new(Authorize::new( compute_id.clone(), - jwks.clone(), + config.jwks.clone(), ))); router diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 5c78bbcd02..a681fad0b0 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -26,3 +26,4 @@ pub mod spec; mod spec_apply; pub mod swap; pub mod sync_sk; +pub mod tls; diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index dd8d8e9b8b..802e3e93d9 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -10,8 +10,10 @@ use std::str::FromStr; use std::time::{Duration, Instant}; use anyhow::{Result, bail}; +use compute_api::responses::TlsConfig; use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; use futures::StreamExt; +use indexmap::IndexMap; use ini::Ini; use notify::{RecursiveMode, Watcher}; use postgres::config::Config; @@ -406,7 +408,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> { /// Update pgbouncer.ini with provided options fn update_pgbouncer_ini( - pgbouncer_config: HashMap, + pgbouncer_config: IndexMap, pgbouncer_ini_path: &str, ) -> Result<()> { let mut conf = Ini::load_from_file(pgbouncer_ini_path)?; @@ -427,7 +429,10 @@ fn update_pgbouncer_ini( /// Tune pgbouncer. /// 1. Apply new config using pgbouncer admin console /// 2. Add new values to pgbouncer.ini to preserve them after restart -pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result<()> { +pub async fn tune_pgbouncer( + mut pgbouncer_config: IndexMap, + tls_config: Option, +) -> Result<()> { let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() { // for VMs use pgbouncer specific way to connect to // pgbouncer admin console without password @@ -473,19 +478,21 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result } }; - // Apply new config - for (option_name, value) in pgbouncer_config.iter() { - let query = format!("SET {}={}", option_name, value); - // keep this log line for debugging purposes - info!("Applying pgbouncer setting change: {}", query); + if let Some(tls_config) = tls_config { + // pgbouncer starts in a half-ok state if it cannot find these files. + // It will default to client_tls_sslmode=deny, which causes proxy to error. + // There is a small window at startup where these files don't yet exist in the VM. + // Best to wait until it exists. + loop { + if let Ok(true) = tokio::fs::try_exists(&tls_config.key_path).await { + break; + } + tokio::time::sleep(Duration::from_millis(500)).await + } - if let Err(err) = client.simple_query(&query).await { - // Don't fail on error, just print it into log - error!( - "Failed to apply pgbouncer setting change: {}, {}", - query, err - ); - }; + pgbouncer_config.insert("client_tls_cert_file".to_string(), tls_config.cert_path); + pgbouncer_config.insert("client_tls_key_file".to_string(), tls_config.key_path); + pgbouncer_config.insert("client_tls_sslmode".to_string(), "allow".to_string()); } // save values to pgbouncer.ini @@ -501,6 +508,13 @@ pub async fn tune_pgbouncer(pgbouncer_config: HashMap) -> Result }; update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?; + info!("Applying pgbouncer setting change"); + + if let Err(err) = client.simple_query("RELOAD").await { + // Don't fail on error, just print it into log + error!("Failed to apply pgbouncer setting change, {err}",); + }; + Ok(()) } diff --git a/compute_tools/src/tls.rs b/compute_tools/src/tls.rs new file mode 100644 index 0000000000..5a310d8ac4 --- /dev/null +++ b/compute_tools/src/tls.rs @@ -0,0 +1,118 @@ +use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration}; + +use anyhow::{Context, Result, bail}; +use compute_api::responses::TlsConfig; +use ring::digest; +use spki::ObjectIdentifier; +use spki::der::{Decode, PemReader}; +use x509_cert::Certificate; + +#[derive(Clone, Copy)] +pub struct CertDigest(digest::Digest); + +pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver { + let mut digest = compute_digest(&cert_path).await; + let (tx, rx) = tokio::sync::watch::channel(digest); + tokio::spawn(async move { + while !tx.is_closed() { + let new_digest = compute_digest(&cert_path).await; + if digest.0.as_ref() != new_digest.0.as_ref() { + digest = new_digest; + _ = tx.send(digest); + } + + tokio::time::sleep(Duration::from_secs(60)).await + } + }); + rx +} + +async fn compute_digest(cert_path: &str) -> CertDigest { + loop { + match try_compute_digest(cert_path).await { + Ok(d) => break d, + Err(e) => { + tracing::error!("could not read cert file {e:?}"); + tokio::time::sleep(Duration::from_secs(1)).await + } + } + } +} + +async fn try_compute_digest(cert_path: &str) -> Result { + let data = tokio::fs::read(cert_path).await?; + // sha256 is extremely collision resistent. can safely assume the digest to be unique + Ok(CertDigest(digest::digest(&digest::SHA256, &data))) +} + +pub const SERVER_CRT: &str = "server.crt"; +pub const SERVER_KEY: &str = "server.key"; + +pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) { + loop { + match try_update_key_path_blocking(pg_data, tls_config) { + Ok(()) => break, + Err(e) => { + tracing::error!("could not create key file {e:?}"); + std::thread::sleep(Duration::from_secs(1)) + } + } + } +} + +// Postgres requires the keypath be "secure". This means +// 1. Owned by the postgres user. +// 2. Have permission 600. +fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> { + let key = std::fs::read_to_string(&tls_config.key_path)?; + let crt = std::fs::read_to_string(&tls_config.cert_path)?; + + // to mitigate a race condition during renewal. + verify_key_cert(&key, &crt)?; + + let mut key_file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .mode(0o600) + .open(pg_data.join(SERVER_KEY))?; + + let mut crt_file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .mode(0o600) + .open(pg_data.join(SERVER_CRT))?; + + key_file.write_all(key.as_bytes())?; + crt_file.write_all(crt.as_bytes())?; + + Ok(()) +} + +fn verify_key_cert(key: &str, cert: &str) -> Result<()> { + const ECDSA_WITH_SHA256: ObjectIdentifier = ObjectIdentifier::new_unwrap("1.2.840.10045.4.3.2"); + + let cert = Certificate::decode(&mut PemReader::new(cert.as_bytes()).context("pem reader")?) + .context("decode cert")?; + + match cert.signature_algorithm.oid { + ECDSA_WITH_SHA256 => { + let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?; + + let a = key.public_key().to_sec1_bytes(); + let b = cert + .tbs_certificate + .subject_public_key_info + .subject_public_key + .raw_bytes(); + + if *a != *b { + bail!("private key file does not match certificate") + } + } + _ => bail!("unknown TLS key type"), + } + + Ok(()) +} diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 0d1618c1b2..81b0cd19a1 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true chrono.workspace = true +indexmap.workspace = true jsonwebtoken.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 3300fbf7dd..c8f6019c5c 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -139,6 +139,7 @@ pub struct ComputeCtlConfig { /// Set of JSON web keys that the compute can use to authenticate /// communication from the control plane. pub jwks: JwkSet, + pub tls: Option, } impl Default for ComputeCtlConfig { @@ -147,10 +148,17 @@ impl Default for ComputeCtlConfig { jwks: JwkSet { keys: Vec::default(), }, + tls: None, } } } +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct TlsConfig { + pub key_path: String, + pub cert_path: String, +} + /// Response of the `/computes/{compute_id}/spec` control-plane API. #[derive(Deserialize, Debug)] pub struct ControlPlaneSpecResponse { diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 77f2e1e631..af4264f8d2 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -5,12 +5,15 @@ //! and connect it to the storage nodes. use std::collections::HashMap; +use indexmap::IndexMap; use regex::Regex; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use crate::responses::TlsConfig; + /// String type alias representing Postgres identifier and /// intended to be used for DB / role names. pub type PgIdent = String; @@ -125,7 +128,7 @@ pub struct ComputeSpec { // information about available remote extensions pub remote_extensions: Option, - pub pgbouncer_settings: Option>, + pub pgbouncer_settings: Option>, // Stripe size for pageserver sharding, in pages #[serde(default)] @@ -357,6 +360,9 @@ pub struct LocalProxySpec { #[serde(default)] #[serde(skip_serializing_if = "Option::is_none")] pub jwks: Option>, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub tls: Option, } #[derive(Clone, Debug, Deserialize, Serialize)] diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index dedd225cba..ee7f6ffcd7 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, bail, ensure}; +use arc_swap::ArcSwapOption; use camino::{Utf8Path, Utf8PathBuf}; use clap::Parser; use compute_api::spec::LocalProxySpec; @@ -27,6 +28,7 @@ use crate::config::{ }; use crate::control_plane::locks::ApiLocks; use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use crate::ext::TaskExt; use crate::http::health_server::AppMetrics; use crate::intern::RoleNameInt; use crate::metrics::{Metrics, ThreadPoolMetrics}; @@ -190,7 +192,11 @@ pub async fn run() -> anyhow::Result<()> { // 2. The config file is written but the signal hook is not yet received // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. refresh_config_notify.notify_one(); - tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); + tokio::spawn(refresh_config_loop( + config, + args.config_path, + refresh_config_notify, + )); maintenance_tasks.spawn(crate::http::health_server::task_main( metrics_listener, @@ -269,7 +275,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }; Ok(Box::leak(Box::new(ProxyConfig { - tls_config: None, + tls_config: ArcSwapOption::from(None), metric_collection: None, http_config, authentication_config: AuthenticationConfig { @@ -311,14 +317,16 @@ enum RefreshConfigError { Parse(#[from] serde_json::Error), #[error(transparent)] Validate(anyhow::Error), + #[error(transparent)] + Tls(anyhow::Error), } -async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { +async fn refresh_config_loop(config: &ProxyConfig, path: Utf8PathBuf, rx: Arc) { let mut init = true; loop { rx.notified().await; - match refresh_config_inner(&path).await { + match refresh_config_inner(config, &path).await { Ok(()) => {} // don't log for file not found errors if this is the first time we are checking // for computes that don't use local_proxy, this is not an error. @@ -327,6 +335,9 @@ async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { { debug!(error=?e, ?path, "could not read config file"); } + Err(RefreshConfigError::Tls(e)) => { + error!(error=?e, ?path, "could not read TLS certificates"); + } Err(e) => { error!(error=?e, ?path, "could not read config file"); } @@ -336,7 +347,10 @@ async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { } } -async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> { +async fn refresh_config_inner( + config: &ProxyConfig, + path: &Utf8Path, +) -> Result<(), RefreshConfigError> { let bytes = tokio::fs::read(&path).await?; let data: LocalProxySpec = serde_json::from_slice(&bytes)?; @@ -406,5 +420,20 @@ async fn refresh_config_inner(path: &Utf8Path) -> Result<(), RefreshConfigError> info!("successfully loaded new config"); JWKS_ROLE_MAP.store(Some(Arc::new(EndpointJwksResponse { jwks: jwks_set }))); + if let Some(tls_config) = data.tls { + let tls_config = tokio::task::spawn_blocking(move || { + crate::tls::server_config::configure_tls( + &tls_config.key_path, + &tls_config.cert_path, + None, + false, + ) + }) + .await + .propagate_task_panic() + .map_err(RefreshConfigError::Tls)?; + config.tls_config.store(Some(Arc::new(tls_config))); + } + Ok(()) } diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index eec0bf8f99..feca5ccf88 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::bail; +use arc_swap::ArcSwapOption; use futures::future::Either; use remote_storage::RemoteStorageConfig; use tokio::net::TcpListener; @@ -563,6 +564,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; + let tls_config = ArcSwapOption::from(tls_config.map(Arc::new)); let backup_metric_collection_config = config::MetricBackupCollectionConfig { remote_storage_config: args.metric_backup_collection_remote_storage.clone(), diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 1bcd22e98f..ad398c122c 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Ok, bail, ensure}; +use arc_swap::ArcSwapOption; use clap::ValueEnum; use remote_storage::RemoteStorageConfig; @@ -17,7 +18,7 @@ pub use crate::tls::server_config::{TlsConfig, configure_tls}; use crate::types::Host; pub struct ProxyConfig { - pub tls_config: Option, + pub tls_config: ArcSwapOption, pub metric_collection: Option, pub http_config: HttpConfig, pub authentication_config: AuthenticationConfig, diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 4662860b3f..1156545f34 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -177,7 +177,8 @@ pub(crate) async fn handle_client( let proto = ctx.protocol(); let request_gauge = metrics.connection_requests.guard(proto); - let tls = config.tls_config.as_ref(); + let tls = config.tls_config.load(); + let tls = tls.as_deref(); let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 955f754497..2582e4c069 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -114,7 +114,7 @@ pub(crate) async fn handshake( let mut read_buf = read_buf.reader(); let mut res = Ok(()); - let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config()) + let accept = tokio_rustls::TlsAcceptor::from(tls.pg_config.clone()) .accept_with(raw, |session| { // push the early data to the tls session while !read_buf.get_ref().is_empty() { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 0c6d352600..2e7d332a8b 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -278,7 +278,8 @@ pub(crate) async fn handle_client( let proto = ctx.protocol(); let request_gauge = metrics.connection_requests.guard(proto); - let tls = config.tls_config.as_ref(); + let tls = config.tls_config.load(); + let tls = tls.as_deref(); let record_handshake_error = !ctx.has_private_peer_addr(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index e0b7539538..2c3e70138d 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -96,16 +96,18 @@ fn generate_tls_config<'a>( .with_safe_default_protocol_versions() .context("ring should support the default protocol versions")? .with_no_client_auth() - .with_single_cert(vec![cert.clone()], key.clone_key())? - .into(); + .with_single_cert(vec![cert.clone()], key.clone_key())?; let mut cert_resolver = CertResolver::new(); cert_resolver.add_cert(key, vec![cert], true)?; let common_names = cert_resolver.get_common_names(); + let config = Arc::new(config); + TlsConfig { - config, + http_config: config.clone(), + pg_config: config, common_names, cert_resolver: Arc::new(cert_resolver), } diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index a7f46cbe58..00164d631a 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -19,6 +19,7 @@ use std::pin::{Pin, pin}; use std::sync::Arc; use anyhow::Context; +use arc_swap::ArcSwapOption; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; @@ -117,18 +118,7 @@ pub async fn task_main( auth_backend, endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); - let tls_acceptor: Arc = match config.tls_config.as_ref() { - Some(config) => { - let mut tls_server_config = rustls::ServerConfig::clone(&config.to_server_config()); - // prefer http2, but support http/1.1 - tls_server_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; - Arc::new(tls_server_config) - } - None => { - warn!("TLS config is missing"); - Arc::new(NoTls) - } - }; + let tls_acceptor: Arc = Arc::new(&config.tls_config); let connections = tokio_util::task::task_tracker::TaskTracker::new(); connections.close(); // allows `connections.wait to complete` @@ -216,22 +206,20 @@ pub(crate) type AsyncRW = Pin>; #[async_trait] trait MaybeTlsAcceptor: Send + Sync + 'static { - async fn accept(self: Arc, conn: ChainRW) -> std::io::Result; + async fn accept(&self, conn: ChainRW) -> std::io::Result; } #[async_trait] -impl MaybeTlsAcceptor for rustls::ServerConfig { - async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { - Ok(Box::pin(TlsAcceptor::from(self).accept(conn).await?)) - } -} - -struct NoTls; - -#[async_trait] -impl MaybeTlsAcceptor for NoTls { - async fn accept(self: Arc, conn: ChainRW) -> std::io::Result { - Ok(Box::pin(conn)) +impl MaybeTlsAcceptor for &'static ArcSwapOption { + async fn accept(&self, conn: ChainRW) -> std::io::Result { + match &*self.load() { + Some(config) => Ok(Box::pin( + TlsAcceptor::from(config.http_config.clone()) + .accept(conn) + .await?, + )), + None => Ok(Box::pin(conn)), + } } } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 47009086c3..a79a478126 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -614,7 +614,9 @@ async fn handle_inner( &config.authentication_config, ctx, request.headers(), - config.tls_config.as_ref(), + // todo: race condition? + // we're unlikely to change the common names. + config.tls_config.load().as_deref(), )?; info!( user = conn_info.conn_info.user_info.user.as_str(), diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 903c0b712b..4cbd0474c2 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -9,17 +9,14 @@ use rustls::pki_types::{CertificateDer, PrivateKeyDer}; use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; pub struct TlsConfig { - pub config: Arc, + // unfortunate split since we cannot change the ALPN on demand. + // + pub http_config: Arc, + pub pg_config: Arc, pub common_names: HashSet, pub cert_resolver: Arc, } -impl TlsConfig { - pub fn to_server_config(&self) -> Arc { - self.config.clone() - } -} - /// Configure TLS for the main endpoint. pub fn configure_tls( key_path: &str, @@ -71,8 +68,15 @@ pub fn configure_tls( config.key_log = Arc::new(rustls::KeyLogFile::new()); } + let mut http_config = config.clone(); + let mut pg_config = config; + + http_config.alpn_protocols = vec![b"h2".to_vec(), b"http/1.1".to_vec()]; + pg_config.alpn_protocols = vec![b"postgresql".to_vec()]; + Ok(TlsConfig { - config: Arc::new(config), + http_config: Arc::new(http_config), + pg_config: Arc::new(pg_config), common_names, cert_resolver, }) diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f1696c5ff9..6a726f0585 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -26,11 +26,14 @@ camino = { version = "1", default-features = false, features = ["serde1"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } clap = { version = "4", features = ["derive", "env", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] } +const-oid = { version = "0.9", default-features = false, features = ["db", "std"] } crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] } -der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] } +der = { version = "0.7", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] } deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } digest = { version = "0.10", features = ["mac", "oid", "std"] } +ecdsa = { version = "0.16", features = ["pem", "signing", "std", "verifying"] } either = { version = "1" } +elliptic-curve = { version = "0.13", default-features = false, features = ["digest", "hazmat", "jwk", "pem", "std"] } env_filter = { version = "0.1", default-features = false, features = ["regex"] } env_logger = { version = "0.11" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } @@ -65,6 +68,7 @@ num-iter = { version = "0.1", default-features = false, features = ["i128", "std num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } +p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } @@ -74,6 +78,7 @@ regex-syntax = { version = "0.8" } reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "rustls-tls-native-roots", "stream"] } rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] } scopeguard = { version = "1" } +sec1 = { version = "0.7", features = ["pem", "serde", "std", "subtle"] } serde = { version = "1", features = ["alloc", "derive"] } serde_json = { version = "1", features = ["alloc", "raw_value"] } sha2 = { version = "0.10", features = ["asm", "oid"] } From ed31dd2a3c9cdb6dce6ea26e50a42be477e2a3a2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 13 Mar 2025 16:03:53 +0100 Subject: [PATCH 30/71] pageserver: better observability for slow wait_lsn (#11176) # Problem We leave too few observability breadcrumbs in the case where wait_lsn is exceptionally slow. # Changes - refactor: extract the monitoring logic out of `log_slow` into `monitor_slow_future` - add global + per-timeline counter for time spent waiting for wait_lsn - It is updated while we're still waiting, similar to what we do for page_service response flush. - add per-timeline counterpair for started & finished wait_lsn count - add slow-logging to leave breadcrumbs in logs, not just metrics For the slow-logging, we need to consider not flooding the logs during a broker or network outage/blip. The solution is a "log-streak-level" concurrency limit per timeline. At any given time, there is at most one slow wait_lsn that is logging the "still running" and "completed" sequence of logs. Other concurrent slow wait_lsn's don't log at all. This leaves at least one breadcrumb in each timeline's logs if some wait_lsn was exceptionally slow during a given period. The full degree of slowness can then be determined by looking at the per-timeline metric. # Performance Reran the `bench_log_slow` benchmark, no difference, so, existing call sites are fine. We do use a Semaphore, but only try_acquire it _after_ things have already been determined to be slow. So, no baseline overhead anticipated. # Refs - https://github.com/neondatabase/cloud/issues/23486#issuecomment-2711587222 --- libs/utils/benches/benchmarks.rs | 8 ++- libs/utils/src/logging.rs | 91 +++++++++++++++++++++++------- pageserver/src/metrics.rs | 92 ++++++++++++++++++++++++------- pageserver/src/page_service.rs | 19 +++++-- pageserver/src/tenant/timeline.rs | 63 +++++++++++++++++++-- test_runner/fixtures/metrics.py | 3 + 6 files changed, 227 insertions(+), 49 deletions(-) diff --git a/libs/utils/benches/benchmarks.rs b/libs/utils/benches/benchmarks.rs index 12c620ec87..35f3baaed1 100644 --- a/libs/utils/benches/benchmarks.rs +++ b/libs/utils/benches/benchmarks.rs @@ -49,7 +49,13 @@ pub fn bench_log_slow(c: &mut Criterion) { // performance too. Use a simple noop future that yields once, to avoid any scheduler fast // paths for a ready future. if enabled { - b.iter(|| runtime.block_on(log_slow("ready", THRESHOLD, tokio::task::yield_now()))); + b.iter(|| { + runtime.block_on(log_slow( + "ready", + THRESHOLD, + std::pin::pin!(tokio::task::yield_now()), + )) + }); } else { b.iter(|| runtime.block_on(tokio::task::yield_now())); } diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index f37f05692a..0ac8201795 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -331,37 +331,90 @@ impl std::fmt::Debug for SecretString { /// /// TODO: consider upgrading this to a warning, but currently it fires too often. #[inline] -pub async fn log_slow(name: &str, threshold: Duration, f: impl Future) -> O { - // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and - // won't fit on the stack. - let mut f = Box::pin(f); +pub async fn log_slow(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O +where + F: Future, +{ + monitor_slow_future( + threshold, + threshold, // period = threshold + f, + |MonitorSlowFutureCallback { + ready, + is_slow, + elapsed_total, + elapsed_since_last_callback: _, + }| { + if !is_slow { + return; + } + if ready { + info!( + "slow {name} completed after {:.3}s", + elapsed_total.as_secs_f64() + ); + } else { + info!( + "slow {name} still running after {:.3}s", + elapsed_total.as_secs_f64() + ); + } + }, + ) + .await +} +/// Poll future `fut` to completion, invoking callback `cb` at the given `threshold` and every +/// `period` afterwards, and also unconditionally when the future completes. +#[inline] +pub async fn monitor_slow_future( + threshold: Duration, + period: Duration, + mut fut: std::pin::Pin<&mut F>, + mut cb: impl FnMut(MonitorSlowFutureCallback), +) -> O +where + F: Future, +{ let started = Instant::now(); let mut attempt = 1; - + let mut last_cb = started; loop { // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common // case where the timeout doesn't fire. - let deadline = started + attempt * threshold; - if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await { - // NB: we check if we exceeded the threshold even if the timeout never fired, because - // scheduling or execution delays may cause the future to succeed even if it exceeds the - // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid - // false negatives. - let elapsed = started.elapsed(); - if elapsed >= threshold { - info!("slow {name} completed after {:.3}s", elapsed.as_secs_f64()); - } + let deadline = started + threshold + (attempt - 1) * period; + // TODO: still call the callback if the future panics? Copy how we do it for the page_service flush_in_progress counter. + let res = tokio::time::timeout_at(deadline, &mut fut).await; + let now = Instant::now(); + let elapsed_total = now - started; + cb(MonitorSlowFutureCallback { + ready: res.is_ok(), + is_slow: elapsed_total >= threshold, + elapsed_total, + elapsed_since_last_callback: now - last_cb, + }); + last_cb = now; + if let Ok(output) = res { return output; } - - let elapsed = started.elapsed().as_secs_f64(); - info!("slow {name} still running after {elapsed:.3}s",); - attempt += 1; } } +/// See [`monitor_slow_future`]. +pub struct MonitorSlowFutureCallback { + /// Whether the future completed. If true, there will be no more callbacks. + pub ready: bool, + /// Whether the future is taking `>=` the specififed threshold duration to complete. + /// Monotonic: if true in one callback invocation, true in all subsequent onces. + pub is_slow: bool, + /// The time elapsed since the [`monitor_slow_future`] was first polled. + pub elapsed_total: Duration, + /// The time elapsed since the last callback invocation. + /// For the initial callback invocation, the time elapsed since the [`monitor_slow_future`] was first polled. + pub elapsed_since_last_callback: Duration, +} + #[cfg(test)] mod tests { use metrics::IntCounterVec; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index fd90ef8cd7..f7afaae068 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -465,12 +465,40 @@ pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) { pub(crate) static WAIT_LSN_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wait_lsn_seconds", - "Time spent waiting for WAL to arrive", + "Time spent waiting for WAL to arrive. Updated on completion of the wait_lsn operation.", CRITICAL_OP_BUCKETS.into(), ) .expect("failed to define a metric") }); +pub(crate) static WAIT_LSN_START_FINISH_COUNTERPAIR: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_wait_lsn_started_count", + "Number of wait_lsn operations started.", + "pageserver_wait_lsn_finished_count", + "Number of wait_lsn operations finished.", + &["tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub(crate) static WAIT_LSN_IN_PROGRESS_MICROS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_wait_lsn_in_progress_micros", + "Time spent waiting for WAL to arrive, by timeline_id. Updated periodically while waiting.", + &["tenant_id", "shard_id", "timeline_id"], + ) + .expect("failed to define a metric") +}); + +pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy = Lazy::new(|| { + register_int_counter!( + "pageserver_wait_lsn_in_progress_micros_global", + "Time spent waiting for WAL to arrive, globally. Updated periodically while waiting." + ) + .expect("failed to define a metric") +}); + static FLUSH_WAIT_UPLOAD_TIME: Lazy = Lazy::new(|| { register_gauge_vec!( "pageserver_flush_wait_upload_seconds", @@ -2830,7 +2858,6 @@ impl StorageTimeMetrics { } } -#[derive(Debug)] pub(crate) struct TimelineMetrics { tenant_id: String, shard_id: String, @@ -2863,6 +2890,8 @@ pub(crate) struct TimelineMetrics { pub valid_lsn_lease_count_gauge: UIntGauge, pub wal_records_received: IntCounter, pub storage_io_size: StorageIoSizeMetrics, + pub wait_lsn_in_progress_micros: GlobalAndPerTenantIntCounter, + pub wait_lsn_start_finish_counterpair: IntCounterPair, shutdown: std::sync::atomic::AtomicBool, } @@ -3000,6 +3029,17 @@ impl TimelineMetrics { let storage_io_size = StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id); + let wait_lsn_in_progress_micros = GlobalAndPerTenantIntCounter { + global: WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS.clone(), + per_tenant: WAIT_LSN_IN_PROGRESS_MICROS + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(), + }; + + let wait_lsn_start_finish_counterpair = WAIT_LSN_START_FINISH_COUNTERPAIR + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + TimelineMetrics { tenant_id, shard_id, @@ -3032,6 +3072,8 @@ impl TimelineMetrics { storage_io_size, valid_lsn_lease_count_gauge, wal_records_received, + wait_lsn_in_progress_micros, + wait_lsn_start_finish_counterpair, shutdown: std::sync::atomic::AtomicBool::default(), } } @@ -3224,6 +3266,15 @@ impl TimelineMetrics { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } + let _ = + WAIT_LSN_IN_PROGRESS_MICROS.remove_label_values(&[tenant_id, shard_id, timeline_id]); + + { + let mut res = [Ok(()), Ok(())]; + WAIT_LSN_START_FINISH_COUNTERPAIR + .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id]); + } + let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[ SmgrQueryType::GetPageAtLsn.into(), tenant_id, @@ -3836,27 +3887,29 @@ pub mod tokio_epoll_uring { }); } +pub(crate) struct GlobalAndPerTenantIntCounter { + global: IntCounter, + per_tenant: IntCounter, +} + +impl GlobalAndPerTenantIntCounter { + #[inline(always)] + pub(crate) fn inc(&self) { + self.inc_by(1) + } + #[inline(always)] + pub(crate) fn inc_by(&self, n: u64) { + self.global.inc_by(n); + self.per_tenant.inc_by(n); + } +} + pub(crate) mod tenant_throttling { - use metrics::{IntCounter, register_int_counter_vec}; + use metrics::register_int_counter_vec; use once_cell::sync::Lazy; use utils::shard::TenantShardId; - pub(crate) struct GlobalAndPerTenantIntCounter { - global: IntCounter, - per_tenant: IntCounter, - } - - impl GlobalAndPerTenantIntCounter { - #[inline(always)] - pub(crate) fn inc(&self) { - self.inc_by(1) - } - #[inline(always)] - pub(crate) fn inc_by(&self, n: u64) { - self.global.inc_by(n); - self.per_tenant.inc_by(n); - } - } + use super::GlobalAndPerTenantIntCounter; pub(crate) struct Metrics { pub(super) count_accounted_start: GlobalAndPerTenantIntCounter, @@ -4102,6 +4155,7 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) { &CIRCUIT_BREAKERS_BROKEN, &CIRCUIT_BREAKERS_UNBROKEN, &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL, + &WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS, ] .into_iter() .for_each(|c| { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index f2d2ab05ad..94571cbaaa 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1106,12 +1106,19 @@ impl PageServerHandler { }; // Dispatch the batch to the appropriate request handler. - let (mut handler_results, span) = log_slow( - batch.as_static_str(), - LOG_SLOW_GETPAGE_THRESHOLD, - self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx), - ) - .await?; + let log_slow_name = batch.as_static_str(); + let (mut handler_results, span) = { + // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and + // won't fit on the stack. + let mut boxpinned = + Box::pin(self.pagestream_dispatch_batched_message(batch, io_concurrency, ctx)); + log_slow( + log_slow_name, + LOG_SLOW_GETPAGE_THRESHOLD, + boxpinned.as_mut(), + ) + .await? + }; // We purposefully don't count flush time into the smgr operation timer. // diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 61542409f7..6cca8cc407 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -67,6 +67,7 @@ use tracing::*; use utils::generation::Generation; use utils::guard_arc_swap::GuardArcSwap; use utils::id::TimelineId; +use utils::logging::{MonitorSlowFutureCallback, monitor_slow_future}; use utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use utils::postgres_client::PostgresClientProtocol; use utils::rate_limit::RateLimit; @@ -439,6 +440,8 @@ pub struct Timeline { heatmap_layers_downloader: Mutex>, pub(crate) rel_size_v2_status: ArcSwapOption, + + wait_lsn_log_slow: tokio::sync::Semaphore, } pub(crate) enum PreviousHeatmap { @@ -1479,17 +1482,67 @@ impl Timeline { WaitLsnTimeout::Default => self.conf.wait_lsn_timeout, }; - let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); + let timer = crate::metrics::WAIT_LSN_TIME.start_timer(); + let start_finish_counterpair_guard = self.metrics.wait_lsn_start_finish_counterpair.guard(); - match self.last_record_lsn.wait_for_timeout(lsn, timeout).await { + let wait_for_timeout = self.last_record_lsn.wait_for_timeout(lsn, timeout); + let wait_for_timeout = std::pin::pin!(wait_for_timeout); + // Use threshold of 1 because even 1 second of wait for ingest is very much abnormal. + let log_slow_threshold = Duration::from_secs(1); + // Use period of 10 to avoid flooding logs during an outage that affects all timelines. + let log_slow_period = Duration::from_secs(10); + let mut logging_permit = None; + let wait_for_timeout = monitor_slow_future( + log_slow_threshold, + log_slow_period, + wait_for_timeout, + |MonitorSlowFutureCallback { + ready, + is_slow, + elapsed_total, + elapsed_since_last_callback, + }| { + self.metrics + .wait_lsn_in_progress_micros + .inc_by(u64::try_from(elapsed_since_last_callback.as_micros()).unwrap()); + if !is_slow { + return; + } + // It's slow, see if we should log it. + // (We limit the logging to one per invocation per timeline to avoid excessive + // logging during an extended broker / networking outage that affects all timelines.) + if logging_permit.is_none() { + logging_permit = self.wait_lsn_log_slow.try_acquire().ok(); + } + if logging_permit.is_none() { + return; + } + // We log it. + if ready { + info!( + "slow wait_lsn completed after {:.3}s", + elapsed_total.as_secs_f64() + ); + } else { + info!( + "slow wait_lsn still running for {:.3}s", + elapsed_total.as_secs_f64() + ); + } + }, + ); + let res = wait_for_timeout.await; + // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo + drop(logging_permit); + drop(start_finish_counterpair_guard); + drop(timer); + match res { Ok(()) => Ok(()), Err(e) => { use utils::seqwait::SeqWaitError::*; match e { Shutdown => Err(WaitLsnError::Shutdown), Timeout => { - // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo - drop(_timer); let walreceiver_status = self.walreceiver_status(); Err(WaitLsnError::Timeout(format!( "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}", @@ -2821,6 +2874,8 @@ impl Timeline { heatmap_layers_downloader: Mutex::new(None), rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status), + + wait_lsn_log_slow: tokio::sync::Semaphore::new(1), }; result.repartition_threshold = diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 83a1a87611..54e6458ac6 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -175,6 +175,9 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( counter("pageserver_tenant_throttling_count"), counter("pageserver_timeline_wal_records_received"), counter("pageserver_page_service_pagestream_flush_in_progress_micros"), + counter("pageserver_wait_lsn_in_progress_micros"), + counter("pageserver_wait_lsn_started_count"), + counter("pageserver_wait_lsn_finished_count"), *histogram("pageserver_page_service_batch_size"), *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, From 3c3b9dc9197dc43cce8bd24dfe4feab179ac118c Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 13 Mar 2025 17:28:21 +0100 Subject: [PATCH 31/71] pageserver: enable `image_creation_preempt_threshold` by default (#11216) ## Problem This is already set in production, we should harmonize the default. ## Summary of changes Default `image_creation_preempt_threshold` to 3. --- libs/pageserver_api/src/config.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 6e457823dd..c7d33d8a04 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -587,9 +587,8 @@ pub mod tenant_conf_defaults { pub const DEFAULT_GC_PERIOD: &str = "1 hr"; pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image - // layer creation will end immediately. Set to 0 to disable. The target default will be 3 once we - // want to enable this feature. - pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 0; + // layer creation will end immediately. Set to 0 to disable. + pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3; pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; From 398d2794ebb63bb80ff3c896a1cd84878ef4c821 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 13 Mar 2025 18:30:32 +0200 Subject: [PATCH 32/71] Handle DEBUG_COMPARE_LOCAL mode in neon_zeroextend (#11220) ## Problem DEBUG_COMPARE_LOCAL is not supported in neon_zeroextend added in PG16 ## Summary of changes Add support of DEBUG_COMPARE_LOCAL in neon_zeroextend Co-authored-by: Konstantin Knizhnik --- pgxn/neon/pagestore_smgr.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 1135212e22..6fe95df3dd 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -2898,6 +2898,11 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, relpath(reln->smgr_rlocator, forkNum), InvalidBlockNumber))); +#ifdef DEBUG_COMPARE_LOCAL + if (IS_LOCAL_REL(reln)) + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); +#endif + /* Don't log any pages if we're not allowed to do so. */ if (!XLogInsertAllowed()) return; From 066b0a1be91bbf21984d20de9931ca0f650f441f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Thu, 13 Mar 2025 19:18:55 +0100 Subject: [PATCH 33/71] fix(ci): correctly push neon-test-extensions in releases and to ghcr (#11225) ## Problem ef0d4a48a adjusted how we build container images and how we push them, and the neon-test-extensions image was overlooked. Additionally, is was also missed in 1f0dea9a1, which pushed our container images to GHCR. ## Summary of changes Push neon-test-extensions to GHCR and also push release tags for it. --- .github/workflows/_meta.yml | 2 +- .github/workflows/build_and_test.yml | 49 +++++++++++++++++++++++++--- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml index f029385980..9e49c1ebc8 100644 --- a/.github/workflows/_meta.yml +++ b/.github/workflows/_meta.yml @@ -125,5 +125,5 @@ jobs: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} run: | - RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy)|(compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Faied to find Build and Test run from RC PR!" | halt_error(1))') + RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy)|(compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Falied to find Build and Test run from RC PR!" | halt_error(1))') echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e1ad972a61..409ad6be3d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -978,16 +978,55 @@ jobs: acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} secrets: inherit - # This is a bit of a special case so we're not using a generated image map. - add-latest-tag-to-neon-extensions-test-image: - if: github.ref_name == 'main' + push-neon-test-extensions-image-ghcr: + if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: | { - "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v16:latest"], - "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": ["docker.io/neondatabase/neon-test-extensions-v17:latest"] + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" + ], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" + ] + } + secrets: inherit + + add-latest-tag-to-neon-test-extensions-image: + if: ${{ needs.meta.outputs.run-kind == 'push-main' }} + needs: [ meta, compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v16:latest", + "ghcr.io/neondatabase/neon-test-extensions-v16:latest" + ], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v17:latest", + "ghcr.io/neondatabase/neon-test-extensions-v17:latest" + ] + } + secrets: inherit + + add-release-tag-to-neon-test-extensions-image: + if: ${{ needs.meta.outputs.run-kind == 'compute-release' }} + needs: [ meta, compute-node-image ] + uses: ./.github/workflows/_push-to-container-registry.yml + with: + image-map: | + { + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.release-pr-run-id }}": [ + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}", + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" + ], + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.release-pr-run-id }}": [ + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}", + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" + ] } secrets: inherit From 8afae9d03c13141d2b0c6ebcc2985949e838f5f7 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 13 Mar 2025 20:15:22 +0100 Subject: [PATCH 34/71] pageserver: enable `l0_flush_delay_threshold` by default (#11214) ## Problem `l0_flush_delay_threshold` has already been set to 30 in production for a couple of weeks. Let's harmonize the default. ## Summary of changes Update `DEFAULT_L0_FLUSH_DELAY_FACTOR` to 3 such that the default `l0_flush_delay_threshold` is `3 * compaction_threshold`. This differs from the production setting, which is hardcoded to 30 (with `compaction_threshold` at 10), and is more appropriate for any tenants that have custom `compaction_threshold` overrides. --- libs/pageserver_api/src/config.rs | 8 ++++---- pageserver/src/tenant/timeline.rs | 10 ++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index c7d33d8a04..e112a57c9d 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -278,10 +278,10 @@ pub struct TenantConfigToml { /// If true, use a separate semaphore (i.e. concurrency limit) for the L0 compaction pass. Only /// has an effect if `compaction_l0_first` is true. Defaults to true. pub compaction_l0_semaphore: bool, - /// Level0 delta layer threshold at which to delay layer flushes for compaction backpressure, - /// such that they take 2x as long, and start waiting for layer flushes during ephemeral layer - /// rolls. This helps compaction keep up with WAL ingestion, and avoids read amplification - /// blowing up. Should be >compaction_threshold. 0 to disable. Disabled by default. + /// Level0 delta layer threshold at which to delay layer flushes such that they take 2x as long, + /// and block on layer flushes during ephemeral layer rolls, for compaction backpressure. This + /// helps compaction keep up with WAL ingestion, and avoids read amplification blowing up. + /// Should be >compaction_threshold. 0 to disable. Defaults to 3x compaction_threshold. pub l0_flush_delay_threshold: Option, /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold /// to avoid deadlock. 0 to disable. Disabled by default. diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6cca8cc407..be861a0c89 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -2476,8 +2476,9 @@ impl Timeline { } fn get_l0_flush_delay_threshold(&self) -> Option { - // Disable L0 flushes by default. This and compaction needs further tuning. - const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 0; // TODO: default to e.g. 3 + // By default, delay L0 flushes at 3x the compaction threshold. The compaction threshold + // defaults to 10, and L0 compaction is generally able to keep L0 counts below 30. + const DEFAULT_L0_FLUSH_DELAY_FACTOR: usize = 3; // If compaction is disabled, don't delay. if self.get_compaction_period() == Duration::ZERO { @@ -2505,8 +2506,9 @@ impl Timeline { } fn get_l0_flush_stall_threshold(&self) -> Option { - // Disable L0 stalls by default. In ingest benchmarks, we see image compaction take >10 - // minutes, blocking L0 compaction, and we can't stall L0 flushes for that long. + // Disable L0 stalls by default. Stalling can cause unavailability if L0 compaction isn't + // responsive, and it can e.g. block on other compaction via the compaction semaphore or + // sibling timelines. We need more confidence before enabling this. const DEFAULT_L0_FLUSH_STALL_FACTOR: usize = 0; // TODO: default to e.g. 5 // If compaction is disabled, don't stall. From b1a1be6a4cb91d6a5af9f643a16f786eeb128c17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 13 Mar 2025 20:50:52 +0100 Subject: [PATCH 35/71] switch pytests and neon_local to control_plane_hooks_api (#11195) We want to switch away from and deprecate the `--compute-hook-url` param for the storcon in favour of `--control-plane-url` because it allows us to construct urls with `notify-safekeepers`. This PR switches the pytests and neon_local from a `control_plane_compute_hook_api` to a new param named `control_plane_hooks_api` which is supposed to point to the parent of the `notify-attach` URL. We still support reading the old url from disk to not be too disruptive with existing deployments, but we just ignore it. Also add docs for the `notify-safekeepers` upcall API. Follow-up of #11173 Part of https://github.com/neondatabase/neon/issues/11163 --- control_plane/src/bin/neon_local.rs | 2 +- control_plane/src/local_env.rs | 19 ++++--- control_plane/src/storage_controller.rs | 6 +- docs/storage_controller.md | 57 ++++++++++++++++--- test_runner/fixtures/compute_reconfigure.py | 2 +- test_runner/fixtures/neon_fixtures.py | 8 +-- .../test_storage_controller_scale.py | 4 +- test_runner/regress/test_change_pageserver.py | 4 +- .../regress/test_pageserver_secondary.py | 4 +- test_runner/regress/test_sharding.py | 8 +-- .../regress/test_storage_controller.py | 20 +++---- 11 files changed, 85 insertions(+), 49 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 72ebbafd3b..747268f80b 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -979,7 +979,7 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { neon_distrib_dir: None, default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), storage_controller: None, - control_plane_compute_hook_api: None, + control_plane_hooks_api: None, generate_local_ssl_certs: false, } }; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index ec9eb74e6f..2e57236ddb 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -72,9 +72,9 @@ pub struct LocalEnv { // be propagated into each pageserver's configuration. pub control_plane_api: Url, - // Control plane upcall API for storage controller. If set, this will be propagated into the + // Control plane upcall APIs for storage controller. If set, this will be propagated into the // storage controller's configuration. - pub control_plane_compute_hook_api: Option, + pub control_plane_hooks_api: Option, /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. // A `HashMap>` would be more appropriate here, @@ -104,6 +104,7 @@ pub struct OnDiskConfig { pub pageservers: Vec, pub safekeepers: Vec, pub control_plane_api: Option, + pub control_plane_hooks_api: Option, pub control_plane_compute_hook_api: Option, branch_name_mappings: HashMap>, // Note: skip serializing because in compat tests old storage controller fails @@ -136,7 +137,7 @@ pub struct NeonLocalInitConf { pub pageservers: Vec, pub safekeepers: Vec, pub control_plane_api: Option, - pub control_plane_compute_hook_api: Option>, + pub control_plane_hooks_api: Option, pub generate_local_ssl_certs: bool, } @@ -573,7 +574,8 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api, - control_plane_compute_hook_api, + control_plane_hooks_api, + control_plane_compute_hook_api: _, branch_name_mappings, generate_local_ssl_certs, } = on_disk_config; @@ -588,7 +590,7 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api: control_plane_api.unwrap(), - control_plane_compute_hook_api, + control_plane_hooks_api, branch_name_mappings, generate_local_ssl_certs, } @@ -695,7 +697,8 @@ impl LocalEnv { pageservers: vec![], // it's skip_serializing anyway safekeepers: self.safekeepers.clone(), control_plane_api: Some(self.control_plane_api.clone()), - control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(), + control_plane_hooks_api: self.control_plane_hooks_api.clone(), + control_plane_compute_hook_api: None, branch_name_mappings: self.branch_name_mappings.clone(), generate_local_ssl_certs: self.generate_local_ssl_certs, }, @@ -779,8 +782,8 @@ impl LocalEnv { pageservers, safekeepers, control_plane_api, - control_plane_compute_hook_api, generate_local_ssl_certs, + control_plane_hooks_api, } = conf; // Find postgres binaries. @@ -827,7 +830,7 @@ impl LocalEnv { pageservers: pageservers.iter().map(Into::into).collect(), safekeepers, control_plane_api: control_plane_api.unwrap(), - control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(), + control_plane_hooks_api, branch_name_mappings: Default::default(), generate_local_ssl_certs, }; diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index bbd7f67720..e28fd70fdf 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -558,10 +558,8 @@ impl StorageController { args.push(format!("--public-key=\"{public_key}\"")); } - if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api { - args.push(format!( - "--compute-hook-url={control_plane_compute_hook_api}" - )); + if let Some(control_plane_hooks_api) = &self.env.control_plane_hooks_api { + args.push(format!("--control-plane-url={control_plane_hooks_api}")); } if let Some(split_threshold) = self.config.split_threshold.as_ref() { diff --git a/docs/storage_controller.md b/docs/storage_controller.md index cf00cd8e33..ac4aca4219 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -101,15 +101,25 @@ changes such as a pageserver node becoming unavailable, or the tenant's shard co postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver location changes. -The hook is configured using the storage controller's `--control-plane-url` CLI option. If the hook requires -JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request. +The hook is configured using the storage controller's `--control-plane-url` CLI option, from which the hook URL is computed. -In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems +Currently, there is two hooks, each computed by appending the name to the provided control plane URL prefix: + +- `notify-attach`, called whenever attachment for pageservers changes +- `notify-safekeepers`, called whenever attachment for safekeepers changes + +If the hooks require JWT auth, the token may be provided with `--control-plane-jwt-token`. +The hooks will be invoked with a `PUT` request. + +In the Neon cloud service, these hooks are implemented by Neon's internal cloud control plane. In `neon_local` systems, the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling the compute hook. -When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated: -the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience. +When implementing an on-premise Neon deployment, you must implement a service that handles the compute hooks. This is not complicated. + +### `notify-attach` body + +The `notify-attach` request body follows the format of the `ComputeHookNotifyRequest` structure, provided below for convenience. ``` struct ComputeHookNotifyRequestShard { @@ -128,15 +138,15 @@ When a notification is received: 1. Modify postgres configuration for this tenant: - - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The + - set `neon.pageserver_connstring` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The shards identified by `NodeId` must be converted to the address+port of the node. - - if stripe_size is not None, set `neon.stripe_size` to this value + - if stripe_size is not None, set `neon.shard_stripe_size` to this value 2. Send SIGHUP to postgres to reload configuration 3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller will retry the notification until it succeeds.. -### Example notification body +Example body: ``` { @@ -148,3 +158,34 @@ When a notification is received: ], } ``` + +### `notify-safekeepers` body + +The `notify-safekeepers` request body forllows the format of the `SafekeepersNotifyRequest` structure, provided below for convenience. + +``` +pub struct SafekeeperInfo { + pub id: NodeId, + pub hostname: String, +} + +pub struct SafekeepersNotifyRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub generation: u32, + pub safekeepers: Vec, +} +``` + +When a notification is received: + +1. Modify postgres configuration for this tenant: + + - set `neon.safekeeper_connstrings` to an array of postgres connection strings to safekeepers according to the `safekeepers` list. The + safekeepers identified by `NodeId` must be converted to the address+port of the respective safekeeper. + The hostname is provided for debugging purposes, so we reserve changes to how we pass it. + - set `neon.safekeepers_generation` to the provided `generation` value. + +2. Send SIGHUP to postgres to reload configuration +3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller + will retry the notification until it succeeds.. \ No newline at end of file diff --git a/test_runner/fixtures/compute_reconfigure.py b/test_runner/fixtures/compute_reconfigure.py index 425abef935..205b9141e0 100644 --- a/test_runner/fixtures/compute_reconfigure.py +++ b/test_runner/fixtures/compute_reconfigure.py @@ -19,7 +19,7 @@ if TYPE_CHECKING: class ComputeReconfigure: def __init__(self, server: HTTPServer): self.server = server - self.control_plane_compute_hook_api = f"http://{server.host}:{server.port}/notify-attach" + self.control_plane_hooks_api = f"http://{server.host}:{server.port}/" self.workloads: dict[TenantId, Any] = {} self.on_notify: Callable[[Any], None] | None = None diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7bc746d668..11ca1d7913 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -460,7 +460,7 @@ class NeonEnvBuilder: self.overlay_mounts_created_by_us: list[tuple[str, Path]] = [] self.config_init_force: str | None = None self.top_output_dir = top_output_dir - self.control_plane_compute_hook_api: str | None = None + self.control_plane_hooks_api: str | None = None self.storage_controller_config: dict[Any, Any] | None = None # Flag to enable https listener in pageserver, generate local ssl certs, @@ -1116,7 +1116,7 @@ class NeonEnv: self.control_plane_api: str = self.storage_controller.upcall_api_endpoint() # For testing this with a fake HTTP server, enable passing through a URL from config - self.control_plane_compute_hook_api = config.control_plane_compute_hook_api + self.control_plane_hooks_api = config.control_plane_hooks_api self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode @@ -1137,8 +1137,8 @@ class NeonEnv: if self.control_plane_api is not None: cfg["control_plane_api"] = self.control_plane_api - if self.control_plane_compute_hook_api is not None: - cfg["control_plane_compute_hook_api"] = self.control_plane_compute_hook_api + if self.control_plane_hooks_api is not None: + cfg["control_plane_hooks_api"] = self.control_plane_hooks_api storage_controller_config = self.storage_controller_config diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 777b9e2870..e897d53cc8 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -83,9 +83,7 @@ def test_storage_controller_many_tenants( "max_offline": "30s", "max_warming_up": "300s", } - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api AZS = ["alpha", "bravo", "charlie"] diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 41aa5b47ca..5526b783d5 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -23,8 +23,8 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder, make_httpserver): ) env = neon_env_builder.init_start() - neon_env_builder.control_plane_compute_hook_api = ( - f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" ) def ignore_notify(request: Request): diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 130db009c9..9f2aa5df8c 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -87,8 +87,8 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=s3_storage(), ) - neon_env_builder.control_plane_compute_hook_api = ( - f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + neon_env_builder.control_plane_hooks_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/" ) def ignore_notify(request: Request): diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index cb28f5b12d..b98ac8e50a 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -794,7 +794,7 @@ def test_sharding_split_stripe_size( Check that modifying stripe size inline with a shard split works as expected """ (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" neon_env_builder.num_pageservers = 1 # Set up fake HTTP notify endpoint: we will use this to validate that we receive @@ -806,7 +806,7 @@ def test_sharding_split_stripe_size( notifications.append(request.json) return Response(status=200) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) env = neon_env_builder.init_start( initial_tenant_shard_count=1, initial_tenant_shard_stripe_size=initial_stripe_size @@ -1312,9 +1312,7 @@ def test_sharding_split_failures( failure: Failure, ): neon_env_builder.num_pageservers = 4 - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api initial_shard_count = 2 split_shard_count = 4 diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 5eaf69cfa1..05eb4301b0 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -605,7 +605,7 @@ def test_storage_controller_compute_hook( # when migrating. neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" # Set up fake HTTP notify endpoint notifications = [] @@ -618,7 +618,7 @@ def test_storage_controller_compute_hook( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -724,7 +724,7 @@ def test_storage_controller_stuck_compute_hook( neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" handle_params = {"status": 200} @@ -736,7 +736,7 @@ def test_storage_controller_stuck_compute_hook( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -871,7 +871,7 @@ def test_storage_controller_compute_hook_retry( neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" handle_params = {"status": 200} @@ -883,7 +883,7 @@ def test_storage_controller_compute_hook_retry( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_configs() @@ -993,7 +993,7 @@ def test_storage_controller_compute_hook_revert( # when migrating. neon_env_builder.num_pageservers = 2 (host, port) = httpserver_listen_address - neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify" + neon_env_builder.control_plane_hooks_api = f"http://{host}:{port}" # Set up fake HTTP notify endpoint notifications = [] @@ -1006,7 +1006,7 @@ def test_storage_controller_compute_hook_revert( notifications.append(request.json) return Response(status=status) - httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler) + httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(handler) # Start running env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"}) @@ -1395,9 +1395,7 @@ def test_storage_controller_tenant_deletion( """ neon_env_builder.num_pageservers = 4 neon_env_builder.enable_pageserver_remote_storage(s3_storage()) - neon_env_builder.control_plane_compute_hook_api = ( - compute_reconfigure_listener.control_plane_compute_hook_api - ) + neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api env = neon_env_builder.init_configs() env.start() From 23b713900ef97251967691757011e4a793814188 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Thu, 13 Mar 2025 16:21:23 -0400 Subject: [PATCH 36/71] feat(storcon): passthrough ancestor detach behavior (#11199) ## Problem https://github.com/neondatabase/neon/issues/10310 https://github.com/neondatabase/neon/pull/11158 ## Summary of changes We need to passthrough the new detach behavior through the storcon API. Signed-off-by: Alex Chi Z --- libs/pageserver_api/src/models.rs | 33 +++++++++++++++++++ pageserver/client/src/mgmt_api.rs | 12 +++++-- pageserver/src/http/routes.rs | 8 ++--- pageserver/src/tenant/mgr.rs | 4 +-- pageserver/src/tenant/timeline.rs | 9 ++--- .../src/tenant/timeline/detach_ancestor.rs | 25 +------------- storage_controller/src/http.rs | 5 +-- storage_controller/src/pageserver_client.rs | 11 ++++--- storage_controller/src/service.rs | 11 ++++--- test_runner/fixtures/pageserver/http.py | 6 ++-- .../regress/test_timeline_detach_ancestor.py | 12 +++++-- 11 files changed, 84 insertions(+), 52 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 5e5bcf5338..4a8f75413c 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -176,6 +176,39 @@ impl LsnLease { } } +/// Controls the detach ancestor behavior. +/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point. +/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all. +#[derive(Debug, Clone, Copy, Default)] +pub enum DetachBehavior { + #[default] + NoAncestorAndReparent, + MultiLevelAndNoReparent, +} + +impl std::str::FromStr for DetachBehavior { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s { + "no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent), + "multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent), + "v1" => Ok(DetachBehavior::NoAncestorAndReparent), + "v2" => Ok(DetachBehavior::MultiLevelAndNoReparent), + _ => Err("cannot parse detach behavior"), + } + } +} + +impl std::fmt::Display for DetachBehavior { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DetachBehavior::NoAncestorAndReparent => write!(f, "no_ancestor_and_reparent"), + DetachBehavior::MultiLevelAndNoReparent => write!(f, "multi_level_and_no_reparent"), + } + } +} + /// The only [`TenantState`] variants we could be `TenantState::Activating` from. /// /// XXX: We used to have more variants here, but now it's just one, which makes this rather diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 830fd8a531..508dac231e 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -7,7 +7,7 @@ use http_utils::error::HttpErrorBody; use pageserver_api::models::*; use pageserver_api::shard::TenantShardId; pub use reqwest::Body as ReqwestBody; -use reqwest::{Certificate, IntoUrl, Method, StatusCode}; +use reqwest::{Certificate, IntoUrl, Method, StatusCode, Url}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -458,13 +458,21 @@ impl Client { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + behavior: Option, ) -> Result { let uri = format!( "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor", self.mgmt_api_endpoint ); + let mut uri = Url::parse(&uri) + .map_err(|e| Error::ApiError(StatusCode::INTERNAL_SERVER_ERROR, format!("{e}")))?; - self.request(Method::PUT, &uri, ()) + if let Some(behavior) = behavior { + uri.query_pairs_mut() + .append_pair("detach_behavior", &behavior.to_string()); + } + + self.request(Method::PUT, uri, ()) .await? .json() .await diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 70c3cc8522..e8a32ca1ef 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -28,9 +28,9 @@ use hyper::{Body, Request, Response, StatusCode, Uri, header}; use metrics::launch_timestamp::LaunchTimestamp; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::models::{ - DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, ListAuxFilesRequest, - LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, LsnLeaseRequest, - OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse, + DetachBehavior, DownloadRemoteLayersTaskSpawnRequest, IngestAuxFilesRequest, + ListAuxFilesRequest, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, + LsnLeaseRequest, OffloadedTimelineInfo, PageTraceEvent, ShardParameters, StatusResponse, TenantConfigPatchRequest, TenantConfigRequest, TenantDetails, TenantInfo, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantScanRemoteStorageResponse, TenantScanRemoteStorageShard, TenantShardLocation, TenantShardSplitRequest, @@ -72,7 +72,6 @@ use crate::tenant::remote_timeline_client::{ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; -use crate::tenant::timeline::detach_ancestor::DetachBehavior; use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; use crate::tenant::timeline::{ CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout, @@ -2508,6 +2507,7 @@ async fn timeline_detach_ancestor_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let behavior: Option = parse_query_param(&request, "detach_behavior")?; + let behavior = behavior.unwrap_or_default(); let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 092bfdf6c1..f02247950f 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -14,7 +14,7 @@ use futures::StreamExt; use itertools::Itertools; use once_cell::sync::Lazy; use pageserver_api::key::Key; -use pageserver_api::models::LocationConfigMode; +use pageserver_api::models::{DetachBehavior, LocationConfigMode}; use pageserver_api::shard::{ ShardCount, ShardIdentity, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId, }; @@ -1914,7 +1914,7 @@ impl TenantManager { tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, - behavior: detach_ancestor::DetachBehavior, + behavior: DetachBehavior, mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, ) -> Result, detach_ancestor::Error> { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index be861a0c89..face2dfdc1 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -45,8 +45,9 @@ use pageserver_api::key::{ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning}; use pageserver_api::models::{ CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings, - DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, - InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState, + DetachBehavior, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, + EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, + TimelineState, }; use pageserver_api::reltag::{BlockNumber, RelTag}; use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId}; @@ -5445,7 +5446,7 @@ impl Timeline { self: &Arc, tenant: &crate::tenant::Tenant, options: detach_ancestor::Options, - behavior: detach_ancestor::DetachBehavior, + behavior: DetachBehavior, ctx: &RequestContext, ) -> Result { detach_ancestor::prepare(self, tenant, behavior, options, ctx).await @@ -5466,7 +5467,7 @@ impl Timeline { prepared: detach_ancestor::PreparedTimelineDetach, ancestor_timeline_id: TimelineId, ancestor_lsn: Lsn, - behavior: detach_ancestor::DetachBehavior, + behavior: DetachBehavior, ctx: &RequestContext, ) -> Result { detach_ancestor::detach_and_reparent( diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index c3e4bedc50..ac9d9a4579 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::Context; use http_utils::error::ApiError; +use pageserver_api::models::DetachBehavior; use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::shard::ShardIdentity; use tokio::sync::Semaphore; @@ -139,30 +140,6 @@ pub(crate) struct Options { pub(crate) copy_concurrency: std::num::NonZeroUsize, } -/// Controls the detach ancestor behavior. -/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point. -/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all. -#[derive(Debug, Clone, Copy, Default)] -pub enum DetachBehavior { - #[default] - NoAncestorAndReparent, - MultiLevelAndNoReparent, -} - -impl std::str::FromStr for DetachBehavior { - type Err = &'static str; - - fn from_str(s: &str) -> Result { - match s { - "no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent), - "multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent), - "v1" => Ok(DetachBehavior::NoAncestorAndReparent), - "v2" => Ok(DetachBehavior::MultiLevelAndNoReparent), - _ => Err("cannot parse detach behavior"), - } - } -} - impl Default for Options { fn default() -> Self { Self { diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index b27804d820..52e3ef5b0a 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -24,7 +24,7 @@ use pageserver_api::controller_api::{ ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, }; use pageserver_api::models::{ - TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, + DetachBehavior, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, }; @@ -525,6 +525,7 @@ async fn handle_tenant_timeline_detach_ancestor( ) -> Result, ApiError> { let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + let behavior: Option = parse_query_param(&req, "detach_behavior")?; check_permissions(&req, Scope::PageServerApi)?; maybe_rate_limit(&req, tenant_id).await; @@ -537,7 +538,7 @@ async fn handle_tenant_timeline_detach_ancestor( }; let res = service - .tenant_timeline_detach_ancestor(tenant_id, timeline_id) + .tenant_timeline_detach_ancestor(tenant_id, timeline_id, behavior) .await?; json_response(StatusCode::OK, res) diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 7fd4f37e7e..05e7aa88c6 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,9 +1,9 @@ use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::models::{ - LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, - TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse, - TenantWaitLsnRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineInfo, - TopTenantShardsRequest, TopTenantShardsResponse, + DetachBehavior, LocationConfig, LocationConfigListResponse, PageserverUtilization, + SecondaryProgress, TenantScanRemoteStorageResponse, TenantShardSplitRequest, + TenantShardSplitResponse, TenantWaitLsnRequest, TimelineArchivalConfigRequest, + TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, }; use pageserver_api::shard::TenantShardId; use pageserver_client::BlockUnblock; @@ -252,13 +252,14 @@ impl PageserverClient { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + behavior: Option, ) -> Result { measured_request!( "timeline_detach_ancestor", crate::metrics::Method::Put, &self.node_id_label, self.inner - .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .timeline_detach_ancestor(tenant_shard_id, timeline_id, behavior) .await ) } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 445b174b96..789f4da255 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -34,9 +34,9 @@ use pageserver_api::controller_api::{ TenantShardMigrateResponse, }; use pageserver_api::models::{ - self, LocationConfig, LocationConfigListResponse, LocationConfigMode, PageserverUtilization, - SafekeeperInfo, SafekeepersInfo, SecondaryProgress, ShardParameters, TenantConfig, - TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, + self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, + PageserverUtilization, SafekeeperInfo, SafekeepersInfo, SecondaryProgress, ShardParameters, + TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateResponseStorcon, @@ -4041,6 +4041,7 @@ impl Service { &self, tenant_id: TenantId, timeline_id: TimelineId, + behavior: Option, ) -> Result { tracing::info!("Detaching timeline {tenant_id}/{timeline_id}",); @@ -4064,6 +4065,7 @@ impl Service { node: Node, jwt: Option, ssl_ca_cert: Option, + behavior: Option, ) -> Result<(ShardNumber, models::detach_ancestor::AncestorDetached), ApiError> { tracing::info!( "Detaching timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", @@ -4073,7 +4075,7 @@ impl Service { .map_err(|e| passthrough_api_error(&node, e))?; client - .timeline_detach_ancestor(tenant_shard_id, timeline_id) + .timeline_detach_ancestor(tenant_shard_id, timeline_id, behavior) .await .map_err(|e| { use mgmt_api::Error; @@ -4111,6 +4113,7 @@ impl Service { node, self.config.pageserver_jwt_token.clone(), self.config.ssl_ca_cert.clone(), + behavior, )) }) .await?; diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 61aab2213d..13cab448f3 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1070,14 +1070,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, batch_size: int | None = None, - behavior_v2: bool = False, + detach_behavior: str | None = None, **kwargs, ) -> set[TimelineId]: params: dict[str, Any] = {} if batch_size is not None: params["batch_size"] = batch_size - if behavior_v2: - params["detach_behavior"] = "v2" + if detach_behavior: + params["detach_behavior"] = detach_behavior res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", params=params, diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 79537ba83a..685a32af90 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -407,7 +407,9 @@ def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): after = env.create_branch("after", ancestor_branch_name="main", ancestor_start_lsn=None) - all_reparented = client.detach_ancestor(env.initial_tenant, branch_to_detach, behavior_v2=True) + all_reparented = client.detach_ancestor( + env.initial_tenant, branch_to_detach, detach_behavior="v2" + ) assert set(all_reparented) == set() env.pageserver.quiesce_tenants() @@ -1350,8 +1352,10 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv ) +@pytest.mark.parametrize("detach_behavior", ["default", "v1", "v2"]) def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( neon_env_builder: NeonEnvBuilder, + detach_behavior: str, ): shard_count = 2 neon_env_builder.num_pageservers = shard_count @@ -1390,7 +1394,11 @@ def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( victim_http.configure_failpoints([(pausepoint, "pause"), (failpoint, "return")]) def detach_timeline(): - http.detach_ancestor(env.initial_tenant, detached_branch) + http.detach_ancestor( + env.initial_tenant, + detached_branch, + detach_behavior=detach_behavior if detach_behavior != "default" else None, + ) def paused_at_failpoint(): stuck.assert_log_contains(f"at failpoint {pausepoint}") From 9a3020d2ce038d61ae2b180a6ab698cf2232cb87 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Thu, 13 Mar 2025 20:23:53 +0000 Subject: [PATCH 37/71] chore(proxy): pre-initialise metricvecs (#11226) ## Problem We noticed that error metrics didn't show for some services with light load. This is not great and can cause problems for dashboards/alerts ## Summary of changes Pre-initialise some metricvecs. --- proxy/src/metrics.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 29834760c0..e5fc0b724b 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -30,7 +30,16 @@ pub struct Metrics { static SELF: OnceLock = OnceLock::new(); impl Metrics { pub fn install(thread_pool: Arc) { - SELF.set(Metrics::new(thread_pool)) + let mut metrics = Metrics::new(thread_pool); + + metrics.proxy.errors_total.init_all_dense(); + metrics.proxy.redis_errors_total.init_all_dense(); + metrics.proxy.redis_events_count.init_all_dense(); + metrics.proxy.retries_metric.init_all_dense(); + metrics.proxy.invalid_endpoints_total.init_all_dense(); + metrics.proxy.connection_failures_total.init_all_dense(); + + SELF.set(metrics) .ok() .expect("proxy metrics must not be installed more than once"); } From 4ff000c042b5ad63cd8d9ca209ebb877f61ec6fe Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 13 Mar 2025 21:46:21 +0100 Subject: [PATCH 38/71] pageserver: deflake `test_metadata_image_creation` (#11230) ## Problem `test_metadata_image_creation ` became flaky with #11212, since image compaction may yield to L0 compaction. ## Summary of changes Set `NoYield` when compacting in tenant tests. --- pageserver/src/tenant.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2bce56345a..7a06d60268 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -7020,6 +7020,7 @@ mod tests { child_timeline.freeze_and_flush().await?; let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); child_timeline .compact(&CancellationToken::new(), flags, &ctx) .await?; @@ -7727,6 +7728,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags } else { EnumSet::empty() @@ -7893,7 +7895,6 @@ mod tests { Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) } - #[allow(clippy::needless_range_loop)] for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = (blknum * STEP) as u32; @@ -7943,6 +7944,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8405,6 +8407,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags }, &ctx, @@ -8472,6 +8475,7 @@ mod tests { let mut flags = EnumSet::new(); flags.insert(CompactFlags::ForceImageLayerCreation); flags.insert(CompactFlags::ForceRepartition); + flags.insert(CompactFlags::NoYield); flags }, &ctx, From d6d78a050f4d3b807718ba2a1f2fbc0d779b5cc8 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 13 Mar 2025 22:08:28 +0100 Subject: [PATCH 39/71] pageserver: disable `l0_flush_wait_upload` by default (#11215) ## Problem This is already disabled in production, as it is replaced by L0 flush delays. It will be removed in a later PR, once the config option is no longer specified in production. ## Summary of changes Disable `l0_flush_wait_upload` by default. --- libs/pageserver_api/src/config.rs | 4 +- .../regress/test_attach_tenant_config.py | 2 +- test_runner/regress/test_branching.py | 7 +-- test_runner/regress/test_remote_storage.py | 55 +------------------ 4 files changed, 8 insertions(+), 60 deletions(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index e112a57c9d..b12ef65780 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -289,6 +289,8 @@ pub struct TenantConfigToml { /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next /// layer. This is a temporary backpressure mechanism which should be removed once /// l0_flush_{delay,stall}_threshold is fully enabled. + /// + /// TODO: this is no longer enabled, remove it when the config option is no longer set. pub l0_flush_wait_upload: bool, // Determines how much history is retained, to allow // branching and read replicas at an older point in time. @@ -576,7 +578,7 @@ pub mod tenant_conf_defaults { pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = crate::models::CompactionAlgorithm::Legacy; - pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = true; + pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 07600dd911..b56fcd3500 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -144,7 +144,7 @@ def test_fully_custom_config(positive_env: NeonEnv): "compaction_l0_semaphore": False, "l0_flush_delay_threshold": 25, "l0_flush_stall_threshold": 42, - "l0_flush_wait_upload": False, + "l0_flush_wait_upload": True, "compaction_target_size": 1048576, "checkpoint_distance": 10000, "checkpoint_timeout": "13m", diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 34e4e994cb..85d0cfbf1d 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -19,6 +19,7 @@ from fixtures.pageserver.utils import wait_until_tenant_active from fixtures.utils import query_scalar from performance.test_perf_pgbench import get_scales_matrix from requests import RequestException +from requests.exceptions import RetryError # Test branch creation @@ -180,7 +181,6 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env.endpoints.create_start( initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2 ) - ps_http.configure_failpoints(("before-upload-index-pausable", "off")) finally: env.pageserver.stop(immediate=True) @@ -221,10 +221,7 @@ def test_cannot_branch_from_non_uploaded_branch(neon_env_builder: NeonEnvBuilder branch_id = TimelineId.generate() - with pytest.raises( - PageserverApiException, - match="Cannot branch off the timeline that's not present in pageserver", - ): + with pytest.raises(RetryError, match="too many 503 error responses"): ps_http.timeline_create( env.pg_version, env.initial_tenant, diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index c39c74fa2a..e8721f1ea0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -29,7 +29,6 @@ from fixtures.remote_storage import ( from fixtures.utils import ( assert_eq, assert_ge, - assert_gt, print_gc_result, query_scalar, wait_until, @@ -334,14 +333,12 @@ def test_remote_storage_upload_queue_retries( # Exponential back-off in upload queue, so, gracious timeouts. wait_until( - lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30 + lambda: assert_ge(get_queued_count(file_kind="layer", op_kind="upload"), 1), timeout=30 ) wait_until( lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30 ) - wait_until( - lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30 - ) + # There may or may not be deletes queued up behind conflicting uploads; don't check. # unblock churn operations configure_storage_sync_failpoints("off") @@ -786,54 +783,6 @@ def test_empty_branch_remote_storage_upload_on_restart(neon_env_builder: NeonEnv create_thread.join() -def test_paused_upload_stalls_checkpoint( - neon_env_builder: NeonEnvBuilder, -): - """ - This test checks that checkpoints block on uploads to remote storage. - """ - neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - - env = neon_env_builder.init_start( - initial_tenant_conf={ - # Set a small compaction threshold - "compaction_threshold": "3", - # Disable GC - "gc_period": "0s", - # disable PITR - "pitr_interval": "0s", - } - ) - - env.pageserver.allowed_errors.append( - f".*PUT.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing" - ) - - tenant_id = env.initial_tenant - timeline_id = env.initial_timeline - - client = env.pageserver.http_client() - layers_at_creation = client.layer_map_info(tenant_id, timeline_id) - deltas_at_creation = len(layers_at_creation.delta_layers()) - assert ( - deltas_at_creation == 1 - ), "are you fixing #5863? make sure we end up with 2 deltas at the end of endpoint lifecycle" - - # Make new layer uploads get stuck. - # Note that timeline creation waits for the initial layers to reach remote storage. - # So at this point, the `layers_at_creation` are in remote storage. - client.configure_failpoints(("before-upload-layer-pausable", "pause")) - - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: - # Build two tables with some data inside - endpoint.safe_psql("CREATE TABLE foo AS SELECT x FROM generate_series(1, 10000) g(x)") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) - - with pytest.raises(ReadTimeout): - client.timeline_checkpoint(tenant_id, timeline_id, timeout=5) - client.configure_failpoints(("before-upload-layer-pausable", "off")) - - def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): From 5359cf717c557061652efaa06e14ecf111fe5071 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Fri, 14 Mar 2025 01:00:37 +0100 Subject: [PATCH 40/71] storcon: add API definitions for exclude_timeline and term_bump (#11197) Adds API definitions for the safekeeper API endpoints `exclude_timeline` and `term_bump`. Also does a bugfix to return the correct type from `delete_timeline`. Part of #8614 --- libs/safekeeper_api/src/models.rs | 5 +++ safekeeper/client/src/mgmt_api.rs | 40 +++++++++++++++++++-- safekeeper/src/http/routes.rs | 5 +-- safekeeper/src/timelines_global_map.rs | 8 +---- storage_controller/src/safekeeper_client.rs | 38 ++++++++++++++++++-- 5 files changed, 83 insertions(+), 13 deletions(-) diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 10c703395f..6bdc651668 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -221,6 +221,11 @@ pub struct TimelineMembershipSwitchResponse { pub current_conf: Configuration, } +#[derive(Clone, Copy, Serialize, Deserialize)] +pub struct TimelineDeleteResult { + pub dir_existed: bool, +} + fn lsn_invalid() -> Lsn { Lsn::INVALID } diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 3966aa811f..7ae39ef95e 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -8,7 +8,7 @@ use std::error::Error as _; use http_utils::error::HttpErrorBody; use reqwest::{IntoUrl, Method, StatusCode}; use safekeeper_api::models::{ - PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, TimelineStatus, }; use utils::id::{NodeId, TenantId, TimelineId}; @@ -96,11 +96,25 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn exclude_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/exclude", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.put(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_timeline( &self, tenant_id: TenantId, timeline_id: TimelineId, - ) -> Result { + ) -> Result { let uri = format!( "{}/v1/tenant/{}/timeline/{}", self.mgmt_api_endpoint, tenant_id, timeline_id @@ -109,6 +123,20 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn bump_timeline_term( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineTermBumpRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/term_bump", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.post(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn timeline_status( &self, tenant_id: TenantId, @@ -149,6 +177,14 @@ impl Client { self.request(Method::POST, uri, body).await } + async fn put( + &self, + uri: U, + body: B, + ) -> Result { + self.request(Method::PUT, uri, body).await + } + async fn get(&self, uri: U) -> Result { self.request(Method::GET, uri, ()).await } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 4f47331c85..21293671e1 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -17,7 +17,8 @@ use hyper::{Body, Request, Response, StatusCode}; use postgres_ffi::WAL_SEGMENT_SIZE; use safekeeper_api::models::{ AcceptorStateStatus, PullTimelineRequest, SafekeeperStatus, SkTimelineInfo, TermSwitchApiEntry, - TimelineCopyRequest, TimelineCreateRequest, TimelineStatus, TimelineTermBumpRequest, + TimelineCopyRequest, TimelineCreateRequest, TimelineDeleteResult, TimelineStatus, + TimelineTermBumpRequest, }; use safekeeper_api::{ServerInfo, membership, models}; use storage_broker::proto::{SafekeeperTimelineInfo, TenantTimelineId as ProtoTenantTimelineId}; @@ -32,7 +33,7 @@ use utils::lsn::Lsn; use crate::debug_dump::TimelineDigestRequest; use crate::safekeeper::TermLsn; -use crate::timelines_global_map::{DeleteOrExclude, TimelineDeleteResult}; +use crate::timelines_global_map::DeleteOrExclude; use crate::{ GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline, }; diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 858dfce807..41abee369e 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -11,9 +11,8 @@ use anyhow::{Context, Result, bail}; use camino::Utf8PathBuf; use camino_tempfile::Utf8TempDir; use safekeeper_api::membership::Configuration; -use safekeeper_api::models::SafekeeperUtilization; +use safekeeper_api::models::{SafekeeperUtilization, TimelineDeleteResult}; use safekeeper_api::{ServerInfo, membership}; -use serde::Serialize; use tokio::fs; use tracing::*; use utils::crashsafe::{durable_rename, fsync_async_opt}; @@ -579,11 +578,6 @@ impl GlobalTimelines { } } -#[derive(Clone, Copy, Serialize)] -pub struct TimelineDeleteResult { - pub dir_existed: bool, -} - /// Action for delete_or_exclude. #[derive(Clone, Debug)] pub enum DeleteOrExclude { diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index 1533b6c086..a44fcc27d2 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -1,5 +1,5 @@ use safekeeper_api::models::{ - PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, + self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, TimelineStatus, }; use safekeeper_client::mgmt_api::{Client, Result}; @@ -69,11 +69,28 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn exclude_timeline( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + measured_request!( + "exclude_timeline", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .exclude_timeline(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn delete_timeline( &self, tenant_id: TenantId, timeline_id: TimelineId, - ) -> Result { + ) -> Result { measured_request!( "delete_timeline", crate::metrics::Method::Delete, @@ -94,6 +111,23 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn bump_timeline_term( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineTermBumpRequest, + ) -> Result { + measured_request!( + "term_bump", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .bump_timeline_term(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn get_utilization(&self) -> Result { measured_request!( "utilization", From 04370b48b30a2ba63a2e17fbc0405ec9e403ff50 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 14 Mar 2025 12:21:16 +0100 Subject: [PATCH 41/71] fix(storcon): optimization validation makes decisions based on wrong SecondaryProgress (#11229) # Refs - fixes https://github.com/neondatabase/neon/issues/11228 # Problem High-Level When storcon validates whether a `ScheduleOptimizationAction` should be applied, it retrieves the `tenant_secondary_status` to determine whether a secondary is ready for the optimization. When collecting results, it associates secondary statuses with the wrong optimization actions in the batch of optimizations that we're validating. The result is that we make the decision for shard/location X based on the SecondaryStatus of a random secondary location Y in the current batch of optimizations. A possible symptom is an early cutover, as seen in this engineering investigation here: - https://github.com/neondatabase/cloud/issues/25734 # Problem Code-Level This code here in `optimize_all_validate` https://github.com/neondatabase/neon/blob/97e2e27f682003bcc8ac1c9e625bc3675f394264/storage_controller/src/service.rs#L7012-L7029 zips the `want_secondary_status` with the Vec returned from `tenant_for_shards_api` . However, the Vec returned from `want_secondary_status` is not ordered (it uses FuturesUnordered internally). # Solution Sort the Vec in input order before returning it. `optimize_all_validate` was the only caller affected by this problem While at it, also future-proof similar-looking function `tenant_for_shards`. None of its callers care about the order, but this type of function signature is easy to use incorrectly. # Future Work Avoid the additional iteration, map, and allocation. Change API to leverage AsyncFn (async closure). And/or invert `tenant_for_shards_api` into a Future ext trait / iterator adaptor thing. --- storage_controller/src/service.rs | 52 ++++++++++++++++++------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 789f4da255..f33408a89b 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -4268,7 +4268,8 @@ impl Service { /// Helper for concurrently calling a pageserver API on a number of shards, such as timeline creation. /// - /// On success, the returned vector contains exactly the same number of elements as the input `locations`. + /// On success, the returned vector contains exactly the same number of elements as the input `locations` + /// and returned element at index `i` is the result for `req_fn(op(locations[i])`. async fn tenant_for_shards( &self, locations: Vec<(TenantShardId, Node)>, @@ -4284,18 +4285,23 @@ impl Service { let mut futs = FuturesUnordered::new(); let mut results = Vec::with_capacity(locations.len()); - for (tenant_shard_id, node) in locations { - futs.push(req_fn(tenant_shard_id, node)); + for (idx, (tenant_shard_id, node)) in locations.into_iter().enumerate() { + let fut = req_fn(tenant_shard_id, node); + futs.push(async move { (idx, fut.await) }); } - while let Some(r) = futs.next().await { - results.push(r?); + while let Some((idx, r)) = futs.next().await { + results.push((idx, r?)); } - Ok(results) + results.sort_by_key(|(idx, _)| *idx); + Ok(results.into_iter().map(|(_, r)| r).collect()) } - /// Concurrently invoke a pageserver API call on many shards at once + /// Concurrently invoke a pageserver API call on many shards at once. + /// + /// The returned Vec has the same length as the `locations` Vec, + /// and returned element at index `i` is the result for `op(locations[i])`. pub(crate) async fn tenant_for_shards_api( &self, locations: Vec<(TenantShardId, Node)>, @@ -4312,27 +4318,29 @@ impl Service { let mut futs = FuturesUnordered::new(); let mut results = Vec::with_capacity(locations.len()); - for (tenant_shard_id, node) in locations { + for (idx, (tenant_shard_id, node)) in locations.into_iter().enumerate() { futs.push(async move { - node.with_client_retries( - |client| op(tenant_shard_id, client), - &self.config.pageserver_jwt_token, - &self.config.ssl_ca_cert, - warn_threshold, - max_retries, - timeout, - cancel, - ) - .await + let r = node + .with_client_retries( + |client| op(tenant_shard_id, client), + &self.config.pageserver_jwt_token, + &self.config.ssl_ca_cert, + warn_threshold, + max_retries, + timeout, + cancel, + ) + .await; + (idx, r) }); } - while let Some(r) = futs.next().await { - let r = r.unwrap_or(Err(mgmt_api::Error::Cancelled)); - results.push(r); + while let Some((idx, r)) = futs.next().await { + results.push((idx, r.unwrap_or(Err(mgmt_api::Error::Cancelled)))); } - results + results.sort_by_key(|(idx, _)| *idx); + results.into_iter().map(|(_, r)| r).collect() } /// Helper for safely working with the shards in a tenant remotely on pageservers, for example From f68be2b5e26321a506cc5afd14b826606031a89c Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Fri, 14 Mar 2025 15:41:22 +0400 Subject: [PATCH 42/71] safekeeper: https for management API (#11171) ## Problem Storage controller uses unencrypted HTTP requests for safekeeper management API. - Closes: https://github.com/neondatabase/cloud/issues/24836 ## Summary of changes - Replace `hyper0::server::Server` with `http_utils::server::Server` in safekeeper. - Add HTTPS handler for safekeeper management API. --- Cargo.lock | 6 +- control_plane/src/local_env.rs | 11 +++- control_plane/src/safekeeper.rs | 22 ++++++++ control_plane/src/storage_controller.rs | 4 ++ libs/http-utils/Cargo.toml | 2 + libs/http-utils/src/lib.rs | 1 + libs/http-utils/src/tls_certs.rs | 21 +++++++ pageserver/Cargo.toml | 2 - pageserver/src/bin/pageserver.rs | 24 +------- safekeeper/Cargo.toml | 6 +- safekeeper/src/bin/safekeeper.rs | 55 ++++++++++++++++++- safekeeper/src/http/mod.rs | 38 +++++++++++-- safekeeper/src/http/routes.rs | 11 +++- safekeeper/src/lib.rs | 12 ++++ safekeeper/src/pull_timeline.rs | 23 +++++--- .../tests/walproposer_sim/safekeeper.rs | 4 ++ test_runner/fixtures/neon_fixtures.py | 14 ++++- test_runner/regress/test_ssl.py | 55 ++++++++++++++++++- test_runner/regress/test_wal_acceptor.py | 1 + 19 files changed, 264 insertions(+), 48 deletions(-) create mode 100644 libs/http-utils/src/tls_certs.rs diff --git a/Cargo.lock b/Cargo.lock index 898ff1eabb..12fa3589f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2863,6 +2863,7 @@ version = "0.1.0" dependencies = [ "anyhow", "bytes", + "camino", "fail", "futures", "hyper 0.14.30", @@ -2873,6 +2874,7 @@ dependencies = [ "pprof", "regex", "routerify", + "rustls-pemfile 2.1.1", "serde", "serde_json", "serde_path_to_error", @@ -4327,8 +4329,6 @@ dependencies = [ "reqwest", "rpds", "rustls 0.23.18", - "rustls-pemfile 2.1.1", - "rustls-pki-types", "scopeguard", "send-future", "serde", @@ -6044,6 +6044,7 @@ dependencies = [ "regex", "remote_storage", "reqwest", + "rustls 0.23.18", "safekeeper_api", "safekeeper_client", "scopeguard", @@ -6060,6 +6061,7 @@ dependencies = [ "tokio", "tokio-io-timeout", "tokio-postgres", + "tokio-rustls 0.26.0", "tokio-stream", "tokio-tar", "tokio-util", diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 2e57236ddb..f0a11106bd 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -149,7 +149,7 @@ pub struct NeonBroker { pub listen_addr: SocketAddr, } -/// Broker config for cluster internal communication. +/// A part of storage controller's config the neon_local knows about. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct NeonStorageControllerConf { @@ -176,10 +176,11 @@ pub struct NeonStorageControllerConf { #[serde(with = "humantime_serde")] pub long_reconcile_threshold: Option, - #[serde(default)] pub use_https_pageserver_api: bool, pub timelines_onto_safekeepers: bool, + + pub use_https_safekeeper_api: bool, } impl NeonStorageControllerConf { @@ -205,6 +206,7 @@ impl Default for NeonStorageControllerConf { long_reconcile_threshold: None, use_https_pageserver_api: false, timelines_onto_safekeepers: false, + use_https_safekeeper_api: false, } } } @@ -302,6 +304,7 @@ pub struct SafekeeperConf { pub pg_port: u16, pub pg_tenant_only_port: Option, pub http_port: u16, + pub https_port: Option, pub sync: bool, pub remote_storage: Option, pub backup_threads: Option, @@ -316,6 +319,7 @@ impl Default for SafekeeperConf { pg_port: 0, pg_tenant_only_port: None, http_port: 0, + https_port: None, sync: true, remote_storage: None, backup_threads: None, @@ -845,6 +849,9 @@ impl LocalEnv { // create safekeeper dirs for safekeeper in &env.safekeepers { fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?; + SafekeeperNode::from_env(&env, safekeeper) + .initialize() + .context("safekeeper init failed")?; } // initialize pageserver state diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 70915d5aaf..231871852e 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -111,6 +111,18 @@ impl SafekeeperNode { .expect("non-Unicode path") } + /// Initializes a safekeeper node by creating all necessary files, + /// e.g. SSL certificates. + pub fn initialize(&self) -> anyhow::Result<()> { + if self.env.generate_local_ssl_certs { + self.env.generate_ssl_cert( + &self.datadir_path().join("server.crt"), + &self.datadir_path().join("server.key"), + )?; + } + Ok(()) + } + pub async fn start( &self, extra_opts: &[String], @@ -196,6 +208,16 @@ impl SafekeeperNode { ]); } + if let Some(https_port) = self.conf.https_port { + args.extend([ + "--listen-https".to_owned(), + format!("{}:{}", self.listen_addr, https_port), + ]); + } + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { + args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); + } + args.extend_from_slice(extra_opts); background_process::start_process( diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index e28fd70fdf..0c78f2e18e 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -538,6 +538,10 @@ impl StorageController { args.push("--use-https-pageserver-api".to_string()); } + if self.config.use_https_safekeeper_api { + args.push("--use-https-safekeeper-api".to_string()); + } + if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() { args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap())); } diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index 00b3777a63..331ae4a9b8 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -7,6 +7,7 @@ license.workspace = true [dependencies] anyhow.workspace = true bytes.workspace = true +camino.workspace = true fail.workspace = true futures.workspace = true hyper0.workspace = true @@ -16,6 +17,7 @@ once_cell.workspace = true pprof.workspace = true regex.workspace = true routerify.workspace = true +rustls-pemfile.workspace = true serde.workspace = true serde_json.workspace = true serde_path_to_error.workspace = true diff --git a/libs/http-utils/src/lib.rs b/libs/http-utils/src/lib.rs index dd520ef69b..2bd0fe582f 100644 --- a/libs/http-utils/src/lib.rs +++ b/libs/http-utils/src/lib.rs @@ -4,6 +4,7 @@ pub mod failpoints; pub mod json; pub mod request; pub mod server; +pub mod tls_certs; extern crate hyper0 as hyper; diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs new file mode 100644 index 0000000000..db9ec825ed --- /dev/null +++ b/libs/http-utils/src/tls_certs.rs @@ -0,0 +1,21 @@ +use camino::Utf8Path; +use tokio_rustls::rustls::pki_types::{CertificateDer, PrivateKeyDer}; + +pub fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { + let file = std::fs::File::open(filename)?; + let mut reader = std::io::BufReader::new(file); + + Ok(rustls_pemfile::certs(&mut reader).collect::, _>>()?) +} + +pub fn load_private_key(filename: &Utf8Path) -> anyhow::Result> { + let file = std::fs::File::open(filename)?; + let mut reader = std::io::BufReader::new(file); + + let key = rustls_pemfile::private_key(&mut reader)?; + + key.ok_or(anyhow::anyhow!( + "no private key found in {}", + filename.as_str(), + )) +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index d17a19ce65..56d97bf8a9 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -48,8 +48,6 @@ pprof.workspace = true rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true -rustls-pemfile.workspace = true -rustls-pki-types.workspace = true rustls.workspace = true scopeguard.workspace = true send-future.workspace = true diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 4d30a6358b..3ab6d79546 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -30,7 +30,6 @@ use pageserver::{ }; use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; -use rustls_pki_types::{CertificateDer, PrivateKeyDer}; use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; @@ -622,8 +621,8 @@ fn start_pageserver( let https_task = match https_listener { Some(https_listener) => { - let certs = load_certs(&conf.ssl_cert_file)?; - let key = load_private_key(&conf.ssl_key_file)?; + let certs = http_utils::tls_certs::load_cert_chain(&conf.ssl_cert_file)?; + let key = http_utils::tls_certs::load_private_key(&conf.ssl_key_file)?; let server_config = rustls::ServerConfig::builder() .with_no_client_auth() @@ -735,25 +734,6 @@ fn start_pageserver( }) } -fn load_certs(filename: &Utf8Path) -> std::io::Result>> { - let file = std::fs::File::open(filename)?; - let mut reader = std::io::BufReader::new(file); - - rustls_pemfile::certs(&mut reader).collect() -} - -fn load_private_key(filename: &Utf8Path) -> anyhow::Result> { - let file = std::fs::File::open(filename)?; - let mut reader = std::io::BufReader::new(file); - - let key = rustls_pemfile::private_key(&mut reader)?; - - key.ok_or(anyhow::anyhow!( - "no private key found in {}", - filename.as_str(), - )) -} - async fn create_remote_storage_client( conf: &'static PageServerConf, ) -> anyhow::Result { diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index bb937ad56a..965aa7504b 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -35,8 +35,9 @@ postgres-protocol.workspace = true pprof.workspace = true rand.workspace = true regex.workspace = true -scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } +rustls.workspace = true +scopeguard.workspace = true serde.workspace = true serde_json.workspace = true smallvec.workspace = true @@ -45,10 +46,11 @@ strum_macros.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["fs"] } -tokio-util = { workspace = true } tokio-io-timeout.workspace = true tokio-postgres.workspace = true +tokio-rustls.workspace = true tokio-tar.workspace = true +tokio-util = { workspace = true } tracing.workspace = true url.workspace = true metrics.workspace = true diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 10fc4a4b59..9ca79de179 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -16,10 +16,12 @@ use futures::stream::FuturesUnordered; use futures::{FutureExt, StreamExt}; use metrics::set_build_info_metric; use remote_storage::RemoteStorageConfig; +use reqwest::Certificate; use safekeeper::defaults::{ DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, - DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, + DEFAULT_SSL_KEY_FILE, }; use safekeeper::{ BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, WAL_SERVICE_RUNTIME, broker, @@ -94,6 +96,9 @@ struct Args { /// Listen http endpoint for management and metrics in the form host:port. #[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)] listen_http: String, + /// Listen https endpoint for management and metrics in the form host:port. + #[arg(long, default_value = None)] + listen_https: Option, /// Advertised endpoint for receiving/sending WAL in the form host:port. If not /// specified, listen_pg is used to advertise instead. #[arg(long, default_value = None)] @@ -203,6 +208,15 @@ struct Args { /// and the current position of the reader is smaller than this value. #[arg(long)] max_delta_for_fanout: Option, + /// Path to a file with certificate's private key for https API. + #[arg(long, default_value = DEFAULT_SSL_KEY_FILE)] + ssl_key_file: Utf8PathBuf, + /// Path to a file with a X509 certificate for https API. + #[arg(long, default_value = DEFAULT_SSL_CERT_FILE)] + ssl_cert_file: Utf8PathBuf, + /// Trusted root CA certificate to use in https APIs. + #[arg(long)] + ssl_ca_file: Option, } // Like PathBufValueParser, but allows empty string. @@ -336,12 +350,22 @@ async fn main() -> anyhow::Result<()> { } }; + let ssl_ca_cert = match args.ssl_ca_file.as_ref() { + Some(ssl_ca_file) => { + tracing::info!("Using ssl root CA file: {ssl_ca_file:?}"); + let buf = tokio::fs::read(ssl_ca_file).await?; + Some(Certificate::from_pem(&buf)?) + } + None => None, + }; + let conf = Arc::new(SafeKeeperConf { workdir, my_id: id, listen_pg_addr: args.listen_pg, listen_pg_addr_tenant_only: args.listen_pg_tenant_only, listen_http_addr: args.listen_http, + listen_https_addr: args.listen_https, advertise_pg_addr: args.advertise_pg, availability_zone: args.availability_zone, no_sync: args.no_sync, @@ -368,6 +392,9 @@ async fn main() -> anyhow::Result<()> { eviction_min_resident: args.eviction_min_resident, wal_reader_fanout: args.wal_reader_fanout, max_delta_for_fanout: args.max_delta_for_fanout, + ssl_key_file: args.ssl_key_file, + ssl_cert_file: args.ssl_cert_file, + ssl_ca_cert, }); // initialize sentry if SENTRY_DSN is provided @@ -428,6 +455,17 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { e })?; + let https_listener = match conf.listen_https_addr.as_ref() { + Some(listen_https_addr) => { + info!("starting safekeeper HTTPS service on {}", listen_https_addr); + Some(tcp_listener::bind(listen_https_addr).map_err(|e| { + error!("failed to bind to address {}: {}", listen_https_addr, e); + e + })?) + } + None => None, + }; + let global_timelines = Arc::new(GlobalTimelines::new(conf.clone())); // Register metrics collector for active timelines. It's important to do this @@ -501,7 +539,7 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { let http_handle = current_thread_rt .as_ref() .unwrap_or_else(|| HTTP_RUNTIME.handle()) - .spawn(http::task_main( + .spawn(http::task_main_http( conf.clone(), http_listener, global_timelines.clone(), @@ -509,6 +547,19 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { .map(|res| ("HTTP service main".to_owned(), res)); tasks_handles.push(Box::pin(http_handle)); + if let Some(https_listener) = https_listener { + let https_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| HTTP_RUNTIME.handle()) + .spawn(http::task_main_https( + conf.clone(), + https_listener, + global_timelines.clone(), + )) + .map(|res| ("HTTPS service main".to_owned(), res)); + tasks_handles.push(Box::pin(https_handle)); + } + let broker_task_handle = current_thread_rt .as_ref() .unwrap_or_else(|| BROKER_RUNTIME.handle()) diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index f162985ef7..4908863a4b 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -3,10 +3,11 @@ use std::sync::Arc; pub use routes::make_router; pub use safekeeper_api::models; +use tokio_util::sync::CancellationToken; use crate::{GlobalTimelines, SafeKeeperConf}; -pub async fn task_main( +pub async fn task_main_http( conf: Arc, http_listener: std::net::TcpListener, global_timelines: Arc, @@ -14,8 +15,37 @@ pub async fn task_main( let router = make_router(conf, global_timelines) .build() .map_err(|err| anyhow::anyhow!(err))?; - let service = http_utils::RouterService::new(router).unwrap(); - let server = hyper::Server::from_tcp(http_listener)?; - server.serve(service).await?; + + let service = Arc::new( + http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow::anyhow!(err))?, + ); + let server = http_utils::server::Server::new(service, http_listener, None)?; + server.serve(CancellationToken::new()).await?; + Ok(()) // unreachable +} + +pub async fn task_main_https( + conf: Arc, + https_listener: std::net::TcpListener, + global_timelines: Arc, +) -> anyhow::Result<()> { + let certs = http_utils::tls_certs::load_cert_chain(&conf.ssl_cert_file)?; + let key = http_utils::tls_certs::load_private_key(&conf.ssl_key_file)?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_single_cert(certs, key)?; + + let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + + let router = make_router(conf, global_timelines) + .build() + .map_err(|err| anyhow::anyhow!(err))?; + + let service = Arc::new( + http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow::anyhow!(err))?, + ); + let server = http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; + server.serve(CancellationToken::new()).await?; Ok(()) // unreachable } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 21293671e1..3299d77545 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -232,9 +232,14 @@ async fn timeline_pull_handler(mut request: Request) -> Result, pub listen_http_addr: String, + pub listen_https_addr: Option, pub advertise_pg_addr: Option, pub availability_zone: Option, pub no_sync: bool, @@ -111,6 +116,9 @@ pub struct SafeKeeperConf { pub eviction_min_resident: Duration, pub wal_reader_fanout: bool, pub max_delta_for_fanout: Option, + pub ssl_key_file: Utf8PathBuf, + pub ssl_cert_file: Utf8PathBuf, + pub ssl_ca_cert: Option, } impl SafeKeeperConf { @@ -127,6 +135,7 @@ impl SafeKeeperConf { listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_pg_addr_tenant_only: None, listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), + listen_https_addr: None, advertise_pg_addr: None, availability_zone: None, remote_storage: None, @@ -155,6 +164,9 @@ impl SafeKeeperConf { eviction_min_resident: Duration::ZERO, wal_reader_fanout: false, max_delta_for_fanout: None, + ssl_key_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_KEY_FILE), + ssl_cert_file: Utf8PathBuf::from(defaults::DEFAULT_SSL_CERT_FILE), + ssl_ca_cert: None, } } } diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 7d6ce1269c..dab8142dfb 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -8,6 +8,7 @@ use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use reqwest::Certificate; use safekeeper_api::Term; use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; use safekeeper_client::mgmt_api; @@ -392,6 +393,7 @@ pub struct DebugDumpResponse { pub async fn handle_request( request: PullTimelineRequest, sk_auth_token: Option, + ssl_ca_cert: Option, global_timelines: Arc, ) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( @@ -402,9 +404,11 @@ pub async fn handle_request( bail!("Timeline {} already exists", request.timeline_id); } - // TODO(DimasKovas): add ssl root CA certificate when implementing safekeeper's - // part of https support (#24836). - let http_client = reqwest::Client::new(); + let mut http_client = reqwest::Client::builder(); + if let Some(ssl_ca_cert) = ssl_ca_cert { + http_client = http_client.add_root_certificate(ssl_ca_cert); + } + let http_client = http_client.build()?; let http_hosts = request.http_hosts.clone(); @@ -441,13 +445,21 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline(status, safekeeper_host, sk_auth_token, global_timelines).await + pull_timeline( + status, + safekeeper_host, + sk_auth_token, + http_client, + global_timelines, + ) + .await } async fn pull_timeline( status: TimelineStatus, host: String, sk_auth_token: Option, + http_client: reqwest::Client, global_timelines: Arc, ) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); @@ -464,9 +476,6 @@ async fn pull_timeline( let conf = &global_timelines.get_global_config(); let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?; - // TODO(DimasKovas): add ssl root CA certificate when implementing safekeeper's - // part of https support (#24836). - let http_client = reqwest::Client::new(); let client = Client::new(http_client, host.clone(), sk_auth_token.clone()); // Request stream with basebackup archive. let bb_resp = client diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 6ce1a9940e..0dfdafcc51 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -152,6 +152,7 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { my_id: NodeId(os.id() as u64), listen_pg_addr: String::new(), listen_http_addr: String::new(), + listen_https_addr: None, no_sync: false, broker_endpoint: "/".parse::().unwrap(), broker_keepalive_interval: Duration::from_secs(0), @@ -179,6 +180,9 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { eviction_min_resident: Duration::ZERO, wal_reader_fanout: false, max_delta_for_fanout: None, + ssl_key_file: Utf8PathBuf::from(""), + ssl_cert_file: Utf8PathBuf::from(""), + ssl_ca_cert: None, }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 11ca1d7913..deff02f0f9 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -466,6 +466,9 @@ class NeonEnvBuilder: # Flag to enable https listener in pageserver, generate local ssl certs, # and force storage controller to use https for pageserver api. self.use_https_pageserver_api: bool = False + # Flag to enable https listener in safekeeper, generate local ssl certs, + # and force storage controller to use https for safekeeper api. + self.use_https_safekeeper_api: bool = False self.pageserver_virtual_file_io_engine: str | None = pageserver_virtual_file_io_engine self.pageserver_get_vectored_concurrent_io: str | None = ( @@ -1063,7 +1066,9 @@ class NeonEnv: self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline - self.generate_local_ssl_certs = config.use_https_pageserver_api + self.generate_local_ssl_certs = ( + config.use_https_pageserver_api or config.use_https_safekeeper_api + ) self.ssl_ca_file = ( self.repo_dir.joinpath("rootCA.crt") if self.generate_local_ssl_certs else None ) @@ -1146,6 +1151,10 @@ class NeonEnv: storage_controller_config = storage_controller_config or {} storage_controller_config["use_https_pageserver_api"] = True + if config.use_https_safekeeper_api: + storage_controller_config = storage_controller_config or {} + storage_controller_config["use_https_safekeeper_api"] = True + if storage_controller_config is not None: cfg["storage_controller"] = storage_controller_config @@ -1248,6 +1257,7 @@ class NeonEnv: pg=self.port_distributor.get_port(), pg_tenant_only=self.port_distributor.get_port(), http=self.port_distributor.get_port(), + https=self.port_distributor.get_port() if config.use_https_safekeeper_api else None, ) id = config.safekeepers_id_start + i # assign ids sequentially sk_cfg: dict[str, Any] = { @@ -1255,6 +1265,7 @@ class NeonEnv: "pg_port": port.pg, "pg_tenant_only_port": port.pg_tenant_only, "http_port": port.http, + "https_port": port.https, "sync": config.safekeepers_enable_fsync, } if config.auth_enabled: @@ -4475,6 +4486,7 @@ class SafekeeperPort: pg: int pg_tenant_only: int http: int + https: int | None @dataclass diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py index 25d839aa42..7db4a16f49 100644 --- a/test_runner/regress/test_ssl.py +++ b/test_runner/regress/test_ssl.py @@ -1,5 +1,7 @@ +import pytest import requests -from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.neon_fixtures import NeonEnvBuilder, StorageControllerApiException +from fixtures.utils import wait_until def test_pageserver_https_api(neon_env_builder: NeonEnvBuilder): @@ -13,3 +15,54 @@ def test_pageserver_https_api(neon_env_builder: NeonEnvBuilder): addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status" requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + +def test_safekeeper_https_api(neon_env_builder: NeonEnvBuilder): + """ + Test HTTPS safekeeper management API. + 1. Make /v1/status request to HTTPS API to ensure it's appropriately configured. + 2. Try to register safekeeper in storcon with https port missing. + 3. Register safekeeper with https port. + 4. Wait for a heartbeat round to complete. + """ + neon_env_builder.use_https_safekeeper_api = True + env = neon_env_builder.init_start() + + sk = env.safekeepers[0] + + # 1. Make simple https request. + addr = f"https://localhost:{sk.port.https}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + # Note: http_port is intentionally wrong. + # Storcon should not use it if use_https is on. + http_port = 0 + + body = { + "active": True, + "id": sk.id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "localhost", + "port": sk.port.pg, + "http_port": http_port, + "https_port": None, + "version": 5957, + "availability_zone_id": "us-east-2b", + } + # 2. Try register with https port missing. + with pytest.raises(StorageControllerApiException, match="https port is not specified"): + env.storage_controller.on_safekeeper_deploy(sk.id, body) + + # 3. Register with https port. + body["https_port"] = sk.port.https + env.storage_controller.on_safekeeper_deploy(sk.id, body) + + # 4. Wait for hearbeat round complete. + def storcon_heartbeat(): + assert env.storage_controller.log_contains( + "Heartbeat round complete for 1 safekeepers, 0 offline" + ) + + wait_until(storcon_heartbeat) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 0366e88389..55e38b29a2 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1427,6 +1427,7 @@ class SafekeeperEnv: pg=self.port_distributor.get_port(), pg_tenant_only=self.port_distributor.get_port(), http=self.port_distributor.get_port(), + https=None, ) safekeeper_dir = self.repo_dir / f"sk{i}" From b0922967e03732bf8e22a8d2273da43dbae94a01 Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Fri, 14 Mar 2025 15:51:11 +0400 Subject: [PATCH 43/71] Bump humantime version and remove advisories.ignore (#11242) ## Problem - Closes: https://github.com/neondatabase/neon/issues/11179#issuecomment-2724222041 ## Summary of changes - Bump humantime version to `2.2` - Remove `RUSTSEC-2025-0014` from `advisories.ignore` --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- deny.toml | 4 ---- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 12fa3589f6..39ce785a4e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2904,9 +2904,9 @@ checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" [[package]] name = "humantime" -version = "2.1.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +checksum = "9b112acc8b3adf4b107a8ec20977da0273a8c386765a3ec0229bd500a1443f9f" [[package]] name = "humantime-serde" diff --git a/Cargo.toml b/Cargo.toml index 82fb463182..f2a94d2371 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -106,7 +106,7 @@ hostname = "0.4" http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } http-body-util = "0.1.2" -humantime = "2.1" +humantime = "2.2" humantime-serde = "1.1.1" hyper0 = { package = "hyper", version = "0.14" } hyper = "1.4" diff --git a/deny.toml b/deny.toml index 1023b1833a..ed7aa9ef9f 100644 --- a/deny.toml +++ b/deny.toml @@ -31,10 +31,6 @@ reason = "the marvin attack only affects private key decryption, not public key id = "RUSTSEC-2024-0436" reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact." -[[advisories.ignore]] -id = "RUSTSEC-2025-0014" -reason = "The humantime is widely used and is not easy to replace right now. It is unmaintained, but it has no known vulnerabilities to care about. #11179" - # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html From 7fe5a689b4dd501a084181ccad03e0bbc3c0f6f2 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 14 Mar 2025 13:54:57 +0000 Subject: [PATCH 44/71] feat(proxy): export ingress metrics (#11244) ## Problem We exposed the direction tag in #10925 but didn't actually include the ingress tag in the export to allow for an adaption period. ## Summary of changes We now export the ingress direction --- proxy/src/proxy/passthrough.rs | 4 +- proxy/src/serverless/conn_pool_lib.rs | 9 +- proxy/src/serverless/http_conn_pool.rs | 9 +- proxy/src/serverless/sql_over_http.rs | 17 ++- proxy/src/usage_metrics.rs | 148 ++++++++++++++++++------- 5 files changed, 126 insertions(+), 61 deletions(-) diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 23b9897155..c100b8d716 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -10,7 +10,7 @@ use crate::config::ComputeConfig; use crate::control_plane::messages::MetricsAuxInfo; use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; use crate::stream::Stream; -use crate::usage_metrics::{Ids, MetricCounterRecorder, TrafficDirection, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] @@ -24,7 +24,6 @@ pub(crate) async fn proxy_pass( let usage_tx = USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, - direction: TrafficDirection::Egress, private_link_id, }); @@ -47,6 +46,7 @@ pub(crate) async fn proxy_pass( |cnt| { // Number of bytes the client sent to the compute node (inbound). metrics.get_metric(m_recv).inc_by(cnt as u64); + usage_tx.record_ingress(cnt as u64); }, ); diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 933204994b..77b548cc43 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -22,7 +22,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::protocol2::ConnectionInfoExtra; use crate::types::{DbName, EndpointCacheKey, RoleName}; -use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; #[derive(Debug, Clone)] pub(crate) struct ConnInfo { @@ -639,11 +639,7 @@ impl Client { (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn metrics( - &self, - direction: TrafficDirection, - ctx: &RequestContext, - ) -> Arc { + pub(crate) fn metrics(&self, ctx: &RequestContext) -> Arc { let aux = &self .inner .as_ref() @@ -659,7 +655,6 @@ impl Client { USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, - direction, private_link_id, }) } diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index bca2d4c165..1c6574e57e 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -19,7 +19,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::protocol2::ConnectionInfoExtra; use crate::types::EndpointCacheKey; -use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS}; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = http2::Connection, hyper::body::Incoming, TokioExecutor>; @@ -265,11 +265,7 @@ impl Client { Self { inner } } - pub(crate) fn metrics( - &self, - direction: TrafficDirection, - ctx: &RequestContext, - ) -> Arc { + pub(crate) fn metrics(&self, ctx: &RequestContext) -> Arc { let aux = &self.inner.aux; let private_link_id = match ctx.extra() { @@ -281,7 +277,6 @@ impl Client { USAGE_METRICS.register(Ids { endpoint_id: aux.endpoint_id, branch_id: aux.branch_id, - direction, private_link_id, }) } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index a79a478126..10e378a18d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -42,7 +42,7 @@ use crate::metrics::{HttpDirection, Metrics}; use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; -use crate::usage_metrics::{MetricCounter, MetricCounterRecorder, TrafficDirection}; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] @@ -663,6 +663,7 @@ async fn handle_db_inner( let parsed_headers = HttpHeaders::try_parse(headers)?; + let mut request_len = 0; let fetch_and_process_request = Box::pin( async { let body = read_body_with_limit( @@ -671,6 +672,8 @@ async fn handle_db_inner( ) .await?; + request_len = body.len(); + Metrics::get() .proxy .http_conn_content_length_bytes @@ -765,7 +768,7 @@ async fn handle_db_inner( } }; - let metrics = client.metrics(TrafficDirection::Egress, ctx); + let metrics = client.metrics(ctx); let len = json_output.len(); let response = response @@ -781,6 +784,8 @@ async fn handle_db_inner( // count the egress bytes - we miss the TLS and header overhead but oh well... // moving this later in the stack is going to be a lot of effort and ehhhh metrics.record_egress(len as u64); + metrics.record_ingress(request_len as u64); + Metrics::get() .proxy .http_conn_content_length_bytes @@ -838,7 +843,7 @@ async fn handle_auth_broker_inner( .expect("all headers and params received via hyper should be valid for request"); // todo: map body to count egress - let _metrics = client.metrics(TrafficDirection::Egress, ctx); + let _metrics = client.metrics(ctx); Ok(client .inner @@ -1168,10 +1173,10 @@ enum Discard<'a> { } impl Client { - fn metrics(&self, direction: TrafficDirection, ctx: &RequestContext) -> Arc { + fn metrics(&self, ctx: &RequestContext) -> Arc { match self { - Client::Remote(client) => client.metrics(direction, ctx), - Client::Local(local_client) => local_client.metrics(direction, ctx), + Client::Remote(client) => client.metrics(ctx), + Client::Local(local_client) => local_client.metrics(ctx), } } diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index 004d268fa1..2b27dc5c76 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -44,11 +44,17 @@ const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); pub(crate) struct Ids { pub(crate) endpoint_id: EndpointIdInt, pub(crate) branch_id: BranchIdInt, - pub(crate) direction: TrafficDirection, #[serde(with = "none_as_empty_string")] pub(crate) private_link_id: Option, } +#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] +struct Extra { + #[serde(flatten)] + ids: Ids, + direction: TrafficDirection, +} + mod none_as_empty_string { use serde::Deserialize; use smol_str::SmolStr; @@ -76,18 +82,23 @@ pub(crate) enum TrafficDirection { pub(crate) trait MetricCounterRecorder { /// Record that some bytes were sent from the proxy to the client fn record_egress(&self, bytes: u64); + + /// Record that some bytes were sent from the client to the proxy + fn record_ingress(&self, bytes: u64); + /// Record that some connections were opened fn record_connection(&self, count: usize); } trait MetricCounterReporter { - fn get_metrics(&mut self) -> (u64, usize); - fn move_metrics(&self) -> (u64, usize); + fn get_metrics(&mut self) -> MetricsData; + fn move_metrics(&self) -> MetricsData; } #[derive(Debug)] pub(crate) struct MetricCounter { transmitted: AtomicU64, + received: AtomicU64, opened_connections: AtomicUsize, } @@ -97,6 +108,11 @@ impl MetricCounterRecorder for MetricCounter { self.transmitted.fetch_add(bytes, Ordering::Relaxed); } + /// Record that some bytes were sent from the proxy to the client + fn record_ingress(&self, bytes: u64) { + self.received.fetch_add(bytes, Ordering::Relaxed); + } + /// Record that some connections were opened fn record_connection(&self, count: usize) { self.opened_connections.fetch_add(count, Ordering::Relaxed); @@ -104,29 +120,43 @@ impl MetricCounterRecorder for MetricCounter { } impl MetricCounterReporter for MetricCounter { - fn get_metrics(&mut self) -> (u64, usize) { - ( - *self.transmitted.get_mut(), - *self.opened_connections.get_mut(), - ) + fn get_metrics(&mut self) -> MetricsData { + MetricsData { + received: *self.received.get_mut(), + transmitted: *self.transmitted.get_mut(), + connections: *self.opened_connections.get_mut(), + } } - fn move_metrics(&self) -> (u64, usize) { - ( - self.transmitted.swap(0, Ordering::Relaxed), - self.opened_connections.swap(0, Ordering::Relaxed), - ) + + fn move_metrics(&self) -> MetricsData { + MetricsData { + received: self.received.swap(0, Ordering::Relaxed), + transmitted: self.transmitted.swap(0, Ordering::Relaxed), + connections: self.opened_connections.swap(0, Ordering::Relaxed), + } } } +struct MetricsData { + transmitted: u64, + received: u64, + connections: usize, +} + +struct BytesSent { + transmitted: u64, + received: u64, +} + trait Clearable { /// extract the value that should be reported - fn should_report(self: &Arc) -> Option; + fn should_report(self: &Arc) -> Option; /// Determine whether the counter should be cleared from the global map. fn should_clear(self: &mut Arc) -> bool; } impl Clearable for C { - fn should_report(self: &Arc) -> Option { + fn should_report(self: &Arc) -> Option { // heuristic to see if the branch is still open // if a clone happens while we are observing, the heuristic will be incorrect. // @@ -139,14 +169,21 @@ impl Clearable for C { // (to avoid sending the same metrics twice) // see the relevant discussion on why to do so even if the status is not success: // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 - let (value, opened) = self.move_metrics(); + let MetricsData { + transmitted, + received, + connections, + } = self.move_metrics(); // Our only requirement is that we report in every interval if there was an open connection // if there were no opened connections since, then we don't need to report - if value == 0 && !is_open && opened == 0 { + if transmitted == 0 && received == 0 && !is_open && connections == 0 { None } else { - Some(value) + Some(BytesSent { + transmitted, + received, + }) } } fn should_clear(self: &mut Arc) -> bool { @@ -154,9 +191,13 @@ impl Clearable for C { let Some(counter) = Arc::get_mut(self) else { return false; }; - let (opened, value) = counter.get_metrics(); + let MetricsData { + transmitted, + received, + connections, + } = counter.get_metrics(); // clear if there's no data to report - value == 0 && opened == 0 + transmitted == 0 && received == 0 && connections == 0 } } @@ -178,6 +219,7 @@ impl Metrics { .entry(ids) .or_insert_with(|| { Arc::new(MetricCounter { + received: AtomicU64::new(0), transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), }) @@ -242,10 +284,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result( endpoints: &ClashMap, FastHasher>, -) -> Vec<(Ids, u64)> { +) -> Vec<(Ids, BytesSent)> { let mut metrics_to_clear = Vec::new(); - let metrics_to_send: Vec<(Ids, u64)> = endpoints + let metrics_to_send: Vec<(Ids, BytesSent)> = endpoints .iter() .filter_map(|counter| { let key = counter.key().clone(); @@ -271,26 +313,46 @@ fn collect_and_clear_metrics( } fn create_event_chunks<'a>( - metrics_to_send: &'a [(Ids, u64)], + metrics_to_send: &'a [(Ids, BytesSent)], hostname: &'a str, prev: DateTime, now: DateTime, chunk_size: usize, -) -> impl Iterator>> + 'a { +) -> impl Iterator>> + 'a { metrics_to_send .chunks(chunk_size) .map(move |chunk| EventChunk { events: chunk .iter() - .map(|(ids, value)| Event { - kind: EventType::Incremental { - start_time: prev, - stop_time: now, - }, - metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname), - value: *value, - extra: ids.clone(), + .flat_map(|(ids, bytes)| { + [ + Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: bytes.transmitted, + extra: Extra { + ids: ids.clone(), + direction: TrafficDirection::Egress, + }, + }, + Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: bytes.received, + extra: Extra { + ids: ids.clone(), + direction: TrafficDirection::Ingress, + }, + }, + ] }) .collect(), }) @@ -350,7 +412,7 @@ fn create_remote_path_prefix(now: DateTime) -> String { async fn upload_main_events_chunked( client: &http::ClientWithMiddleware, metric_collection_endpoint: &reqwest::Url, - chunk: &EventChunk<'_, Event>, + chunk: &EventChunk<'_, Event>, subchunk_size: usize, ) { // Split into smaller chunks to avoid exceeding the max request size @@ -384,7 +446,7 @@ async fn upload_main_events_chunked( async fn upload_backup_events( storage: Option<&GenericRemoteStorage>, - chunk: &EventChunk<'_, Event>, + chunk: &EventChunk<'_, Event>, path_prefix: &str, cancel: &CancellationToken, ) -> anyhow::Result<()> { @@ -461,7 +523,7 @@ mod tests { #[tokio::test] async fn metrics() { - type Report = EventChunk<'static, Event>; + type Report = EventChunk<'static, Event>; let reports: Arc>> = Arc::default(); let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); @@ -533,7 +595,6 @@ mod tests { let counter = metrics.register(Ids { endpoint_id: (&EndpointId::from("e1")).into(), branch_id: (&BranchId::from("b1")).into(), - direction: TrafficDirection::Egress, private_link_id: None, }); @@ -551,13 +612,19 @@ mod tests { .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); - assert_eq!(r[0].events.len(), 1); + assert_eq!(r[0].events.len(), 2); assert_eq!(r[0].events[0].value, 0); + assert_eq!(r[0].events[0].extra.direction, TrafficDirection::Egress); + assert_eq!(r[0].events[1].value, 0); + assert_eq!(r[0].events[1].extra.direction, TrafficDirection::Ingress); pushed_chunks.extend(r); // record egress counter.record_egress(1); + // record ingress + counter.record_ingress(2); + // egress should be observered collect_metrics_iteration( &metrics.endpoints, @@ -572,8 +639,11 @@ mod tests { .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); - assert_eq!(r[0].events.len(), 1); + assert_eq!(r[0].events.len(), 2); assert_eq!(r[0].events[0].value, 1); + assert_eq!(r[0].events[0].extra.direction, TrafficDirection::Egress); + assert_eq!(r[0].events[1].value, 2); + assert_eq!(r[0].events[1].extra.direction, TrafficDirection::Ingress); pushed_chunks.extend(r); // release counter From b7c6738524208a892839a90c8afef953104f5d28 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Fri, 14 Mar 2025 14:08:16 +0000 Subject: [PATCH 45/71] feat(compute_ctl): add pgaudt log gc to compute_ctl (#11169) - add pgaudt_gc thread to compute_ctl to cleanup old pgaudit logs if they exist. pgaudit can rotate files, but it doesn't delete the old files - Add AUDIT_LOG_DIR_SIZE metric to compute_ctl to track the size of the audit log directory in bytes. - Fix permissions for rsyslog state files directory --- compute/vm-image-spec-bookworm.yaml | 2 +- compute/vm-image-spec-bullseye.yaml | 2 +- compute_tools/src/compute.rs | 14 ++-- compute_tools/src/config.rs | 3 +- .../compute_audit_rsyslog_template.conf | 3 +- compute_tools/src/metrics.rs | 16 ++++- compute_tools/src/rsyslog.rs | 65 ++++++++++++++++++- 7 files changed, 89 insertions(+), 16 deletions(-) diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index e6707381ac..0cf72b6f74 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -145,7 +145,7 @@ merge: | COPY compute_rsyslog.conf /etc/compute_rsyslog.conf RUN chmod 0666 /etc/compute_rsyslog.conf - RUN chmod 0666 /var/log/ + RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index c89ee112dc..9deaf3ea55 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -140,7 +140,7 @@ merge: | COPY compute_rsyslog.conf /etc/compute_rsyslog.conf RUN chmod 0666 /etc/compute_rsyslog.conf - RUN chmod 0666 /var/log/ + RUN mkdir /var/log/rsyslog && chown -R postgres /var/log/rsyslog COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a0654ea0e4..58b99dde53 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -37,7 +37,7 @@ use crate::logger::startup_context_from_env; use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; use crate::monitor::launch_monitor; use crate::pg_helpers::*; -use crate::rsyslog::configure_audit_rsyslog; +use crate::rsyslog::{configure_audit_rsyslog, launch_pgaudit_gc}; use crate::spec::*; use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; @@ -625,13 +625,11 @@ impl ComputeNode { } let log_directory_path = Path::new(&self.params.pgdata).join("log"); - // TODO: make this more robust - // now rsyslog starts once and there is no monitoring or restart if it fails - configure_audit_rsyslog( - log_directory_path.to_str().unwrap(), - "hipaa", - &remote_endpoint, - )?; + let log_directory_path = log_directory_path.to_string_lossy().to_string(); + configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; + + // Launch a background task to clean up the audit logs + launch_pgaudit_gc(log_directory_path); } // Launch remaining service threads diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 7aa7360f9d..e4acc5471c 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -167,7 +167,8 @@ pub fn write_postgres_conf( writeln!(file, "# Managed by compute_ctl audit settings: begin")?; // This log level is very verbose // but this is necessary for HIPAA compliance. - writeln!(file, "pgaudit.log='all'")?; + // Exclude 'misc' category, because it doesn't contain anythig relevant. + writeln!(file, "pgaudit.log='all, -misc'")?; writeln!(file, "pgaudit.log_parameter=on")?; // Disable logging of catalog queries // The catalog doesn't contain sensitive data, so we don't need to audit it. diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf index bef3c36446..1937cdc292 100644 --- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -4,7 +4,8 @@ module(load="imfile") # Input configuration for log files in the specified directory # Replace {log_directory} with the directory containing the log files input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0") -global(workDirectory="/var/log") +# the directory to store rsyslog state files +global(workDirectory="/var/log/rsyslog") # Forward logs to remote syslog server *.* @@{remote_endpoint} \ No newline at end of file diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index dab32d5dc1..4caa48307e 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -1,6 +1,8 @@ -use metrics::core::Collector; +use metrics::core::{AtomicF64, Collector, GenericGauge}; use metrics::proto::MetricFamily; -use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; +use metrics::{ + IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec, +}; use once_cell::sync::Lazy; pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { @@ -59,10 +61,20 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy = Lazy::new(|| .expect("failed to define a metric") }); +// Size of audit log directory in bytes +pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy> = Lazy::new(|| { + register_gauge!( + "compute_audit_log_dir_size", + "Size of audit log directory in bytes", + ) + .expect("failed to define a metric") +}); + pub fn collect() -> Vec { let mut metrics = INSTALLED_EXTENSIONS.collect(); metrics.extend(CPLANE_REQUESTS_TOTAL.collect()); metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect()); metrics.extend(DB_MIGRATION_FAILED.collect()); + metrics.extend(AUDIT_LOG_DIR_SIZE.collect()); metrics } diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index c8fba4fdcd..7537fafaa5 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -1,8 +1,11 @@ +use std::fs; +use std::path::Path; use std::process::Command; +use std::time::Duration; use std::{fs::OpenOptions, io::Write}; use anyhow::{Context, Result}; -use tracing::info; +use tracing::{error, info, instrument, warn}; fn get_rsyslog_pid() -> Option { let output = Command::new("pgrep") @@ -43,7 +46,7 @@ fn restart_rsyslog() -> Result<()> { } pub fn configure_audit_rsyslog( - log_directory: &str, + log_directory: String, tag: &str, remote_endpoint: &str, ) -> Result<()> { @@ -75,3 +78,61 @@ pub fn configure_audit_rsyslog( Ok(()) } + +#[instrument(skip_all)] +async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> { + info!("running pgaudit GC main loop"); + loop { + // Check log_directory for old pgaudit logs and delete them. + // New log files are checked every 5 minutes, as set in pgaudit.log_rotation_age + // Find files that were not modified in the last 15 minutes and delete them. + // This should be enough time for rsyslog to process the logs and for us to catch the alerts. + // + // In case of a very high load, we might need to adjust this value and pgaudit.log_rotation_age. + // + // TODO: add some smarter logic to delete the files that are fully streamed according to rsyslog + // imfile-state files, but for now just do a simple GC to avoid filling up the disk. + let _ = Command::new("find") + .arg(&log_directory) + .arg("-name") + .arg("audit*.log") + .arg("-mmin") + .arg("+15") + .arg("-delete") + .output()?; + + // also collect the metric for the size of the log directory + async fn get_log_files_size(path: &Path) -> Result { + let mut total_size = 0; + + for entry in fs::read_dir(path)? { + let entry = entry?; + let entry_path = entry.path(); + + if entry_path.is_file() && entry_path.to_string_lossy().ends_with("log") { + total_size += entry.metadata()?.len(); + } + } + + Ok(total_size) + } + + let log_directory_size = get_log_files_size(Path::new(&log_directory)) + .await + .unwrap_or_else(|e| { + warn!("Failed to get log directory size: {}", e); + 0 + }); + crate::metrics::AUDIT_LOG_DIR_SIZE.set(log_directory_size as f64); + tokio::time::sleep(Duration::from_secs(60)).await; + } +} + +// launch pgaudit GC thread to clean up the old pgaudit logs stored in the log_directory +pub fn launch_pgaudit_gc(log_directory: String) { + tokio::spawn(async move { + if let Err(e) = pgaudit_gc_main_loop(log_directory).await { + error!("pgaudit GC main loop failed: {}", e); + } + }); +} From 4a97cd0b7e95d88ef23ea235700326db39d6e0cb Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 14 Mar 2025 17:02:55 +0000 Subject: [PATCH 46/71] test_runner: fix tests with jsonnet for Python 3.13 (#11240) ## Problem Python's `jsonnet` 0.20.0 doesn't support Python 3.13, so we have a couple of tests xfailed because of that. ## Summary of changes - Bump `jsonnet` to `0.21.0rc2` which supports Python 3.13 - Unxfail `test_sql_exporter_metrics_e2e` and `test_sql_exporter_metrics_smoke` on Python 3.13 --- poetry.lock | 34 ++++++++++++++++++--- pyproject.toml | 4 +-- test_runner/regress/test_compute_metrics.py | 9 ++---- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/poetry.lock b/poetry.lock index 03aa543b06..7c84b2969b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1491,14 +1491,38 @@ files = [ [[package]] name = "jsonnet" -version = "0.20.0" -description = "Python bindings for Jsonnet - The data templating language" +version = "0.21.0rc2" +description = "Python bindings for Jsonnet - The data templating language " optional = false python-versions = "*" groups = ["main"] -markers = "python_version < \"3.13\"" files = [ - {file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8779ac6820fee44ef736df2baedc3ae93e8cd5d672ee105015c2a47fe627a727"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:99affe8c71e2551465064a8039bb3d1cba27a0b73b2b9ff1b652e06f17d4ea8b"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a9dffb9aa01013d100ddfb7230d1eeb80f2a8eef712b1825a60cad57106d8bd"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cca6c95f2879dcab52650b7aa09a4e82a139b084931b1f6f8c840f834fecc08a"}, + {file = "jsonnet-0.21.0rc2-cp310-cp310-win_amd64.whl", hash = "sha256:016d6afdb302a6d00bf3bce6a0c3d9c093b992e33f9bc67c64a868035892258e"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e893ab2c9bf10d8ec9e9b0cee8961879c88d0619cc6d8f75ea284a78e06ae32b"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06b353cd3daa2781e6cd308e05f2f116396376994bcb5f59aaadbc6a752c7f2"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eb2bc8e62b73101329072da322f7e2a1bdb3ac530b94669128d1b480e311e55"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:113766fd0c25620807bcf04d4c739f461c971a4f0e4aece9ba62b4e762de9598"}, + {file = "jsonnet-0.21.0rc2-cp311-cp311-win_amd64.whl", hash = "sha256:8dab208c2c2760be60f87d1ceb8b28c86b51ed0e31129a7d90cd5fe890b41225"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:95f5b9dd26a41d6f258d1baa8d22e557051beeed8c52a6202584f1becca9dcb5"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cecc6d76e2b377260fae0a060097c113e6ac361b8f739903ea7f3f5f64cdebdf"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aaa2d18224af7e63872ef4a101e93962505456cf5f5439c3cfc25dad6845f8b1"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2a9063f811554487ed552445e964aeec969cafb266b965029c8d6b091ce47950"}, + {file = "jsonnet-0.21.0rc2-cp312-cp312-win_amd64.whl", hash = "sha256:80d171182c169761f744ba50068a4ad35d48e52b91d25bf4c7bb9a72f0a04f71"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3657938f87cb6bc6da20ca631d437b5faf469ca060a7c7def9c8fd2f25a5e06"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3dcebc30cb991b58bc416ee05e9387004d04716d5c0b89714ff042bd069af5c8"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ac52c95482df3ed93c908468ca2f40d4825b6baba284b395ddc47bd663b8c3a"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b34450823a7a1861de892fef9f29de1b4c19e1a79e27d81ffe7e57646cc89d6"}, + {file = "jsonnet-0.21.0rc2-cp313-cp313-win_amd64.whl", hash = "sha256:573fd2580e46f4875ec505f1732f9e804b7063cba790342ed6fdafe9a6b30556"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:871ca1411de3626499bda60b330d37f85a592918f99ba4809089bbb8d4f5bfe4"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5d33b25a9c5bf9099100b9b16cb385a2876d891fbe639ee9d476fc75c861903a"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2bac374565c7f89a4675f19fd2b624ed1376519267f4e444f49b6fc0368f6e5"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:fab7bbd88f9159f88a7350701a97bda24de9e3b9eef14c2501ba8b9224160d60"}, + {file = "jsonnet-0.21.0rc2-cp39-cp39-win_amd64.whl", hash = "sha256:ed71ffba0fd233a1bca7b0f7be79730792c5383e562a9dc7da152478d9ee1612"}, + {file = "jsonnet-0.21.0rc2.tar.gz", hash = "sha256:2b83ec4b5a771c3732e0972be23a71f042ad2940db6918d3a52aade69bc394fb"}, ] [[package]] @@ -3820,4 +3844,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "010ffce959bb256880ab5a267048c182e4612b3151f9a94e3bf5d3a7807962fe" +content-hash = "715fc8c896dcfa1b15054deeddcdec557ef93af91b26e1c8e4688fe4dbef5296" diff --git a/pyproject.toml b/pyproject.toml index e7f5c62bd0..e009b0773e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,8 +48,8 @@ types-jwcrypto = "^1.5.0.20240925" pyyaml = "^6.0.2" types-pyyaml = "^6.0.12.20240917" testcontainers = "^4.9.0" -# Jsonnet doesn't support Python 3.13 yet -jsonnet = { version = "^0.20.0", markers = "python_version < '3.13'" } +# Install a release candidate of `jsonnet`, as it supports Python 3.13 +jsonnet = "^0.21.0-rc2" [tool.poetry.group.dev.dependencies] mypy = "==1.13.0" diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py index b360162dc1..85cd065a2f 100644 --- a/test_runner/regress/test_compute_metrics.py +++ b/test_runner/regress/test_compute_metrics.py @@ -3,12 +3,13 @@ from __future__ import annotations import enum import os import shutil -import sys from enum import StrEnum from logging import debug from pathlib import Path from typing import TYPE_CHECKING, cast +# Docs are available at https://jsonnet.org/ref/bindings.html#python_api +import _jsonnet import pytest import requests import yaml @@ -92,10 +93,6 @@ def jsonnet_evaluate_file( ext_vars: str | dict[str, str] | None = None, tla_vars: str | dict[str, str] | None = None, ) -> str: - # Jsonnet doesn't support Python 3.13 yet - # Docs are available at https://jsonnet.org/ref/bindings.html#python_api - import _jsonnet - return cast( "str", _jsonnet.evaluate_file( @@ -130,7 +127,6 @@ class SqlExporterProcess(StrEnum): AUTOSCALING = "autoscaling" -@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet") @pytest.mark.parametrize( "collector_name", ["neon_collector", "neon_collector_autoscaling"], @@ -359,7 +355,6 @@ else: self.__proc.wait() -@pytest.mark.xfail(sys.version_info >= (3, 13), reason="Jsonnet doesn't support Python 3.13 yet") @pytest.mark.parametrize( "exporter", [SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING], From 3168bd0e3ac916e62d2032d2e49cbbd1f49e474a Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Fri, 14 Mar 2025 21:42:09 +0400 Subject: [PATCH 47/71] tests: suppress "Cancelled request finished with an error" in test_timeline_archive (#11241) ## Problem Previous PR https://github.com/neondatabase/neon/pull/11190 didn't suppress `Cancelled request finished with an error` messages, which are also expected, so the test https://github.com/neondatabase/neon/issues/11177 is still flaky. ## Summary of changes - Suppress `Cancelled request finished with an error` in `test_timeline_archive` --- test_runner/regress/test_timeline_archive.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 11567cafd0..2bad0bb671 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -46,8 +46,11 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): # We make /archival_config requests that are intended to fail. # It's expected that storcon drops requests to other pageservers after # it gets the first error (https://github.com/neondatabase/neon/issues/11177) - ps.allowed_errors.append( - ".*WARN.* path=/v1/tenant/.*/archival_config .*request was dropped before completing", + ps.allowed_errors.extend( + [ + ".*WARN.* path=/v1/tenant/.*/archival_config .*request was dropped before completing", + ".*ERROR.* path=/v1/tenant/.*/archival_config .*Cancelled request finished with an error.*", + ] ) # first try to archive a non existing timeline for an existing tenant: From 53d50c7ea574c3e53827fa83614eeb7479d9ee5c Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 14 Mar 2025 18:45:18 +0100 Subject: [PATCH 48/71] pageserver: deflake compaction tests (#11246) These need to set `NoYield`, otherwise they may be preempted by pending L0 compaction. --- pageserver/src/tenant.rs | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 7a06d60268..55b5704d67 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -6559,7 +6559,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; let mut writer = tline.writer().await; @@ -6576,7 +6580,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; let mut writer = tline.writer().await; @@ -6593,7 +6601,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; let mut writer = tline.writer().await; @@ -6610,7 +6622,11 @@ mod tests { tline.freeze_and_flush().await?; tline - .compact(&CancellationToken::new(), EnumSet::empty(), &ctx) + .compact( + &CancellationToken::new(), + CompactFlags::NoYield.into(), + &ctx, + ) .await?; assert_eq!( @@ -6693,7 +6709,9 @@ mod tests { timeline.freeze_and_flush().await?; if compact { // this requires timeline to be &Arc - timeline.compact(&cancel, EnumSet::empty(), ctx).await?; + timeline + .compact(&cancel, CompactFlags::NoYield.into(), ctx) + .await?; } // this doesn't really need to use the timeline_id target, but it is closer to what it @@ -7399,7 +7417,9 @@ mod tests { // Perform a cycle of flush, compact, and GC tline.freeze_and_flush().await?; - tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tline + .compact(&cancel, CompactFlags::NoYield.into(), &ctx) + .await?; tenant .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) .await?; @@ -7779,7 +7799,9 @@ mod tests { let before_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); - tline.compact(&cancel, EnumSet::empty(), &ctx).await?; + tline + .compact(&cancel, CompactFlags::NoYield.into(), &ctx) + .await?; let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len(); From a674ed8cafe4d378d43938285da866cfa6255082 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 14 Mar 2025 20:08:24 +0000 Subject: [PATCH 49/71] storcon: safety check when completing shard split (#11256) ## Problem There is a rare race between controller graceful deployment and shard splitting where we may incorrectly both abort _and_ complete the split (on different pods), and thereby leave no shards at all in the database. Related: #11254 ## Summary of changes - In complete_shard_split, refuse to delete anything if child shards are not found --- storage_controller/src/persistence.rs | 16 ++++++++++++++++ storage_controller/src/service.rs | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 4a97aac125..85d9c574a1 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -967,10 +967,26 @@ impl Persistence { &self, split_tenant_id: TenantId, old_shard_count: ShardCount, + new_shard_count: ShardCount, ) -> DatabaseResult<()> { use crate::schema::tenant_shards::dsl::*; self.with_measured_conn(DatabaseOperation::CompleteShardSplit, move |conn| { Box::pin(async move { + // Sanity: child shards must still exist, as we're deleting parent shards + let child_shards_query = tenant_shards + .filter(tenant_id.eq(split_tenant_id.to_string())) + .filter(shard_count.eq(new_shard_count.literal() as i32)); + let child_shards = child_shards_query + .load::(conn) + .await?; + if child_shards.len() != new_shard_count.count() as usize { + return Err(DatabaseError::Logical(format!( + "Unexpected child shard count {} while completing split to \ + count {new_shard_count:?} on tenant {split_tenant_id}", + child_shards.len() + ))); + } + // Drop parent shards diesel::delete(tenant_shards) .filter(tenant_id.eq(split_tenant_id.to_string())) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index f33408a89b..4e00136e1b 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5753,7 +5753,7 @@ impl Service { // it doesn't match, but that requires more retry logic on this side) self.persistence - .complete_shard_split(tenant_id, old_shard_count) + .complete_shard_split(tenant_id, old_shard_count, new_shard_count) .await?; fail::fail_point!("shard-split-post-complete", |_| Err( From a5b00b87ba5bc41b9dd16776a72d3bd614c95a35 Mon Sep 17 00:00:00 2001 From: Cihan Demirci <128653800+fcdm@users.noreply.github.com> Date: Sun, 16 Mar 2025 14:53:27 +0100 Subject: [PATCH 50/71] CI(pre-merge-checks): use step-security/changed-files (#11265) Use Step Security maintained version of `tj-actions/changed-files`. https://www.stepsecurity.io/blog/harden-runner-detection-tj-actions-changed-files-action-is-compromised#use-the-stepsecurity-maintained-changed-files-action --- .github/workflows/pre-merge-checks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index 1e81550314..9e5c3df52c 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -27,7 +27,7 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 id: python-src with: files: | @@ -38,7 +38,7 @@ jobs: poetry.lock pyproject.toml - - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4 + - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 id: rust-src with: files: | From 228bb753546149135ddb7f4c198d8c8d28fa1c6e Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Sun, 16 Mar 2025 15:04:48 +0100 Subject: [PATCH 51/71] Extend large tenant OLTP workload ... (#11166) ... to better match the workload characteristics of real Neon customers ## Problem We analyzed workloads of large Neon users and want to extend the oltp workload to include characteristics seen in those workloads. ## Summary of changes - for re-use branch delete inserted rows from last run - adjust expected run-time (time-outs) in GitHub workflow - add queries that exposes the prefetch getpages path - add I/U/D transactions for another table (so far the workload was insert/append-only) - add an explicit vacuum analyze step and measure its time - add reindex concurrently step and measure its time (and take care that this step succeeds even if prior reindex runs have failed or were canceled) - create a second connection string for the pooled connection that removes the `-pooler` suffix from the hostname because we want to run long-running statements (database maintenance) and bypass the pooler which doesn't support unlimited statement timeout ## Test run https://github.com/neondatabase/neon/actions/runs/13851772887/job/38760172415 --- .github/workflows/large_oltp_benchmark.yml | 89 +++++++--- .../IUD_one_transaction.sql | 162 ++++++++++++++++++ .../select_prefetch_webhook.sql | 25 +++ .../test_perf_oltp_large_tenant.py | 78 ++++++++- 4 files changed, 327 insertions(+), 27 deletions(-) create mode 100644 test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql create mode 100644 test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml index f33e11cd08..a7c3118e34 100644 --- a/.github/workflows/large_oltp_benchmark.yml +++ b/.github/workflows/large_oltp_benchmark.yml @@ -2,8 +2,8 @@ name: large oltp benchmark on: # uncomment to run on push for debugging your PR - push: - branches: [ bodobolero/synthetic_oltp_workload ] + #push: + # branches: [ bodobolero/synthetic_oltp_workload ] schedule: # * is a special character in YAML so you have to quote this string @@ -12,7 +12,7 @@ on: # │ │ ┌───────────── day of the month (1 - 31) # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks + - cron: '0 15 * * 0,2,4' # run on Sunday, Tuesday, Thursday at 3 PM UTC workflow_dispatch: # adds ability to run this manually defaults: @@ -22,7 +22,7 @@ defaults: concurrency: # Allow only one workflow globally because we need dedicated resources which only exist once group: large-oltp-bench-workflow - cancel-in-progress: true + cancel-in-progress: false jobs: oltp: @@ -31,9 +31,9 @@ jobs: matrix: include: - target: new_branch - custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 - target: reuse_branch - custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 + custom_scripts: insert_webhooks.sql@200 select_any_webhook_with_skew.sql@300 select_recent_webhook.sql@397 select_prefetch_webhook.sql@3 IUD_one_transaction.sql@100 max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results permissions: contents: write @@ -46,7 +46,6 @@ jobs: PG_VERSION: 16 # pre-determined by pre-determined project TEST_OUTPUT: /tmp/test_output BUILD_TYPE: remote - SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }} PLATFORM: ${{ matrix.target }} runs-on: [ self-hosted, us-east-2, x64 ] @@ -57,8 +56,10 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init - # Increase timeout to 8h, default timeout is 6h - timeout-minutes: 480 + # Increase timeout to 2 days, default timeout is 6h - database maintenance can take a long time + # (normally 1h pgbench, 3h vacuum analyze 3.5h re-index) x 2 = 15h, leave some buffer for regressions + # in one run vacuum didn't finish within 12 hours + timeout-minutes: 2880 steps: - uses: actions/checkout@v4 @@ -89,29 +90,45 @@ jobs: - name: Set up Connection String id: set-up-connstr run: | - case "${{ matrix.target }}" in - new_branch) - CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }} - ;; - reuse_branch) - CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} - ;; - *) - echo >&2 "Unknown target=${{ matrix.target }}" - exit 1 - ;; - esac + case "${{ matrix.target }}" in + new_branch) + CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }} + ;; + reuse_branch) + CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }} + ;; + *) + echo >&2 "Unknown target=${{ matrix.target }}" + exit 1 + ;; + esac - echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + CONNSTR_WITHOUT_POOLER="${CONNSTR//-pooler/}" - - name: Benchmark pgbench with custom-scripts + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + echo "connstr_without_pooler=${CONNSTR_WITHOUT_POOLER}" >> $GITHUB_OUTPUT + + - name: Delete rows from prior runs in reuse branch + if: ${{ matrix.target == 'reuse_branch' }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }} + PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config + PSQL: /tmp/neon/pg_install/v16/bin/psql + PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib + run: | + echo "$(date '+%Y-%m-%d %H:%M:%S') - Deleting rows in table webhook.incoming_webhooks from prior runs" + export LD_LIBRARY_PATH=${PG_16_LIB_PATH} + ${PSQL} "${BENCHMARK_CONNSTR}" -c "SET statement_timeout = 0; DELETE FROM webhook.incoming_webhooks WHERE created_at > '2025-02-27 23:59:59+00';" + echo "$(date '+%Y-%m-%d %H:%M:%S') - Finished deleting rows in table webhook.incoming_webhooks from prior runs" + + - name: Benchmark pgbench with custom-scripts uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} test_selection: performance run_in_parallel: false - save_perf_report: ${{ env.SAVE_PERF_REPORT }} - extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant + save_perf_report: true + extra_params: -m remote_cluster --timeout 7200 -k test_perf_oltp_large_tenant_pgbench pg_version: ${{ env.PG_VERSION }} aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} env: @@ -119,6 +136,21 @@ jobs: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + - name: Benchmark database maintenance + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: true + extra_params: -m remote_cluster --timeout 172800 -k test_perf_oltp_large_tenant_maintenance + pg_version: ${{ env.PG_VERSION }} + aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr_without_pooler }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + - name: Delete Neon Branch for large tenant if: ${{ always() && matrix.target == 'new_branch' }} uses: ./.github/actions/neon-branch-delete @@ -127,6 +159,13 @@ jobs: branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} + - name: Configure AWS credentials # again because prior steps could have exceeded 5 hours + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-region: eu-central-1 + role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} + role-duration-seconds: 18000 # 5 hours + - name: Create Allure report id: create-allure-report if: ${{ !cancelled() }} diff --git a/test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql b/test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql new file mode 100644 index 0000000000..4c5b3fbd11 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/IUD_one_transaction.sql @@ -0,0 +1,162 @@ +\set min_id 1 +\set max_id 1500000000 +\set range_size 100 + +-- Use uniform random instead of random_zipfian +\set random_id random(:min_id, :max_id) +\set random_mar_id random(1, 65536) +\set random_delete_id random(:min_id, :max_id) + +-- Update exactly one row (if it exists) using the uniformly chosen random_id +UPDATE transaction.transaction + SET state = 'COMPLETED', + settlement_date = CURRENT_DATE, + mar_identifier = (:random_mar_id)::int + WHERE id = (:random_id)::bigint; + +-- Insert exactly one row +INSERT INTO transaction.transaction ( + user_id, + card_id, + business_id, + preceding_transaction_id, + is_last, + is_mocked, + type, + state, + network, + subnetwork, + user_transaction_time, + settlement_date, + request_amount, + amount, + currency_code, + approval_code, + response, + gpa, + gpa_order_unload, + gpa_order, + program_transfer, + fee_transfer, + peer_transfer, + msa_orders, + risk_assessment, + auto_reload, + direct_deposit, + polarity, + real_time_fee_group, + fee, + chargeback, + standin_approved_by, + acquirer_fee_amount, + funded_account_holder, + digital_wallet_token, + network_fees, + card_security_code_verification, + fraud, + cardholder_authentication_data, + currency_conversion, + merchant, + store, + card_acceptor, + acquirer, + pos, + avs, + mar_token, + mar_preceding_related_transaction_token, + mar_business_token, + mar_acting_user_token, + mar_card_token, + mar_duration, + mar_created_time, + issuer_interchange_amount, + offer_orders, + transaction_canonical_id, + mar_identifier, + created_at, + card_acceptor_mid, + card_acceptor_name, + address_verification, + issuing_product, + mar_enhanced_data_token, + standin_reason +) +SELECT + (:random_id % 100000) + 1 AS user_id, + (:random_id % 500000) + 1 AS card_id, + (:random_id % 20000) + 1 AS business_id, + NULL AS preceding_transaction_id, + (:random_id % 2) = 0 AS is_last, + (:random_id % 5) = 0 AS is_mocked, + 'authorization' AS type, + 'PENDING' AS state, + 'VISA' AS network, + 'VISANET' AS subnetwork, + now() - ((:random_id % 100) || ' days')::interval AS user_transaction_time, + now() - ((:random_id % 100) || ' days')::interval AS settlement_date, + random() * 1000 AS request_amount, + random() * 1000 AS amount, + 'USD' AS currency_code, + md5((:random_id)::text) AS approval_code, + '{}'::jsonb AS response, + '{}'::jsonb AS gpa, + '{}'::jsonb AS gpa_order_unload, + '{}'::jsonb AS gpa_order, + '{}'::jsonb AS program_transfer, + '{}'::jsonb AS fee_transfer, + '{}'::jsonb AS peer_transfer, + '{}'::jsonb AS msa_orders, + '{}'::jsonb AS risk_assessment, + '{}'::jsonb AS auto_reload, + '{}'::jsonb AS direct_deposit, + '{}'::jsonb AS polarity, + '{}'::jsonb AS real_time_fee_group, + '{}'::jsonb AS fee, + '{}'::jsonb AS chargeback, + NULL AS standin_approved_by, + random() * 100 AS acquirer_fee_amount, + '{}'::jsonb AS funded_account_holder, + '{}'::jsonb AS digital_wallet_token, + '{}'::jsonb AS network_fees, + '{}'::jsonb AS card_security_code_verification, + '{}'::jsonb AS fraud, + '{}'::jsonb AS cardholder_authentication_data, + '{}'::jsonb AS currency_conversion, + '{}'::jsonb AS merchant, + '{}'::jsonb AS store, + '{}'::jsonb AS card_acceptor, + '{}'::jsonb AS acquirer, + '{}'::jsonb AS pos, + '{}'::jsonb AS avs, + md5((:random_id)::text || 'token') AS mar_token, + NULL AS mar_preceding_related_transaction_token, + NULL AS mar_business_token, + NULL AS mar_acting_user_token, + NULL AS mar_card_token, + random() * 1000 AS mar_duration, + now() AS mar_created_time, + random() * 100 AS issuer_interchange_amount, + '{}'::jsonb AS offer_orders, + (:random_id % 500) + 1 AS transaction_canonical_id, + :random_id::integer AS mar_identifier, + now() AS created_at, + NULL AS card_acceptor_mid, + NULL AS card_acceptor_name, + '{}'::jsonb AS address_verification, + 'DEFAULT_PRODUCT' AS issuing_product, + NULL AS mar_enhanced_data_token, + NULL AS standin_reason +FROM (SELECT 1) AS dummy; + +-- Delete exactly one row using the uniformly chosen random_delete_id +WITH to_delete AS ( + SELECT id + FROM transaction.transaction + WHERE id >= (:random_delete_id)::bigint + AND id < ((:random_delete_id)::bigint + :range_size) + ORDER BY id + LIMIT 1 +) +DELETE FROM transaction.transaction +USING to_delete +WHERE transaction.transaction.id = to_delete.id; \ No newline at end of file diff --git a/test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql b/test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql new file mode 100644 index 0000000000..e0b0e52276 --- /dev/null +++ b/test_runner/performance/large_synthetic_oltp/select_prefetch_webhook.sql @@ -0,0 +1,25 @@ +-- enforce a controlled number of getpages prefetch requests from a range of +-- 40 million first pages (320 GB) of a 500 GiB table +-- the table has 55 million pages + + +-- Zipfian distributions model real-world access patterns where: +-- A few values (popular IDs) are accessed frequently. +-- Many values are accessed rarely. +-- This is useful for simulating realistic workloads + +\set alpha 1.2 +\set min_page 1 +\set max_page 40000000 + +\set zipf_random_page random_zipfian(:min_page, :max_page, :alpha) + +-- Read 500 consecutive pages from a Zipfian-distributed random start page +-- This enforces PostgreSQL prefetching +WITH random_page AS ( + SELECT :zipf_random_page::int AS start_page +) +SELECT MAX(created_at) +FROM webhook.incoming_webhooks +WHERE ctid >= (SELECT format('(%s,1)', start_page)::tid FROM random_page) +AND ctid < (SELECT format('(%s,1)', start_page + 500)::tid FROM random_page); \ No newline at end of file diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py index ae00dbb3b5..842e6a904b 100644 --- a/test_runner/performance/test_perf_oltp_large_tenant.py +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -2,11 +2,13 @@ from __future__ import annotations import os import timeit +from contextlib import closing from pathlib import Path import pytest from fixtures.benchmark_fixture import PgBenchRunResult from fixtures.compare_fixtures import PgCompare +from fixtures.log_helper import log from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp @@ -82,9 +84,81 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None): env.zenbenchmark.record_pg_bench_result(prefix, res) +def run_database_maintenance(env: PgCompare): + with closing(env.pg.connect()) as conn: + with conn.cursor() as cur: + log.info("start vacuum analyze transaction.transaction") + with env.zenbenchmark.record_duration("vacuum_analyze"): + cur.execute("SET statement_timeout = 0;") + cur.execute("SET max_parallel_maintenance_workers = 7;") + cur.execute("SET maintenance_work_mem = '10GB';") + cur.execute("vacuum analyze transaction.transaction;") + log.info("finished vacuum analyze transaction.transaction") + + # recover previously failed or canceled re-indexing + cur.execute( + """ + DO $$ + DECLARE + invalid_index TEXT; + BEGIN + FOR invalid_index IN + SELECT c.relname + FROM pg_class c + JOIN pg_index i ON i.indexrelid = c.oid + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = 'transaction' + AND i.indisvalid = FALSE + AND c.relname LIKE '%_ccnew%' + LOOP + EXECUTE 'DROP INDEX IF EXISTS transaction.' || invalid_index; + END LOOP; + END $$; + """ + ) + # also recover failed or canceled re-indexing on toast part of table + cur.execute( + """ + DO $$ + DECLARE + invalid_index TEXT; + BEGIN + FOR invalid_index IN + SELECT c.relname + FROM pg_class c + JOIN pg_index i ON i.indexrelid = c.oid + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = 'pg_toast' + AND i.indisvalid = FALSE + AND c.relname LIKE '%_ccnew%' + AND i.indrelid = ( + SELECT reltoastrelid FROM pg_class + WHERE relname = 'transaction' + AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'transaction') + ) + LOOP + EXECUTE 'DROP INDEX IF EXISTS pg_toast.' || invalid_index; + END LOOP; + END $$; + """ + ) + + log.info("start REINDEX TABLE CONCURRENTLY transaction.transaction") + with env.zenbenchmark.record_duration("reindex concurrently"): + cur.execute("REINDEX TABLE CONCURRENTLY transaction.transaction;") + log.info("finished REINDEX TABLE CONCURRENTLY transaction.transaction") + + @pytest.mark.parametrize("custom_scripts", get_custom_scripts()) @pytest.mark.parametrize("duration", get_durations_matrix()) @pytest.mark.remote_cluster -def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int): +def test_perf_oltp_large_tenant_pgbench( + remote_compare: PgCompare, custom_scripts: str, duration: int +): run_test_pgbench(remote_compare, custom_scripts, duration) - # todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration + + +@pytest.mark.remote_cluster +def test_perf_oltp_large_tenant_maintenance(remote_compare: PgCompare): + # run analyze, vacuum, re-index after the test and measure and report its duration + run_database_maintenance(remote_compare) From 8566cad23b576fcac7b055b88a33acc74335323d Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Sun, 16 Mar 2025 18:17:58 +0100 Subject: [PATCH 52/71] chore(docs): Refresh RFC guide to suggest using YYYY-MM-DD prefix (#11252) ## Problem Serial/numeric IDs lead to collisions, which is not critical but looks awkward. Previous discussion: https://neondb.slack.com/archives/C033A2WE6BZ/p1741891345869979 ## Summary of changes Suggest using the `YYYY-MM-DD` prefix, which i) has less chance of collision; ii) provides out-of-the-box lexicographic sorting; iii) even if it collides, it's not a big deal -- just two RFCs have been started on the same day. --------- Co-authored-by: Alexander Bayandin --- ...e-limits.md => 001-cluster-size-limits.md} | 0 docs/rfcs/README.md | 43 +++++-------------- docs/rfcs/YYYY-MM-DD-copy-me.md | 30 +++++++++++++ 3 files changed, 40 insertions(+), 33 deletions(-) rename docs/rfcs/{cluster-size-limits.md => 001-cluster-size-limits.md} (100%) create mode 100644 docs/rfcs/YYYY-MM-DD-copy-me.md diff --git a/docs/rfcs/cluster-size-limits.md b/docs/rfcs/001-cluster-size-limits.md similarity index 100% rename from docs/rfcs/cluster-size-limits.md rename to docs/rfcs/001-cluster-size-limits.md diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md index f7b0b3a587..094f8d5360 100644 --- a/docs/rfcs/README.md +++ b/docs/rfcs/README.md @@ -1,3 +1,7 @@ +# Neon RFCs + +## Overview + This directory contains Request for Comments documents, or RFCs, for features or concepts that have been proposed. Alternative names: technical design doc, ERD, one-pager @@ -59,37 +63,10 @@ RFC lifecycle: ### RFC template +Use template with `YYYY-MM-DD-copy-me.md` as a starting point. Timestamp prefix helps to avoid awkward 'id' collisions. + +```sh +cp docs/rfcs/YYYY-MM-DD-copy-me.md docs/rfcs/$(date +"%Y-%m-%d")-.md +``` + Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration. - -``` -# Name -Created on .. -Implemented on .. - -## Summary - -## Motivation - -## Non Goals (if relevant) - -## Impacted components (e.g. pageserver, safekeeper, console, etc) - -## Proposed implementation - -### Reliability, failure modes and corner cases (if relevant) - -### Interaction/Sequence diagram (if relevant) - -### Scalability (if relevant) - -### Security implications (if relevant) - -### Unresolved questions (if relevant) - -## Alternative implementation (if relevant) - -## Pros/cons of proposed approaches (if relevant) - -## Definition of Done (if relevant) - -``` diff --git a/docs/rfcs/YYYY-MM-DD-copy-me.md b/docs/rfcs/YYYY-MM-DD-copy-me.md new file mode 100644 index 0000000000..8487861e6b --- /dev/null +++ b/docs/rfcs/YYYY-MM-DD-copy-me.md @@ -0,0 +1,30 @@ +# Name + +Created on YYYY-MM-DD +Implemented on _TBD_ + +## Summary + +## Motivation + +## Non Goals (if relevant) + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +## Proposed implementation + +### Reliability, failure modes and corner cases (if relevant) + +### Interaction/Sequence diagram (if relevant) + +### Scalability (if relevant) + +### Security implications (if relevant) + +### Unresolved questions (if relevant) + +## Alternative implementation (if relevant) + +## Pros/cons of proposed approaches (if relevant) + +## Definition of Done (if relevant) From 966abd3bd602899a94b0675be9dd8faa8a8a6edf Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Sun, 16 Mar 2025 19:39:54 +0100 Subject: [PATCH 53/71] fix(compute_ctl): Dollar escaping helper fixes (#11263) ## Problem In the previous PR #11045, one edge-case wasn't covered, when an ident contains only one `$`, we were picking `$$` as a 'wrapper'. Yet, when this `$` is at the beginning or at the end of the ident, then we end up with `$$$` in a row which breaks the escaping. ## Summary of changes Start from `x` tag instead of a blank string. Slack: https://neondb.slack.com/archives/C08HV951W2W/p1742076675079769?thread_ts=1742004205.461159&cid=C08HV951W2W --- compute_tools/src/pg_helpers.rs | 4 ++-- compute_tools/tests/pg_helpers_tests.rs | 3 ++- test_runner/regress/test_compute_catalog.py | 15 +++++++++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 802e3e93d9..10d8f2c878 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -208,8 +208,8 @@ impl Escaping for PgIdent { /// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`, /// fn pg_quote_dollar(&self) -> (String, String) { - let mut tag: String = "".to_string(); - let mut outer_tag = "x".to_string(); + let mut tag: String = "x".to_string(); + let mut outer_tag = "xx".to_string(); // Find the first suitable tag that is not present in the string. // Postgres' max role/DB name length is 63 bytes, so even in the diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index f2d74ff384..b72c1293ee 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -64,7 +64,8 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor #[test] fn ident_pg_quote_dollar() { let test_cases = vec![ - ("name", ("$$name$$", "x")), + ("name", ("$x$name$x$", "xx")), + ("name$", ("$x$name$$x$", "xx")), ("name$$", ("$x$name$$$x$", "xx")), ("name$$$", ("$x$name$$$$x$", "xx")), ("name$$$$", ("$x$name$$$$$x$", "xx")), diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index 2e7da86d9d..0d3618d1b8 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -16,6 +16,9 @@ TEST_ROLE_NAMES = [ {"name": "role \";with ';injections $$ $x$ $ %I !/\\&#@"}, {"name": '"role in double quotes"'}, {"name": "'role in single quotes'"}, + {"name": "role$"}, + {"name": "role$$"}, + {"name": "role$x$"}, ] TEST_DB_NAMES = [ @@ -59,6 +62,18 @@ TEST_DB_NAMES = [ "name": "'db in single quotes'", "owner": "'role in single quotes'", }, + { + "name": "db name$", + "owner": "role$", + }, + { + "name": "db name$$", + "owner": "role$$", + }, + { + "name": "db name$x$", + "owner": "role$x$", + }, ] From 15e63afe7dde679a1b8c604fee3fec11004079a4 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 17 Mar 2025 08:07:24 +0200 Subject: [PATCH 54/71] Support DEBUG_COMPARE_LOCAL mode for unloggedindex build (#11257) ## Problem In unlogged index build (used fir GIST/SPGIST/GIN indexes) files is created on disk and then removed at the end. It contradicts to the logic of DEBUG_COMPARE_LOCAL mode. ## Summary of changes Do not create and unlink files in unlogged build in DEBUG_COMPARE_LOCAL mode. Co-authored-by: Konstantin Knizhnik --- pgxn/neon/pagestore_smgr.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 6fe95df3dd..ae92be4577 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -4176,8 +4176,10 @@ neon_start_unlogged_build(SMgrRelation reln) * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ - if (!IsParallelWorker()) +#ifndef DEBUG_COMPARE_LOCAL + if (!IsParallelWorker()) mdcreate(reln, MAIN_FORKNUM, false); +#endif } /* @@ -4252,8 +4254,10 @@ neon_end_unlogged_build(SMgrRelation reln) forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); mdclose(reln, forknum); +#ifndef DEBUG_COMPARE_LOCAL /* use isRedo == true, so that we drop it immediately */ mdunlink(rinfob, forknum, true); +#endif } } From 136cae76c23d4aca05c2033ec88547d53c723033 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 17 Mar 2025 07:25:12 +0000 Subject: [PATCH 55/71] fix(ci): correct regex to detect release-compute RC PRs (#11269) ## Problem The regex in `_meta.yml` workflow doesn't detect RC PRs for compute releases: https://neondb.slack.com/archives/C059ZC138NR/p1742164884669389 ## Summary of changes - Fix regex --------- Co-authored-by: Peter Bendel --- .github/workflows/_meta.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml index 9e49c1ebc8..bb2f9fa5d9 100644 --- a/.github/workflows/_meta.yml +++ b/.github/workflows/_meta.yml @@ -125,5 +125,5 @@ jobs: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} run: | - RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy)|(compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Falied to find Build and Test run from RC PR!" | halt_error(1))') + RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from RC PR!" | halt_error(1))') echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT From fdf04d4d81f38b3e4901b118af92974de9709f6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Mon, 17 Mar 2025 10:26:45 +0100 Subject: [PATCH 56/71] fix(ci): use correct branch ref for checking whether this is a release merge queue (#11270) ## Problem https://github.com/neondatabase/neon/actions/runs/13894288475/job/38871819190 shows the "Add fast-fordward label to PR to trigger fast-forward merge" job being skipped. This is due to not using the right variable for checking which branch the merge queue is merging into. ## Summary of changes Use the `branch` output of the `meta` task for checking the target branch of a merge group. --- .github/workflows/pre-merge-checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index 9e5c3df52c..3bd81f6538 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -148,7 +148,7 @@ jobs: ${{ always() && github.event_name == 'merge_group' - && contains(fromJson('["release", "release-proxy", "release-compute"]'), github.base_ref) + && contains(fromJson('["release", "release-proxy", "release-compute"]'), needs.meta.outputs.branch) }} env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} From db30e1669c40b8ef519a6618313c5ab2da2567ac Mon Sep 17 00:00:00 2001 From: Roman Zaynetdinov Date: Mon, 17 Mar 2025 15:53:23 +0200 Subject: [PATCH 57/71] Add /configure_telemetry API endpoint (#11117) Work on https://github.com/neondatabase/cloud/issues/23721 and https://github.com/neondatabase/cloud/issues/23714 Depends on https://github.com/neondatabase/neon/pull/11111 - Add `/configure_telemetry` API endpoint - Support second rsyslog configuration for Postgres logs export - Enable logs export when compute feature is enabled and configure Postgres to send logs to syslog I have used `/configure_telemetry` name because in the future I see it also being used for configuring a `pg_tracing` extension to export traces. Let me know if you'd rather have these APIs separate. In this case we can rename it to `/configure_rsyslog`. --- compute/vm-image-spec-bookworm.yaml | 10 ++ compute/vm-image-spec-bullseye.yaml | 10 ++ compute_tools/src/compute.rs | 18 ++- compute_tools/src/config.rs | 8 +- .../compute_audit_rsyslog_template.conf | 2 +- ...pute_rsyslog_postgres_export_template.conf | 10 ++ compute_tools/src/http/openapi_spec.yaml | 30 ++++ compute_tools/src/http/routes/configure.rs | 27 +++- compute_tools/src/http/server.rs | 1 + compute_tools/src/rsyslog.rs | 140 +++++++++++++++++- libs/compute_api/src/requests.rs | 6 + libs/compute_api/src/spec.rs | 3 + 12 files changed, 259 insertions(+), 6 deletions(-) create mode 100644 compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 0cf72b6f74..f63aa88da2 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -39,6 +39,13 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also. + # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to + # use a different path for the socket. The symlink actually points to our custom path. + - name: rsyslogd-socket-symlink + user: root + sysvInitAction: sysinit + shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log" - name: rsyslogd user: postgres sysvInitAction: respawn @@ -77,6 +84,9 @@ files: # compute_ctl will rewrite this file with the actual configuration, if needed. - filename: compute_rsyslog.conf content: | + # Syslock.Name specifies a non-default pipe location that is writeable for the postgres user. + module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging + *.* /dev/null $IncludeConfig /etc/rsyslog.d/*.conf build: | diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 9deaf3ea55..8b3c681228 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -39,6 +39,13 @@ commands: user: nobody sysvInitAction: respawn shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' + # Rsyslog by default creates a unix socket under /dev/log . That's where Postgres sends logs also. + # We run syslog with postgres user so it can't create /dev/log. Instead we configure rsyslog to + # use a different path for the socket. The symlink actually points to our custom path. + - name: rsyslogd-socket-symlink + user: root + sysvInitAction: sysinit + shell: "ln -s /var/db/postgres/rsyslogpipe /dev/log" - name: rsyslogd user: postgres sysvInitAction: respawn @@ -77,6 +84,9 @@ files: # compute_ctl will rewrite this file with the actual configuration, if needed. - filename: compute_rsyslog.conf content: | + # Syslock.Name specifies a non-default pipe location that is writeable for the postgres user. + module(load="imuxsock" SysSock.Name="/var/db/postgres/rsyslogpipe") # provides support for local system logging + *.* /dev/null $IncludeConfig /etc/rsyslog.d/*.conf build: | diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 58b99dde53..d31472b0c1 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -37,7 +37,10 @@ use crate::logger::startup_context_from_env; use crate::lsn_lease::launch_lsn_lease_bg_task_for_static; use crate::monitor::launch_monitor; use crate::pg_helpers::*; -use crate::rsyslog::{configure_audit_rsyslog, launch_pgaudit_gc}; +use crate::rsyslog::{ + PostgresLogsRsyslogConfig, configure_audit_rsyslog, configure_postgres_logs_export, + launch_pgaudit_gc, +}; use crate::spec::*; use crate::swap::resize_swap; use crate::sync_sk::{check_if_synced, ping_safekeeper}; @@ -617,7 +620,7 @@ impl ComputeNode { }); } - // Configure and start rsyslog if necessary + // Configure and start rsyslog for HIPAA if necessary if let ComputeAudit::Hipaa = pspec.spec.audit_log_level { let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); if remote_endpoint.is_empty() { @@ -632,6 +635,17 @@ impl ComputeNode { launch_pgaudit_gc(log_directory_path); } + // Configure and start rsyslog for Postgres logs export + if self.has_feature(ComputeFeature::PostgresLogsExport) { + if let Some(ref project_id) = pspec.spec.cluster.cluster_id { + let host = PostgresLogsRsyslogConfig::default_host(project_id); + let conf = PostgresLogsRsyslogConfig::new(Some(&host)); + configure_postgres_logs_export(conf)?; + } else { + warn!("not configuring rsyslog for Postgres logs export: project ID is missing") + } + } + // Launch remaining service threads let _monitor_handle = launch_monitor(self); let _configurator_handle = launch_configurator(self); diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index e4acc5471c..290632e4cd 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -7,7 +7,7 @@ use std::io::prelude::*; use std::path::Path; use compute_api::responses::TlsConfig; -use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption}; +use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption}; use crate::pg_helpers::{ GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, @@ -216,6 +216,12 @@ pub fn write_postgres_conf( writeln!(file, "neon.disable_logical_replication_subscribers=false")?; } + // We need Postgres to send logs to rsyslog so that we can forward them + // further to customers' log aggregation systems. + if spec.features.contains(&ComputeFeature::PostgresLogsExport) { + writeln!(file, "log_destination='stderr,syslog'")?; + } + // This is essential to keep this line at the end of the file, // because it is intended to override any settings above. writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?; diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf index 1937cdc292..9ca7e36738 100644 --- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -8,4 +8,4 @@ input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Fac global(workDirectory="/var/log/rsyslog") # Forward logs to remote syslog server -*.* @@{remote_endpoint} \ No newline at end of file +*.* @@{remote_endpoint} diff --git a/compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf b/compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf new file mode 100644 index 0000000000..2580b61fea --- /dev/null +++ b/compute_tools/src/config_template/compute_rsyslog_postgres_export_template.conf @@ -0,0 +1,10 @@ +# Program name comes from postgres' syslog_facility configuration: https://www.postgresql.org/docs/current/runtime-config-logging.html#GUC-SYSLOG-IDENT +# Default value is 'postgres'. +if $programname == 'postgres' then {{ + # Forward Postgres logs to telemetry otel collector + action(type="omfwd" target="{logs_export_target}" port="{logs_export_port}" protocol="tcp" + template="RSYSLOG_SyslogProtocol23Format" + action.resumeRetryCount="3" + queue.type="linkedList" queue.size="1000") + stop +}} diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index bbdb7d0917..7c8f72440f 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -306,6 +306,36 @@ paths: schema: $ref: "#/components/schemas/GenericError" + /configure_telemetry: + post: + tags: + - Configure + summary: Configure rsyslog + description: | + This API endpoint configures rsyslog to forward Postgres logs + to a specified otel collector. + operationId: configureTelemetry + requestBody: + required: true + content: + application/json: + schema: + type: object + properties: + logs_export_host: + type: string + description: | + Hostname and the port of the otel collector. Leave empty to disable logs forwarding. + Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526 + responses: + 204: + description: "Telemetry configured successfully" + 500: + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + components: securitySchemes: JWT: diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index 3c5a6a6d41..5c9dd22c3d 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -1,9 +1,11 @@ use std::sync::Arc; +use axum::body::Body; use axum::extract::State; use axum::response::Response; -use compute_api::requests::ConfigurationRequest; +use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest}; use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; +use compute_api::spec::ComputeFeature; use http::StatusCode; use tokio::task; use tracing::info; @@ -11,6 +13,7 @@ use tracing::info; use crate::compute::{ComputeNode, ParsedSpec}; use crate::http::JsonResponse; use crate::http::extract::Json; +use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export}; // Accept spec in JSON format and request compute configuration. If anything // goes wrong after we set the compute status to `ConfigurationPending` and @@ -92,3 +95,25 @@ pub(in crate::http) async fn configure( JsonResponse::success(StatusCode::OK, body) } + +pub(in crate::http) async fn configure_telemetry( + State(compute): State>, + request: Json, +) -> Response { + if !compute.has_feature(ComputeFeature::PostgresLogsExport) { + return JsonResponse::error( + StatusCode::PRECONDITION_FAILED, + "Postgres logs export feature is not enabled".to_string(), + ); + } + + let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref()); + if let Err(err) = configure_postgres_logs_export(conf) { + return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string()); + } + + Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::from("")) + .unwrap() +} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 10f767e97c..179369e3ef 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -87,6 +87,7 @@ impl From<&Server> for Router> { let authenticated_router = Router::>::new() .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) + .route("/configure_telemetry", post(configure::configure_telemetry)) .route("/database_schema", get(database_schema::get_schema_dump)) .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) .route("/insights", get(insights::get_insights)) diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index 7537fafaa5..80594db3f1 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -1,12 +1,15 @@ use std::fs; +use std::io::ErrorKind; use std::path::Path; use std::process::Command; use std::time::Duration; use std::{fs::OpenOptions, io::Write}; -use anyhow::{Context, Result}; +use anyhow::{Context, Result, anyhow}; use tracing::{error, info, instrument, warn}; +const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf"; + fn get_rsyslog_pid() -> Option { let output = Command::new("pgrep") .arg("rsyslogd") @@ -79,6 +82,95 @@ pub fn configure_audit_rsyslog( Ok(()) } +/// Configuration for enabling Postgres logs forwarding from rsyslogd +pub struct PostgresLogsRsyslogConfig<'a> { + pub host: Option<&'a str>, +} + +impl<'a> PostgresLogsRsyslogConfig<'a> { + pub fn new(host: Option<&'a str>) -> Self { + Self { host } + } + + pub fn build(&self) -> Result { + match self.host { + Some(host) => { + if let Some((target, port)) = host.split_once(":") { + Ok(format!( + include_str!( + "config_template/compute_rsyslog_postgres_export_template.conf" + ), + logs_export_target = target, + logs_export_port = port, + )) + } else { + Err(anyhow!("Invalid host format for Postgres logs export")) + } + } + None => Ok("".to_string()), + } + } + + fn current_config() -> Result { + let config_content = match std::fs::read_to_string(POSTGRES_LOGS_CONF_PATH) { + Ok(c) => c, + Err(err) if err.kind() == ErrorKind::NotFound => String::new(), + Err(err) => return Err(err.into()), + }; + Ok(config_content) + } + + /// Returns the default host for otel collector that receives Postgres logs + pub fn default_host(project_id: &str) -> String { + format!( + "config-{}-collector.neon-telemetry.svc.cluster.local:10514", + project_id + ) + } +} + +pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> { + let new_config = conf.build()?; + let current_config = PostgresLogsRsyslogConfig::current_config()?; + + if new_config == current_config { + info!("postgres logs rsyslog configuration is up-to-date"); + return Ok(()); + } + + // When new config is empty we can simply remove the configuration file. + if new_config.is_empty() { + info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH); + match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) { + Ok(_) => {} + Err(err) if err.kind() == ErrorKind::NotFound => {} + Err(err) => return Err(err.into()), + } + restart_rsyslog()?; + return Ok(()); + } + + info!( + "configuring rsyslog for postgres logs export to: {:?}", + conf.host + ); + + let mut file = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(POSTGRES_LOGS_CONF_PATH)?; + file.write_all(new_config.as_bytes())?; + + info!( + "rsyslog configuration file {} added successfully. Starting rsyslogd", + POSTGRES_LOGS_CONF_PATH + ); + + restart_rsyslog()?; + Ok(()) +} + #[instrument(skip_all)] async fn pgaudit_gc_main_loop(log_directory: String) -> Result<()> { info!("running pgaudit GC main loop"); @@ -136,3 +228,49 @@ pub fn launch_pgaudit_gc(log_directory: String) { } }); } + +#[cfg(test)] +mod tests { + use crate::rsyslog::PostgresLogsRsyslogConfig; + + #[test] + fn test_postgres_logs_config() { + { + // Verify empty config + let conf = PostgresLogsRsyslogConfig::new(None); + let res = conf.build(); + assert!(res.is_ok()); + let conf_str = res.unwrap(); + assert_eq!(&conf_str, ""); + } + + { + // Verify config + let conf = PostgresLogsRsyslogConfig::new(Some("collector.cvc.local:514")); + let res = conf.build(); + assert!(res.is_ok()); + let conf_str = res.unwrap(); + assert!(conf_str.contains("omfwd")); + assert!(conf_str.contains(r#"target="collector.cvc.local""#)); + assert!(conf_str.contains(r#"port="514""#)); + } + + { + // Verify invalid config + let conf = PostgresLogsRsyslogConfig::new(Some("invalid")); + let res = conf.build(); + assert!(res.is_err()); + } + + { + // Verify config with default host + let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123"); + let conf = PostgresLogsRsyslogConfig::new(Some(&host)); + let res = conf.build(); + assert!(res.is_ok()); + let conf_str = res.unwrap(); + assert!(conf_str.contains(r#"shy-breeze-123"#)); + assert!(conf_str.contains(r#"port="10514""#)); + } + } +} diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index 3fbdfcf83f..d88451c549 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -30,3 +30,9 @@ pub struct SetRoleGrantsRequest { pub privileges: Vec, pub role: PgIdent, } + +/// Request of the /configure_telemetry API +#[derive(Debug, Deserialize, Serialize)] +pub struct ConfigureTelemetryRequest { + pub logs_export_host: Option, +} diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index af4264f8d2..868a14edeb 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -182,6 +182,9 @@ pub enum ComputeFeature { /// Pre-install and initialize anon extension for every database in the cluster AnonExtension, + /// Allow to configure rsyslog for Postgres logs export + PostgresLogsExport, + /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test /// `parse_unknown_features()` for more details. From 56149a046a0334a150651e2ee42276ccc6ea3c31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 17 Mar 2025 17:28:21 +0100 Subject: [PATCH 58/71] Add test_explicit_timeline_creation_storcon and make it work (#11261) Adds a basic test that makes the storcon issue explicit creation of a timeline on safeekepers (main storcon PR in #11058). It was adapted from `test_explicit_timeline_creation` from #11002. Also, do a bunch of fixes needed to get the test work (the API definitions weren't correct), and log more stuff when we can't create a new timeline due to no safekeepers being active. Part of #9011 --------- Co-authored-by: Arseny Sher --- libs/safekeeper_api/src/models.rs | 1 + safekeeper/client/src/mgmt_api.rs | 9 +++----- storage_controller/src/safekeeper_client.rs | 3 +-- storage_controller/src/service.rs | 12 +++++++---- test_runner/fixtures/neon_fixtures.py | 22 ++++++++++++++++++++ test_runner/regress/test_wal_acceptor.py | 23 +++++++++++++++++++++ 6 files changed, 58 insertions(+), 12 deletions(-) diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 6bdc651668..33ff636a79 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -23,6 +23,7 @@ pub struct TimelineCreateRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub mconf: Configuration, + /// In the PG_VERSION_NUM macro format, like 140017. pub pg_version: u32, pub system_id: Option, // By default WAL_SEGMENT_SIZE diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 7ae39ef95e..424cd89221 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -81,13 +81,10 @@ impl Client { } } - pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result { - let uri = format!( - "{}/v1/tenant/{}/timeline/{}", - self.mgmt_api_endpoint, req.tenant_id, req.timeline_id - ); + pub async fn create_timeline(&self, req: &TimelineCreateRequest) -> Result { + let uri = format!("{}/v1/tenant/timeline", self.mgmt_api_endpoint); let resp = self.post(&uri, req).await?; - resp.json().await.map_err(Error::ReceiveBody) + Ok(resp) } pub async fn pull_timeline(&self, req: &PullTimelineRequest) -> Result { diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index a44fcc27d2..b30237e404 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -1,6 +1,5 @@ use safekeeper_api::models::{ self, PullTimelineRequest, PullTimelineResponse, SafekeeperUtilization, TimelineCreateRequest, - TimelineStatus, }; use safekeeper_client::mgmt_api::{Client, Result}; use utils::id::{NodeId, TenantId, TimelineId}; @@ -60,7 +59,7 @@ impl SafekeeperClient { pub(crate) async fn create_timeline( &self, req: &TimelineCreateRequest, - ) -> Result { + ) -> Result { measured_request!( "create_timeline", crate::metrics::Method::Post, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 4e00136e1b..38bf959056 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -3804,7 +3804,7 @@ impl Service { create_mode: models::TimelineCreateRequestMode, ) -> Result { let timeline_id = timeline_info.timeline_id; - let pg_version = timeline_info.pg_version; + let pg_version = timeline_info.pg_version * 10000; // Initially start_lsn is determined by last_record_lsn in pageserver // response as it does initdb. However, later we persist it and in sk // creation calls replace with the value from the timeline row if it @@ -8723,6 +8723,8 @@ impl Service { pub(crate) async fn safekeepers_for_new_timeline( &self, ) -> Result, ApiError> { + // Number of safekeepers in different AZs we are looking for + let wanted_count = 3; let mut all_safekeepers = { let locked = self.inner.read().unwrap(); locked @@ -8768,15 +8770,17 @@ impl Service { continue; } sks.push(sk_info.clone()); - if sks.len() == 3 { + if sks.len() == wanted_count { break; } } - if sks.len() == 3 { + if sks.len() == wanted_count { Ok(sks) } else { Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find three safekeepers in different AZs for new timeline" + "couldn't find {wanted_count} safekeepers in different AZs for new timeline (found: {}, total active: {})", + sks.len(), + all_safekeepers.len(), ))) } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index deff02f0f9..aba8e04977 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1321,6 +1321,28 @@ class NeonEnv: for f in futs: f.result() + # Last step: register safekeepers at the storage controller + if ( + self.storage_controller_config is not None + and self.storage_controller_config.get("timelines_onto_safekeepers") is True + ): + for sk_id, sk in enumerate(self.safekeepers): + body = { + "id": sk_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "127.0.0.1", + "port": sk.port.pg, + "http_port": sk.port.http, + "https_port": None, + "version": 5957, + "availability_zone_id": f"us-east-2b-{sk_id}", + } + + self.storage_controller.on_safekeeper_deploy(sk_id, body) + self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active") + def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 55e38b29a2..89c4a96499 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2039,6 +2039,29 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") +def test_explicit_timeline_creation_storcon(neon_env_builder: NeonEnvBuilder): + """ + Test that having neon.safekeepers starting with g#n: with non zero n enables + generations, which as a side effect disables automatic timeline creation. + Like test_explicit_timeline_creation, but asks the storcon to + create membership conf & timeline. + """ + neon_env_builder.num_safekeepers = 3 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + } + env = neon_env_builder.init_start() + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + ep = env.endpoints.create("main", config_lines=config_lines) + + # endpoint should start. + ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) + ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt From 486ffeef6d58c5ef5b2803a3a8290cc7ea7c6a94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Mon, 17 Mar 2025 17:31:49 +0100 Subject: [PATCH 59/71] fix(ci): don't have neon-test-extensions release tag push depend on compute-node-image build (#11281) ## Problem Failures like https://github.com/neondatabase/neon/actions/runs/13901493608/job/38896940612?pr=11272 are caused by the dependency on `compute-node-image`, which was wrong on release jobs anyway. ## Summary of changes Remove dependency on `compute-node-image` from the job `add-release-tag-to-neon-test-extension-image`. --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 409ad6be3d..0980561345 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1014,7 +1014,7 @@ jobs: add-release-tag-to-neon-test-extensions-image: if: ${{ needs.meta.outputs.run-kind == 'compute-release' }} - needs: [ meta, compute-node-image ] + needs: [ meta ] uses: ./.github/workflows/_push-to-container-registry.yml with: image-map: | From a05c99f487e301e1b0392c494edea1b1e4e66071 Mon Sep 17 00:00:00 2001 From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com> Date: Mon, 17 Mar 2025 14:23:32 -0400 Subject: [PATCH 60/71] fix: removed anon pg extension (#10936) ## Problem Removing the `anon` v1 extension in postgres as described in https://github.com/neondatabase/cloud/issues/22663. This extension is not built for postgres v17 and is out of date when compared to the upstream variant which is v2 (we have v1.4). ## Summary of changes Removed the `anon` v1 extension from being built or preloaded Related to https://github.com/neondatabase/cloud/issues/22663 --- compute_tools/src/spec.rs | 122 +---------------------- compute_tools/src/spec_apply.rs | 98 +----------------- libs/compute_api/src/spec.rs | 3 - libs/compute_api/tests/cluster_spec.json | 8 -- 4 files changed, 3 insertions(+), 228 deletions(-) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 1d19f2738d..a76af21e9f 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -8,13 +8,12 @@ use compute_api::responses::{ use compute_api::spec::ComputeSpec; use reqwest::StatusCode; use tokio_postgres::Client; -use tracing::{error, info, instrument, warn}; +use tracing::{error, info, instrument}; use crate::config; use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS}; use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; -use crate::pg_helpers::*; // Do control plane request and return response if any. In case of error it // returns a bool flag indicating whether it makes sense to retry the request @@ -212,122 +211,3 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> { Ok(()) } - -/// Connect to the database as superuser and pre-create anon extension -/// if it is present in shared_preload_libraries -#[instrument(skip_all)] -pub async fn handle_extension_anon( - spec: &ComputeSpec, - db_owner: &str, - db_client: &mut Client, - grants_only: bool, -) -> Result<()> { - info!("handle extension anon"); - - if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { - if libs.contains("anon") { - if !grants_only { - // check if extension is already initialized using anon.is_initialized() - let query = "SELECT anon.is_initialized()"; - match db_client.query(query, &[]).await { - Ok(rows) => { - if !rows.is_empty() { - let is_initialized: bool = rows[0].get(0); - if is_initialized { - info!("anon extension is already initialized"); - return Ok(()); - } - } - } - Err(e) => { - warn!( - "anon extension is_installed check failed with expected error: {}", - e - ); - } - }; - - // Create anon extension if this compute needs it - // Users cannot create it themselves, because superuser is required. - let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE"; - info!("creating anon extension with query: {}", query); - match db_client.query(query, &[]).await { - Ok(_) => {} - Err(e) => { - error!("anon extension creation failed with error: {}", e); - return Ok(()); - } - } - - // check that extension is installed - query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - let rows = db_client.query(query, &[]).await?; - if rows.is_empty() { - error!("anon extension is not installed"); - return Ok(()); - } - - // Initialize anon extension - // This also requires superuser privileges, so users cannot do it themselves. - query = "SELECT anon.init()"; - match db_client.query(query, &[]).await { - Ok(_) => {} - Err(e) => { - error!("anon.init() failed with error: {}", e); - return Ok(()); - } - } - } - - // check that extension is installed, if not bail early - let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'"; - match db_client.query(query, &[]).await { - Ok(rows) => { - if rows.is_empty() { - error!("anon extension is not installed"); - return Ok(()); - } - } - Err(e) => { - error!("anon extension check failed with error: {}", e); - return Ok(()); - } - }; - - let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner); - info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query).await?; - - // Grant permissions to db_owner to use anon extension functions - let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner); - info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query).await?; - - // This is needed, because some functions are defined as SECURITY DEFINER. - // In Postgres SECURITY DEFINER functions are executed with the privileges - // of the owner. - // In anon extension this it is needed to access some GUCs, which are only accessible to - // superuser. But we've patched postgres to allow db_owner to access them as well. - // So we need to change owner of these functions to db_owner. - let query = format!(" - SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};' - from pg_proc p - join pg_namespace nsp ON p.pronamespace = nsp.oid - where nsp.nspname = 'anon';", db_owner); - - info!("change anon extension functions owner to db owner"); - db_client.simple_query(&query).await?; - - // affects views as well - let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner); - info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query).await?; - - let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner); - info!("granting anon extension permissions with query: {}", query); - db_client.simple_query(&query).await?; - } - } - - Ok(()) -} diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index e5f7aebbf8..80506b13cb 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -6,7 +6,7 @@ use std::sync::Arc; use anyhow::{Context, Result}; use compute_api::responses::ComputeStatus; -use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role}; +use compute_api::spec::{ComputeAudit, ComputeSpec, Database, PgIdent, Role}; use futures::future::join_all; use tokio::sync::RwLock; use tokio_postgres::Client; @@ -26,7 +26,7 @@ use crate::spec_apply::ApplySpecPhase::{ RunInEachDatabase, }; use crate::spec_apply::PerDatabasePhase::{ - ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension, + ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, }; impl ComputeNode { @@ -238,7 +238,6 @@ impl ComputeNode { let mut phases = vec![ DeleteDBRoleReferences, ChangeSchemaPerms, - HandleAnonExtension, ]; if spec.drop_subscriptions_before_start && !drop_subscriptions_done { @@ -458,7 +457,6 @@ impl Debug for DB { pub enum PerDatabasePhase { DeleteDBRoleReferences, ChangeSchemaPerms, - HandleAnonExtension, /// This is a shared phase, used for both i) dropping dangling LR subscriptions /// before dropping the DB, and ii) dropping all subscriptions after creating /// a fresh branch. @@ -1012,98 +1010,6 @@ async fn get_operations<'a>( ] .into_iter(); - Ok(Box::new(operations)) - } - // TODO: remove this completely https://github.com/neondatabase/cloud/issues/22663 - PerDatabasePhase::HandleAnonExtension => { - // Only install Anon into user databases - let db = match &db { - DB::SystemDB => return Ok(Box::new(empty())), - DB::UserDB(db) => db, - }; - // Never install Anon when it's not enabled as feature - if !spec.features.contains(&ComputeFeature::AnonExtension) { - return Ok(Box::new(empty())); - } - - // Only install Anon when it's added in preload libraries - let opt_libs = spec.cluster.settings.find("shared_preload_libraries"); - - let libs = match opt_libs { - Some(libs) => libs, - None => return Ok(Box::new(empty())), - }; - - if !libs.contains("anon") { - return Ok(Box::new(empty())); - } - - let db_owner = db.owner.pg_quote(); - - let operations = vec![ - // Create anon extension if this compute needs it - // Users cannot create it themselves, because superuser is required. - Operation { - query: String::from("CREATE EXTENSION IF NOT EXISTS anon CASCADE"), - comment: Some(String::from("creating anon extension")), - }, - // Initialize anon extension - // This also requires superuser privileges, so users cannot do it themselves. - Operation { - query: String::from("SELECT anon.init()"), - comment: Some(String::from("initializing anon extension data")), - }, - Operation { - query: format!("GRANT ALL ON SCHEMA anon TO {}", db_owner), - comment: Some(String::from( - "granting anon extension schema permissions", - )), - }, - Operation { - query: format!( - "GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", - db_owner - ), - comment: Some(String::from( - "granting anon extension schema functions permissions", - )), - }, - // We need this, because some functions are defined as SECURITY DEFINER. - // In Postgres SECURITY DEFINER functions are executed with the privileges - // of the owner. - // In anon extension this it is needed to access some GUCs, which are only accessible to - // superuser. But we've patched postgres to allow db_owner to access them as well. - // So we need to change owner of these functions to db_owner. - Operation { - query: format!( - include_str!("sql/anon_ext_fn_reassign.sql"), - db_owner = db_owner, - ), - comment: Some(String::from( - "change anon extension functions owner to database_owner", - )), - }, - Operation { - query: format!( - "GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", - db_owner, - ), - comment: Some(String::from( - "granting anon extension tables permissions", - )), - }, - Operation { - query: format!( - "GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", - db_owner, - ), - comment: Some(String::from( - "granting anon extension sequences permissions", - )), - }, - ] - .into_iter(); - Ok(Box::new(operations)) } } diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 868a14edeb..11615b73a1 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -179,9 +179,6 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, - /// Pre-install and initialize anon extension for every database in the cluster - AnonExtension, - /// Allow to configure rsyslog for Postgres logs export PostgresLogsExport, diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index ccd015ad19..37de24be5b 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -208,7 +208,6 @@ ], "remote_extensions": { "library_index": { - "anon": "anon", "postgis-3": "postgis", "libpgrouting-3.4": "postgis", "postgis_raster-3": "postgis", @@ -217,12 +216,6 @@ "address_standardizer-3": "postgis" }, "extension_data": { - "anon": { - "archive_path": "5834329303/v15/extensions/anon.tar.zst", - "control_data": { - "anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n" - } - }, "postgis": { "archive_path": "5834329303/v15/extensions/postgis.tar.zst", "control_data": { @@ -238,7 +231,6 @@ } }, "custom_extensions": [ - "anon" ], "public_extensions": [ "postgis" From 24f41bee5c08e5433bf297a7b3cb3d7f1d807d9a Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 17 Mar 2025 21:06:42 +0200 Subject: [PATCH 61/71] Update LFC in case of unlogged build (#11262) ## Problem Unlogged build is used for GIST/SPGIST/GIN/HNSW indexes. In this mode we first change relation class to `RELPERSISTENCE_UNLOGGED` and save them on local disk. But we do not save unlogged relations in LFC. It may cause fetching incorrect value from LFC if relfilenode is reused. ## Summary of changes Save modified pages in LFC on second stage of unlogged build (when modified pages are walloged). There is no need to save pages in LFC at first phase because the will be in any case overwritten with assigned LSN at second phase. Co-authored-by: Konstantin Knizhnik --- pgxn/neon/pagestore_smgr.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index ae92be4577..78e42191a4 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -2778,6 +2778,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdextend(reln, forkNum, blkno, buffer, skipFsync); + /* Update LFC in case of unlogged index build */ + if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) + lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); return; default: @@ -2866,6 +2869,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); + /* Update LFC in case of unlogged index build */ + if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) + { + for (int i = 0; i < nblocks; i++) + { + lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); + } + } return; default: @@ -3714,6 +3725,9 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif + /* Update LFC in case of unlogged index build */ + if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) + lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -3777,6 +3791,9 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); + /* Update LFC in case of unlogged index build */ + if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) + lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); From bb64beffbb62e47ab5fbc61599c8dcca6b58cd1f Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 17 Mar 2025 15:42:02 -0400 Subject: [PATCH 62/71] fix(pageserver): log compaction errors with timeline ids (#11231) ## Problem Makes it easier to debug. ## Summary of changes Log compaction errors with timeline ids. Signed-off-by: Alex Chi Z --- pageserver/src/tenant/tasks.rs | 31 +++++++++++++------- pageserver/src/tenant/timeline.rs | 30 +++++++++++-------- pageserver/src/tenant/timeline/compaction.rs | 15 ++++++++++ 3 files changed, 54 insertions(+), 22 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 589ac5ae88..034e5f8c91 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -268,7 +268,7 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { error_run += 1; let backoff = exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS); - log_compaction_error(&err, error_run, backoff, cancel.is_cancelled()); + log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled()); continue; } } @@ -281,10 +281,9 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } } -fn log_compaction_error( +pub(crate) fn log_compaction_error( err: &CompactionError, - error_count: u32, - sleep_duration: Duration, + retry_info: Option<(u32, Duration)>, task_cancelled: bool, ) { use CompactionError::*; @@ -318,14 +317,26 @@ fn log_compaction_error( } }; - match level { - Level::ERROR => { - error!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + if let Some((error_count, sleep_duration)) = retry_info { + match level { + Level::ERROR => { + error!( + "Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}" + ) + } + Level::INFO => { + info!( + "Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}" + ) + } + level => unimplemented!("unexpected level {level:?}"), } - Level::INFO => { - info!("Compaction failed {error_count} times, retrying in {sleep_duration:?}: {err:#}") + } else { + match level { + Level::ERROR => error!("Compaction failed: {err:#}"), + Level::INFO => info!("Compaction failed: {err:#}"), + level => unimplemented!("unexpected level {level:?}"), } - level => unimplemented!("unexpected level {level:?}"), } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index face2dfdc1..3007d1e58a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -89,6 +89,7 @@ use super::remote_timeline_client::index::{GcCompactionState, IndexPart}; use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use super::secondary::heatmap::HeatMapLayer; use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer}; +use super::tasks::log_compaction_error; use super::upload_queue::NotInitialized; use super::{ AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded, @@ -1856,18 +1857,23 @@ impl Timeline { flags: EnumSet, ctx: &RequestContext, ) -> Result { - self.compact_with_options( - cancel, - CompactOptions { - flags, - compact_key_range: None, - compact_lsn_range: None, - sub_compaction: false, - sub_compaction_max_job_size_mb: None, - }, - ctx, - ) - .await + let res = self + .compact_with_options( + cancel, + CompactOptions { + flags, + compact_key_range: None, + compact_lsn_range: None, + sub_compaction: false, + sub_compaction_max_job_size_mb: None, + }, + ctx, + ) + .await; + if let Err(err) = &res { + log_compaction_error(err, None, cancel.is_cancelled()); + } + res } /// Outermost timeline compaction operation; downloads needed layers. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 300daec9bf..711501caa9 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -56,6 +56,7 @@ use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::{ AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState, }; +use crate::tenant::tasks::log_compaction_error; use crate::tenant::timeline::{ DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, drop_rlock, @@ -440,6 +441,20 @@ impl GcCompactionQueue { ctx: &RequestContext, gc_block: &GcBlock, timeline: &Arc, + ) -> Result { + let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await; + if let Err(err) = &res { + log_compaction_error(err, None, cancel.is_cancelled()); + } + res + } + + async fn iteration_inner( + &self, + cancel: &CancellationToken, + ctx: &RequestContext, + gc_block: &GcBlock, + timeline: &Arc, ) -> Result { let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else { return Err(CompactionError::AlreadyRunning( From 05ca27c981b766ce66a47a801d4a990340247c9c Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Mon, 17 Mar 2025 17:27:27 -0400 Subject: [PATCH 63/71] fix(pagectl/benches): scope context with debug tools (#11285) ## Problem https://github.com/neondatabase/neon/commit/7c462b3417ecd3ae3907f3480f3b8a8c99fc6d7b requires all contexts have scopes. pagectl/benches don't have such scopes. close https://github.com/neondatabase/neon/issues/11280 ## Summary of changes Adding scopes for the tools. Signed-off-by: Alex Chi Z --- pageserver/benches/bench_ingest.rs | 3 ++- pageserver/ctl/src/layer_map_analyzer.rs | 3 ++- pageserver/ctl/src/layers.rs | 6 ++++-- pageserver/ctl/src/main.rs | 3 ++- pageserver/src/context.rs | 16 ++++++++++++++++ 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index b1103948d6..272c3e2338 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -57,7 +57,8 @@ async fn ingest( tokio::fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id)).await?; - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); let gate = utils::sync::gate::Gate::default(); diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index b426f977cf..c49c8b58df 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -131,7 +131,8 @@ async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> R pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let storage_path = &cmd.path; let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. pageserver::virtual_file::init( diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 05fb35ff09..293c01eff0 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -76,7 +76,8 @@ async fn read_image_file(path: impl AsRef, ctx: &RequestContext) -> Result } pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); match cmd { LayerCmd::List { path } => { for tenant in fs::read_dir(path.join(TENANTS_SEGMENT_NAME))? { @@ -176,7 +177,8 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { ); pageserver::page_cache::init(100); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error) + .with_scope_debug_tools(); macro_rules! rewrite_closure { ($($summary_ty:tt)*) => {{ diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 72a120a69b..1d81b839a8 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -208,7 +208,8 @@ async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { virtual_file::SyncMode::Sync, ); page_cache::init(100); - let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); + let ctx = + RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error).with_scope_debug_tools(); dump_layerfile_from_path(path, true, &ctx).await } diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index e2a84d0c24..d2caf030df 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -134,6 +134,9 @@ pub(crate) enum Scope { UnitTest { io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, }, + DebugTools { + io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics, + }, } static GLOBAL_IO_SIZE_METRICS: Lazy = @@ -195,6 +198,12 @@ impl Scope { io_size_metrics: &GLOBAL_IO_SIZE_METRICS, } } + + pub(crate) fn new_debug_tools() -> Self { + Scope::DebugTools { + io_size_metrics: &GLOBAL_IO_SIZE_METRICS, + } + } } /// The kind of access to the page cache. @@ -435,6 +444,12 @@ impl RequestContext { .build() } + pub fn with_scope_debug_tools(&self) -> Self { + RequestContextBuilder::new(TaskKind::DebugTool) + .scope(Scope::new_debug_tools()) + .build() + } + pub fn task_kind(&self) -> TaskKind { self.task_kind } @@ -486,6 +501,7 @@ impl RequestContext { Scope::SecondaryTenant { io_size_metrics } => io_size_metrics, #[cfg(test)] Scope::UnitTest { io_size_metrics } => io_size_metrics, + Scope::DebugTools { io_size_metrics } => io_size_metrics, } } } From 0d3d639ef3e9742c2d6e41850fd163e910807a85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 18 Mar 2025 04:37:45 +0100 Subject: [PATCH 64/71] storcon: remove timeouts for safekeeper heartbeating (#11232) PRs #10891 and #10902 have time-bounded the safekeeper heartbeating of the storage controller. Those timeouts were not meant to be permanent, but temporary until we figured out the reasons for the safekeeper heartbeating causing problems. Now they are better understood and resolved. A comment is [here](https://github.com/neondatabase/cloud/issues/24396#issuecomment-2679342929), but most importantly, we've had: * #10954 to send heartbeats concurrently (before the issue was we sent them sequentially, so the total time time was number of nodes times time for timeout to be hit, now the total time is the maximum of all things we are heartbeating) * work to actually make heartbeats work and not error, i.e. JWT rollout for storcon, not sending heartbeats to decomissioned safekeepers, removal of decomissioned safekeepers from the databases Part of https://github.com/neondatabase/cloud/issues/25473 --- storage_controller/src/service.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 38bf959056..019a889d25 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -864,11 +864,9 @@ impl Service { }; tracing::info!("Sending initial heartbeats..."); - // Put a small, but reasonable timeout to get the initial heartbeats of the safekeepers to avoid a storage controller downtime - const SK_TIMEOUT: Duration = Duration::from_secs(5); let (res_ps, res_sk) = tokio::join!( self.heartbeater_ps.heartbeat(Arc::new(nodes_to_heartbeat)), - tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(all_sks)) + self.heartbeater_sk.heartbeat(all_sks) ); let mut online_nodes = HashMap::new(); @@ -887,7 +885,7 @@ impl Service { } let mut online_sks = HashMap::new(); - if let Ok(Ok(deltas)) = res_sk { + if let Ok(deltas) = res_sk { for (node_id, status) in deltas.0 { match status { SafekeeperState::Available { @@ -1123,10 +1121,9 @@ impl Service { locked.safekeepers.clone() }; - const SK_TIMEOUT: Duration = Duration::from_secs(3); let (res_ps, res_sk) = tokio::join!( self.heartbeater_ps.heartbeat(nodes), - tokio::time::timeout(SK_TIMEOUT, self.heartbeater_sk.heartbeat(safekeepers)) + self.heartbeater_sk.heartbeat(safekeepers) ); if let Ok(deltas) = res_ps { @@ -1230,7 +1227,7 @@ impl Service { } } } - if let Ok(Ok(deltas)) = res_sk { + if let Ok(deltas) = res_sk { let mut locked = self.inner.write().unwrap(); let mut safekeepers = (*locked.safekeepers).clone(); for (id, state) in deltas.0 { From 57d51e949db6bc0512ce66884c8ced307ec5094f Mon Sep 17 00:00:00 2001 From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com> Date: Tue, 18 Mar 2025 11:10:11 +0400 Subject: [PATCH 65/71] tests: suppress excessive pageserver errors in test_timeline_ancestor_detach_errors (#11277) ## Problem The test is flaky because of the same reasons as described in https://github.com/neondatabase/neon/issues/11177. The test has already suppressed these `WARN` and `ERROR` log messages, but the regexp didn't match all possible errors. ## Summary of changes - Change regexp to suppress all possible allowed error log messages. --- test_runner/regress/test_timeline_detach_ancestor.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 685a32af90..96664f2b8d 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -812,11 +812,13 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard for ps in pageservers.values(): ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + # We make /detach_ancestor requests that are intended to fail. + # It's expected that storcon drops requests to other pageservers after + # it gets the first error (https://github.com/neondatabase/neon/issues/11177) ps.allowed_errors.extend( [ ".* WARN .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: request was dropped before completing", - # rare error logging, which is hard to reproduce without instrumenting responding with random sleep - '.* ERROR .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: Cancelled request finished with an error: Conflict\\("no ancestors"\\)', + ".* ERROR .* path=/v1/tenant/.*/timeline/.*/detach_ancestor request_id=.*: Cancelled request finished with an error.*", ] ) From 2cf6ae76fc40a88f2aa1f43d46d5f73e6f3ec6ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 18 Mar 2025 10:00:53 +0100 Subject: [PATCH 66/71] storcon: move safekeeper related stuff out of service.rs (#11288) There is no functional change here. We move safekeeper related code from `service.rs` to `service/safekeeper_service.rs`, so that safekeeper related stuff is contained in a single file. This also helps with preventing `service.rs` from growing even further. Part of #9011. --- storage_controller/src/service.rs | 520 +----------------- .../src/service/safekeeper_service.rs | 518 +++++++++++++++++ 2 files changed, 528 insertions(+), 510 deletions(-) create mode 100644 storage_controller/src/service/safekeeper_service.rs diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 019a889d25..61a6c12f47 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,6 +1,7 @@ pub mod chaos_injector; mod context_iterator; pub(crate) mod safekeeper_reconciler; +mod safekeeper_service; use std::borrow::Cow; use std::cmp::Ordering; @@ -27,16 +28,15 @@ use itertools::Itertools; use pageserver_api::controller_api::{ AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, - SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest, - ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, - TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, - TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, - TenantShardMigrateResponse, + ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, + TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, + TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, + TenantShardMigrateRequest, TenantShardMigrateResponse, }; use pageserver_api::models::{ self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, - PageserverUtilization, SafekeeperInfo, SafekeepersInfo, SecondaryProgress, ShardParameters, - TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, + PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig, + TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateResponseStorcon, @@ -51,18 +51,15 @@ use pageserver_api::upcall_api::{ }; use pageserver_client::{BlockUnblock, mgmt_api}; use reqwest::{Certificate, StatusCode}; -use safekeeper_api::membership::{MemberSet, SafekeeperId}; use safekeeper_api::models::SafekeeperUtilization; -use safekeeper_reconciler::{SafekeeperReconcilers, ScheduleRequest}; +use safekeeper_reconciler::SafekeeperReconcilers; use tokio::sync::TryAcquireError; use tokio::sync::mpsc::error::TrySendError; -use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, info_span, instrument, warn}; use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; -use utils::logging::SecretString; use utils::sync::gate::Gate; use utils::{failpoint_support, pausable_failpoint}; @@ -83,8 +80,8 @@ use crate::peer_client::GlobalObservedState; use crate::persistence::split_state::SplitState; use crate::persistence::{ AbortShardSplitStatus, ControllerPersistence, DatabaseError, DatabaseResult, - MetadataHealthPersistence, Persistence, SafekeeperTimelineOpKind, ShardGenerationState, - TenantFilter, TenantShardPersistence, TimelinePendingOpPersistence, TimelinePersistence, + MetadataHealthPersistence, Persistence, ShardGenerationState, TenantFilter, + TenantShardPersistence, }; use crate::reconciler::{ ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder, ReconcilerPriority, @@ -3646,281 +3643,6 @@ impl Service { .await? } - /// Timeline creation on safekeepers - /// - /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers, - /// where `left` contains the list of safekeepers that didn't have a successful response. - /// Assumes tenant lock is held while calling this function. - async fn tenant_timeline_create_safekeepers_quorum( - &self, - tenant_id: TenantId, - timeline_id: TimelineId, - pg_version: u32, - timeline_persistence: &TimelinePersistence, - ) -> Result, ApiError> { - // If quorum is reached, return if we are outside of a specified timeout - let jwt = self - .config - .safekeeper_jwt_token - .clone() - .map(SecretString::from); - let mut joinset = JoinSet::new(); - - let safekeepers = { - let locked = self.inner.read().unwrap(); - locked.safekeepers.clone() - }; - - let mut members = Vec::new(); - for sk_id in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk_id as u64); - let Some(safekeeper) = safekeepers.get(&sk_id) else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find entry for safekeeper with id {sk_id}" - )))?; - }; - members.push(SafekeeperId { - id: sk_id, - host: safekeeper.skp.host.clone(), - pg_port: safekeeper.skp.port as u16, - }); - } - let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?; - let mconf = safekeeper_api::membership::Configuration::new(mset); - - let req = safekeeper_api::models::TimelineCreateRequest { - commit_lsn: None, - mconf, - pg_version, - start_lsn: timeline_persistence.start_lsn.0, - system_id: None, - tenant_id, - timeline_id, - wal_seg_size: None, - }; - const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); - for sk in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk as u64); - let safekeepers = safekeepers.clone(); - let jwt = jwt.clone(); - let ssl_ca_cert = self.config.ssl_ca_cert.clone(); - let req = req.clone(); - joinset.spawn(async move { - // Unwrap is fine as we already would have returned error above - let sk_p = safekeepers.get(&sk_id).unwrap(); - let res = sk_p - .with_client_retries( - |client| { - let req = req.clone(); - async move { client.create_timeline(&req).await } - }, - &jwt, - &ssl_ca_cert, - 3, - 3, - SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, - &CancellationToken::new(), - ) - .await; - (sk_id, sk_p.skp.host.clone(), res) - }); - } - // After we have built the joinset, we now wait for the tasks to complete, - // but with a specified timeout to make sure we return swiftly, either with - // a failure or success. - let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT; - - // Wait until all tasks finish or timeout is hit, whichever occurs - // first. - let mut reconcile_results = Vec::new(); - loop { - if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await - { - let Some(res) = res else { break }; - match res { - Ok(res) => { - tracing::info!( - "response from safekeeper id:{} at {}: {:?}", - res.0, - res.1, - res.2 - ); - reconcile_results.push(res); - } - Err(join_err) => { - tracing::info!("join_err for task in joinset: {join_err}"); - } - } - } else { - tracing::info!( - "timeout for creation call after {} responses", - reconcile_results.len() - ); - break; - } - } - - // Now check now if quorum was reached in reconcile_results. - let total_result_count = reconcile_results.len(); - let remaining = reconcile_results - .into_iter() - .filter_map(|res| res.2.is_err().then_some(res.0)) - .collect::>(); - tracing::info!( - "Got {} non-successful responses from initial creation request of total {total_result_count} responses", - remaining.len() - ); - if remaining.len() >= 2 { - // Failure - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "not enough successful reconciliations to reach quorum, please retry: {} errored", - remaining.len() - ))); - } - - Ok(remaining) - } - - /// Create timeline in controller database and on safekeepers. - /// `timeline_info` is result of timeline creation on pageserver. - /// - /// All actions must be idempotent as the call is retried until success. It - /// tries to create timeline in the db and on at least majority of - /// safekeepers + queue creation for safekeepers which missed it in the db - /// for infinite retries; after that, call returns Ok. - /// - /// The idea is that once this is reached as long as we have alive majority - /// of safekeepers it is expected to get eventually operational as storcon - /// will be able to seed timeline on nodes which missed creation by making - /// pull_timeline from peers. On the other hand we don't want to fail - /// timeline creation if one safekeeper is down. - async fn tenant_timeline_create_safekeepers( - self: &Arc, - tenant_id: TenantId, - timeline_info: &TimelineInfo, - create_mode: models::TimelineCreateRequestMode, - ) -> Result { - let timeline_id = timeline_info.timeline_id; - let pg_version = timeline_info.pg_version * 10000; - // Initially start_lsn is determined by last_record_lsn in pageserver - // response as it does initdb. However, later we persist it and in sk - // creation calls replace with the value from the timeline row if it - // previously existed as on retries in theory endpoint might have - // already written some data and advanced last_record_lsn, while we want - // safekeepers to have consistent start_lsn. - let start_lsn = match create_mode { - models::TimelineCreateRequestMode::Bootstrap { .. } => timeline_info.last_record_lsn, - models::TimelineCreateRequestMode::Branch { .. } => timeline_info.last_record_lsn, - models::TimelineCreateRequestMode::ImportPgdata { .. } => { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "import pgdata doesn't specify the start lsn, aborting creation on safekeepers" - )))?; - } - }; - // Choose initial set of safekeepers respecting affinity - let sks = self.safekeepers_for_new_timeline().await?; - let sks_persistence = sks.iter().map(|sk| sk.id.0 as i64).collect::>(); - // Add timeline to db - let mut timeline_persist = TimelinePersistence { - tenant_id: tenant_id.to_string(), - timeline_id: timeline_id.to_string(), - start_lsn: start_lsn.into(), - generation: 0, - sk_set: sks_persistence.clone(), - new_sk_set: None, - cplane_notified_generation: 0, - deleted_at: None, - }; - let inserted = self - .persistence - .insert_timeline(timeline_persist.clone()) - .await?; - if !inserted { - if let Some(existent_persist) = self - .persistence - .get_timeline(tenant_id, timeline_id) - .await? - { - // Replace with what we have in the db, to get stuff like the generation right. - // We do still repeat the http calls to the safekeepers. After all, we could have - // crashed right after the wrote to the DB. - timeline_persist = existent_persist; - } else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "insertion said timeline already in db, but looking it up, it was gone" - ))); - } - } - // Create the timeline on a quorum of safekeepers - let remaining = self - .tenant_timeline_create_safekeepers_quorum( - tenant_id, - timeline_id, - pg_version, - &timeline_persist, - ) - .await?; - - // For the remaining safekeepers, take care of their reconciliation asynchronously - for &remaining_id in remaining.iter() { - let pending_op = TimelinePendingOpPersistence { - tenant_id: tenant_id.to_string(), - timeline_id: timeline_id.to_string(), - generation: timeline_persist.generation, - op_kind: crate::persistence::SafekeeperTimelineOpKind::Pull, - sk_id: remaining_id.0 as i64, - }; - tracing::info!("writing pending op for sk id {remaining_id}"); - self.persistence.insert_pending_op(pending_op).await?; - } - if !remaining.is_empty() { - let mut locked = self.inner.write().unwrap(); - for remaining_id in remaining { - let Some(sk) = locked.safekeepers.get(&remaining_id) else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Couldn't find safekeeper with id {remaining_id}" - ))); - }; - let Ok(host_list) = sks - .iter() - .map(|sk| { - Ok(( - sk.id, - locked - .safekeepers - .get(&sk.id) - .ok_or_else(|| { - ApiError::InternalServerError(anyhow::anyhow!( - "Couldn't find safekeeper with id {remaining_id} to pull from" - )) - })? - .base_url(), - )) - }) - .collect::>() - else { - continue; - }; - let req = ScheduleRequest { - safekeeper: Box::new(sk.clone()), - host_list, - tenant_id, - timeline_id, - generation: timeline_persist.generation as u32, - kind: crate::persistence::SafekeeperTimelineOpKind::Pull, - }; - locked.safekeeper_reconcilers.schedule_request(self, req); - } - } - - Ok(SafekeepersInfo { - generation: timeline_persist.generation as u32, - safekeepers: sks, - tenant_id, - timeline_id, - }) - } - pub(crate) async fn tenant_timeline_create( self: &Arc, tenant_id: TenantId, @@ -4614,62 +4336,6 @@ impl Service { status_code } - /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. - async fn tenant_timeline_delete_safekeepers( - self: &Arc, - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> Result<(), ApiError> { - let tl = self - .persistence - .get_timeline(tenant_id, timeline_id) - .await?; - let Some(tl) = tl else { - tracing::info!( - "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table, no deletions on safekeepers needed" - ); - return Ok(()); - }; - let all_sks = tl - .new_sk_set - .iter() - .flat_map(|sks| { - sks.iter() - .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude)) - }) - .chain( - tl.sk_set - .iter() - .map(|v| (*v, SafekeeperTimelineOpKind::Delete)), - ) - .collect::>(); - - // Schedule reconciliations - { - let mut locked = self.inner.write().unwrap(); - for (sk_id, kind) in all_sks { - let sk_id = NodeId(sk_id as u64); - let Some(sk) = locked.safekeepers.get(&sk_id) else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Couldn't find safekeeper with id {sk_id}" - ))); - }; - - let req = ScheduleRequest { - safekeeper: Box::new(sk.clone()), - // we don't use this for this kind, put a dummy value - host_list: Vec::new(), - tenant_id, - timeline_id, - generation: tl.generation as u32, - kind, - }; - locked.safekeeper_reconcilers.schedule_request(self, req); - } - } - Ok(()) - } - /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0. pub(crate) async fn tenant_shard0_node( &self, @@ -8716,172 +8382,6 @@ impl Service { global_observed } - /// Choose safekeepers for the new timeline: 3 in different azs. - pub(crate) async fn safekeepers_for_new_timeline( - &self, - ) -> Result, ApiError> { - // Number of safekeepers in different AZs we are looking for - let wanted_count = 3; - let mut all_safekeepers = { - let locked = self.inner.read().unwrap(); - locked - .safekeepers - .iter() - .filter_map(|sk| { - if sk.1.scheduling_policy() != SkSchedulingPolicy::Active { - // If we don't want to schedule stuff onto the safekeeper, respect that. - return None; - } - let utilization_opt = if let SafekeeperState::Available { - last_seen_at: _, - utilization, - } = sk.1.availability() - { - Some(utilization) - } else { - // non-available safekeepers still get a chance for new timelines, - // but put them last in the list. - None - }; - let info = SafekeeperInfo { - hostname: sk.1.skp.host.clone(), - id: NodeId(sk.1.skp.id as u64), - }; - Some((utilization_opt, info, sk.1.skp.availability_zone_id.clone())) - }) - .collect::>() - }; - all_safekeepers.sort_by_key(|sk| { - ( - sk.0.as_ref() - .map(|ut| ut.timeline_count) - .unwrap_or(u64::MAX), - // Use the id to decide on equal scores for reliability - sk.1.id.0, - ) - }); - let mut sks = Vec::new(); - let mut azs = HashSet::new(); - for (_sk_util, sk_info, az_id) in all_safekeepers.iter() { - if !azs.insert(az_id) { - continue; - } - sks.push(sk_info.clone()); - if sks.len() == wanted_count { - break; - } - } - if sks.len() == wanted_count { - Ok(sks) - } else { - Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find {wanted_count} safekeepers in different AZs for new timeline (found: {}, total active: {})", - sks.len(), - all_safekeepers.len(), - ))) - } - } - - pub(crate) async fn safekeepers_list( - &self, - ) -> Result, DatabaseError> { - let locked = self.inner.read().unwrap(); - let mut list = locked - .safekeepers - .iter() - .map(|sk| sk.1.describe_response()) - .collect::, _>>()?; - list.sort_by_key(|v| v.id); - Ok(list) - } - - pub(crate) async fn get_safekeeper( - &self, - id: i64, - ) -> Result { - let locked = self.inner.read().unwrap(); - let sk = locked - .safekeepers - .get(&NodeId(id as u64)) - .ok_or(diesel::result::Error::NotFound)?; - sk.describe_response() - } - - pub(crate) async fn upsert_safekeeper( - &self, - record: crate::persistence::SafekeeperUpsert, - ) -> Result<(), ApiError> { - let node_id = NodeId(record.id as u64); - let use_https = self.config.use_https_safekeeper_api; - - if use_https && record.https_port.is_none() { - return Err(ApiError::PreconditionFailed( - format!( - "cannot upsert safekeeper {node_id}: \ - https is enabled, but https port is not specified" - ) - .into(), - )); - } - - self.persistence.safekeeper_upsert(record.clone()).await?; - { - let mut locked = self.inner.write().unwrap(); - let mut safekeepers = (*locked.safekeepers).clone(); - match safekeepers.entry(node_id) { - std::collections::hash_map::Entry::Occupied(mut entry) => entry - .get_mut() - .update_from_record(record) - .expect("all preconditions should be checked before upsert to database"), - std::collections::hash_map::Entry::Vacant(entry) => { - entry.insert( - Safekeeper::from_persistence( - crate::persistence::SafekeeperPersistence::from_upsert( - record, - SkSchedulingPolicy::Pause, - ), - CancellationToken::new(), - use_https, - ) - .expect("all preconditions should be checked before upsert to database"), - ); - } - } - locked.safekeepers = Arc::new(safekeepers); - } - Ok(()) - } - - pub(crate) async fn set_safekeeper_scheduling_policy( - &self, - id: i64, - scheduling_policy: SkSchedulingPolicy, - ) -> Result<(), DatabaseError> { - self.persistence - .set_safekeeper_scheduling_policy(id, scheduling_policy) - .await?; - let node_id = NodeId(id as u64); - // After the change has been persisted successfully, update the in-memory state - { - let mut locked = self.inner.write().unwrap(); - let mut safekeepers = (*locked.safekeepers).clone(); - let sk = safekeepers - .get_mut(&node_id) - .ok_or(DatabaseError::Logical("Not found".to_string()))?; - sk.set_scheduling_policy(scheduling_policy); - - match scheduling_policy { - SkSchedulingPolicy::Active => (), - SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => { - locked.safekeeper_reconcilers.cancel_safekeeper(node_id); - } - } - - locked.safekeepers = Arc::new(safekeepers); - } - Ok(()) - } - pub(crate) async fn update_shards_preferred_azs( &self, req: ShardsPreferredAzsRequest, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs new file mode 100644 index 0000000000..b5fb00a469 --- /dev/null +++ b/storage_controller/src/service/safekeeper_service.rs @@ -0,0 +1,518 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use std::time::Duration; + +use super::safekeeper_reconciler::ScheduleRequest; +use crate::heartbeater::SafekeeperState; +use crate::persistence::{ + DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, +}; +use crate::safekeeper::Safekeeper; +use http_utils::error::ApiError; +use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; +use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo}; +use safekeeper_api::membership::{MemberSet, SafekeeperId}; +use tokio::task::JoinSet; +use tokio_util::sync::CancellationToken; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::logging::SecretString; + +use super::Service; + +impl Service { + /// Timeline creation on safekeepers + /// + /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers, + /// where `left` contains the list of safekeepers that didn't have a successful response. + /// Assumes tenant lock is held while calling this function. + pub(super) async fn tenant_timeline_create_safekeepers_quorum( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: u32, + timeline_persistence: &TimelinePersistence, + ) -> Result, ApiError> { + // If quorum is reached, return if we are outside of a specified timeout + let jwt = self + .config + .safekeeper_jwt_token + .clone() + .map(SecretString::from); + let mut joinset = JoinSet::new(); + + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + let mut members = Vec::new(); + for sk_id in timeline_persistence.sk_set.iter() { + let sk_id = NodeId(*sk_id as u64); + let Some(safekeeper) = safekeepers.get(&sk_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "couldn't find entry for safekeeper with id {sk_id}" + )))?; + }; + members.push(SafekeeperId { + id: sk_id, + host: safekeeper.skp.host.clone(), + pg_port: safekeeper.skp.port as u16, + }); + } + let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?; + let mconf = safekeeper_api::membership::Configuration::new(mset); + + let req = safekeeper_api::models::TimelineCreateRequest { + commit_lsn: None, + mconf, + pg_version, + start_lsn: timeline_persistence.start_lsn.0, + system_id: None, + tenant_id, + timeline_id, + wal_seg_size: None, + }; + const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + for sk in timeline_persistence.sk_set.iter() { + let sk_id = NodeId(*sk as u64); + let safekeepers = safekeepers.clone(); + let jwt = jwt.clone(); + let ssl_ca_cert = self.config.ssl_ca_cert.clone(); + let req = req.clone(); + joinset.spawn(async move { + // Unwrap is fine as we already would have returned error above + let sk_p = safekeepers.get(&sk_id).unwrap(); + let res = sk_p + .with_client_retries( + |client| { + let req = req.clone(); + async move { client.create_timeline(&req).await } + }, + &jwt, + &ssl_ca_cert, + 3, + 3, + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + &CancellationToken::new(), + ) + .await; + (sk_id, sk_p.skp.host.clone(), res) + }); + } + // After we have built the joinset, we now wait for the tasks to complete, + // but with a specified timeout to make sure we return swiftly, either with + // a failure or success. + let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT; + + // Wait until all tasks finish or timeout is hit, whichever occurs + // first. + let mut reconcile_results = Vec::new(); + loop { + if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await + { + let Some(res) = res else { break }; + match res { + Ok(res) => { + tracing::info!( + "response from safekeeper id:{} at {}: {:?}", + res.0, + res.1, + res.2 + ); + reconcile_results.push(res); + } + Err(join_err) => { + tracing::info!("join_err for task in joinset: {join_err}"); + } + } + } else { + tracing::info!( + "timeout for creation call after {} responses", + reconcile_results.len() + ); + break; + } + } + + // Now check now if quorum was reached in reconcile_results. + let total_result_count = reconcile_results.len(); + let remaining = reconcile_results + .into_iter() + .filter_map(|res| res.2.is_err().then_some(res.0)) + .collect::>(); + tracing::info!( + "Got {} non-successful responses from initial creation request of total {total_result_count} responses", + remaining.len() + ); + if remaining.len() >= 2 { + // Failure + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "not enough successful reconciliations to reach quorum, please retry: {} errored", + remaining.len() + ))); + } + + Ok(remaining) + } + + /// Create timeline in controller database and on safekeepers. + /// `timeline_info` is result of timeline creation on pageserver. + /// + /// All actions must be idempotent as the call is retried until success. It + /// tries to create timeline in the db and on at least majority of + /// safekeepers + queue creation for safekeepers which missed it in the db + /// for infinite retries; after that, call returns Ok. + /// + /// The idea is that once this is reached as long as we have alive majority + /// of safekeepers it is expected to get eventually operational as storcon + /// will be able to seed timeline on nodes which missed creation by making + /// pull_timeline from peers. On the other hand we don't want to fail + /// timeline creation if one safekeeper is down. + pub(super) async fn tenant_timeline_create_safekeepers( + self: &Arc, + tenant_id: TenantId, + timeline_info: &TimelineInfo, + create_mode: models::TimelineCreateRequestMode, + ) -> Result { + let timeline_id = timeline_info.timeline_id; + let pg_version = timeline_info.pg_version * 10000; + // Initially start_lsn is determined by last_record_lsn in pageserver + // response as it does initdb. However, later we persist it and in sk + // creation calls replace with the value from the timeline row if it + // previously existed as on retries in theory endpoint might have + // already written some data and advanced last_record_lsn, while we want + // safekeepers to have consistent start_lsn. + let start_lsn = match create_mode { + models::TimelineCreateRequestMode::Bootstrap { .. } => timeline_info.last_record_lsn, + models::TimelineCreateRequestMode::Branch { .. } => timeline_info.last_record_lsn, + models::TimelineCreateRequestMode::ImportPgdata { .. } => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "import pgdata doesn't specify the start lsn, aborting creation on safekeepers" + )))?; + } + }; + // Choose initial set of safekeepers respecting affinity + let sks = self.safekeepers_for_new_timeline().await?; + let sks_persistence = sks.iter().map(|sk| sk.id.0 as i64).collect::>(); + // Add timeline to db + let mut timeline_persist = TimelinePersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + start_lsn: start_lsn.into(), + generation: 0, + sk_set: sks_persistence.clone(), + new_sk_set: None, + cplane_notified_generation: 0, + deleted_at: None, + }; + let inserted = self + .persistence + .insert_timeline(timeline_persist.clone()) + .await?; + if !inserted { + if let Some(existent_persist) = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await? + { + // Replace with what we have in the db, to get stuff like the generation right. + // We do still repeat the http calls to the safekeepers. After all, we could have + // crashed right after the wrote to the DB. + timeline_persist = existent_persist; + } else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "insertion said timeline already in db, but looking it up, it was gone" + ))); + } + } + // Create the timeline on a quorum of safekeepers + let remaining = self + .tenant_timeline_create_safekeepers_quorum( + tenant_id, + timeline_id, + pg_version, + &timeline_persist, + ) + .await?; + + // For the remaining safekeepers, take care of their reconciliation asynchronously + for &remaining_id in remaining.iter() { + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: timeline_persist.generation, + op_kind: crate::persistence::SafekeeperTimelineOpKind::Pull, + sk_id: remaining_id.0 as i64, + }; + tracing::info!("writing pending op for sk id {remaining_id}"); + self.persistence.insert_pending_op(pending_op).await?; + } + if !remaining.is_empty() { + let mut locked = self.inner.write().unwrap(); + for remaining_id in remaining { + let Some(sk) = locked.safekeepers.get(&remaining_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {remaining_id}" + ))); + }; + let Ok(host_list) = sks + .iter() + .map(|sk| { + Ok(( + sk.id, + locked + .safekeepers + .get(&sk.id) + .ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {remaining_id} to pull from" + )) + })? + .base_url(), + )) + }) + .collect::>() + else { + continue; + }; + let req = ScheduleRequest { + safekeeper: Box::new(sk.clone()), + host_list, + tenant_id, + timeline_id, + generation: timeline_persist.generation as u32, + kind: crate::persistence::SafekeeperTimelineOpKind::Pull, + }; + locked.safekeeper_reconcilers.schedule_request(self, req); + } + } + + Ok(SafekeepersInfo { + generation: timeline_persist.generation as u32, + safekeepers: sks, + tenant_id, + timeline_id, + }) + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. + pub(super) async fn tenant_timeline_delete_safekeepers( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> Result<(), ApiError> { + let tl = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + let Some(tl) = tl else { + tracing::info!( + "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table, no deletions on safekeepers needed" + ); + return Ok(()); + }; + let all_sks = tl + .new_sk_set + .iter() + .flat_map(|sks| { + sks.iter() + .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude)) + }) + .chain( + tl.sk_set + .iter() + .map(|v| (*v, SafekeeperTimelineOpKind::Delete)), + ) + .collect::>(); + + // Schedule reconciliations + { + let mut locked = self.inner.write().unwrap(); + for (sk_id, kind) in all_sks { + let sk_id = NodeId(sk_id as u64); + let Some(sk) = locked.safekeepers.get(&sk_id) else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Couldn't find safekeeper with id {sk_id}" + ))); + }; + + let req = ScheduleRequest { + safekeeper: Box::new(sk.clone()), + // we don't use this for this kind, put a dummy value + host_list: Vec::new(), + tenant_id, + timeline_id, + generation: tl.generation as u32, + kind, + }; + locked.safekeeper_reconcilers.schedule_request(self, req); + } + } + Ok(()) + } + + /// Choose safekeepers for the new timeline: 3 in different azs. + pub(crate) async fn safekeepers_for_new_timeline( + &self, + ) -> Result, ApiError> { + // Number of safekeepers in different AZs we are looking for + let wanted_count = 3; + let mut all_safekeepers = { + let locked = self.inner.read().unwrap(); + locked + .safekeepers + .iter() + .filter_map(|sk| { + if sk.1.scheduling_policy() != SkSchedulingPolicy::Active { + // If we don't want to schedule stuff onto the safekeeper, respect that. + return None; + } + let utilization_opt = if let SafekeeperState::Available { + last_seen_at: _, + utilization, + } = sk.1.availability() + { + Some(utilization) + } else { + // non-available safekeepers still get a chance for new timelines, + // but put them last in the list. + None + }; + let info = SafekeeperInfo { + hostname: sk.1.skp.host.clone(), + id: NodeId(sk.1.skp.id as u64), + }; + Some((utilization_opt, info, sk.1.skp.availability_zone_id.clone())) + }) + .collect::>() + }; + all_safekeepers.sort_by_key(|sk| { + ( + sk.0.as_ref() + .map(|ut| ut.timeline_count) + .unwrap_or(u64::MAX), + // Use the id to decide on equal scores for reliability + sk.1.id.0, + ) + }); + let mut sks = Vec::new(); + let mut azs = HashSet::new(); + for (_sk_util, sk_info, az_id) in all_safekeepers.iter() { + if !azs.insert(az_id) { + continue; + } + sks.push(sk_info.clone()); + if sks.len() == wanted_count { + break; + } + } + if sks.len() == wanted_count { + Ok(sks) + } else { + Err(ApiError::InternalServerError(anyhow::anyhow!( + "couldn't find {wanted_count} safekeepers in different AZs for new timeline (found: {}, total active: {})", + sks.len(), + all_safekeepers.len(), + ))) + } + } + + pub(crate) async fn safekeepers_list( + &self, + ) -> Result, DatabaseError> { + let locked = self.inner.read().unwrap(); + let mut list = locked + .safekeepers + .iter() + .map(|sk| sk.1.describe_response()) + .collect::, _>>()?; + list.sort_by_key(|v| v.id); + Ok(list) + } + + pub(crate) async fn get_safekeeper( + &self, + id: i64, + ) -> Result { + let locked = self.inner.read().unwrap(); + let sk = locked + .safekeepers + .get(&NodeId(id as u64)) + .ok_or(diesel::result::Error::NotFound)?; + sk.describe_response() + } + + pub(crate) async fn upsert_safekeeper( + &self, + record: crate::persistence::SafekeeperUpsert, + ) -> Result<(), ApiError> { + let node_id = NodeId(record.id as u64); + let use_https = self.config.use_https_safekeeper_api; + + if use_https && record.https_port.is_none() { + return Err(ApiError::PreconditionFailed( + format!( + "cannot upsert safekeeper {node_id}: \ + https is enabled, but https port is not specified" + ) + .into(), + )); + } + + self.persistence.safekeeper_upsert(record.clone()).await?; + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + match safekeepers.entry(node_id) { + std::collections::hash_map::Entry::Occupied(mut entry) => entry + .get_mut() + .update_from_record(record) + .expect("all preconditions should be checked before upsert to database"), + std::collections::hash_map::Entry::Vacant(entry) => { + entry.insert( + Safekeeper::from_persistence( + crate::persistence::SafekeeperPersistence::from_upsert( + record, + SkSchedulingPolicy::Pause, + ), + CancellationToken::new(), + use_https, + ) + .expect("all preconditions should be checked before upsert to database"), + ); + } + } + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) + } + + pub(crate) async fn set_safekeeper_scheduling_policy( + &self, + id: i64, + scheduling_policy: SkSchedulingPolicy, + ) -> Result<(), DatabaseError> { + self.persistence + .set_safekeeper_scheduling_policy(id, scheduling_policy) + .await?; + let node_id = NodeId(id as u64); + // After the change has been persisted successfully, update the in-memory state + { + let mut locked = self.inner.write().unwrap(); + let mut safekeepers = (*locked.safekeepers).clone(); + let sk = safekeepers + .get_mut(&node_id) + .ok_or(DatabaseError::Logical("Not found".to_string()))?; + sk.set_scheduling_policy(scheduling_policy); + + match scheduling_policy { + SkSchedulingPolicy::Active => (), + SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => { + locked.safekeeper_reconcilers.cancel_safekeeper(node_id); + } + } + + locked.safekeepers = Arc::new(safekeepers); + } + Ok(()) + } +} From 2dfff6a2a3287083a8057a401fe39ec7898367ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Tue, 18 Mar 2025 12:30:49 +0100 Subject: [PATCH 67/71] impr(ci): use ghcr.io as the default container registry (#11210) ## Problem Docker Hub has new rate limits coming up, and to avoid problems coming with those we're switching to GHCR. ## Summary of changes - Push images to GHCR initially and distribute them from there - Use images from GHCR in docker-compose --- .github/scripts/generate_image_maps.py | 4 +- .github/workflows/_build-and-test-locally.yml | 8 +- .github/workflows/_check-codestyle-python.yml | 8 +- .github/workflows/_check-codestyle-rust.yml | 7 +- .../workflows/_push-to-container-registry.yml | 2 +- .github/workflows/build-build-tools-image.yml | 39 ++++- .github/workflows/build_and_test.yml | 144 ++++++++++++------ .github/workflows/cargo-deny.yml | 9 +- .github/workflows/pin-build-tools-image.yml | 8 +- .github/workflows/pre-merge-checks.yml | 11 ++ Dockerfile | 2 +- compute/compute-node.Dockerfile | 2 +- docker-compose/compute_wrapper/Dockerfile | 2 +- docker-compose/docker-compose.yml | 14 +- docker-compose/run-tests.sh | 2 +- 15 files changed, 178 insertions(+), 84 deletions(-) diff --git a/.github/scripts/generate_image_maps.py b/.github/scripts/generate_image_maps.py index f67e07024c..d8f910271b 100644 --- a/.github/scripts/generate_image_maps.py +++ b/.github/scripts/generate_image_maps.py @@ -49,10 +49,10 @@ target_stages = ( for component_name, component_images in components.items(): for stage in target_stages: outputs[f"{component_name}-{stage}"] = { - f"docker.io/neondatabase/{component_image}:{source_tag}": [ + f"ghcr.io/neondatabase/{component_image}:{source_tag}": [ f"{registry}/{component_image}:{tag}" for registry, tag in itertools.product(registries[stage], target_tags) - if not (registry == "docker.io/neondatabase" and tag == source_tag) + if not (registry == "ghcr.io/neondatabase" and tag == source_tag) ] for component_image in component_images } diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 6a2070424a..db1ea464e6 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -46,8 +46,8 @@ jobs: container: image: ${{ inputs.build-tools-image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} # Raise locked memory limit for tokio-epoll-uring. # On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12), # io_uring will account the memory of the CQ and SQ as locked. @@ -322,8 +322,8 @@ jobs: container: image: ${{ inputs.build-tools-image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: diff --git a/.github/workflows/_check-codestyle-python.yml b/.github/workflows/_check-codestyle-python.yml index 9ae28a1379..868ac15f3c 100644 --- a/.github/workflows/_check-codestyle-python.yml +++ b/.github/workflows/_check-codestyle-python.yml @@ -15,11 +15,15 @@ defaults: jobs: check-codestyle-python: runs-on: [ self-hosted, small ] + + permissions: + packages: read + container: image: ${{ inputs.build-tools-image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml index c4c76914aa..6d517abe72 100644 --- a/.github/workflows/_check-codestyle-rust.yml +++ b/.github/workflows/_check-codestyle-rust.yml @@ -26,11 +26,14 @@ jobs: arch: ${{ fromJson(inputs.archs) }} runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + permissions: + packages: read + container: image: ${{ inputs.build-tools-image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/_push-to-container-registry.yml b/.github/workflows/_push-to-container-registry.yml index 2dab665f40..949eeca4b1 100644 --- a/.github/workflows/_push-to-container-registry.yml +++ b/.github/workflows/_push-to-container-registry.yml @@ -89,7 +89,7 @@ jobs: uses: docker/login-action@v3 with: registry: ghcr.io - username: ${{ github.repository_owner }} + username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Log in to Docker Hub diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 0a7f0cd7a0..4eae242395 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -19,7 +19,7 @@ on: value: ${{ jobs.check-image.outputs.tag }} image: description: "build-tools image" - value: neondatabase/build-tools:${{ jobs.check-image.outputs.tag }} + value: ghcr.io/neondatabase/build-tools:${{ jobs.check-image.outputs.tag }} defaults: run: @@ -49,9 +49,18 @@ jobs: everything: ${{ steps.set-more-variables.outputs.everything }} found: ${{ steps.set-more-variables.outputs.found }} + permissions: + packages: read + steps: - uses: actions/checkout@v4 + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Set variables id: set-variables env: @@ -75,7 +84,7 @@ jobs: contains(fromJson(steps.set-variables.outputs.debians), 'bullseye') && contains(fromJson(steps.set-variables.outputs.debians), 'bookworm') }} run: | - if docker manifest inspect neondatabase/build-tools:${IMAGE_TAG}; then + if docker manifest inspect ghcr.io/neondatabase/build-tools:${IMAGE_TAG}; then found=true else found=false @@ -93,6 +102,9 @@ jobs: arch: ${{ fromJson(needs.check-image.outputs.archs) }} debian: ${{ fromJson(needs.check-image.outputs.debians) }} + permissions: + packages: write + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: @@ -108,6 +120,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - uses: docker/login-action@v3 with: registry: cache.neon.build @@ -126,18 +144,27 @@ jobs: cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian, matrix.arch) || '' }} tags: | - neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian }}-${{ matrix.arch }} + ghcr.io/neondatabase/build-tools:${{ needs.check-image.outputs.tag }}-${{ matrix.debian }}-${{ matrix.arch }} merge-images: needs: [ check-image, build-image ] runs-on: ubuntu-22.04 + permissions: + packages: write + steps: - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Create multi-arch image env: DEFAULT_DEBIAN_VERSION: bookworm @@ -147,14 +174,14 @@ jobs: IMAGE_TAG: ${{ needs.check-image.outputs.tag }} run: | for debian in ${DEBIANS}; do - tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian}") + tags=("-t" "ghcr.io/neondatabase/build-tools:${IMAGE_TAG}-${debian}") if [ "${EVERYTHING}" == "true" ] && [ "${debian}" == "${DEFAULT_DEBIAN_VERSION}" ]; then - tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}") + tags+=("-t" "ghcr.io/neondatabase/build-tools:${IMAGE_TAG}") fi for arch in ${ARCHS}; do - tags+=("neondatabase/build-tools:${IMAGE_TAG}-${debian}-${arch}") + tags+=("ghcr.io/neondatabase/build-tools:${IMAGE_TAG}-${debian}-${arch}") done docker buildx imagetools create "${tags[@]}" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0980561345..1762cd9644 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -89,8 +89,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -209,8 +209,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: - name: Checkout @@ -314,8 +314,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -367,8 +367,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init strategy: fail-fast: false @@ -494,6 +494,9 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + permissions: + packages: write + steps: - uses: actions/checkout@v4 with: @@ -509,6 +512,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - uses: docker/login-action@v3 with: registry: cache.neon.build @@ -533,7 +542,7 @@ jobs: cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }} + ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: needs: [ neon-image-arch, meta ] @@ -543,19 +552,21 @@ jobs: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read + packages: write steps: - uses: docker/login-action@v3 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch image run: | - docker buildx imagetools create -t neondatabase/neon:${{ needs.meta.outputs.build-tag }} \ - -t neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \ - neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \ - neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 + docker buildx imagetools create -t ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }} \ + -t ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm \ + ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-x64 \ + ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }}-bookworm-arm64 compute-node-image-arch: needs: [ check-permissions, build-build-tools-image, meta ] @@ -564,6 +575,7 @@ jobs: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read + packages: write strategy: fail-fast: false matrix: @@ -604,6 +616,12 @@ jobs: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - uses: docker/login-action@v3 with: registry: cache.neon.build @@ -627,7 +645,7 @@ jobs: cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} + ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg >= 'v16' @@ -647,7 +665,7 @@ jobs: target: extension-tests cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} + ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.meta.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: needs: [ compute-node-image-arch, meta ] @@ -656,6 +674,7 @@ jobs: id-token: write # aws-actions/configure-aws-credentials statuses: write contents: read + packages: write runs-on: ubuntu-22.04 strategy: @@ -674,28 +693,32 @@ jobs: steps: - uses: docker/login-action@v3 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image if: matrix.version.pg >= 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 + docker buildx imagetools create -t ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -t ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }} \ + ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + ghcr.io/neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 vm-compute-node-image-arch: needs: [ check-permissions, meta, compute-node-image ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + permissions: + contents: read + packages: write strategy: fail-fast: false matrix: @@ -723,31 +746,34 @@ jobs: - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 - uses: docker/login-action@v3 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} # Note: we need a separate pull step here because otherwise vm-builder will try to pull, and # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} + docker pull ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ -size=2G \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ - -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \ + -src=ghcr.io/neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + -dst=ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \ -target-arch=linux/${{ matrix.arch }} - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} + docker push ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} vm-compute-node-image: needs: [ vm-compute-node-image-arch, meta ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} + permissions: + packages: write runs-on: ubuntu-22.04 strategy: matrix: @@ -760,14 +786,15 @@ jobs: steps: - uses: docker/login-action@v3 with: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ - neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \ - neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64 + docker buildx imagetools create -t ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \ + ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \ + ghcr.io/neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64 test-images: @@ -785,18 +812,28 @@ jobs: arch: [ x64, arm64 ] pg_version: [v16, v17] + permissions: + packages: read + runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - uses: actions/checkout@v4 - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193 + - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - # `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. + - uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # `ghcr.io/neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library. # Pick pageserver as currently the only binary with extra "version" features printed in the string to verify. # Regular pageserver version string looks like # Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: [] @@ -807,7 +844,7 @@ jobs: shell: bash # ensure no set -e for better error messages if: ${{ contains(fromJSON('["push-main", "pr", "storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind) }} run: | - pageserver_version=$(docker run --rm neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") + pageserver_version=$(docker run --rm ghcr.io/neondatabase/neon:${{ needs.meta.outputs.build-tag }} "/bin/sh" "-c" "/usr/local/bin/pageserver --version") echo "Pageserver version string: $pageserver_version" @@ -978,18 +1015,21 @@ jobs: acr-registry-name: ${{ vars.AZURE_PROD_REGISTRY_NAME }} secrets: inherit - push-neon-test-extensions-image-ghcr: + push-neon-test-extensions-image-dockerhub: if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + packages: write + id-token: write with: image-map: | { - "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ - "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" ], - "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ - "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ + "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" ] } secrets: inherit @@ -998,14 +1038,17 @@ jobs: if: ${{ needs.meta.outputs.run-kind == 'push-main' }} needs: [ meta, compute-node-image ] uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + packages: write + id-token: write with: image-map: | { - "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}": [ "docker.io/neondatabase/neon-test-extensions-v16:latest", "ghcr.io/neondatabase/neon-test-extensions-v16:latest" ], - "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}": [ "docker.io/neondatabase/neon-test-extensions-v17:latest", "ghcr.io/neondatabase/neon-test-extensions-v17:latest" ] @@ -1016,14 +1059,17 @@ jobs: if: ${{ needs.meta.outputs.run-kind == 'compute-release' }} needs: [ meta ] uses: ./.github/workflows/_push-to-container-registry.yml + permissions: + packages: write + id-token: write with: image-map: | { - "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.release-pr-run-id }}": [ + "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.release-pr-run-id }}": [ "docker.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}", "ghcr.io/neondatabase/neon-test-extensions-v16:${{ needs.meta.outputs.build-tag }}" ], - "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.release-pr-run-id }}": [ + "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.release-pr-run-id }}": [ "docker.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}", "ghcr.io/neondatabase/neon-test-extensions-v17:${{ needs.meta.outputs.build-tag }}" ] diff --git a/.github/workflows/cargo-deny.yml b/.github/workflows/cargo-deny.yml index 222f7e9787..a4f476c99a 100644 --- a/.github/workflows/cargo-deny.yml +++ b/.github/workflows/cargo-deny.yml @@ -24,11 +24,14 @@ jobs: runs-on: [self-hosted, small] + permissions: + packages: read + container: - image: ${{ inputs.build-tools-image || 'neondatabase/build-tools:pinned' }} + image: ${{ inputs.build-tools-image || 'ghcr.io/neondatabase/build-tools:pinned' }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index d2588ba0bf..ddeefe0128 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -46,8 +46,8 @@ jobs: FROM_TAG: ${{ inputs.from-tag }} TO_TAG: pinned run: | - docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json" - docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json" + docker manifest inspect "ghcr.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json" + docker manifest inspect "ghcr.io/neondatabase/build-tools:${TO_TAG}" > "${TO_TAG}.json" if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then skip=true @@ -71,13 +71,13 @@ jobs: with: image-map: | { - "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ + "ghcr.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [ "docker.io/neondatabase/build-tools:pinned-bullseye", "ghcr.io/neondatabase/build-tools:pinned-bullseye", "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye", "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye" ], - "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ + "ghcr.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [ "docker.io/neondatabase/build-tools:pinned-bookworm", "docker.io/neondatabase/build-tools:pinned", "ghcr.io/neondatabase/build-tools:pinned-bookworm", diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index 3bd81f6538..85b131bb11 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -19,6 +19,8 @@ permissions: {} jobs: meta: runs-on: ubuntu-22.04 + permissions: + contents: read outputs: python-changed: ${{ steps.python-src.outputs.any_changed }} rust-changed: ${{ steps.rust-src.outputs.any_changed }} @@ -72,6 +74,9 @@ jobs: || needs.meta.outputs.python-changed == 'true' || needs.meta.outputs.rust-changed == 'true' needs: [ meta ] + permissions: + contents: read + packages: write uses: ./.github/workflows/build-build-tools-image.yml with: # Build only one combination to save time @@ -82,6 +87,9 @@ jobs: check-codestyle-python: if: needs.meta.outputs.python-changed == 'true' needs: [ meta, build-build-tools-image ] + permissions: + contents: read + packages: read uses: ./.github/workflows/_check-codestyle-python.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` @@ -91,6 +99,9 @@ jobs: check-codestyle-rust: if: needs.meta.outputs.rust-changed == 'true' needs: [ meta, build-build-tools-image ] + permissions: + contents: read + packages: read uses: ./.github/workflows/_check-codestyle-rust.yml with: # `-bookworm-x64` suffix should match the combination in `build-build-tools-image` diff --git a/Dockerfile b/Dockerfile index 83ad86badb..01540e1925 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ ### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters. ### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used ### inside this image in the real deployments. -ARG REPOSITORY=neondatabase +ARG REPOSITORY=ghcr.io/neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG DEFAULT_PG_VERSION=17 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index d5483018b4..bdc73ab174 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -77,7 +77,7 @@ # build_and_test.yml github workflow for how that's done. ARG PG_VERSION -ARG REPOSITORY=neondatabase +ARG REPOSITORY=ghcr.io/neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index b5f0f47ceb..9ef831a9cd 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -1,4 +1,4 @@ -ARG REPOSITORY=neondatabase +ARG REPOSITORY=ghcr.io/neondatabase ARG COMPUTE_IMAGE=compute-node-v14 ARG TAG=latest diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 95d4ff7b2a..493a0a5523 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -29,7 +29,7 @@ services: pageserver: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - AWS_ACCESS_KEY_ID=minio - AWS_SECRET_ACCESS_KEY=password @@ -45,7 +45,7 @@ services: safekeeper1: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454 - SAFEKEEPER_ID=1 @@ -75,7 +75,7 @@ services: safekeeper2: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454 - SAFEKEEPER_ID=2 @@ -105,7 +105,7 @@ services: safekeeper3: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} environment: - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454 - SAFEKEEPER_ID=3 @@ -135,7 +135,7 @@ services: storage_broker: restart: always - image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon:${TAG:-latest} ports: - 50051:50051 command: @@ -147,7 +147,7 @@ services: build: context: ./compute_wrapper/ args: - - REPOSITORY=${REPOSITORY:-neondatabase} + - REPOSITORY=${REPOSITORY:-ghcr.io/neondatabase} - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16} - TAG=${COMPUTE_TAG:-${TAG:-latest}} - http_proxy=${http_proxy:-} @@ -186,7 +186,7 @@ services: neon-test-extensions: profiles: ["test-extensions"] - image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} + image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}} environment: - PGPASSWORD=cloud_admin entrypoint: diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh index 72ae61b032..3117950cc0 100644 --- a/docker-compose/run-tests.sh +++ b/docker-compose/run-tests.sh @@ -20,4 +20,4 @@ for d in ${LIST}; do done [ -z "${FAILED}" ] && exit 0 echo "${FAILED}" -exit 1 \ No newline at end of file +exit 1 From fd41ab9bb61dc22a1193c050df581689b75b241f Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 18 Mar 2025 13:05:08 +0000 Subject: [PATCH 68/71] chore: remove x509-parser (#11247) Both crates seem well maintained. x509-cert is part of the high quality RustCrypto project that we already make heavy use of, and I think it makes sense to reduce the dependencies where possible. --- Cargo.lock | 91 +--------------------------------- Cargo.toml | 2 +- compute_tools/Cargo.toml | 2 +- compute_tools/src/tls.rs | 5 +- proxy/Cargo.toml | 6 +-- proxy/src/tls/mod.rs | 28 +++++------ proxy/src/tls/server_config.rs | 11 ++-- workspace_hack/Cargo.toml | 5 +- 8 files changed, 31 insertions(+), 119 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 39ce785a4e..a8e400524e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -167,45 +167,6 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" -[[package]] -name = "asn1-rs" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048" -dependencies = [ - "asn1-rs-derive", - "asn1-rs-impl", - "displaydoc", - "nom", - "num-traits", - "rusticata-macros", - "thiserror 1.0.69", - "time", -] - -[[package]] -name = "asn1-rs-derive" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.100", - "synstructure", -] - -[[package]] -name = "asn1-rs-impl" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.100", -] - [[package]] name = "assert-json-diff" version = "2.0.2" @@ -1813,20 +1774,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "der-parser" -version = "9.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553" -dependencies = [ - "asn1-rs", - "displaydoc", - "nom", - "num-bigint", - "num-traits", - "rusticata-macros", -] - [[package]] name = "der_derive" version = "0.7.3" @@ -4044,15 +3991,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "oid-registry" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9" -dependencies = [ - "asn1-rs", -] - [[package]] name = "once_cell" version = "1.20.2" @@ -5227,7 +5165,7 @@ dependencies = [ "uuid", "walkdir", "workspace_hack", - "x509-parser", + "x509-cert", "zerocopy", ] @@ -5848,15 +5786,6 @@ dependencies = [ "semver", ] -[[package]] -name = "rusticata-macros" -version = "4.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" -dependencies = [ - "nom", -] - [[package]] name = "rustix" version = "0.38.41" @@ -8440,7 +8369,6 @@ dependencies = [ "der 0.7.8", "deranged", "digest", - "displaydoc", "ecdsa 0.16.9", "either", "elliptic-curve 0.13.8", @@ -8568,23 +8496,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "x509-parser" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69" -dependencies = [ - "asn1-rs", - "data-encoding", - "der-parser", - "lazy_static", - "nom", - "oid-registry", - "rusticata-macros", - "thiserror 1.0.69", - "time", -] - [[package]] name = "xattr" version = "1.0.0" diff --git a/Cargo.toml b/Cargo.toml index f2a94d2371..9bbc5a1a38 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -215,10 +215,10 @@ urlencoding = "2.1" uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] } walkdir = "2.3.2" rustls-native-certs = "0.8" -x509-parser = "0.16" whoami = "1.5.1" zerocopy = { version = "0.7", features = ["derive"] } json-structural-diff = { version = "0.2.0" } +x509-cert = { version = "0.2.5" } ## TODO replace this with tracing env_logger = "0.11" diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 90951e7ddb..d80ec41d34 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -61,7 +61,7 @@ thiserror.workspace = true url.workspace = true uuid.workspace = true walkdir.workspace = true -x509-cert = { version = "0.2.5" } +x509-cert.workspace = true postgres_initdb.workspace = true compute_api.workspace = true diff --git a/compute_tools/src/tls.rs b/compute_tools/src/tls.rs index 5a310d8ac4..8f465c7300 100644 --- a/compute_tools/src/tls.rs +++ b/compute_tools/src/tls.rs @@ -3,7 +3,6 @@ use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration}; use anyhow::{Context, Result, bail}; use compute_api::responses::TlsConfig; use ring::digest; -use spki::ObjectIdentifier; use spki::der::{Decode, PemReader}; use x509_cert::Certificate; @@ -91,13 +90,13 @@ fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Resul } fn verify_key_cert(key: &str, cert: &str) -> Result<()> { - const ECDSA_WITH_SHA256: ObjectIdentifier = ObjectIdentifier::new_unwrap("1.2.840.10045.4.3.2"); + use x509_cert::der::oid::db::rfc5912::ECDSA_WITH_SHA_256; let cert = Certificate::decode(&mut PemReader::new(cert.as_bytes()).context("pem reader")?) .context("decode cert")?; match cert.signature_algorithm.oid { - ECDSA_WITH_SHA256 => { + ECDSA_WITH_SHA_256 => { let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?; let a = key.public_key().to_sec1_bytes(); diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index b6e3f03a81..2cec510d82 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -70,8 +70,9 @@ reqwest-middleware = { workspace = true, features = ["json"] } reqwest-retry.workspace = true reqwest-tracing.workspace = true rustc-hash.workspace = true -rustls-pemfile.workspace = true rustls.workspace = true +rustls-native-certs.workspace = true +rustls-pemfile.workspace = true scopeguard.workspace = true serde.workspace = true serde_json.workspace = true @@ -99,8 +100,7 @@ url.workspace = true urlencoding.workspace = true utils.workspace = true uuid.workspace = true -rustls-native-certs.workspace = true -x509-parser.workspace = true +x509-cert.workspace = true redis.workspace = true zerocopy.workspace = true diff --git a/proxy/src/tls/mod.rs b/proxy/src/tls/mod.rs index d6ce6bd9fc..7fe71abf48 100644 --- a/proxy/src/tls/mod.rs +++ b/proxy/src/tls/mod.rs @@ -6,7 +6,7 @@ use anyhow::Context; use rustls::pki_types::CertificateDer; use sha2::{Digest, Sha256}; use tracing::{error, info}; -use x509_parser::oid_registry; +use x509_cert::der::{Reader, SliceReader, oid}; /// pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; @@ -41,27 +41,27 @@ pub enum TlsServerEndPoint { impl TlsServerEndPoint { pub fn new(cert: &CertificateDer<'_>) -> anyhow::Result { - let sha256_oids = [ + const SHA256_OIDS: &[oid::ObjectIdentifier] = &[ // I'm explicitly not adding MD5 or SHA1 here... They're bad. - oid_registry::OID_SIG_ECDSA_WITH_SHA256, - oid_registry::OID_PKCS1_SHA256WITHRSA, + oid::db::rfc5912::ECDSA_WITH_SHA_256, + oid::db::rfc5912::SHA_256_WITH_RSA_ENCRYPTION, ]; - let pem = x509_parser::parse_x509_certificate(cert) - .context("Failed to parse PEM object from cerficiate")? - .1; + let certificate = SliceReader::new(cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; - info!(subject = %pem.subject, "parsing TLS certificate"); + let subject = certificate.tbs_certificate.subject; + info!(%subject, "parsing TLS certificate"); - let reg = oid_registry::OidRegistry::default().with_all_crypto(); - let oid = pem.signature_algorithm.oid(); - let alg = reg.get(oid); - if sha256_oids.contains(oid) { + let oid = certificate.signature_algorithm.oid; + if SHA256_OIDS.contains(&oid) { let tls_server_end_point: [u8; 32] = Sha256::new().chain_update(cert).finalize().into(); - info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); + info!(%subject, tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding"); Ok(Self::Sha256(tls_server_end_point)) } else { - error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding"); + error!(%subject, "unknown channel binding"); Ok(Self::Undefined) } } diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 4cbd0474c2..eab9940e7d 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -5,6 +5,7 @@ use anyhow::{Context, bail}; use itertools::Itertools; use rustls::crypto::ring::{self, sign}; use rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use x509_cert::der::{Reader, SliceReader}; use super::{PG_ALPN_PROTOCOL, TlsServerEndPoint}; @@ -131,11 +132,13 @@ impl CertResolver { let first_cert = &cert_chain[0]; let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - let pem = x509_parser::parse_x509_certificate(first_cert) - .context("Failed to parse PEM object from cerficiate")? - .1; - let common_name = pem.subject().to_string(); + let certificate = SliceReader::new(first_cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + let common_name = certificate.tbs_certificate.subject.to_string(); // We need to get the canonical name for this certificate so we can match them against any domain names // seen within the proxy codebase. diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 6a726f0585..e9eaf4b35e 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -61,7 +61,7 @@ memchr = { version = "2" } nix = { version = "0.26" } nom = { version = "7" } num = { version = "0.4" } -num-bigint = { version = "0.4" } +num-bigint = { version = "0.4", default-features = false, features = ["std"] } num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } @@ -115,7 +115,6 @@ anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } -displaydoc = { version = "0.2" } either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } @@ -128,7 +127,7 @@ log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } num = { version = "0.4" } -num-bigint = { version = "0.4" } +num-bigint = { version = "0.4", default-features = false, features = ["std"] } num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } From eb6efda98bad146e4754eda343ea0bed6648d932 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Tue, 18 Mar 2025 14:49:34 +0100 Subject: [PATCH 69/71] impr(ci): move some kinds of tests to PR runs only (#11272) ## Problem The pipelines after release merges are slower than they need to be at the moment. This is because some kinds of tests/checks run on all kinds of pipelines, even though they only matter in some of those. ## Summary of changes Run `check-codestyle-{rust,python,jsonnet}`, `build-and-test-locally` and `trigger-e2e-tests` only on regular PRs, not release PR or pushes to main or release branches. --- .github/workflows/_build-and-test-locally.yml | 4 +- .github/workflows/_check-codestyle-rust.yml | 4 +- .github/workflows/_meta.yml | 2 +- .github/workflows/benchmarking.yml | 12 ++--- .github/workflows/build-build-tools-image.yml | 18 +++---- .github/workflows/build-macos.yml | 2 +- .github/workflows/build_and_test.yml | 52 ++++++++++++------- .github/workflows/neon_extra_builds.yml | 4 +- .github/workflows/pre-merge-checks.yml | 2 +- 9 files changed, 56 insertions(+), 44 deletions(-) diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index db1ea464e6..b950187fe1 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -39,7 +39,7 @@ env: jobs: build-neon: - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} permissions: id-token: write # aws-actions/configure-aws-credentials contents: read @@ -318,7 +318,7 @@ jobs: contents: read statuses: write needs: [ build-neon ] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }} container: image: ${{ inputs.build-tools-image }} credentials: diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml index 6d517abe72..83eeb83e45 100644 --- a/.github/workflows/_check-codestyle-rust.yml +++ b/.github/workflows/_check-codestyle-rust.yml @@ -23,8 +23,8 @@ jobs: check-codestyle-rust: strategy: matrix: - arch: ${{ fromJson(inputs.archs) }} - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + arch: ${{ fromJSON(inputs.archs) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} permissions: packages: read diff --git a/.github/workflows/_meta.yml b/.github/workflows/_meta.yml index bb2f9fa5d9..44802f0525 100644 --- a/.github/workflows/_meta.yml +++ b/.github/workflows/_meta.yml @@ -120,7 +120,7 @@ jobs: - name: Get the release PR run ID id: release-pr-run-id - if: ${{ contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }} + if: ${{ contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index ff7db02e42..e080b06cac 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -441,7 +441,7 @@ jobs: strategy: fail-fast: false - matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}} + matrix: ${{fromJSON(needs.generate-matrices.outputs.pgbench-compare-matrix)}} env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" @@ -483,7 +483,7 @@ jobs: aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }} - name: Create Neon Project - if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) + if: contains(fromJSON('["neonvm-captest-new", "neonvm-captest-new-many-tables", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -523,7 +523,7 @@ jobs: # without (neonvm-captest-new) # and with (neonvm-captest-new-many-tables) many relations in the database - name: Create many relations before the run - if: contains(fromJson('["neonvm-captest-new-many-tables"]'), matrix.platform) + if: contains(fromJSON('["neonvm-captest-new-many-tables"]'), matrix.platform) uses: ./.github/actions/run-python-test-set with: build_type: ${{ env.BUILD_TYPE }} @@ -753,7 +753,7 @@ jobs: strategy: fail-fast: false - matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} + matrix: ${{ fromJSON(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -880,7 +880,7 @@ jobs: strategy: fail-fast: false - matrix: ${{ fromJson(needs.generate-matrices.outputs.tpch-compare-matrix) }} + matrix: ${{ fromJSON(needs.generate-matrices.outputs.tpch-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -999,7 +999,7 @@ jobs: strategy: fail-fast: false - matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} + matrix: ${{ fromJSON(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 4eae242395..f7c91e7412 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -79,10 +79,10 @@ jobs: env: IMAGE_TAG: ${{ steps.set-variables.outputs.image-tag }} EVERYTHING: | - ${{ contains(fromJson(steps.set-variables.outputs.archs), 'x64') && - contains(fromJson(steps.set-variables.outputs.archs), 'arm64') && - contains(fromJson(steps.set-variables.outputs.debians), 'bullseye') && - contains(fromJson(steps.set-variables.outputs.debians), 'bookworm') }} + ${{ contains(fromJSON(steps.set-variables.outputs.archs), 'x64') && + contains(fromJSON(steps.set-variables.outputs.archs), 'arm64') && + contains(fromJSON(steps.set-variables.outputs.debians), 'bullseye') && + contains(fromJSON(steps.set-variables.outputs.debians), 'bookworm') }} run: | if docker manifest inspect ghcr.io/neondatabase/build-tools:${IMAGE_TAG}; then found=true @@ -99,13 +99,13 @@ jobs: strategy: matrix: - arch: ${{ fromJson(needs.check-image.outputs.archs) }} - debian: ${{ fromJson(needs.check-image.outputs.debians) }} + arch: ${{ fromJSON(needs.check-image.outputs.archs) }} + debian: ${{ fromJSON(needs.check-image.outputs.debians) }} permissions: packages: write - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - uses: actions/checkout@v4 @@ -168,8 +168,8 @@ jobs: - name: Create multi-arch image env: DEFAULT_DEBIAN_VERSION: bookworm - ARCHS: ${{ join(fromJson(needs.check-image.outputs.archs), ' ') }} - DEBIANS: ${{ join(fromJson(needs.check-image.outputs.debians), ' ') }} + ARCHS: ${{ join(fromJSON(needs.check-image.outputs.archs), ' ') }} + DEBIANS: ${{ join(fromJSON(needs.check-image.outputs.debians), ' ') }} EVERYTHING: ${{ needs.check-image.outputs.everything }} IMAGE_TAG: ${{ needs.check-image.outputs.tag }} run: | diff --git a/.github/workflows/build-macos.yml b/.github/workflows/build-macos.yml index 347a511e98..b24a872152 100644 --- a/.github/workflows/build-macos.yml +++ b/.github/workflows/build-macos.yml @@ -40,7 +40,7 @@ jobs: runs-on: macos-15 strategy: matrix: - postgres-version: ${{ inputs.rebuild_everything && fromJson('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }} + postgres-version: ${{ inputs.rebuild_everything && fromJSON('["v14", "v15", "v16", "v17"]') || fromJSON(inputs.pg_versions) }} env: # Use release build only, to have less debug info around # Hence keeping target/ (and general cache size) smaller diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1762cd9644..6e7c1ce0a5 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -77,14 +77,17 @@ jobs: secrets: inherit check-codestyle-python: - needs: [ check-permissions, build-build-tools-image ] + needs: [ meta, check-permissions, build-build-tools-image ] + # No need to run on `main` because we this in the merge queue + if: ${{ needs.meta.outputs.run-kind == 'pr' }} uses: ./.github/workflows/_check-codestyle-python.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm secrets: inherit check-codestyle-jsonnet: - needs: [ check-permissions, build-build-tools-image ] + needs: [ meta, check-permissions, build-build-tools-image ] + if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }} runs-on: [ self-hosted, small ] container: image: ${{ needs.build-build-tools-image.outputs.image }} @@ -156,7 +159,9 @@ jobs: pass_if_unchanged: true check-codestyle-rust: - needs: [ check-permissions, build-build-tools-image ] + needs: [ meta, check-permissions, build-build-tools-image ] + # No need to run on `main` because we this in the merge queue + if: ${{ needs.meta.outputs.run-kind == 'pr' }} uses: ./.github/workflows/_check-codestyle-rust.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -164,8 +169,8 @@ jobs: secrets: inherit check-dependencies-rust: - needs: [ files-changed, build-build-tools-image ] - if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' }} + needs: [ meta, files-changed, build-build-tools-image ] + if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr' }} uses: ./.github/workflows/cargo-deny.yml with: build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm @@ -173,12 +178,13 @@ jobs: build-and-test-locally: needs: [ meta, build-build-tools-image ] + if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }} strategy: fail-fast: false matrix: arch: [ x64, arm64 ] # Do not build or run tests in debug for release branches - build-type: ${{ fromJson((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }} + build-type: ${{ fromJSON((startsWith(github.ref_name, 'release') && github.event_name == 'push') && '["release"]' || '["debug", "release"]') }} include: - build-type: release arch: arm64 @@ -470,14 +476,20 @@ jobs: }) trigger-e2e-tests: - # Depends on jobs that can get skipped + # !failure() && !cancelled() because it depends on jobs that can get skipped if: >- ${{ ( - !github.event.pull_request.draft - || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') - || needs.meta.outputs.run-kind == 'push-main' - ) && !failure() && !cancelled() + ( + needs.meta.outputs.run-kind == 'pr' + && ( + !github.event.pull_request.draft + || contains(github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') + ) + ) + || contains(fromJSON('["push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) + ) + && !failure() && !cancelled() }} needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, meta ] uses: ./.github/workflows/trigger-e2e-tests.yml @@ -492,7 +504,7 @@ jobs: matrix: arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} permissions: packages: write @@ -594,7 +606,7 @@ jobs: debian: bookworm arch: [ x64, arm64 ] - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} steps: - uses: actions/checkout@v4 @@ -715,7 +727,7 @@ jobs: vm-compute-node-image-arch: needs: [ check-permissions, meta, compute-node-image ] if: ${{ contains(fromJSON('["push-main", "pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }} - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} permissions: contents: read packages: write @@ -815,7 +827,7 @@ jobs: permissions: packages: read - runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} + runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} steps: - uses: actions/checkout@v4 @@ -929,7 +941,7 @@ jobs: env: SOURCE_TAG: >- ${{ - contains(fromJson('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) + contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), needs.meta.outputs.run-kind) && needs.meta.outputs.release-pr-run-id || needs.meta.outputs.build-tag }} @@ -1434,10 +1446,10 @@ jobs: if: | contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') - || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true') - || needs.build-and-test-locally.result == 'skipped' - || needs.check-codestyle-python.result == 'skipped' - || needs.check-codestyle-rust.result == 'skipped' + || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr') + || (needs.build-and-test-locally.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') + || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') + || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.run-kind == 'pr') || needs.files-changed.result == 'skipped' || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind)) || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)) diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 90318747b3..74e9a5a9ab 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -71,8 +71,8 @@ jobs: uses: ./.github/workflows/build-macos.yml with: pg_versions: ${{ needs.files-changed.outputs.postgres_changes }} - rebuild_rust_code: ${{ fromJson(needs.files-changed.outputs.rebuild_rust_code) }} - rebuild_everything: ${{ fromJson(needs.files-changed.outputs.rebuild_everything) }} + rebuild_rust_code: ${{ fromJSON(needs.files-changed.outputs.rebuild_rust_code) }} + rebuild_everything: ${{ fromJSON(needs.files-changed.outputs.rebuild_everything) }} gather-rust-build-stats: needs: [ check-permissions, build-build-tools-image, files-changed ] diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index 85b131bb11..bbe4638235 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -159,7 +159,7 @@ jobs: ${{ always() && github.event_name == 'merge_group' - && contains(fromJson('["release", "release-proxy", "release-compute"]'), needs.meta.outputs.branch) + && contains(fromJSON('["release", "release-proxy", "release-compute"]'), needs.meta.outputs.branch) }} env: GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }} From 86fe26c6765837eb067f4785ed0e983b924c6871 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Tue, 18 Mar 2025 17:14:08 +0200 Subject: [PATCH 70/71] fix(proxy): Fix testodrome HTTP header handling in proxy (#11292) Relates to #22486 --- proxy/src/serverless/mod.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 00164d631a..9c11f32083 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -437,9 +437,11 @@ async fn request_handler( let testodrome_id = request .headers() .get("X-Neon-Query-ID") - .map(|value| value.to_str().unwrap_or_default().to_string()); + .and_then(|value| value.to_str().ok()) + .map(|s| s.to_string()); if let Some(query_id) = testodrome_id { + info!(parent: &ctx.span(), "testodrome query ID: {query_id}"); ctx.set_testodrome_id(query_id); } @@ -481,6 +483,17 @@ async fn request_handler( ); let span = ctx.span(); + let testodrome_id = request + .headers() + .get("X-Neon-Query-ID") + .and_then(|value| value.to_str().ok()) + .map(|s| s.to_string()); + + if let Some(query_id) = testodrome_id { + info!(parent: &ctx.span(), "testodrome query ID: {query_id}"); + ctx.set_testodrome_id(query_id); + } + sql_over_http::handle(config, ctx, request, backend, http_cancellation_token) .instrument(span) .await From 99639c26b49a0d6d546fd973670c30470ee3a81e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= Date: Tue, 18 Mar 2025 16:21:22 +0100 Subject: [PATCH 71/71] fix(ci): update build-tools image references (#11293) ## Problem https://github.com/neondatabase/neon/pull/11210 migrated pushing images to ghcr. Unfortunately, it was incomplete in using images from ghcr, which resulted in a few places referencing the ghcr build-tools image, while trying to use docker hub credentials. ## Summary of changes Use build-tools image from ghcr consistently. --- .../workflows/_benchmarking_preparation.yml | 6 +- .github/workflows/benchmarking.yml | 72 +++++++++---------- .github/workflows/build_and_test.yml | 4 +- .../build_and_test_with_sanitizers.yml | 4 +- .github/workflows/cloud-regress.yml | 6 +- .github/workflows/ingest_benchmark.yml | 6 +- .github/workflows/large_oltp_benchmark.yml | 6 +- .github/workflows/neon_extra_builds.yml | 4 +- .github/workflows/periodic_pagebench.yml | 6 +- .github/workflows/pg-clients.yml | 8 +-- 10 files changed, 61 insertions(+), 61 deletions(-) diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml index 71aef1430e..0703e2c4d6 100644 --- a/.github/workflows/_benchmarking_preparation.yml +++ b/.github/workflows/_benchmarking_preparation.yml @@ -27,10 +27,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index e080b06cac..0cffb3787b 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -87,10 +87,10 @@ jobs: runs-on: ${{ matrix.RUNNER }} container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -190,10 +190,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -245,10 +245,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -352,7 +352,7 @@ jobs: region_id_default=${{ env.DEFAULT_REGION_ID }} runner_default='["self-hosted", "us-east-2", "x64"]' runner_azure='["self-hosted", "eastus2", "x64"]' - image_default="neondatabase/build-tools:pinned-bookworm" + image_default="ghcr.io/neondatabase/build-tools:pinned-bookworm" matrix='{ "pg_version" : [ 16 @@ -368,18 +368,18 @@ jobs: "db_size": [ "10gb" ], "runner": ['"$runner_default"'], "image": [ "'"$image_default"'" ], - "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, - { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned-bookworm" }, - { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, - { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] + "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "ghcr.io/neondatabase/build-tools:pinned-bookworm" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "ghcr.io/neondatabase/build-tools:pinned-bookworm" }, + { "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "ghcr.io/neondatabase/build-tools:pinned-bookworm" }, + { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new-many-tables","db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }, + { "pg_version": 17, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }] }' if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then @@ -457,8 +457,8 @@ jobs: container: image: ${{ matrix.image }} credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init # Increase timeout to 8h, default timeout is 6h @@ -642,10 +642,10 @@ jobs: runs-on: ${{ matrix.RUNNER }} container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -767,10 +767,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init # Increase timeout to 12h, default timeout is 6h @@ -892,10 +892,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: @@ -1011,10 +1011,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 6e7c1ce0a5..bc88da316a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -254,8 +254,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} # for changed limits, see comments on `options:` earlier in this file options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 strategy: diff --git a/.github/workflows/build_and_test_with_sanitizers.yml b/.github/workflows/build_and_test_with_sanitizers.yml index e40b02b5d2..389b59c1a5 100644 --- a/.github/workflows/build_and_test_with_sanitizers.yml +++ b/.github/workflows/build_and_test_with_sanitizers.yml @@ -94,8 +94,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml index 606e1c0862..566629e15c 100644 --- a/.github/workflows/cloud-regress.yml +++ b/.github/workflows/cloud-regress.yml @@ -37,10 +37,10 @@ jobs: runs-on: us-east-2 container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init steps: diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index c20c5890f9..37ee371311 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -67,10 +67,10 @@ jobs: PGCOPYDB_LIB_PATH: /pgcopydb/lib runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init timeout-minutes: 1440 diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml index a7c3118e34..fea21877f8 100644 --- a/.github/workflows/large_oltp_benchmark.yml +++ b/.github/workflows/large_oltp_benchmark.yml @@ -50,10 +50,10 @@ jobs: runs-on: [ self-hosted, us-east-2, x64 ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init # Increase timeout to 2 days, default timeout is 6h - database maintenance can take a long time diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 74e9a5a9ab..558aba1e2e 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -90,8 +90,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init env: diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml index f854bf3212..433b969b0c 100644 --- a/.github/workflows/periodic_pagebench.yml +++ b/.github/workflows/periodic_pagebench.yml @@ -34,10 +34,10 @@ jobs: pull-requests: write runs-on: [ self-hosted, small ] container: - image: neondatabase/build-tools:pinned-bookworm + image: ghcr.io/neondatabase/build-tools:pinned-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init timeout-minutes: 360 # Set the timeout to 6 hours env: diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index abc90c7fe1..cb5ae556d8 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -53,8 +53,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init --user root services: clickhouse: @@ -153,8 +153,8 @@ jobs: container: image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: - username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} - password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} options: --init --user root steps: