pq bench: avoid repeated conversion to_i128

per-second RPS
pq bench: proper shutdown
2026-06-01 12:30:38 +00:00 · 2023-11-02 17:43:59 +00:00 · 2023-11-02 17:11:37 +00:00 · 2023-11-02 17:07:00 +00:00 · 2023-11-02 17:06:36 +00:00 · 2023-11-02 16:28:16 +01:00
48 changed files with 1976 additions and 693 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -5,4 +5,6 @@ self-hosted-runner:
    - small
    - us-east-2
 config-variables:
+  - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_REGION
  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -203,6 +203,10 @@ runs:
        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
      run: |
+        if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
+          exit 0
+        fi
+
        export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}

        ./scripts/pysync
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -340,11 +340,11 @@ jobs:

          # Run separate tests for real Azure Blob Storage
          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
-          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
-          export REMOTE_STORAGE_AZURE_CONTAINER=neon-github-sandbox
-          export REMOTE_STORAGE_AZURE_REGION=eastus2
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure

@@ -433,7 +433,7 @@ jobs:
          rerun_flaky: true
          pg_version: ${{ matrix.pg_version }}
        env:
-          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
+          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty

      - name: Merge and upload coverage data
@@ -468,7 +468,7 @@ jobs:
        env:
          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
+          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2932,6 +2932,16 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -3198,6 +3208,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "pagectl"
 version = "0.1.0"
@@ -3283,10 +3299,12 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
+ "tokio-stream",
 "tokio-tar",
 "tokio-util",
 "toml_edit",
 "tracing",
+ "tracing-subscriber",
 "url",
 "utils",
 "walkdir",
@@ -3561,7 +3579,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3574,7 +3592,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3585,7 +3603,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3603,7 +3621,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -5407,7 +5425,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=problame/copy-both-duplex-public#5c462bd3500e657c014ef087e4eef2c1a8f0ebda"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -5764,6 +5782,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
+ "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -161,11 +161,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -202,7 +202,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="problame/copy-both-duplex-public" }

 ################# Binary contents sections

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -18,7 +18,7 @@ use utils::{

 use crate::reltag::RelTag;
 use anyhow::bail;
-use bytes::{BufMut, Bytes, BytesMut};
+use bytes::{Buf, BufMut, Bytes, BytesMut};

 /// The state of a tenant in this pageserver.
 ///
@@ -807,6 +807,36 @@ impl PagestreamBeMessage {

        bytes.into()
    }
+
+    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
+        let mut buf = buf.reader();
+        let msg_tag = buf.read_u8()?;
+        match msg_tag {
+            100 => todo!(),
+            101 => todo!(),
+            102 => {
+                let buf = buf.get_ref();
+                /* TODO use constant */
+                if buf.len() == 8192 {
+                    Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+                        page: buf.clone(),
+                    }))
+                } else {
+                    anyhow::bail!("invalid page size: {}", buf.len());
+                }
+            }
+            103 => {
+                let buf = buf.get_ref();
+                let cstr = std::ffi::CStr::from_bytes_until_nul(&buf)?;
+                let rust_str = cstr.to_str()?;
+                Ok(PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    message: rust_str.to_owned(),
+                }))
+            }
+            104 => todo!(),
+            _ => bail!("unknown tag: {:?}", msg_tag),
+        }
+    }
 }

 #[cfg(test)]
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -112,7 +112,7 @@ impl RemotePath {
        self.0.file_name()
    }

-    pub fn join(&self, segment: &Utf8Path) -> Self {
+    pub fn join<P: AsRef<Utf8Path>>(&self, segment: P) -> Self {
        Self(self.0.join(segment))
    }

--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -82,6 +82,8 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+tokio-stream.workspace = true
+tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }

 [dev-dependencies]
 criterion.workspace = true
--- a/pageserver/src/bin/getpage_bench_http.rs
+++ b/pageserver/src/bin/getpage_bench_http.rs
@@ -0,0 +1,245 @@
+use clap::Parser;
+use hyper::client::conn::Parts;
+use hyper::client::HttpConnector;
+use hyper::{Body, Client, Uri};
+use pageserver::{repository, tenant};
+use rand::prelude::*;
+use std::env::args;
+use std::future::Future;
+use std::str::FromStr;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::thread;
+use tokio::sync::mpsc::{channel, Sender};
+use tokio::sync::Mutex as AsyncMutex;
+use tokio::task::JoinHandle;
+
+struct Key(repository::Key);
+
+impl std::str::FromStr for Key {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        repository::Key::from_hex(s).map(Key)
+    }
+}
+
+struct KeyRange {
+    start: Key,
+    end: Key,
+}
+
+impl KeyRange {
+    fn len(&self) -> i128 {
+        self.end.0.to_i128() - self.start.0.to_i128()
+    }
+}
+
+#[derive(clap::Parser)]
+struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    ps_endpoint: String,
+    // tenant_id: String,
+    // timeline_id: String,
+    num_tasks: usize,
+    num_requests: usize,
+    tenants: Option<Vec<String>>,
+    #[clap(long)]
+    pick_n_tenants: Option<usize>,
+}
+
+#[derive(Debug, Default)]
+struct Stats {
+    completed_requests: AtomicU64,
+}
+
+impl Stats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    let args: &'static Args = Box::leak(Box::new(Args::parse()));
+
+    let client = Client::new();
+
+    let tenants = if let Some(tenants) = &args.tenants {
+        tenants.clone()
+    } else {
+        // let tenant_id = "b97965931096047b2d54958756baee7b";
+        // let timeline_id = "2868f84a8d166779e4c651b116c45059";
+
+        let resp = client
+            .get(Uri::try_from(&format!("{}/v1/tenant", args.ps_endpoint)).unwrap())
+            .await
+            .unwrap();
+
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let tenants: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let mut out = Vec::new();
+        for t in tenants.as_array().unwrap() {
+            if let Some(limit) = args.pick_n_tenants {
+                if out.len() >= limit {
+                    break;
+                }
+            }
+            out.push(t.get("id").unwrap().as_str().unwrap().to_owned());
+        }
+        if let Some(limit) = args.pick_n_tenants {
+            assert_eq!(out.len(), limit);
+        }
+        out
+    };
+
+    let mut tenant_timelines = Vec::new();
+    for tenant_id in tenants {
+        let resp = client
+            .get(
+                Uri::try_from(&format!(
+                    "{}/v1/tenant/{}/timeline",
+                    args.ps_endpoint, tenant_id
+                ))
+                .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let timelines: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        for t in timelines.as_array().unwrap() {
+            let timeline_id = t.get("timeline_id").unwrap().as_str().unwrap().to_owned();
+            tenant_timelines.push((tenant_id.clone(), timeline_id));
+        }
+    }
+    println!("tenant_timelines:\n{:?}", tenant_timelines);
+
+    let mut stats = Arc::new(Stats::default());
+
+    tokio::spawn({
+        let stats = Arc::clone(&stats);
+        async move {
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                println!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut tasks = Vec::new();
+    for (tenant_id, timeline_id) in tenant_timelines {
+        let t = tokio::spawn(timeline(
+            args,
+            client.clone(),
+            tenant_id,
+            timeline_id,
+            Arc::clone(&stats),
+        ));
+        tasks.push(t);
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+}
+
+fn timeline(
+    args: &'static Args,
+    client: Client<HttpConnector, Body>,
+    tenant_id: String,
+    timeline_id: String,
+    stats: Arc<Stats>,
+) -> impl Future<Output = ()> {
+    async move {
+        let mut resp = client
+            .get(
+                Uri::try_from(&format!(
+                    "{}/v1/tenant/{}/timeline/{}/keyspace",
+                    args.ps_endpoint, tenant_id, timeline_id
+                ))
+                .unwrap(),
+            )
+            .await
+            .unwrap();
+        if !resp.status().is_success() {
+            panic!("Failed to get keyspace: {resp:?}");
+        }
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let keyspace: serde_json::Value = serde_json::from_slice(&body).unwrap();
+
+        let lsn = Arc::new(keyspace["at_lsn"].as_str().unwrap().to_owned());
+
+        let ranges = keyspace["keys"]
+            .as_array()
+            .unwrap()
+            .iter()
+            .map(|r| {
+                let r = r.as_array().unwrap();
+                assert_eq!(r.len(), 2);
+                let start = Key::from_str(r[0].as_str().unwrap()).unwrap();
+                let end = Key::from_str(r[1].as_str().unwrap()).unwrap();
+                KeyRange { start, end }
+            })
+            .collect::<Vec<_>>();
+
+        // weighted ranges
+        let weights = ranges.iter().map(|r| r.len()).collect::<Vec<_>>();
+
+        let ranges = Arc::new(ranges);
+        let weights = Arc::new(weights);
+
+        let (tx, mut rx) = channel::<i32>(1000);
+        let tx = Arc::new(AsyncMutex::new(tx));
+
+        let mut tasks = Vec::<JoinHandle<()>>::new();
+
+        let start = std::time::Instant::now();
+
+        for i in 0..args.num_tasks {
+            let ranges = ranges.clone();
+            let weights = weights.clone();
+            let lsn = lsn.clone();
+            let client = client.clone();
+            let tenant_id = tenant_id.clone();
+            let timeline_id = timeline_id.clone();
+            let stats = Arc::clone(&stats);
+            let task = tokio::spawn(async move {
+                for i in 0..args.num_requests {
+                    let key = {
+                        let mut rng = rand::thread_rng();
+                        let r = ranges.choose_weighted(&mut rng, |r| r.len()).unwrap();
+                        let key = rng.gen_range((r.start.0.to_i128()..r.end.0.to_i128()));
+                        key
+                    };
+                    let url = format!(
+                        "{}/v1/tenant/{}/timeline/{}/getpage?key={:036x}&lsn={}",
+                        args.ps_endpoint, tenant_id, timeline_id, key, lsn
+                    );
+                    let uri = url.parse::<Uri>().unwrap();
+                    let resp = client.get(uri).await.unwrap();
+                    stats.inc();
+                }
+            });
+            tasks.push(task);
+        }
+
+        drop(tx);
+
+        for task in tasks {
+            task.await.unwrap();
+        }
+
+        let elapsed = start.elapsed();
+        println!(
+            "RPS: {:.0}",
+            (args.num_requests * args.num_tasks) as f64 / elapsed.as_secs_f64()
+        );
+    }
+}
--- a/pageserver/src/bin/getpage_bench_libpq.rs
+++ b/pageserver/src/bin/getpage_bench_libpq.rs
@@ -0,0 +1,373 @@
+use anyhow::Context;
+use clap::Parser;
+use futures::{SinkExt, TryStreamExt};
+use hyper::client::conn::Parts;
+use hyper::client::HttpConnector;
+use hyper::{Client, Uri};
+use pageserver::page_cache::PAGE_SZ;
+use pageserver::pgdatadir_mapping::{is_rel_block_key, key_to_rel_block};
+use pageserver::{repository, tenant};
+use pageserver_api::models::{
+    PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+};
+use pageserver_api::reltag::RelTag;
+use rand::prelude::*;
+use scopeguard::defer;
+use std::env::args;
+use std::future::Future;
+use std::str::FromStr;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::thread;
+use tokio::sync::mpsc::{channel, Sender};
+use tokio::sync::Mutex as AsyncMutex;
+use tokio::task::JoinHandle;
+use tokio_stream::{Stream, StreamExt};
+use utils::completion;
+use utils::lsn::Lsn;
+
+struct Key(repository::Key);
+
+impl std::str::FromStr for Key {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        repository::Key::from_hex(s).map(Key)
+    }
+}
+
+struct KeyRange {
+    start: i128,
+    end: i128,
+}
+
+impl KeyRange {
+    fn len(&self) -> i128 {
+        self.end - self.start
+    }
+}
+
+struct RelTagBlockNo {
+    rel_tag: RelTag,
+    block_no: u32,
+}
+
+#[derive(clap::Parser)]
+struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    ps_endpoint: String,
+    // tenant_id: String,
+    // timeline_id: String,
+    num_tasks: usize,
+    num_requests: usize,
+    tenants: Option<Vec<String>>,
+    #[clap(long)]
+    pick_n_tenants: Option<usize>,
+}
+
+#[derive(Debug, Default)]
+struct Stats {
+    completed_requests: AtomicU64,
+}
+
+impl Stats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    let args: &'static Args = Box::leak(Box::new(Args::parse()));
+
+    // std::env::set_var("RUST_LOG", "info,tokio_postgres=trace");
+    // tracing_subscriber::fmt::init();
+
+    let client = Client::new();
+
+    let tenants = if let Some(tenants) = &args.tenants {
+        tenants.clone()
+    } else {
+        // let tenant_id = "b97965931096047b2d54958756baee7b";
+        // let timeline_id = "2868f84a8d166779e4c651b116c45059";
+
+        let resp = client
+            .get(Uri::try_from(&format!("{}/v1/tenant", args.ps_endpoint)).unwrap())
+            .await
+            .unwrap();
+
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let tenants: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let mut out = Vec::new();
+        for t in tenants.as_array().unwrap() {
+            if let Some(limit) = args.pick_n_tenants {
+                if out.len() >= limit {
+                    break;
+                }
+            }
+            out.push(t.get("id").unwrap().as_str().unwrap().to_owned());
+        }
+        if let Some(limit) = args.pick_n_tenants {
+            assert_eq!(out.len(), limit);
+        }
+        out
+    };
+
+    let mut tenant_timelines = Vec::new();
+    for tenant_id in tenants {
+        let resp = client
+            .get(
+                Uri::try_from(&format!(
+                    "{}/v1/tenant/{}/timeline",
+                    args.ps_endpoint, tenant_id
+                ))
+                .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let timelines: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        for t in timelines.as_array().unwrap() {
+            let timeline_id = t.get("timeline_id").unwrap().as_str().unwrap().to_owned();
+            tenant_timelines.push((tenant_id.clone(), timeline_id));
+        }
+    }
+    println!("tenant_timelines:\n{:?}", tenant_timelines);
+
+    let mut stats = Arc::new(Stats::default());
+
+    tokio::spawn({
+        let stats = Arc::clone(&stats);
+        async move {
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                println!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut tasks = Vec::new();
+    for (tenant_id, timeline_id) in tenant_timelines {
+        let stats = Arc::clone(&stats);
+        let t = tokio::spawn(timeline(
+            args,
+            client.clone(),
+            tenant_id,
+            timeline_id,
+            stats,
+        ));
+        tasks.push(t);
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+}
+
+fn timeline(
+    args: &'static Args,
+    http_client: Client<HttpConnector, hyper::Body>,
+    tenant_id: String,
+    timeline_id: String,
+    stats: Arc<Stats>,
+) -> impl Future<Output = ()> + Send + Sync {
+    async move {
+        let mut resp = http_client
+            .get(
+                Uri::try_from(&format!(
+                    "{}/v1/tenant/{}/timeline/{}/keyspace",
+                    args.ps_endpoint, tenant_id, timeline_id
+                ))
+                .unwrap(),
+            )
+            .await
+            .unwrap();
+        if !resp.status().is_success() {
+            panic!("Failed to get keyspace: {resp:?}");
+        }
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let keyspace: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let lsn: Lsn = keyspace["at_lsn"].as_str().unwrap().parse().unwrap();
+
+        let ranges = keyspace["keys"]
+            .as_array()
+            .unwrap()
+            .iter()
+            .filter_map(|r| {
+                let r = r.as_array().unwrap();
+                assert_eq!(r.len(), 2);
+                let start = Key::from_str(r[0].as_str().unwrap()).unwrap();
+                let end = Key::from_str(r[1].as_str().unwrap()).unwrap();
+                // filter out non-relblock keys
+                match (is_rel_block_key(start.0), is_rel_block_key(end.0)) {
+                    (true, true) => Some(KeyRange {
+                        start: start.0.to_i128(),
+                        end: end.0.to_i128(),
+                    }),
+                    (true, false) | (false, true) => {
+                        unimplemented!("split up range")
+                    }
+                    (false, false) => None,
+                }
+            })
+            .collect::<Vec<_>>();
+
+        // weighted ranges
+        let weights = ranges.iter().map(|r| r.len()).collect::<Vec<_>>();
+
+        let ranges = Arc::new(ranges);
+        let weights = Arc::new(weights);
+
+        let mut tasks = Vec::<JoinHandle<()>>::new();
+
+        let start = std::time::Instant::now();
+
+        for i in 0..args.num_tasks {
+            let ranges = ranges.clone();
+            let weights = weights.clone();
+            let client = http_client.clone();
+            let tenant_id = tenant_id.clone();
+            let timeline_id = timeline_id.clone();
+            let task = tokio::spawn({
+                let stats = Arc::clone(&stats);
+                async move {
+                    let mut client =
+                        getpage_client::Client::new(tenant_id.clone(), timeline_id.clone())
+                            .await
+                            .unwrap();
+                    for i in 0..args.num_requests {
+                        let key = {
+                            let mut rng = rand::thread_rng();
+                            let r = ranges.choose_weighted(&mut rng, |r| r.len()).unwrap();
+                            let key: i128 = rng.gen_range((r.start..r.end));
+                            let key = repository::Key::from_i128(key);
+                            // XXX filter these out when we iterate the keyspace
+                            assert!(
+                                is_rel_block_key(key),
+                                "we filter non-relblock keys out above"
+                            );
+                            let (rel_tag, block_no) =
+                                key_to_rel_block(key).expect("we just checked");
+                            RelTagBlockNo { rel_tag, block_no }
+                        };
+                        client
+                            .getpage(key, lsn)
+                            .await
+                            .with_context(|| {
+                                format!("getpage for tenant {} timeline {}", tenant_id, timeline_id)
+                            })
+                            .unwrap();
+                        stats.inc();
+                    }
+                    client.shutdown().await;
+                }
+            });
+            tasks.push(task);
+        }
+
+        for task in tasks {
+            task.await.unwrap();
+        }
+    }
+}
+
+mod getpage_client {
+    use std::pin::Pin;
+
+    use futures::SinkExt;
+    use pageserver_api::models::{
+        PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
+        PagestreamGetPageResponse,
+    };
+    use tokio::task::JoinHandle;
+    use tokio_stream::StreamExt;
+    use tokio_util::sync::CancellationToken;
+    use utils::lsn::Lsn;
+
+    use crate::RelTagBlockNo;
+
+    pub(crate) struct Client {
+        copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
+        cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+        conn_task: JoinHandle<()>,
+    }
+
+    impl Client {
+        pub fn new(
+            tenant_id: String,
+            timeline_id: String,
+        ) -> impl std::future::Future<Output = anyhow::Result<Self>> + Send {
+            async move {
+                let (client, connection) =
+                    tokio_postgres::connect("host=localhost port=64000", postgres::NoTls).await?;
+
+                let conn_task_cancel = CancellationToken::new();
+                let conn_task = tokio::spawn({
+                    let conn_task_cancel = conn_task_cancel.clone();
+                    async move {
+                        tokio::select! {
+                            _ = conn_task_cancel.cancelled() => {
+                                return;
+                            }
+                            res = connection => {
+                                res.unwrap();
+                            }
+                        }
+                    }
+                });
+
+                let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = client
+                    .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+                    .await?;
+
+                Ok(Self {
+                    copy_both: Box::pin(copy_both),
+                    conn_task,
+                    cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
+                })
+            }
+        }
+
+        pub async fn shutdown(mut self) {
+            let _ = self.cancel_on_client_drop.take();
+            self.conn_task.await.unwrap();
+        }
+
+        pub async fn getpage(
+            &mut self,
+            key: RelTagBlockNo,
+            lsn: Lsn,
+        ) -> anyhow::Result<PagestreamGetPageResponse> {
+            let req = PagestreamGetPageRequest {
+                latest: false,
+                rel: key.rel_tag,
+                blkno: key.block_no,
+                lsn,
+            };
+            let req = PagestreamFeMessage::GetPage(req);
+            let req: bytes::Bytes = req.serialize();
+            // let mut req = tokio_util::io::ReaderStream::new(&req);
+            let mut req = tokio_stream::once(Ok(req));
+
+            self.copy_both.send_all(&mut req).await?;
+
+            let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
+            let next = next.unwrap().unwrap();
+
+            match PagestreamBeMessage::deserialize(next)? {
+                PagestreamBeMessage::Exists(_) => todo!(),
+                PagestreamBeMessage::Nblocks(_) => todo!(),
+                PagestreamBeMessage::GetPage(p) => Ok(p),
+                PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
+                PagestreamBeMessage::DbSize(_) => todo!(),
+            }
+        }
+    }
+}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1479,8 +1479,6 @@ threshold = "20m"
            Some(DiskUsageEvictionTaskConfig {
                max_usage_pct: Percent::new(80).unwrap(),
                min_avail_bytes: 0,
-                target_avail_bytes: None,
-                target_usage_pct: None,
                period: Duration::from_secs(10),
                #[cfg(feature = "testing")]
                mock_statvfs: None,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -67,40 +67,16 @@ use crate::{
 pub struct DiskUsageEvictionTaskConfig {
    pub max_usage_pct: Percent,
    pub min_avail_bytes: u64,
-
-    // Control how far we will go when evicting: when usage exceeds max_usage_pct or min_avail_bytes,
-    // we will keep evicting layers until we reach the target.  The resulting disk usage should look
-    // like a sawtooth bouncing between the upper max/min line and the lower target line.
-    #[serde(default)]
-    pub target_usage_pct: Option<Percent>,
-    #[serde(default)]
-    pub target_avail_bytes: Option<u64>,
-
    #[serde(with = "humantime_serde")]
    pub period: Duration,
    #[cfg(feature = "testing")]
    pub mock_statvfs: Option<crate::statvfs::mock::Behavior>,
 }

-#[derive(Default)]
-enum Status {
-    /// We are within disk limits, and not currently doing any eviction
-    #[default]
-    Idle,
-    /// Disk limits have been exceeded: we will evict soon
-    UnderPressure,
-    /// We are currently doing an eviction pass.
-    Evicting,
-}
-
 #[derive(Default)]
 pub struct State {
    /// Exclude http requests and background task from running at the same time.
    mutex: tokio::sync::Mutex<()>,
-
-    /// Publish the current status of eviction work, for visibility to other subsystems
-    /// that modify their behavior if disk pressure is high or if eviction is going on.
-    status: std::sync::RwLock<Status>,
 }

 pub fn launch_disk_usage_global_eviction_task(
@@ -200,9 +176,7 @@ async fn disk_usage_eviction_task(
 }

 pub trait Usage: Clone + Copy + std::fmt::Debug {
-    fn pressure(&self) -> f64;
-    fn over_pressure(&self) -> bool;
-    fn no_pressure(&self) -> bool;
+    fn has_pressure(&self) -> bool;
    fn add_available_bytes(&mut self, bytes: u64);
 }

@@ -215,19 +189,13 @@ async fn disk_usage_eviction_task_iteration(
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-
-    if usage_pre.over_pressure() {
-        *state.status.write().unwrap() = Status::Evicting;
-    }
-
    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
-            let new_status = match outcome {
+            match outcome {
                IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
                    // nothing to do, select statement below will handle things
-                    Status::Idle
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
@@ -237,30 +205,21 @@ async fn disk_usage_eviction_task_iteration(

                    debug!(?after, "disk usage");

-                    if after.over_pressure() {
+                    if after.has_pressure() {
                        // Don't bother doing an out-of-order iteration here now.
                        // In practice, the task period is set to a value in the tens-of-seconds range,
                        // which will cause another iteration to happen soon enough.
                        // TODO: deltas between the three different usages would be helpful,
                        // consider MiB, GiB, TiB
                        warn!(?outcome, ?after, "disk usage still high");
-                        Status::UnderPressure
                    } else {
                        info!(?outcome, ?after, "disk usage pressure relieved");
-                        Status::Idle
                    }
                }
-            };
-
-            *state.status.write().unwrap() = new_status;
+            }
        }
        Err(e) => {
            error!("disk_usage_eviction_iteration failed: {:#}", e);
-            *state.status.write().unwrap() = if usage_pre.over_pressure() {
-                Status::UnderPressure
-            } else {
-                Status::Idle
-            };
        }
    }

@@ -326,10 +285,8 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(

    debug!(?usage_pre, "disk usage");

-    if !usage_pre.over_pressure() {
+    if !usage_pre.has_pressure() {
        return Ok(IterationOutcome::NoPressure);
-    } else {
-        *state.status.write().unwrap() = Status::Evicting;
    }

    warn!(
@@ -377,7 +334,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    let mut warned = None;
    let mut usage_planned = usage_pre;
    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
-        if usage_planned.no_pressure() {
+        if !usage_planned.has_pressure() {
            debug!(
                no_candidates_evicted = i,
                "took enough candidates for pressure to be relieved"
@@ -687,57 +644,22 @@ mod filesystem_level_usage {
    }

    impl super::Usage for Usage<'_> {
-        /// Does the pressure exceed 1.0, i.e. has the disk usage exceeded upper bounds?
-        ///
-        /// This is the condition for starting eviction.
-        fn over_pressure(&self) -> bool {
-            self.pressure() >= 1.0
-        }
+        fn has_pressure(&self) -> bool {
+            let usage_pct =
+                (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;

-        /// Is the pressure <0, ie.. has disk usage gone below the target bound?
-        ///
-        /// This is the condition for dropping out of eviction.
-        fn no_pressure(&self) -> bool {
-            self.pressure() <= 0.0
-        }
+            let pressures = [
+                (
+                    "min_avail_bytes",
+                    self.avail_bytes < self.config.min_avail_bytes,
+                ),
+                (
+                    "max_usage_pct",
+                    usage_pct >= self.config.max_usage_pct.get() as u64,
+                ),
+            ];

-        fn pressure(&self) -> f64 {
-            let max_usage = std::cmp::min(
-                self.total_bytes - self.config.min_avail_bytes,
-                (self.total_bytes as f64 * (self.config.max_usage_pct.get() as f64 / 100.0)) as u64,
-            );
-
-            let mut target_usage = max_usage;
-            if let Some(target_avail_bytes) = self.config.target_avail_bytes {
-                target_usage = std::cmp::min(target_usage, self.total_bytes - target_avail_bytes);
-            }
-            if let Some(target_usage_pct) = self.config.target_usage_pct {
-                target_usage = std::cmp::min(
-                    target_usage,
-                    (self.total_bytes as f64 * (target_usage_pct.get() as f64 / 100.0)) as u64,
-                );
-            };
-
-            let usage = self.total_bytes - self.avail_bytes;
-            eprintln!(
-                "pressure: {} {}, current {}",
-                target_usage, max_usage, usage
-            );
-            if target_usage == max_usage {
-                // We are configured with a zero sized range: treat anything at+beyond limit as pressure 1.0, else 0.0
-                if usage >= max_usage {
-                    1.0
-                } else {
-                    0.0
-                }
-            } else if usage <= target_usage {
-                // No pressure.
-                0.0
-            } else {
-                // We are above target: pressure is the ratio of how much we exceed target to the size of the gap
-                let range_size = (max_usage - target_usage) as f64;
-                (usage - target_usage) as f64 / range_size
-            }
+            pressures.into_iter().any(|(_, has_pressure)| has_pressure)
        }

        fn add_available_bytes(&mut self, bytes: u64) {
@@ -791,8 +713,6 @@ mod filesystem_level_usage {
            config: &DiskUsageEvictionTaskConfig {
                max_usage_pct: Percent::new(85).unwrap(),
                min_avail_bytes: 0,
-                target_avail_bytes: None,
-                target_usage_pct: None,
                period: Duration::MAX,
                #[cfg(feature = "testing")]
                mock_statvfs: None,
@@ -801,24 +721,24 @@ mod filesystem_level_usage {
            avail_bytes: 0,
        };

-        assert!(usage.over_pressure(), "expected pressure at 100%");
+        assert!(usage.has_pressure(), "expected pressure at 100%");

        usage.add_available_bytes(14_000);
-        assert!(usage.over_pressure(), "expected pressure at 86%");
+        assert!(usage.has_pressure(), "expected pressure at 86%");

        usage.add_available_bytes(999);
-        assert!(usage.over_pressure(), "expected pressure at 85.001%");
+        assert!(usage.has_pressure(), "expected pressure at 85.001%");

        usage.add_available_bytes(1);
-        assert!(usage.over_pressure(), "expected pressure at precisely 85%");
+        assert!(usage.has_pressure(), "expected pressure at precisely 85%");

        usage.add_available_bytes(1);
-        assert!(!usage.over_pressure(), "no pressure at 84.999%");
+        assert!(!usage.has_pressure(), "no pressure at 84.999%");

        usage.add_available_bytes(999);
-        assert!(!usage.over_pressure(), "no pressure at 84%");
+        assert!(!usage.has_pressure(), "no pressure at 84%");

        usage.add_available_bytes(16_000);
-        assert!(!usage.over_pressure());
+        assert!(!usage.has_pressure());
    }
 }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -392,13 +392,19 @@ paths:
            type: string
            format: date-time
          description: A timestamp to get the LSN
+        - name: version
+          in: query
+          required: false
+          schema:
+            type: integer
+          description: The version of the endpoint to use
      responses:
        "200":
          description: OK
          content:
            application/json:
              schema:
-                type: string
+                $ref: "#/components/schemas/LsnByTimestampResponse"
        "400":
          description: Error when no tenant id found in path, no timeline id or invalid timestamp
          content:
@@ -1384,6 +1390,19 @@ components:
          type: string
          format: hex

+    LsnByTimestampResponse:
+      type: object
+      required:
+        - lsn
+        - kind
+      properties:
+        lsn:
+          type: string
+          format: hex
+        kind:
+          type: string
+          enum: [past, present, future, nodata]
+
    Error:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -8,7 +8,7 @@ use std::sync::Arc;
 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
-use hyper::header::CONTENT_TYPE;
+use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
@@ -17,6 +17,7 @@ use pageserver_api::models::{
    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use remote_storage::GenericRemoteStorage;
+use serde_with::{serde_as, DisplayFromStr};
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -484,6 +485,8 @@ async fn get_lsn_by_timestamp_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

+    let version: Option<u8> = parse_query_param(&request, "version")?;
+
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let timestamp_raw = must_get_query_param(&request, "timestamp")?;
    let timestamp = humantime::parse_rfc3339(&timestamp_raw)
@@ -495,13 +498,32 @@ async fn get_lsn_by_timestamp_handler(
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

-    let result = match result {
-        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
-        LsnForTimestamp::Future(_lsn) => "future".into(),
-        LsnForTimestamp::Past(_lsn) => "past".into(),
-        LsnForTimestamp::NoData(_lsn) => "nodata".into(),
-    };
-    json_response(StatusCode::OK, result)
+    if version.unwrap_or(0) > 1 {
+        #[serde_as]
+        #[derive(serde::Serialize)]
+        struct Result {
+            #[serde_as(as = "DisplayFromStr")]
+            lsn: Lsn,
+            kind: &'static str,
+        }
+        let (lsn, kind) = match result {
+            LsnForTimestamp::Present(lsn) => (lsn, "present"),
+            LsnForTimestamp::Future(lsn) => (lsn, "future"),
+            LsnForTimestamp::Past(lsn) => (lsn, "past"),
+            LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
+        };
+        json_response(StatusCode::OK, Result { lsn, kind })
+    } else {
+        // FIXME: this is a temporary crutch not to break backwards compatibility
+        // See https://github.com/neondatabase/neon/pull/5608
+        let result = match result {
+            LsnForTimestamp::Present(lsn) => format!("{lsn}"),
+            LsnForTimestamp::Future(_lsn) => "future".into(),
+            LsnForTimestamp::Past(_lsn) => "past".into(),
+            LsnForTimestamp::NoData(_lsn) => "nodata".into(),
+        };
+        json_response(StatusCode::OK, result)
+    }
 }

 async fn get_timestamp_of_lsn_handler(
@@ -659,6 +681,45 @@ async fn tenant_ignore_handler(
    json_response(StatusCode::OK, ())
 }

+async fn tenant_duplicate_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let src_tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+
+    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let new_tenant_id = request_data.new_tenant_id;
+    check_permission(&request, None)?;
+
+    let _timer = STORAGE_TIME_GLOBAL
+        .get_metric_with_label_values(&[StorageTimeOperation::DuplicateTenant.into()])
+        .expect("bug")
+        .start_timer();
+
+    let tenant_conf =
+        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
+
+    let state = get_state(&request);
+
+    let generation = get_request_generation(state, request_data.generation)?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    mgr::duplicate_tenant(
+        state.conf,
+        tenant_conf,
+        src_tenant_id,
+        new_tenant_id,
+        generation,
+        state.tenant_resources(),
+        &ctx,
+    )
+    .instrument(info_span!("tenant_duplicate", %src_tenant_id, tenant_id = %new_tenant_id))
+    .await?;
+
+    json_response(StatusCode::CREATED, TenantCreateResponse(new_tenant_id))
+}
+
 async fn tenant_list_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -767,6 +828,10 @@ async fn tenant_size_handler(
        .map_err(ApiError::InternalServerError)?;

    let mut sizes = None;
+    let accepts_html = headers
+        .get(header::ACCEPT)
+        .map(|v| v == "text/html")
+        .unwrap_or_default();
    if !inputs_only.unwrap_or(false) {
        let storage_model = inputs
            .calculate_model()
@@ -774,11 +839,11 @@ async fn tenant_size_handler(
        let size = storage_model.calculate();

        // If request header expects html, return html
-        if headers["Accept"] == "text/html" {
+        if accepts_html {
            return synthetic_size_html_response(inputs, storage_model, size);
        }
        sizes = Some(size);
-    } else if headers["Accept"] == "text/html" {
+    } else if accepts_html {
        return Err(ApiError::BadRequest(anyhow!(
            "inputs_only parameter is incompatible with html output request"
        )));
@@ -929,7 +994,7 @@ fn synthetic_size_html_response(
 pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
    let response = Response::builder()
        .status(status)
-        .header(hyper::header::CONTENT_TYPE, "text/html")
+        .header(header::CONTENT_TYPE, "text/html")
        .body(Body::from(data.as_bytes().to_vec()))
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
@@ -1310,7 +1375,7 @@ async fn getpage_at_lsn_handler(
        Result::<_, ApiError>::Ok(
            Response::builder()
                .status(StatusCode::OK)
-                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(header::CONTENT_TYPE, "application/octet-stream")
                .body(hyper::Body::from(page))
                .unwrap(),
        )
@@ -1452,22 +1517,10 @@ async fn disk_usage_eviction_run(
    }

    impl crate::disk_usage_eviction_task::Usage for Usage {
-        fn over_pressure(&self) -> bool {
+        fn has_pressure(&self) -> bool {
            self.config.evict_bytes > self.freed_bytes
        }

-        fn no_pressure(&self) -> bool {
-            !self.over_pressure()
-        }
-
-        fn pressure(&self) -> f64 {
-            if self.over_pressure() {
-                1.0
-            } else {
-                0.0
-            }
-        }
-
        fn add_available_bytes(&mut self, bytes: u64) {
            self.freed_bytes += bytes;
        }
@@ -1714,6 +1767,9 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/ignore", |r| {
            api_handler(r, tenant_ignore_handler)
        })
+        .post("/v1/tenant/:tenant_id/duplicate", |r| {
+            api_handler(r, tenant_duplicate_handler)
+        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -149,6 +149,10 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    }
 }

+// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
+// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
+// from the name.
+
 pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -51,6 +51,9 @@ pub enum StorageTimeOperation {

    #[strum(serialize = "create tenant")]
    CreateTenant,
+
+    #[strum(serialize = "duplicate tenant")]
+    DuplicateTenant,
 }

 pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
@@ -1388,27 +1391,22 @@ impl TimelineMetrics {
        }
    }

-    pub fn record_new_file_metrics(&self, sz: u64) {
+    pub(crate) fn record_new_file_metrics(&self, sz: u64) {
        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }

-    pub fn resident_physical_size_sub(&self, sz: u64) {
+    pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
        self.resident_physical_size_gauge.sub(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
    }

-    pub fn resident_physical_size_add(&self, sz: u64) {
+    pub(crate) fn resident_physical_size_add(&self, sz: u64) {
        self.resident_physical_size_gauge.add(sz);
        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
    }

-    pub fn resident_physical_size_set(&self, sz: u64) {
-        self.resident_physical_size_gauge.set(sz);
-        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
-    }
-
    pub fn resident_physical_size_get(&self) -> u64 {
        self.resident_physical_size_gauge.get()
    }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -675,8 +675,9 @@ impl Timeline {

        result.add_key(CONTROLFILE_KEY);
        result.add_key(CHECKPOINT_KEY);
-        result.add_key(AUX_FILES_KEY);
-
+        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
+            result.add_key(AUX_FILES_KEY);
+        }
        Ok(result.to_keyspace())
    }

@@ -1693,6 +1694,7 @@ const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    Ok(match key.field1 {
        0x00 => (
@@ -1708,7 +1710,8 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    })
 }

-fn is_rel_block_key(key: Key) -> bool {
+/// See [[key_to_rel_block]].
+pub fn is_rel_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0
 }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1057,8 +1057,8 @@ impl Tenant {
                    TimelineId::try_from(timeline_uninit_mark_file.file_stem())
                        .with_context(|| {
                            format!(
-                            "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
-                        )
+                                "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
+                            )
                        })?;
                let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id);
                if let Err(e) =
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -4,8 +4,10 @@
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use rand::{distributions::Alphanumeric, Rng};
 use std::collections::{hash_map, HashMap};
+use std::str::FromStr;
 use std::sync::Arc;
 use tokio::fs;
+use tokio::io::AsyncSeekExt;

 use anyhow::Context;
 use once_cell::sync::Lazy;
@@ -26,9 +28,11 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
+use crate::tenant::span::debug_assert_current_span_has_tenant_id;
+use crate::tenant::storage_layer::{DeltaLayer, ImageLayer, LayerFileName};
 use crate::tenant::{
-    create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant,
-    TenantState,
+    create_tenant_files, remote_timeline_client, AttachMarkerMode, AttachedTenantConf,
+    CreateTenantFilesMode, IndexPart, Tenant, TenantState,
 };
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

@@ -695,6 +699,159 @@ pub(crate) async fn create_tenant(
    }).await
 }

+pub(crate) async fn duplicate_tenant(
+    conf: &'static PageServerConf,
+    tenant_conf: TenantConfOpt,
+    src_tenant_id: TenantId,
+    new_tenant_id: TenantId,
+    generation: Generation,
+    resources: TenantSharedResources,
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    debug_assert_current_span_has_tenant_id();
+
+    // TODO: would be nice to use tenant_map_insert here, but, we're not ready to create a Tenant object yet
+    let tempdir = path_with_suffix_extension(
+        conf.tenants_path().join(&new_tenant_id.to_string()),
+        &format!("duplication.{TEMP_FILE_SUFFIX}"),
+    );
+    tokio::fs::remove_dir_all(&tempdir)
+        .await
+        .or_else(|e| match e.kind() {
+            std::io::ErrorKind::NotFound => Ok(()),
+            _ => Err(e),
+        })
+        .context("pre-run clean up tempdir")?;
+
+    tokio::fs::create_dir(&tempdir)
+        .await
+        .context("create tempdir")?;
+
+    // Copy the tenant's data in S3
+    let remote_storage = resources
+        .remote_storage
+        .as_ref()
+        .context("only works with remote storage")?;
+
+    let remote_src_timelines =
+        remote_timeline_client::list_remote_timelines(remote_storage, src_tenant_id)
+            .await
+            .context("list src timelines")?;
+
+    info!(?remote_src_timelines, "got src timelines");
+
+    for timeline_id in remote_src_timelines {
+        async {
+            let tempdir = tempdir.join(&timeline_id.to_string());
+
+            tokio::fs::create_dir(&tempdir)
+                .await
+                .context("create tempdir for timeline")?;
+
+            let remote_src_tl =
+                remote_timeline_client::remote_timeline_path(&src_tenant_id, &timeline_id);
+            let remote_dst_tl =
+                remote_timeline_client::remote_timeline_path(&new_tenant_id, &timeline_id);
+
+            let object_names = remote_storage
+                .list_prefixes(Some(&remote_src_tl))
+                .await
+                .context("list timeline remote prefix")?;
+
+            for name in object_names {
+                async {
+                    let name = name.object_name().context(
+                        "list_prefixes return values should always have object_name()=Some",
+                    )?;
+                    let remote_src_obj = remote_src_tl.join(name);
+                    let remote_dst_obj = remote_dst_tl.join(name);
+
+                    let tmp_obj_filepath = tempdir.join(name);
+                    let mut tmp_obj_file = tokio::fs::OpenOptions::new()
+                        .read(true)
+                        .write(true)
+                        .create_new(true)
+                        .open(&tmp_obj_filepath)
+                        .await
+                        .context("create temp file")?;
+                    let mut tmp_dl = remote_storage
+                        .download(&remote_src_obj)
+                        .await
+                        .context("start download")?;
+                    let tmp_obj_size =
+                        tokio::io::copy(&mut tmp_dl.download_stream, &mut tmp_obj_file)
+                            .await
+                            .context("do the download")?;
+
+                    if name == IndexPart::FILE_NAME {
+                        // needs no patching
+                    } else {
+                        let name = LayerFileName::from_str(name).map_err(|e: String| {
+                            anyhow::anyhow!("unknown key in timeline s3 prefix: {name:?}: {e}")
+                        })?;
+                        match name {
+                            LayerFileName::Image(_) => {
+                                ImageLayer::rewrite_tenant_timeline(
+                                    &tmp_obj_filepath,
+                                    new_tenant_id,
+                                    timeline_id, /* leave as is */
+                                    ctx,
+                                )
+                                .await
+                                .context("rewrite tenant timeline")?;
+                            }
+                            LayerFileName::Delta(_) => {
+                                DeltaLayer::rewrite_tenant_timeline(
+                                    &tmp_obj_filepath,
+                                    new_tenant_id,
+                                    timeline_id, /* leave as is */
+                                    ctx,
+                                )
+                                .await
+                                .context("rewrite tenant timeline")?;
+                            }
+                        }
+                    }
+
+                    info!(?remote_dst_obj, "uploading");
+
+                    tmp_obj_file
+                        .seek(std::io::SeekFrom::Start(0))
+                        .await
+                        .context("seek tmp file to beginning for upload")?;
+                    remote_storage
+                        .upload(
+                            tmp_obj_file,
+                            tmp_obj_size as usize,
+                            &remote_dst_obj,
+                            tmp_dl.metadata,
+                        )
+                        .await
+                        .context("upload modified")?;
+
+                    tokio::fs::remove_file(tmp_obj_filepath)
+                        .await
+                        .context("remove temp file")?;
+
+                    anyhow::Ok(())
+                }
+                .instrument(info_span!("copy object", object_name=?name))
+                .await
+                .context("copy object")?;
+            }
+            anyhow::Ok(())
+        }
+        .instrument(info_span!("copy_timeline", timeline_id=%timeline_id))
+        .await?;
+    }
+
+    tokio::fs::remove_dir_all(&tempdir)
+        .await
+        .context("post-run clean up tempdir")?;
+
+    attach_tenant(conf, new_tenant_id, generation, tenant_conf, resources, ctx).await
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum SetNewTenantConfigError {
    #[error(transparent)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -627,7 +627,7 @@ impl RemoteTimelineClient {
    ///
    /// Launch an upload operation in the background.
    ///
-    pub fn schedule_layer_file_upload(
+    pub(crate) fn schedule_layer_file_upload(
        self: &Arc<Self>,
        layer_file_name: &LayerFileName,
        layer_metadata: &LayerFileMetadata,
@@ -635,6 +635,17 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

+        self.schedule_layer_file_upload0(upload_queue, layer_file_name, layer_metadata);
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
+    }
+
+    fn schedule_layer_file_upload0(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        layer_file_name: &LayerFileName,
+        layer_metadata: &LayerFileMetadata,
+    ) {
        upload_queue
            .latest_files
            .insert(layer_file_name.clone(), layer_metadata.clone());
@@ -643,21 +654,15 @@ impl RemoteTimelineClient {
        let op = UploadOp::UploadLayer(layer_file_name.clone(), layer_metadata.clone());
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
-
        info!("scheduled layer file upload {layer_file_name}");
-
-        // Launch the task immediately, if possible
-        self.launch_queued_tasks(upload_queue);
-        Ok(())
    }

    /// Launch a delete operation in the background.
    ///
-    /// The operation does not modify local state but assumes the local files have already been
-    /// deleted, and is used to mirror those changes to remote.
+    /// The operation does not modify local filesystem state.
    ///
    /// Note: This schedules an index file upload before the deletions.  The
-    /// deletion won't actually be performed, until any previously scheduled
+    /// deletion won't actually be performed, until all previously scheduled
    /// upload operations, and the index file upload, have completed
    /// successfully.
    pub fn schedule_layer_file_deletion(
@@ -667,61 +672,133 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

+        let with_generations =
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, &names);
+
+        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
+
+        // Launch the tasks immediately, if possible
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
+    }
+
+    /// Unlinks the layer files from `index_part.json` but does not yet schedule deletion for the
+    /// layer files, leaving them dangling.
+    ///
+    /// The files will be leaked in remote storage unless [`Self::schedule_deletion_of_unlinked`]
+    /// is invoked on them.
+    #[allow(unused)] // will be used by PR#4938
+    pub(crate) fn schedule_unlinking_of_layers_from_index_part(
+        self: &Arc<Self>,
+        names: Vec<LayerFileName>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        // just forget the return value; after uploading the next index_part.json, we can consider
+        // the layer files as "dangling". this is fine however.
+        self.schedule_unlinking_of_layers_from_index_part0(upload_queue, &names);
+
+        self.launch_queued_tasks(upload_queue);
+
+        Ok(())
+    }
+
+    /// Update the remote index file, removing the to-be-deleted files from the index,
+    /// allowing scheduling of actual deletions later.
+    fn schedule_unlinking_of_layers_from_index_part0(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        names: &[LayerFileName],
+    ) -> Vec<(LayerFileName, Generation)> {
        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();

-        // Update the remote index file, removing the to-be-deleted files from the index,
-        // before deleting the actual files.
-        //
-        // Once we start removing files from upload_queue.latest_files, there's
-        // no going back! Otherwise, some of the files would already be removed
-        // from latest_files, but not yet scheduled for deletion. Use a closure
-        // to syntactically forbid ? or bail! calls here.
-        let no_bail_here = || {
-            // Decorate our list of names with each name's generation, dropping
-            // makes that are unexpectedly missing from our metadata.
-            let with_generations: Vec<_> = names
-                .into_iter()
-                .filter_map(|name| {
-                    // Remove from latest_files, learning the file's remote generation in the process
-                    let meta = upload_queue.latest_files.remove(&name);
+        // Decorate our list of names with each name's generation, dropping
+        // makes that are unexpectedly missing from our metadata.
+        let with_generations: Vec<_> = names
+            .iter()
+            .filter_map(|name| {
+                // Remove from latest_files, learning the file's remote generation in the process
+                let meta = upload_queue.latest_files.remove(name);

-                    if let Some(meta) = meta {
-                        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                        Some((name, meta.generation))
-                    } else {
-                        // This can only happen if we forgot to to schedule the file upload
-                        // before scheduling the delete. Log it because it is a rare/strange
-                        // situation, and in case something is misbehaving, we'd like to know which
-                        // layers experienced this.
-                        info!(
-                            "Deleting layer {name} not found in latest_files list, never uploaded?"
-                        );
-                        None
-                    }
-                })
-                .collect();
+                if let Some(meta) = meta {
+                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
+                    Some((name.to_owned(), meta.generation))
+                } else {
+                    // This can only happen if we forgot to to schedule the file upload
+                    // before scheduling the delete. Log it because it is a rare/strange
+                    // situation, and in case something is misbehaving, we'd like to know which
+                    // layers experienced this.
+                    info!("Deleting layer {name} not found in latest_files list, never uploaded?");
+                    None
+                }
+            })
+            .collect();

-            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-                self.schedule_index_upload(upload_queue, metadata);
-            }
+        // after unlinking files from the upload_queue.latest_files we must always schedule an
+        // index_part update, because that needs to be uploaded before we can actually delete the
+        // files.
+        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
+            self.schedule_index_upload(upload_queue, metadata);
+        }

-            for (name, gen) in &with_generations {
-                info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
-            }
+        with_generations
+    }

-            // schedule the actual deletions
-            let op = UploadOp::Delete(Delete {
-                layers: with_generations,
-            });
-            self.calls_unfinished_metric_begin(&op);
-            upload_queue.queued_operations.push_back(op);
+    /// Schedules deletion for layer files which have previously been unlinked from the
+    /// `index_part.json` with [`Self::schedule_unlinking_of_layers_from_index_part`].
+    #[allow(unused)] // will be used by Layer::drop in PR#4938
+    pub(crate) fn schedule_deletion_of_unlinked(
+        self: &Arc<Self>,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        self.schedule_deletion_of_unlinked0(upload_queue, layers);
+        self.launch_queued_tasks(upload_queue);
+        Ok(())
+    }
+
+    fn schedule_deletion_of_unlinked0(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+        with_generations: Vec<(LayerFileName, Generation)>,
+    ) {
+        for (name, gen) in &with_generations {
+            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
+        }
+
+        // schedule the actual deletions
+        let op = UploadOp::Delete(Delete {
+            layers: with_generations,
+        });
+        self.calls_unfinished_metric_begin(&op);
+        upload_queue.queued_operations.push_back(op);
+    }
+
+    /// Schedules a compaction update to the remote `index_part.json`.
+    ///
+    /// `compacted_from` represent the L0 names which have been `compacted_to` L1 layers.
+    pub(crate) fn schedule_compaction_update(
+        self: &Arc<Self>,
+        compacted_from: &[LayerFileName],
+        compacted_to: &[(LayerFileName, LayerFileMetadata)],
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        for (name, m) in compacted_to {
+            self.schedule_layer_file_upload0(upload_queue, name, m);
+        }
+
+        let with_generations =
+            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, compacted_from);
+        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
+        self.launch_queued_tasks(upload_queue);

-            // Launch the tasks immediately, if possible
-            self.launch_queued_tasks(upload_queue);
-        };
-        no_bail_here();
        Ok(())
    }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -844,6 +844,49 @@ impl Drop for DeltaLayerWriter {
    }
 }

+impl DeltaLayer {
+    /// Assume the file at `path` is corrupt if this function returns with an error.
+    pub(crate) async fn rewrite_tenant_timeline(
+        path: &Utf8Path,
+        new_tenant: TenantId,
+        new_timeline: TimelineId,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let file = VirtualFile::open_with_options(
+            path,
+            &*std::fs::OpenOptions::new().read(true).write(true),
+        )
+        .await
+        .with_context(|| format!("Failed to open file '{}'", path))?;
+        let file = FileBlockReader::new(file);
+        let summary_blk = file.read_blk(0, ctx).await?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let mut file = file.file;
+        if actual_summary.magic != DELTA_FILE_MAGIC {
+            bail!("File '{}' is not a delta layer", path);
+        }
+        let new_summary = Summary {
+            tenant_id: new_tenant,
+            timeline_id: new_timeline,
+            ..actual_summary
+        };
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&new_summary, &mut buf)?;
+        if buf.spilled() {
+            // The code in ImageLayerWriterInner just warn!()s for this.
+            // It should probably error out as well.
+            anyhow::bail!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            );
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;
+        Ok(())
+    }
+}
+
 impl DeltaLayerInner {
    pub(super) async fn load(
        path: &Utf8Path,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -436,6 +436,49 @@ impl ImageLayer {
    }
 }

+impl ImageLayer {
+    /// Assume the file at `path` is corrupt if this function returns with an error.
+    pub(crate) async fn rewrite_tenant_timeline(
+        path: &Utf8Path,
+        new_tenant: TenantId,
+        new_timeline: TimelineId,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let file = VirtualFile::open_with_options(
+            path,
+            &*std::fs::OpenOptions::new().read(true).write(true),
+        )
+        .await
+        .with_context(|| format!("Failed to open file '{}'", path))?;
+        let file = FileBlockReader::new(file);
+        let summary_blk = file.read_blk(0, ctx).await?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let mut file = file.file;
+        if actual_summary.magic != IMAGE_FILE_MAGIC {
+            bail!("File '{}' is not a delta layer", path);
+        }
+        let new_summary = Summary {
+            tenant_id: new_tenant,
+            timeline_id: new_timeline,
+            ..actual_summary
+        };
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&new_summary, &mut buf)?;
+        if buf.spilled() {
+            // The code in ImageLayerWriterInner just warn!()s for this.
+            // It should probably error out as well.
+            anyhow::bail!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            );
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;
+        Ok(())
+    }
+}
+
 impl ImageLayerInner {
    pub(super) async fn load(
        path: &Utf8Path,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -159,11 +159,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
            // loop, #3501. There are also additional "allowed_errors" in tests.
-            if first {
-                first = false;
-                if random_init_delay(period, &cancel).await.is_err() {
-                    break;
-                }
+            if first && random_init_delay(period, &cancel).await.is_err() {
+                break;
            }

            let started_at = Instant::now();
@@ -183,7 +180,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
+            if !first {
+                // The first iteration is typically much slower, because all tenants compete for the
+                // compaction sempahore to run, and because of concurrent startup work like initializing
+                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
+                warn_when_period_overrun(
+                    started_at.elapsed(),
+                    period,
+                    BackgroundLoopKind::Compaction,
+                );
+            }

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -192,6 +198,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
+
+            first = false;
        }
    }
    .await;
@@ -223,11 +231,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {

            let period = tenant.get_gc_period();

-            if first {
-                first = false;
-                if random_init_delay(period, &cancel).await.is_err() {
-                    break;
-                }
+            if first && random_init_delay(period, &cancel).await.is_err() {
+                break;
            }

            let started_at = Instant::now();
@@ -251,7 +256,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                }
            };

-            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+            if !first {
+                // The first iteration is typically much slower, because all tenants compete for the
+                // compaction sempahore to run, and because of concurrent startup work like initializing
+                // logical sizes.  To avoid routinely spamming warnings, we suppress this log on first iteration.
+                warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
+            }

            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -260,6 +270,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            {
                break;
            }
+
+            first = false;
        }
    }
    .await;
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1871,7 +1871,7 @@ impl Timeline {
            "loaded layer map with {} layers at {}, total physical size: {}",
            num_layers, disk_consistent_lsn, total_physical_size
        );
-        self.metrics.resident_physical_size_set(total_physical_size);
+        self.metrics.resident_physical_size_add(total_physical_size);

        timer.stop_and_record();
        Ok(())
@@ -3079,6 +3079,7 @@ impl Timeline {
        Ok(false)
    }

+    #[tracing::instrument(skip_all, fields(%lsn, %force))]
    async fn create_image_layers(
        &self,
        partitioning: &KeyPartitioning,
@@ -3869,22 +3870,21 @@ impl Timeline {
        // now, we just skip the file to avoid unintentional modification to files on the disk and in the layer map.
        let mut duplicated_layers = HashSet::new();

+        let mut uploaded_layers = Vec::with_capacity(new_layers.len());
        let mut insert_layers = Vec::new();
        let mut remove_layers = Vec::new();

-        for l in new_layers {
+        for l in &new_layers {
            let new_delta_path = l.path();

            let metadata = new_delta_path.metadata().with_context(|| {
                format!("read file metadata for new created layer {new_delta_path}")
            })?;

-            if let Some(remote_client) = &self.remote_client {
-                remote_client.schedule_layer_file_upload(
-                    &l.filename(),
-                    &LayerFileMetadata::new(metadata.len(), self.generation),
-                )?;
-            }
+            uploaded_layers.push((
+                l.filename(),
+                LayerFileMetadata::new(metadata.len(), self.generation),
+            ));

            // update metrics, including the timeline's physical size
            self.metrics.record_new_file_metrics(metadata.len());
@@ -3897,7 +3897,7 @@ impl Timeline {
                LayerResidenceStatus::Resident,
                LayerResidenceEventReason::LayerCreate,
            );
-            let l = l as Arc<dyn PersistentLayer>;
+            let l = l.to_owned() as Arc<dyn PersistentLayer>;
            if guard.contains(&l) {
                tracing::error!(layer=%l, "duplicated L1 layer");
                duplicated_layers.insert(l.layer_desc().key());
@@ -3929,13 +3929,12 @@ impl Timeline {
            &self.metrics,
        )?;

-        drop_wlock(guard);
-
-        // Also schedule the deletions in remote storage
-        if let Some(remote_client) = &self.remote_client {
-            remote_client.schedule_layer_file_deletion(layer_names_to_delete)?;
+        if let Some(remote_client) = self.remote_client.as_ref() {
+            remote_client.schedule_compaction_update(&layer_names_to_delete, &uploaded_layers)?;
        }

+        drop_wlock(guard);
+
        Ok(())
    }

--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -26,8 +26,7 @@ use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey;
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::SubscribeSafekeeperInfoRequest;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
-use storage_broker::BrokerClientChannel;
-use storage_broker::Streaming;
+use storage_broker::{BrokerClientChannel, Code, Streaming};
 use tokio::select;
 use tracing::*;

@@ -137,8 +136,17 @@ pub(super) async fn connection_manager_loop_step(
            broker_update = broker_subscription.message() => {
                match broker_update {
                    Ok(Some(broker_update)) => connection_manager_state.register_timeline_update(broker_update),
-                    Err(e) => {
-                        error!("broker subscription failed: {e}");
+                    Err(status) => {
+                        match status.code() {
+                            Code::Unknown if status.message().contains("stream closed because of a broken pipe") => {
+                                // tonic's error handling doesn't provide a clear code for disconnections: we get
+                                // "h2 protocol error: error reading a body from connection: stream closed because of a broken pipe"
+                                info!("broker disconnected: {status}");
+                            },
+                            _ => {
+                                warn!("broker subscription failed: {status}");
+                            }
+                        }
                        return ControlFlow::Continue(());
                    }
                    Ok(None) => {
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -18,7 +18,8 @@ use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::{RwLock, RwLockWriteGuard};
+use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+use tokio::time::Instant;

 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -110,7 +111,7 @@ impl OpenFiles {
    ///
    /// On return, we hold a lock on the slot, and its 'tag' has been updated
    /// recently_used has been set. It's all ready for reuse.
-    fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
+    async fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
        //
        // Run the clock algorithm to find a slot to replace.
        //
@@ -142,7 +143,7 @@ impl OpenFiles {
                }
                retries += 1;
            } else {
-                slot_guard = slot.inner.write().unwrap();
+                slot_guard = slot.inner.write().await;
                index = next;
                break;
            }
@@ -153,7 +154,7 @@ impl OpenFiles {
        // old file.
        //
        if let Some(old_file) = slot_guard.file.take() {
-            // the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
+            // the normal path of dropping VirtualFile uses `Close`, use `CloseByReplace` here to
            // distinguish the two.
            STORAGE_IO_TIME_METRIC
                .get(StorageIoOperation::CloseByReplace)
@@ -208,6 +209,29 @@ impl CrashsafeOverwriteError {
    }
 }

+/// Observe duration for the given storage I/O operation
+///
+/// Unlike `observe_closure_duration`, this supports async,
+/// where "support" means that we measure wall clock time.
+macro_rules! observe_duration {
+    ($op:expr, $($body:tt)*) => {{
+        let instant = Instant::now();
+        let result = $($body)*;
+        let elapsed = instant.elapsed().as_secs_f64();
+        STORAGE_IO_TIME_METRIC
+            .get($op)
+            .observe(elapsed);
+        result
+    }}
+}
+
+macro_rules! with_file {
+    ($this:expr, $op:expr, | $ident:ident | $($body:tt)*) => {{
+        let $ident = $this.lock_file().await?;
+        observe_duration!($op, $($body)*)
+    }};
+}
+
 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
@@ -244,11 +268,9 @@ impl VirtualFile {
            tenant_id = "*".to_string();
            timeline_id = "*".to_string();
        }
-        let (handle, mut slot_guard) = get_open_files().find_victim_slot();
+        let (handle, mut slot_guard) = get_open_files().find_victim_slot().await;

-        let file = STORAGE_IO_TIME_METRIC
-            .get(StorageIoOperation::Open)
-            .observe_closure_duration(|| open_options.open(path))?;
+        let file = observe_duration!(StorageIoOperation::Open, open_options.open(path))?;

        // Strip all options other than read and write.
        //
@@ -331,22 +353,24 @@ impl VirtualFile {

    /// Call File::sync_all() on the underlying File.
    pub async fn sync_all(&self) -> Result<(), Error> {
-        self.with_file(StorageIoOperation::Fsync, |file| file.sync_all())
-            .await?
+        with_file!(self, StorageIoOperation::Fsync, |file| file
+            .as_ref()
+            .sync_all())
    }

    pub async fn metadata(&self) -> Result<fs::Metadata, Error> {
-        self.with_file(StorageIoOperation::Metadata, |file| file.metadata())
-            .await?
+        with_file!(self, StorageIoOperation::Metadata, |file| file
+            .as_ref()
+            .metadata())
    }

-    /// Helper function that looks up the underlying File for this VirtualFile,
-    /// opening it and evicting some other File if necessary. It calls 'func'
-    /// with the physical File.
-    async fn with_file<F, R>(&self, op: StorageIoOperation, mut func: F) -> Result<R, Error>
-    where
-        F: FnMut(&File) -> R,
-    {
+    /// Helper function internal to `VirtualFile` that looks up the underlying File,
+    /// opens it and evicts some other File if necessary. The passed parameter is
+    /// assumed to be a function available for the physical `File`.
+    ///
+    /// We are doing it via a macro as Rust doesn't support async closures that
+    /// take on parameters with lifetimes.
+    async fn lock_file(&self) -> Result<FileGuard<'_>, Error> {
        let open_files = get_open_files();

        let mut handle_guard = {
@@ -356,27 +380,23 @@ impl VirtualFile {
            // We only need to hold the handle lock while we read the current handle. If
            // another thread closes the file and recycles the slot for a different file,
            // we will notice that the handle we read is no longer valid and retry.
-            let mut handle = *self.handle.read().unwrap();
+            let mut handle = *self.handle.read().await;
            loop {
                // Check if the slot contains our File
                {
                    let slot = &open_files.slots[handle.index];
-                    let slot_guard = slot.inner.read().unwrap();
-                    if slot_guard.tag == handle.tag {
-                        if let Some(file) = &slot_guard.file {
-                            // Found a cached file descriptor.
-                            slot.recently_used.store(true, Ordering::Relaxed);
-                            return Ok(STORAGE_IO_TIME_METRIC
-                                .get(op)
-                                .observe_closure_duration(|| func(file)));
-                        }
+                    let slot_guard = slot.inner.read().await;
+                    if slot_guard.tag == handle.tag && slot_guard.file.is_some() {
+                        // Found a cached file descriptor.
+                        slot.recently_used.store(true, Ordering::Relaxed);
+                        return Ok(FileGuard { slot_guard });
                    }
                }

                // The slot didn't contain our File. We will have to open it ourselves,
                // but before that, grab a write lock on handle in the VirtualFile, so
                // that no other thread will try to concurrently open the same file.
-                let handle_guard = self.handle.write().unwrap();
+                let handle_guard = self.handle.write().await;

                // If another thread changed the handle while we were not holding the lock,
                // then the handle might now be valid again. Loop back to retry.
@@ -390,17 +410,10 @@ impl VirtualFile {

        // We need to open the file ourselves. The handle in the VirtualFile is
        // now locked in write-mode. Find a free slot to put it in.
-        let (handle, mut slot_guard) = open_files.find_victim_slot();
+        let (handle, mut slot_guard) = open_files.find_victim_slot().await;

        // Open the physical file
-        let file = STORAGE_IO_TIME_METRIC
-            .get(StorageIoOperation::Open)
-            .observe_closure_duration(|| self.open_options.open(&self.path))?;
-
-        // Perform the requested operation on it
-        let result = STORAGE_IO_TIME_METRIC
-            .get(op)
-            .observe_closure_duration(|| func(&file));
+        let file = observe_duration!(StorageIoOperation::Open, self.open_options.open(&self.path))?;

        // Store the File in the slot and update the handle in the VirtualFile
        // to point to it.
@@ -408,7 +421,9 @@ impl VirtualFile {

        *handle_guard = handle;

-        Ok(result)
+        return Ok(FileGuard {
+            slot_guard: slot_guard.downgrade(),
+        });
    }

    pub fn remove(self) {
@@ -423,11 +438,9 @@ impl VirtualFile {
                self.pos = offset;
            }
            SeekFrom::End(offset) => {
-                self.pos = self
-                    .with_file(StorageIoOperation::Seek, |mut file| {
-                        file.seek(SeekFrom::End(offset))
-                    })
-                    .await??
+                self.pos = with_file!(self, StorageIoOperation::Seek, |file| file
+                    .as_ref()
+                    .seek(SeekFrom::End(offset)))?
            }
            SeekFrom::Current(offset) => {
                let pos = self.pos as i128 + offset as i128;
@@ -515,9 +528,9 @@ impl VirtualFile {
    }

    pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
-        let result = self
-            .with_file(StorageIoOperation::Read, |file| file.read_at(buf, offset))
-            .await?;
+        let result = with_file!(self, StorageIoOperation::Read, |file| file
+            .as_ref()
+            .read_at(buf, offset));
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["read", &self.tenant_id, &self.timeline_id])
@@ -527,9 +540,9 @@ impl VirtualFile {
    }

    async fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
-        let result = self
-            .with_file(StorageIoOperation::Write, |file| file.write_at(buf, offset))
-            .await?;
+        let result = with_file!(self, StorageIoOperation::Write, |file| file
+            .as_ref()
+            .write_at(buf, offset));
        if let Ok(size) = result {
            STORAGE_IO_SIZE
                .with_label_values(&["write", &self.tenant_id, &self.timeline_id])
@@ -539,6 +552,18 @@ impl VirtualFile {
    }
 }

+struct FileGuard<'a> {
+    slot_guard: RwLockReadGuard<'a, SlotInner>,
+}
+
+impl<'a> AsRef<File> for FileGuard<'a> {
+    fn as_ref(&self) -> &File {
+        // This unwrap is safe because we only create `FileGuard`s
+        // if we know that the file is Some.
+        self.slot_guard.file.as_ref().unwrap()
+    }
+}
+
 #[cfg(test)]
 impl VirtualFile {
    pub(crate) async fn read_blk(
@@ -571,20 +596,39 @@ impl VirtualFile {
 impl Drop for VirtualFile {
    /// If a VirtualFile is dropped, close the underlying file if it was open.
    fn drop(&mut self) {
-        let handle = self.handle.get_mut().unwrap();
+        let handle = self.handle.get_mut();

-        // We could check with a read-lock first, to avoid waiting on an
-        // unrelated I/O.
-        let slot = &get_open_files().slots[handle.index];
-        let mut slot_guard = slot.inner.write().unwrap();
-        if slot_guard.tag == handle.tag {
-            slot.recently_used.store(false, Ordering::Relaxed);
-            // there is also operation "close-by-replace" for closes done on eviction for
-            // comparison.
-            STORAGE_IO_TIME_METRIC
-                .get(StorageIoOperation::Close)
-                .observe_closure_duration(|| drop(slot_guard.file.take()));
+        fn clean_slot(slot: &Slot, mut slot_guard: RwLockWriteGuard<'_, SlotInner>, tag: u64) {
+            if slot_guard.tag == tag {
+                slot.recently_used.store(false, Ordering::Relaxed);
+                // there is also the `CloseByReplace` operation for closes done on eviction for
+                // comparison.
+                STORAGE_IO_TIME_METRIC
+                    .get(StorageIoOperation::Close)
+                    .observe_closure_duration(|| drop(slot_guard.file.take()));
+            }
        }
+
+        // We don't have async drop so we cannot directly await the lock here.
+        // Instead, first do a best-effort attempt at closing the underlying
+        // file descriptor by using `try_write`, and if that fails, spawn
+        // a tokio task to do it asynchronously: we just want it to be
+        // cleaned up eventually.
+        // Most of the time, the `try_lock` should succeed though,
+        // as we have `&mut self` access. In other words, if the slot
+        // is still occupied by our file, there should be no access from
+        // other I/O operations; the only other possible place to lock
+        // the slot is the lock algorithm looking for free slots.
+        let slot = &get_open_files().slots[handle.index];
+        if let Ok(slot_guard) = slot.inner.try_write() {
+            clean_slot(slot, slot_guard, handle.tag);
+        } else {
+            let tag = handle.tag;
+            tokio::spawn(async move {
+                let slot_guard = slot.inner.write().await;
+                clean_slot(slot, slot_guard, tag);
+            });
+        };
    }
 }

--- a/poetry.lock
+++ b/poetry.lock
@@ -2447,20 +2447,20 @@ test = ["websockets"]

 [[package]]
 name = "werkzeug"
-version = "2.2.3"
+version = "3.0.1"
 description = "The comprehensive WSGI web application library."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
-    {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
+    {file = "werkzeug-3.0.1-py3-none-any.whl", hash = "sha256:90a285dc0e42ad56b34e696398b8122ee4c681833fb35b8334a095d82c56da10"},
+    {file = "werkzeug-3.0.1.tar.gz", hash = "sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc"},
 ]

 [package.dependencies]
 MarkupSafe = ">=2.1.1"

 [package.extras]
-watchdog = ["watchdog"]
+watchdog = ["watchdog (>=2.3)"]

 [[package]]
 name = "wrapt"
@@ -2719,4 +2719,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c5981d8d7c2deadd47c823bc35f86f830c8e320b653d2d3718bade1f4d2dabca"
+content-hash = "74649cf47c52f21b01b096a42044750b1c9677576b405be0489c2909127a9bf1"
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -4,10 +4,11 @@ use proxy::config::AuthenticationConfig;
 use proxy::config::HttpConfig;
 use proxy::console;
 use proxy::http;
-use proxy::metrics;
+use proxy::usage_metrics;

 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
+use proxy::serverless;
 use std::pin::pin;
 use std::{borrow::Cow, net::SocketAddr};
 use tokio::net::TcpListener;
@@ -129,14 +130,16 @@ async fn main() -> anyhow::Result<()> {
        cancellation_token.clone(),
    ));

-    if let Some(wss_address) = args.wss {
-        let wss_address: SocketAddr = wss_address.parse()?;
-        info!("Starting wss on {wss_address}");
-        let wss_listener = TcpListener::bind(wss_address).await?;
+    // TODO: rename the argument to something like serverless.
+    // It now covers more than just websockets, it also covers SQL over HTTP.
+    if let Some(serverless_address) = args.wss {
+        let serverless_address: SocketAddr = serverless_address.parse()?;
+        info!("Starting wss on {serverless_address}");
+        let serverless_listener = TcpListener::bind(serverless_address).await?;

-        client_tasks.spawn(http::websocket::task_main(
+        client_tasks.spawn(serverless::task_main(
            config,
-            wss_listener,
+            serverless_listener,
            cancellation_token.clone(),
        ));
    }
@@ -144,11 +147,11 @@ async fn main() -> anyhow::Result<()> {
    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
-    maintenance_tasks.spawn(http::server::task_main(http_listener));
+    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
    maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));

    if let Some(metrics_config) = &config.metric_collection {
-        maintenance_tasks.spawn(metrics::task_main(metrics_config));
+        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
    }

    let maintenance = loop {
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -2,10 +2,7 @@
 //! Other modules should use stuff from this module instead of
 //! directly relying on deps like `reqwest` (think loose coupling).

-pub mod conn_pool;
-pub mod server;
-pub mod sql_over_http;
-pub mod websocket;
+pub mod health_server;

 use std::{sync::Arc, time::Duration};

--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -14,14 +14,15 @@ pub mod console;
 pub mod error;
 pub mod http;
 pub mod logging;
-pub mod metrics;
 pub mod parse;
 pub mod protocol2;
 pub mod proxy;
 pub mod sasl;
 pub mod scram;
+pub mod serverless;
 pub mod stream;
 pub mod url;
+pub mod usage_metrics;
 pub mod waiters;

 /// Handle unix signals appropriately.
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -8,9 +8,9 @@ use crate::{
    config::{AuthenticationConfig, ProxyConfig, TlsConfig},
    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
    http::StatusCode,
-    metrics::{Ids, USAGE_METRICS},
    protocol2::WithClientIp,
    stream::{PqStream, Stream},
+    usage_metrics::{Ids, USAGE_METRICS},
 };
 use anyhow::{bail, Context};
 use async_trait::async_trait;
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -1,235 +1,36 @@
-use crate::{
-    cancellation::CancelMap,
-    config::ProxyConfig,
-    error::io_error,
-    protocol2::{ProxyProtocolAccept, WithClientIp},
-    proxy::{
-        handle_client, ClientMode, NUM_CLIENT_CONNECTION_CLOSED_COUNTER,
-        NUM_CLIENT_CONNECTION_OPENED_COUNTER,
-    },
-};
+//! Routers for our serverless APIs
+//!
+//! Handles both SQL over HTTP and SQL over Websockets.
+
+mod conn_pool;
+mod sql_over_http;
+mod websocket;
+
 use anyhow::bail;
-use bytes::{Buf, Bytes};
-use futures::{Sink, Stream, StreamExt};
+use hyper::StatusCode;
+pub use reqwest_middleware::{ClientWithMiddleware, Error};
+pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
+
+use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
+use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
+use crate::{cancellation::CancelMap, config::ProxyConfig};
+use futures::StreamExt;
 use hyper::{
    server::{
        accept,
        conn::{AddrIncoming, AddrStream},
    },
-    upgrade::Upgraded,
-    Body, Method, Request, Response, StatusCode,
+    Body, Method, Request, Response,
 };
-use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
-use pin_project_lite::pin_project;

-use std::{
-    future::ready,
-    pin::Pin,
-    sync::Arc,
-    task::{ready, Context, Poll},
-};
+use std::task::Poll;
+use std::{future::ready, sync::Arc};
 use tls_listener::TlsListener;
-use tokio::{
-    io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
-    net::TcpListener,
-};
+use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 use utils::http::{error::ApiError, json::json_response};

-// TODO: use `std::sync::Exclusive` once it's stabilized.
-// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
-use sync_wrapper::SyncWrapper;
-
-use super::{conn_pool::GlobalConnPool, sql_over_http};
-
-pin_project! {
-    /// This is a wrapper around a [`WebSocketStream`] that
-    /// implements [`AsyncRead`] and [`AsyncWrite`].
-    pub struct WebSocketRw {
-        #[pin]
-        stream: SyncWrapper<WebSocketStream<Upgraded>>,
-        bytes: Bytes,
-    }
-}
-
-impl WebSocketRw {
-    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
-        Self {
-            stream: stream.into(),
-            bytes: Bytes::new(),
-        }
-    }
-}
-
-impl AsyncWrite for WebSocketRw {
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &[u8],
-    ) -> Poll<io::Result<usize>> {
-        let mut stream = self.project().stream.get_pin_mut();
-
-        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
-        match stream.as_mut().start_send(Message::Binary(buf.into())) {
-            Ok(()) => Poll::Ready(Ok(buf.len())),
-            Err(e) => Poll::Ready(Err(io_error(e))),
-        }
-    }
-
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
-        stream.poll_flush(cx).map_err(io_error)
-    }
-
-    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
-        let stream = self.project().stream.get_pin_mut();
-        stream.poll_close(cx).map_err(io_error)
-    }
-}
-
-impl AsyncRead for WebSocketRw {
-    fn poll_read(
-        mut self: Pin<&mut Self>,
-        cx: &mut Context<'_>,
-        buf: &mut ReadBuf<'_>,
-    ) -> Poll<io::Result<()>> {
-        if buf.remaining() > 0 {
-            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
-            let len = std::cmp::min(bytes.len(), buf.remaining());
-            buf.put_slice(&bytes[..len]);
-            self.consume(len);
-        }
-
-        Poll::Ready(Ok(()))
-    }
-}
-
-impl AsyncBufRead for WebSocketRw {
-    fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
-        // Please refer to poll_fill_buf's documentation.
-        const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
-
-        let mut this = self.project();
-        loop {
-            if !this.bytes.chunk().is_empty() {
-                let chunk = (*this.bytes).chunk();
-                return Poll::Ready(Ok(chunk));
-            }
-
-            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
-            match res.transpose().map_err(io_error)? {
-                Some(message) => match message {
-                    Message::Ping(_) => {}
-                    Message::Pong(_) => {}
-                    Message::Text(text) => {
-                        // We expect to see only binary messages.
-                        let error = "unexpected text message in the websocket";
-                        warn!(length = text.len(), error);
-                        return Poll::Ready(Err(io_error(error)));
-                    }
-                    Message::Frame(_) => {
-                        // This case is impossible according to Frame's doc.
-                        panic!("unexpected raw frame in the websocket");
-                    }
-                    Message::Binary(chunk) => {
-                        assert!(this.bytes.is_empty());
-                        *this.bytes = Bytes::from(chunk);
-                    }
-                    Message::Close(_) => return EOF,
-                },
-                None => return EOF,
-            }
-        }
-    }
-
-    fn consume(self: Pin<&mut Self>, amount: usize) {
-        self.project().bytes.advance(amount);
-    }
-}
-
-async fn serve_websocket(
-    websocket: HyperWebsocket,
-    config: &'static ProxyConfig,
-    cancel_map: &CancelMap,
-    session_id: uuid::Uuid,
-    hostname: Option<String>,
-) -> anyhow::Result<()> {
-    let websocket = websocket.await?;
-    handle_client(
-        config,
-        cancel_map,
-        session_id,
-        WebSocketRw::new(websocket),
-        ClientMode::Websockets { hostname },
-    )
-    .await?;
-    Ok(())
-}
-
-async fn ws_handler(
-    mut request: Request<Body>,
-    config: &'static ProxyConfig,
-    conn_pool: Arc<GlobalConnPool>,
-    cancel_map: Arc<CancelMap>,
-    session_id: uuid::Uuid,
-    sni_hostname: Option<String>,
-) -> Result<Response<Body>, ApiError> {
-    let host = request
-        .headers()
-        .get("host")
-        .and_then(|h| h.to_str().ok())
-        .and_then(|h| h.split(':').next())
-        .map(|s| s.to_string());
-
-    // Check if the request is a websocket upgrade request.
-    if hyper_tungstenite::is_upgrade_request(&request) {
-        info!(session_id = ?session_id, "performing websocket upgrade");
-
-        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
-            .map_err(|e| ApiError::BadRequest(e.into()))?;
-
-        tokio::spawn(
-            async move {
-                if let Err(e) =
-                    serve_websocket(websocket, config, &cancel_map, session_id, host).await
-                {
-                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
-                }
-            }
-            .in_current_span(),
-        );
-
-        // Return the response so the spawned future can continue.
-        Ok(response)
-    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
-    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
-    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
-        sql_over_http::handle(
-            request,
-            sni_hostname,
-            conn_pool,
-            session_id,
-            &config.http_config,
-        )
-        .await
-    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
-        Response::builder()
-            .header("Allow", "OPTIONS, POST")
-            .header("Access-Control-Allow-Origin", "*")
-            .header(
-                "Access-Control-Allow-Headers",
-                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
-            )
-            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
-            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
-            .body(Body::empty())
-            .map_err(|e| ApiError::InternalServerError(e.into()))
-    } else {
-        json_response(StatusCode::BAD_REQUEST, "query is not supported")
-    }
-}
-
 pub async fn task_main(
    config: &'static ProxyConfig,
    ws_listener: TcpListener,
@@ -239,7 +40,7 @@ pub async fn task_main(
        info!("websocket server has shut down");
    }

-    let conn_pool: Arc<GlobalConnPool> = GlobalConnPool::new(config);
+    let conn_pool = conn_pool::GlobalConnPool::new(config);

    // shutdown the connection pool
    tokio::spawn({
@@ -300,13 +101,15 @@ pub async fn task_main(
                            let cancel_map = Arc::new(CancelMap::default());
                            let session_id = uuid::Uuid::new_v4();

-                            ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
-                                .instrument(info_span!(
-                                    "ws-client",
-                                    session = %session_id,
-                                    %peer_addr,
-                                ))
-                                .await
+                            request_handler(
+                                req, config, conn_pool, cancel_map, session_id, sni_name,
+                            )
+                            .instrument(info_span!(
+                                "serverless",
+                                session = %session_id,
+                                %peer_addr,
+                            ))
+                            .await
                        }
                    },
                )))
@@ -359,3 +162,65 @@ where
        self.inner.call(req)
    }
 }
+
+async fn request_handler(
+    mut request: Request<Body>,
+    config: &'static ProxyConfig,
+    conn_pool: Arc<conn_pool::GlobalConnPool>,
+    cancel_map: Arc<CancelMap>,
+    session_id: uuid::Uuid,
+    sni_hostname: Option<String>,
+) -> Result<Response<Body>, ApiError> {
+    let host = request
+        .headers()
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next())
+        .map(|s| s.to_string());
+
+    // Check if the request is a websocket upgrade request.
+    if hyper_tungstenite::is_upgrade_request(&request) {
+        info!(session_id = ?session_id, "performing websocket upgrade");
+
+        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
+            .map_err(|e| ApiError::BadRequest(e.into()))?;
+
+        tokio::spawn(
+            async move {
+                if let Err(e) =
+                    websocket::serve_websocket(websocket, config, &cancel_map, session_id, host)
+                        .await
+                {
+                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
+                }
+            }
+            .in_current_span(),
+        );
+
+        // Return the response so the spawned future can continue.
+        Ok(response)
+    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+        sql_over_http::handle(
+            request,
+            sni_hostname,
+            conn_pool,
+            session_id,
+            &config.http_config,
+        )
+        .await
+    } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
+        Response::builder()
+            .header("Allow", "OPTIONS, POST")
+            .header("Access-Control-Allow-Origin", "*")
+            .header(
+                "Access-Control-Allow-Headers",
+                "Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
+            )
+            .header("Access-Control-Max-Age", "86400" /* 24 hours */)
+            .status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
+            .body(Body::empty())
+            .map_err(|e| ApiError::InternalServerError(e.into()))
+    } else {
+        json_response(StatusCode::BAD_REQUEST, "query is not supported")
+    }
+}
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -22,8 +22,8 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
    auth, console,
-    metrics::{Ids, MetricCounter, USAGE_METRICS},
    proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
+    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};

@@ -191,22 +191,39 @@ impl GlobalConnPool {
        // ok return cached connection if found and establish a new one otherwise
        let new_client = if let Some(client) = client {
            if client.inner.is_closed() {
-                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
+                let conn_id = uuid::Uuid::new_v4();
+                info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
+                connect_to_compute(
+                    self.proxy_config,
+                    conn_info,
+                    conn_id,
+                    session_id,
+                    latency_timer,
+                )
+                .await
            } else {
                info!("pool: reusing connection '{conn_info}'");
                client.session.send(session_id)?;
                latency_timer.pool_hit();
                latency_timer.success();
                return Ok(Client {
+                    conn_id: client.conn_id,
                    inner: Some(client),
                    span: Span::current(),
                    pool,
                });
            }
        } else {
-            info!("pool: opening a new connection '{conn_info}'");
-            connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
+            let conn_id = uuid::Uuid::new_v4();
+            info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+            connect_to_compute(
+                self.proxy_config,
+                conn_info,
+                conn_id,
+                session_id,
+                latency_timer,
+            )
+            .await
        };

        match &new_client {
@@ -243,6 +260,7 @@ impl GlobalConnPool {
        }

        new_client.map(|inner| Client {
+            conn_id: inner.conn_id,
            inner: Some(inner),
            span: Span::current(),
            pool,
@@ -250,16 +268,18 @@ impl GlobalConnPool {
    }

    fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
+        let conn_id = client.conn_id;
+
        // We want to hold this open while we return. This ensures that the pool can't close
        // while we are in the middle of returning the connection.
        let closed = self.closed.read();
        if *closed {
-            info!("pool: throwing away connection '{conn_info}' because pool is closed");
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is closed");
            return Ok(());
        }

        if client.inner.is_closed() {
-            info!("pool: throwing away connection '{conn_info}' because connection is closed");
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
            return Ok(());
        }

@@ -291,9 +311,9 @@ impl GlobalConnPool {

        // do logging outside of the mutex
        if returned {
-            info!("pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
        } else {
-            info!("pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
        }

        Ok(())
@@ -340,6 +360,7 @@ impl GlobalConnPool {
 struct TokioMechanism<'a> {
    conn_info: &'a ConnInfo,
    session_id: uuid::Uuid,
+    conn_id: uuid::Uuid,
 }

 #[async_trait]
@@ -353,7 +374,14 @@ impl ConnectMechanism for TokioMechanism<'_> {
        node_info: &console::CachedNodeInfo,
        timeout: time::Duration,
    ) -> Result<Self::Connection, Self::ConnectError> {
-        connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
+        connect_to_compute_once(
+            node_info,
+            self.conn_info,
+            timeout,
+            self.conn_id,
+            self.session_id,
+        )
+        .await
    }

    fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
@@ -366,6 +394,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
 async fn connect_to_compute(
    config: &config::ProxyConfig,
    conn_info: &ConnInfo,
+    conn_id: uuid::Uuid,
    session_id: uuid::Uuid,
    latency_timer: LatencyTimer,
 ) -> anyhow::Result<ClientInner> {
@@ -401,6 +430,7 @@ async fn connect_to_compute(

    crate::proxy::connect_to_compute(
        &TokioMechanism {
+            conn_id,
            conn_info,
            session_id,
        },
@@ -416,6 +446,7 @@ async fn connect_to_compute_once(
    node_info: &console::CachedNodeInfo,
    conn_info: &ConnInfo,
    timeout: time::Duration,
+    conn_id: uuid::Uuid,
    mut session: uuid::Uuid,
 ) -> Result<ClientInner, tokio_postgres::Error> {
    let mut config = (*node_info.config).clone();
@@ -430,7 +461,6 @@ async fn connect_to_compute_once(

    let (tx, mut rx) = tokio::sync::watch::channel(session);

-    let conn_id = uuid::Uuid::new_v4();
    let span = info_span!(parent: None, "connection", %conn_id);
    span.in_scope(|| {
        info!(%conn_info, %session, "new connection");
@@ -484,6 +514,7 @@ async fn connect_to_compute_once(
        inner: client,
        session: tx,
        ids,
+        conn_id,
    })
 }

@@ -491,6 +522,7 @@ struct ClientInner {
    inner: tokio_postgres::Client,
    session: tokio::sync::watch::Sender<uuid::Uuid>,
    ids: Ids,
+    conn_id: uuid::Uuid,
 }

 impl Client {
@@ -500,12 +532,14 @@ impl Client {
 }

 pub struct Client {
+    conn_id: uuid::Uuid,
    span: Span,
    inner: Option<ClientInner>,
    pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
 }

 pub struct Discard<'a> {
+    conn_id: uuid::Uuid,
    pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
 }

@@ -514,6 +548,7 @@ impl Client {
        let Self {
            inner,
            pool,
+            conn_id,
            span: _,
        } = self;
        (
@@ -521,7 +556,10 @@ impl Client {
                .as_mut()
                .expect("client inner should not be removed")
                .inner,
-            Discard { pool },
+            Discard {
+                pool,
+                conn_id: *conn_id,
+            },
        )
    }

@@ -537,13 +575,13 @@ impl Discard<'_> {
    pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
        if status != ReadyForQueryStatus::Idle {
            if let Some((conn_info, _)) = self.pool.take() {
-                info!("pool: throwing away connection '{conn_info}' because connection is not idle")
+                info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
            }
        }
    }
    pub fn discard(&mut self) {
        if let Some((conn_info, _)) = self.pool.take() {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
        }
    }
 }
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -0,0 +1,146 @@
+use crate::{
+    cancellation::CancelMap,
+    config::ProxyConfig,
+    error::io_error,
+    proxy::{handle_client, ClientMode},
+};
+use bytes::{Buf, Bytes};
+use futures::{Sink, Stream};
+use hyper::upgrade::Upgraded;
+use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
+use pin_project_lite::pin_project;
+
+use std::{
+    pin::Pin,
+    task::{ready, Context, Poll},
+};
+use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
+use tracing::warn;
+
+// TODO: use `std::sync::Exclusive` once it's stabilized.
+// Tracking issue: https://github.com/rust-lang/rust/issues/98407.
+use sync_wrapper::SyncWrapper;
+
+pin_project! {
+    /// This is a wrapper around a [`WebSocketStream`] that
+    /// implements [`AsyncRead`] and [`AsyncWrite`].
+    pub struct WebSocketRw {
+        #[pin]
+        stream: SyncWrapper<WebSocketStream<Upgraded>>,
+        bytes: Bytes,
+    }
+}
+
+impl WebSocketRw {
+    pub fn new(stream: WebSocketStream<Upgraded>) -> Self {
+        Self {
+            stream: stream.into(),
+            bytes: Bytes::new(),
+        }
+    }
+}
+
+impl AsyncWrite for WebSocketRw {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        let mut stream = self.project().stream.get_pin_mut();
+
+        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
+        match stream.as_mut().start_send(Message::Binary(buf.into())) {
+            Ok(()) => Poll::Ready(Ok(buf.len())),
+            Err(e) => Poll::Ready(Err(io_error(e))),
+        }
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let stream = self.project().stream.get_pin_mut();
+        stream.poll_flush(cx).map_err(io_error)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        let stream = self.project().stream.get_pin_mut();
+        stream.poll_close(cx).map_err(io_error)
+    }
+}
+
+impl AsyncRead for WebSocketRw {
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        if buf.remaining() > 0 {
+            let bytes = ready!(self.as_mut().poll_fill_buf(cx))?;
+            let len = std::cmp::min(bytes.len(), buf.remaining());
+            buf.put_slice(&bytes[..len]);
+            self.consume(len);
+        }
+
+        Poll::Ready(Ok(()))
+    }
+}
+
+impl AsyncBufRead for WebSocketRw {
+    fn poll_fill_buf(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<&[u8]>> {
+        // Please refer to poll_fill_buf's documentation.
+        const EOF: Poll<io::Result<&[u8]>> = Poll::Ready(Ok(&[]));
+
+        let mut this = self.project();
+        loop {
+            if !this.bytes.chunk().is_empty() {
+                let chunk = (*this.bytes).chunk();
+                return Poll::Ready(Ok(chunk));
+            }
+
+            let res = ready!(this.stream.as_mut().get_pin_mut().poll_next(cx));
+            match res.transpose().map_err(io_error)? {
+                Some(message) => match message {
+                    Message::Ping(_) => {}
+                    Message::Pong(_) => {}
+                    Message::Text(text) => {
+                        // We expect to see only binary messages.
+                        let error = "unexpected text message in the websocket";
+                        warn!(length = text.len(), error);
+                        return Poll::Ready(Err(io_error(error)));
+                    }
+                    Message::Frame(_) => {
+                        // This case is impossible according to Frame's doc.
+                        panic!("unexpected raw frame in the websocket");
+                    }
+                    Message::Binary(chunk) => {
+                        assert!(this.bytes.is_empty());
+                        *this.bytes = Bytes::from(chunk);
+                    }
+                    Message::Close(_) => return EOF,
+                },
+                None => return EOF,
+            }
+        }
+    }
+
+    fn consume(self: Pin<&mut Self>, amount: usize) {
+        self.project().bytes.advance(amount);
+    }
+}
+
+pub async fn serve_websocket(
+    websocket: HyperWebsocket,
+    config: &'static ProxyConfig,
+    cancel_map: &CancelMap,
+    session_id: uuid::Uuid,
+    hostname: Option<String>,
+) -> anyhow::Result<()> {
+    let websocket = websocket.await?;
+    handle_client(
+        config,
+        cancel_map,
+        session_id,
+        WebSocketRw::new(websocket),
+        ClientMode::Websockets { hostname },
+    )
+    .await?;
+    Ok(())
+}
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,7 @@ backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "^2.2.3"
+Werkzeug = "^3.0.1"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"
 pytest-asyncio = "^0.21.0"
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -15,28 +15,15 @@ The script fetches the durations of benchmarks from the database and stores it i

 BENCHMARKS_DURATION_QUERY = """
    SELECT
-        DISTINCT parent_suite, suite, test,
-        PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration_ms) as percentile_ms
-    FROM
-        (
-            SELECT
-                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
-                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
-                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp,
-                (jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'duration')::int as duration_ms
-            FROM
-                regress_test_results
-            WHERE
-                reference = 'refs/heads/main'
-        ) data
+        DISTINCT parent_suite, suite, name,
+        PERCENTILE_DISC(%s) WITHIN GROUP (ORDER BY duration) as percentile_ms
+    FROM results
    WHERE
-        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        started_at > CURRENT_DATE - INTERVAL '%s' day
        AND parent_suite = 'test_runner.performance'
        AND status = 'passed'
    GROUP BY
-        parent_suite, suite, test
+        parent_suite, suite, name
    ;
 """

@@ -44,68 +31,69 @@ BENCHMARKS_DURATION_QUERY = """
 # the total duration varies from 8 to 40 minutes.
 # We use some pre-collected durations as a fallback to have a better distribution.
 FALLBACK_DURATION = {
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 57.0,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 28.0,
-    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 71.0,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 27.0,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 11.0,
-    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 30.0,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 40.0,
-    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 5.0,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.0,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 10.0,
-    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 19.0,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.0,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 30.0,
-    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 60.0,
-    "test_runner/performance/test_compaction.py::test_compaction": 77.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.0,
-    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.0,
-    "test_runner/performance/test_copy.py::test_copy[neon]": 12.0,
-    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.0,
-    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 284.0,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 11.0,
-    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 7.0,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 85.0,
-    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 29.0,
-    "test_runner/performance/test_layer_map.py::test_layer_map": 44.0,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 16.0,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 67.0,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 67.0,
-    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 80.0,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 102.0,
-    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.0,
-    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 9.0,
-    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 4.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 80.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 68.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 11.0,
-    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 10.0,
-    "test_runner/performance/test_startup.py::test_startup_simple": 2.0,
-    "test_runner/performance/test_startup.py::test_startup": 539.0,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 375.0,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 370.0,
-    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 94.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 164.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 274.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 949.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 142.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 151.0,
-    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 182.0,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 13.0,
-    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.0,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
+    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
+    "test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
+    "test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
+    "test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
+    "test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
+    "test_runner/performance/test_compaction.py::test_compaction": 110.222,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
+    "test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
+    "test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
+    "test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
+    "test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
+    "test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
+    "test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
+    "test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
+    "test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
+    "test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
+    "test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
+    "test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
+    "test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
+    "test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
+    "test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
+    "test_runner/performance/test_startup.py::test_startup": 890.114,
+    "test_runner/performance/test_startup.py::test_startup_simple": 2.51,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
+    "test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
+    "test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
+    "test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
 }


@@ -130,7 +118,7 @@ def main(args: argparse.Namespace):
        res = FALLBACK_DURATION

    for row in rows:
-        pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}"
+        pytest_name = f"{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['name']}"
        duration = row["percentile_ms"] / 1000
        logging.info(f"\t{pytest_name}: {duration}")
        res[pytest_name] = duration
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -9,28 +9,15 @@ from typing import DefaultDict, Dict
 import psycopg2
 import psycopg2.extras

-# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
 FLAKY_TESTS_QUERY = """
    SELECT
-        DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
-    FROM
-        (
-            SELECT
-                reference,
-                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
-                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
-                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
-            FROM
-                regress_test_results
-        ) data
+        DISTINCT parent_suite, suite, name
+    FROM results
    WHERE
-        timestamp > CURRENT_DATE - INTERVAL '%s' day
+        started_at > CURRENT_DATE - INTERVAL '%s' day
        AND (
            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
-            OR retries_status_change::boolean
+            OR flaky
        )
    ;
 """
@@ -63,12 +50,14 @@ def main(args: argparse.Namespace):
        if row["parent_suite"] != "test_runner.regress":
            continue

-        deparametrized_test = row["deparametrized_test"]
-        dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
-        parametrized_test = deparametrized_test.replace(
-            "[",
-            f"[{build_type}-pg{pg_version}{dash_if_needed}",
-        )
+        if row["name"].endswith("]"):
+            parametrized_test = row["name"].replace(
+                "[",
+                f"[{build_type}-pg{pg_version}-",
+            )
+        else:
+            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}]"
+
        res[row["parent_suite"]][row["suite"]][parametrized_test] = True

        logging.info(
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -4,7 +4,7 @@ use std::task::{Context, Poll};
 use std::time::Duration;
 use tonic::codegen::StdError;
 use tonic::transport::{ClientTlsConfig, Endpoint};
-use tonic::{transport::Channel, Code, Status};
+use tonic::{transport::Channel, Status};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};

 use proto::{
@@ -23,6 +23,7 @@ pub mod proto {
 pub mod metrics;

 // Re-exports to avoid direct tonic dependency in user crates.
+pub use tonic::Code;
 pub use tonic::Request;
 pub use tonic::Streaming;

--- a/test_runner/duplicate_tenant.py
+++ b/test_runner/duplicate_tenant.py
@@ -0,0 +1,43 @@
+# Usage from top of repo:
+#  poetry run python3 test_runner/duplicate_tenant.py b97965931096047b2d54958756baee7b 10
+from queue import Queue
+import sys
+import threading
+
+import requests
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.types import TenantId
+
+initial_tenant = sys.argv[1]
+ncopies = int(sys.argv[2])
+numthreads = int(sys.argv[3])
+
+
+# class DuckTypedNeonEnv:
+#     pass
+
+
+# cli = NeonCli(DuckTypedNeonEnv())
+
+q = Queue()
+for i in range(0, ncopies):
+    q.put(i)
+
+for i in range(0, numthreads):
+    q.put(None)
+
+
+def create():
+    while True:
+        if q.get() == None:
+            break
+        new_tenant = TenantId.generate()
+        res = requests.post(
+            f"http://localhost:9898/v1/tenant/{initial_tenant}/duplicate",
+            json={"new_tenant_id": str(new_tenant)},
+        )
+        res.raise_for_status()
+
+
+for i in range(0, numthreads):
+    threading.Thread(target=create).start()
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1719,6 +1719,11 @@ class NeonPageserver(PgProtocol):
                break

            if error_or_warn.search(line):
+                # Is this a torn log line?  This happens when force-killing a process and restarting
+                # Example: "2023-10-25T09:38:31.752314Z  WARN deletion executo2023-10-25T09:38:31.875947Z  INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
+                if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
+                    continue
+
                # It's an ERROR or WARN. Is it in the allow-list?
                for a in self.allowed_errors:
                    if re.match(a, line):
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -215,6 +215,25 @@ class PageserverHttpClient(requests.Session):
        assert isinstance(new_tenant_id, str)
        return TenantId(new_tenant_id)

+    def tenant_duplicate(
+        self, src_tenant_id: TenantId, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
+    ) -> TenantId:
+        if conf is not None:
+            assert "new_tenant_id" not in conf.keys()
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{src_tenant_id}/duplicate",
+            json={
+                "new_tenant_id": str(new_tenant_id),
+                **(conf or {}),
+            },
+        )
+        self.verbose_error(res)
+        if res.status_code == 409:
+            raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
+        new_tenant_id = res.json()
+        assert isinstance(new_tenant_id, str)
+        return TenantId(new_tenant_id)
+
    def tenant_attach(
        self,
        tenant_id: TenantId,
@@ -441,13 +460,13 @@ class PageserverHttpClient(requests.Session):
        assert res_json is None

    def timeline_get_lsn_by_timestamp(
-        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp
+        self, tenant_id: TenantId, timeline_id: TimelineId, timestamp, version: int
    ):
        log.info(
            f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
        )
        res = self.get(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}&version={version}",
        )
        self.verbose_error(res)
        res_json = res.json()
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -4,8 +4,10 @@ First make a release build. The `-s` flag silences a lot of output, and makes it
 easier to see if you have compile errors without scrolling up.
 `BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing" make -s -j8`

+You may also need to run `./scripts/pysync`.
+
 Then run the tests
-`NEON_BIN=./target/release poetry run pytest test_runner/performance"`
+`DEFAULT_PG_VERSION=15 NEON_BIN=./target/release poetry run pytest test_runner/performance`

 Some handy pytest flags for local development:
 - `-x` tells pytest to stop on first error
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -8,6 +8,71 @@ from fixtures.types import Lsn
 from fixtures.utils import query_scalar


+#
+# Test pageserver get_lsn_by_timestamp API
+#
+def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+
+    new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
+    endpoint_main = env.endpoints.create_start("test_lsn_mapping")
+    log.info("postgres is running on 'test_lsn_mapping' branch")
+
+    cur = endpoint_main.connect().cursor()
+    # Create table, and insert rows, each in a separate transaction
+    # Disable synchronous_commit to make this initialization go faster.
+    #
+    # Each row contains current insert LSN and the current timestamp, when
+    # the row was inserted.
+    cur.execute("SET synchronous_commit=off")
+    cur.execute("CREATE TABLE foo (x integer)")
+    tbl = []
+    for i in range(1000):
+        cur.execute("INSERT INTO foo VALUES(%s)", (i,))
+        # Get the timestamp at UTC
+        after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
+        tbl.append([i, after_timestamp])
+
+    # Execute one more transaction with synchronous_commit enabled, to flush
+    # all the previous transactions
+    cur.execute("SET synchronous_commit=on")
+    cur.execute("INSERT INTO foo VALUES (-1)")
+
+    # Wait until WAL is received by pageserver
+    wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+
+    with env.pageserver.http_client() as client:
+        # Check edge cases: timestamp in the future
+        probe_timestamp = tbl[-1][1] + timedelta(hours=1)
+        result = client.timeline_get_lsn_by_timestamp(
+            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
+        )
+        assert result == "future"
+
+        # timestamp too the far history
+        probe_timestamp = tbl[0][1] - timedelta(hours=10)
+        result = client.timeline_get_lsn_by_timestamp(
+            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
+        )
+        assert result == "past"
+
+        # Probe a bunch of timestamps in the valid range
+        for i in range(1, len(tbl), 100):
+            probe_timestamp = tbl[i][1]
+            lsn = client.timeline_get_lsn_by_timestamp(
+                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 1
+            )
+            # Call get_lsn_by_timestamp to get the LSN
+            # Launch a new read-only node at that LSN, and check that only the rows
+            # that were supposed to be committed at that point in time are visible.
+            endpoint_here = env.endpoints.create_start(
+                branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
+            )
+            assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
+
+            endpoint_here.stop_and_destroy()
+
+
 #
 # Test pageserver get_lsn_by_timestamp API
 #
@@ -45,23 +110,24 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
        # Check edge cases: timestamp in the future
        probe_timestamp = tbl[-1][1] + timedelta(hours=1)
        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
+            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
        )
-        assert result == "future"
+        assert result["kind"] == "future"

        # timestamp too the far history
        probe_timestamp = tbl[0][1] - timedelta(hours=10)
        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
+            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
        )
-        assert result == "past"
+        assert result["kind"] == "past"

        # Probe a bunch of timestamps in the valid range
        for i in range(1, len(tbl), 100):
            probe_timestamp = tbl[i][1]
-            lsn = client.timeline_get_lsn_by_timestamp(
-                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z"
+            result = client.timeline_get_lsn_by_timestamp(
+                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
            )
+            lsn = result["lsn"]
            # Call get_lsn_by_timestamp to get the LSN
            # Launch a new read-only node at that LSN, and check that only the rows
            # that were supposed to be committed at that point in time are visible.
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,5 +1,6 @@
 import json
 import subprocess
+import time
 from typing import Any, List, Optional, Tuple

 import psycopg2
@@ -364,10 +365,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):

    pid1 = get_pid(200, "http")["rows"][0]["pid"]

+    time.sleep(0.02)
+
    # query should be on the same connection
    rows = get_pid(200, "http")["rows"]
    assert rows == [{"pid": pid1}]

+    time.sleep(0.02)
+
    # incorrect password should not work
    res = get_pid(400, "foobar")
    assert "password authentication failed for user" in res["message"]
@@ -378,10 +383,14 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
    pid2 = get_pid(200, "http2")["rows"][0]["pid"]
    assert pid1 != pid2

+    time.sleep(0.02)
+
    # query should be on an existing connection
    pid = get_pid(200, "http2")["rows"][0]["pid"]
    assert pid in [pid1, pid2]

+    time.sleep(0.02)
+
    # old password should not work
    res = get_pid(400, "http")
    assert "password authentication failed for user" in res["message"]
@@ -419,6 +428,7 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
        )

    pid1 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
+    time.sleep(0.02)
    query(200, "BEGIN")
    pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
    assert pid1 != pid2
--- a/test_runner/regress/test_tenant_duplicate.py
+++ b/test_runner/regress/test_tenant_duplicate.py
@@ -0,0 +1,54 @@
+import time
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    last_flush_lsn_upload,
+)
+from fixtures.remote_storage import (
+    RemoteStorageKind,
+)
+from fixtures.types import TenantId
+from fixtures.log_helper import log
+
+
+def test_tenant_duplicate(
+    neon_env_builder: NeonEnvBuilder,
+):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    env = neon_env_builder.init_start()
+
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep_main:
+        ep_main.safe_psql("CREATE TABLE foo (i int);")
+        ep_main.safe_psql("INSERT INTO foo VALUES (1), (2), (3);")
+        last_flush_lsn = last_flush_lsn_upload(
+            env, ep_main, env.initial_tenant, env.initial_timeline
+        )
+
+    new_tenant_id = TenantId.generate()
+    # timeline id remains unchanged with tenant_duplicate
+    # TODO: implement a remapping scheme so timeline ids remain globally unique
+    new_timeline_id = env.initial_timeline
+
+    log.info(f"Duplicate tenant/timeline will be: {new_tenant_id}/{new_timeline_id}")
+
+    ps_http = env.pageserver.http_client()
+
+    ps_http.tenant_duplicate(env.initial_tenant, new_tenant_id)
+
+    ps_http.tenant_delete(env.initial_tenant)
+
+    env.neon_cli.map_branch("duplicate", new_tenant_id, new_timeline_id)
+
+    # start read-only replicate and validate
+    with env.endpoints.create_start(
+        "duplicate", tenant_id=new_tenant_id, lsn=last_flush_lsn
+    ) as ep_dup:
+        with ep_dup.connect() as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT * FROM foo ORDER BY i;")
+                cur.fetchall() == [(1,), (2,), (3,)]
+
+    # ensure restarting PS works
+    env.pageserver.stop()
+    env.pageserver.start()
+
Author	SHA1	Message	Date
Christian Schwarz	2d37857351	pq bench: avoid repeated conversion to_i128	2023-11-02 17:43:59 +00:00
Christian Schwarz	ddfce0cfa5	per-second RPS	2023-11-02 17:11:37 +00:00
Christian Schwarz	d52a622115	pq bench: proper shutdown	2023-11-02 17:07:00 +00:00
Christian Schwarz	a066eecda0	http bench: sligthly improved stats	2023-11-02 17:06:36 +00:00
Christian Schwarz	94e94af6c7	WIP: libpq-based client depends on https://github.com/neondatabase/rust-postgres/pull/25	2023-11-02 16:28:16 +01:00
Christian Schwarz	df7346eaff	Revert "CP tokio_epoll_uring for read path" This reverts commit `82d9c68667`.	2023-11-02 11:32:48 +01:00
Christian Schwarz	76efb1b79b	Revert "CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking" This reverts commit `55cdf6c7ff`.	2023-11-02 11:32:41 +01:00
Christian Schwarz	2f656c6691	rename getpage_bench to getpage_bench_http	2023-11-02 10:59:54 +01:00
Christian Schwarz	bb5b5cbdac	WIP: benchmark that does random getpage requests against the keyspace backup of pageserver.toml d =1 pg_distrib_dir ='/home/admin/neon-main/pg_install' http_auth_type ='Trust' pg_auth_type ='Trust' listen_http_addr ='127.0.0.1:9898' listen_pg_addr ='127.0.0.1:64000' broker_endpoint ='http://127.0.0.1:50051/' #control_plane_api ='http://127.0.0.1:1234/' # Initial configuration file created by 'pageserver --init' #listen_pg_addr = '127.0.0.1:64000' #listen_http_addr = '127.0.0.1:9898' #wait_lsn_timeout = '60 s' #wal_redo_timeout = '60 s' #max_file_descriptors = 10000 #page_cache_size = 160000 # initial superuser role name to use when creating a new tenant #initial_superuser_name = 'cloud_admin' #broker_endpoint = 'http://127.0.0.1:50051' #log_format = 'plain' #concurrent_tenant_size_logical_size_queries = '1' #metric_collection_interval = '10 min' #cached_metric_collection_interval = '0s' #synthetic_size_calculation_interval = '10 min' #disk_usage_based_eviction = { max_usage_pct = .., min_avail_bytes = .., period = "10s"} #background_task_maximum_delay = '10s' [tenant_config] #checkpoint_distance = 268435456 # in bytes #checkpoint_timeout = 10 m #compaction_target_size = 134217728 # in bytes #compaction_period = '20 s' #compaction_threshold = 10 #gc_period = '1 hr' #gc_horizon = 67108864 #image_creation_threshold = 3 #pitr_interval = '7 days' #min_resident_size_override = .. # in bytes #evictions_low_residence_duration_metric_threshold = '24 hour' #gc_feedback = false # make it determinsitic gc_period = '0s' checkpoint_timeout = '3650 day' compaction_period = '20 s' compaction_threshold = 10 compaction_target_size = 134217728 checkpoint_distance = 268435456 image_creation_threshold = 3 [remote_storage] local_path = '/home/admin/neon-main/bench_repo_dir/repo/remote_storage_local_fs'	2023-10-26 17:53:03 +00:00
Christian Schwarz	55cdf6c7ff	CP: use hacked-together open_at for async VirtualFile open calls instead of spawn_blocking This makes Delta/Image ::load fns fully tokio-epoll-uring	2023-10-26 17:40:33 +00:00
Christian Schwarz	82d9c68667	CP tokio_epoll_uring for read path	2023-10-26 17:22:23 +00:00
Christian Schwarz	bc91c40f56	Revert "revert recent VirtualFile asyncification changes (#5291 )" This reverts commit `ab1f37e908`.	2023-10-26 17:22:10 +00:00
Christian Schwarz	c5f58ef3f7	API to duplicate a tenant	2023-10-26 16:30:11 +00:00
Christian Schwarz	bb8531d920	Revert "WIP cleanup unused RemoteStorage fields + half-baked copy_file impl" This reverts commit `7553bbe3f5`.	2023-10-26 15:44:26 +00:00
Christian Schwarz	7553bbe3f5	WIP cleanup unused RemoteStorage fields + half-baked copy_file impl	2023-10-26 15:44:03 +00:00
dependabot[bot]	378daa358b	build(deps): bump werkzeug from 2.2.3 to 3.0.1 (#5665 )	2023-10-25 22:50:35 +00:00
Alexander Bayandin	85f4514e7d	Get env var for real Azure tests from GitHub (#5662 ) ## Problem We'll need to switch `REMOTE_STORAGE_AZURE_REGION` from the current `eastus2` region to something `eu-central-1`-like. This may require changing `AZURE_STORAGE_ACCESS_KEY`. To make it possible to switch from one place (not to break a lot of builds on CI), move `REMOTE_STORAGE_AZURE_CONTAINER` and `REMOTE_STORAGE_AZURE_REGION` to GitHub Variables. See https://github.com/neondatabase/neon/settings/variables/actions ## Summary of changes - Get values for `REMOTE_STORAGE_AZURE_CONTAINER` & `REMOTE_STORAGE_AZURE_REGION` from GitHub Variables	2023-10-25 22:54:23 +01:00
Joonas Koivunen	f70019797c	refactor(rtc): schedule compaction update (#5649 ) a single operation instead of N uploads and 1 deletion scheduling with write(layer_map) lock releasing in the between. Compaction update will make for a much better place to change how the operation will change in future compared to more general file based operations. builds upon #5645. solves the problem of difficult to see hopeful correctness w.r.t. other `index_part.json` changing operations. Co-authored-by: Shany Pozin <shany@neon.tech>	2023-10-25 22:25:43 +01:00
Joonas Koivunen	325258413a	fix: trampling on global physical size metric (#5663 ) All loading (attached, or from disk) timelines overwrite the global gauge for physical size. The `_set` method cannot be used safely, so remove it and just "add" the physical size.	2023-10-25 19:29:12 +01:00
Konstantin Knizhnik	4ddbc0e46d	Ignore missed AUX_FILES_KEY when generating image layer (#5660 ) ## Problem Logical replication requires new AUX_FILES_KEY which is definitely absent in existed database. We do not have function to check if key exists in our KV storage. So I have to handle the error in `list_aux_files` method. But this key is also included in key space range and accessed y `create_image_layer` method. ## Summary of changes Check if AUX_FILES_KEY exists before including it in keyspace. --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech> Co-authored-by: Shany Pozin <shany@neon.tech> Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>	2023-10-25 18:35:23 +01:00
Arpad Müller	a673e4e7a9	Optionally return json from get_lsn_by_timestamp (#5608 ) This does two things: first a minor refactor to not use HTTP/1.x style header names and also to not panic if some certain requests had no "Accept" header. As a second thing, it addresses the third bullet point from #3689: > Change `get_lsn_by_timestamp` API method to return LSN even if we only found commit before the specified timestamp. This is done by adding a version parameter to the `get_lsn_by_timestamp` API call and making its behaviour depend on the version number. Part of #3414 (but doesn't address it in its entirety). --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-10-25 18:46:34 +02:00
bojanserafimov	c155cc0c3f	Fix test instructions readme (#5644 )	2023-10-25 11:53:04 -04:00
Conrad Ludgate	32126d705b	proxy refactor serverless (#4685 ) ## Problem Our serverless backend was a bit jumbled. As a comment indicated, we were handling SQL-over-HTTP in our `websocket.rs` file. I've extracted out the `sql_over_http` and `websocket` files from the `http` module and put them into a new module called `serverless`. ## Summary of changes ```sh mkdir proxy/src/serverless mv proxy/src/http/{conn_pool,sql_over_http,websocket}.rs proxy/src/serverless/ mv proxy/src/http/server.rs proxy/src/http/health_server.rs mv proxy/src/metrics proxy/src/usage_metrics.rs ``` I have also extracted the hyper server and handler from websocket.rs into `serverless.rs`	2023-10-25 15:43:03 +01:00
John Spray	5683ae9eab	pageserver: suppress some of the most common spurious warnings (#5658 ) Two of the most common spurious log messages: - broker connections terminate & we log at error severity. Unfortunately tonic gives us an "Unknown" error so to suppress these we're doing string matching. It's hacky but worthwhile for operations. - the first iteration of tenant background tasks tends to over-run its schedule and emit a warning. Ultimately we should fix these to run on time, but for now we are not benefiting from polluting our logs with the warnings.	2023-10-25 14:55:37 +01:00
Alexander Bayandin	4778b6a12e	Switch to querying new tests results DB (#5616 ) ## Problem We started to store test results in a new format in https://github.com/neondatabase/neon/pull/4549. This PR switches scripts to query this db. (we can completely remove old DB/ingestions scripts in a couple of weeks after the PR merged) ## Summary of changes - `scripts/benchmark_durations.py` query new database - `scripts/flaky_tests.py` query new database	2023-10-25 14:25:13 +01:00
John Spray	8b8be7bed4	tests: don't fail tests on torn log lines (#5655 ) ## Problem Tests that force-kill and restart a service can generate torn log lines that might match WARN\|ERROR, but not match the allow expression that a test has loaded, e.g. https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5651/6638398772/index.html#suites/7538959189f4501983ddd9e167836c8b/d272ba8a73e6945c ## Summary of changes Ignore log lines which match a regex for torn log lines on restart: they have two timestamps and the second line is an "INFO version"... message.	2023-10-25 13:29:30 +01:00
Conrad Ludgate	a461c459d8	fix http pool test (#5653 ) ## Problem We defer the returning of connections the the connection pool. It's possible for our test to be faster than the returning of connections - which then gets a differing process ID because it opens a new connection. ## Summary of changes 1. Delay the tests just a little (20ms) to give more chance for connections to return. 2. Correlate connection IDs with the connection logs a bit more	2023-10-25 13:20:45 +01:00
Joonas Koivunen	4ae2d1390d	refactor(remote_timeline_client): Split deletion into unlinking + deletion (#5645 ) Quest: #4745. Prerequisite for #4938. Original https://github.com/neondatabase/neon/pull/4938#issuecomment-1777150665. The new Layer implementation has so far been using `RemoteTimelineClient::schedule_layer_file_deletion` from `Layer::drop` but it was noticed that this could mean that the L0s compaction wanted to remove could linger in the index part for longer time or be left there for longer time. Solution is to split the `RemoteTimelineClient::schedule_layer_file_deletion` into two parts: - unlinking from index_part.json, to be called from end of compaction and gc - scheduling of actual deletions, to be called from `Layer::drop` The added methods are added unused.	2023-10-25 15:01:19 +03:00
Joonas Koivunen	c5949e1fd6	misc smaller improvements (#5527 ) - finally add an `#[instrument]` to Timeline::create_image_layers, making it easier to see that something is happening because we create image layers - format some macro context code - add a warning not to create new validation functions a la parse do not validate Split off from #5198.	2023-10-25 14:59:43 +03:00