build(deps-dev): bump zipp from 3.8.1 to 3.19.1

Bumps [zipp](https://github.com/jaraco/zipp) from 3.8.1 to 3.19.1. - [Release notes](https://github.com/jaraco/zipp/releases) - [Changelog](https://github.com/jaraco/zipp/blob/main/NEWS.rst) - [Commits](https://github.com/jaraco/zipp/compare/v3.8.1...v3.19.1) --- updated-dependencies: - dependency-name: zipp dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com>
refactor: postgres_backend: replace abstract shutdown_watcher with CancellationToken (#8295 )
2026-05-22 15:41:15 +00:00 · 2024-07-09 18:12:57 +00:00 · 2024-07-09 21:11:11 +03:00 · 2024-07-09 12:42:09 -05:00 · 2024-07-09 12:42:09 -05:00 · 2024-07-09 18:41:37 +01:00
47 changed files with 2040 additions and 1176 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -115,6 +115,7 @@ runs:
        export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
        export DEFAULT_PG_VERSION=${PG_VERSION#v}
        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
+        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}

        if [ "${BUILD_TYPE}" = "remote" ]; then
          export REMOTE_ENV=1
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -99,7 +99,14 @@ jobs:
        # Set --sparse-ordering option of pytest-order plugin
        # to ensure tests are running in order of appears in the file.
        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
+        extra_params:
+          -m remote_cluster
+          --sparse-ordering
+          --timeout 5400
+          --ignore test_runner/performance/test_perf_olap.py
+          --ignore test_runner/performance/test_perf_pgvector_queries.py
+          --ignore test_runner/performance/test_logical_replication.py
+          --ignore test_runner/performance/test_physical_replication.py
      env:
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -125,6 +132,69 @@ jobs:
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

+  replication-tests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_logical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_physical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
  generate-matrices:
    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
    # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1336,6 +1336,7 @@ jobs:
        env:
          BUCKET: neon-github-public-dev
          PREFIX: artifacts/latest
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
        run: |
          # Update compatibility snapshot for the release
          for pg_version in v14 v15 v16; do
@@ -1349,7 +1350,7 @@ jobs:

          # Update Neon artifact for the release (reuse already uploaded artifact)
          for build_type in debug release; do
-            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
+            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst

            S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1397,9 +1397,9 @@ dependencies = [

 [[package]]
 name = "crc32c"
-version = "0.6.5"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
+checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
 dependencies = [
 "rustc_version",
 ]
@@ -1651,6 +1651,16 @@ dependencies = [
 "rusticata-macros",
 ]

+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+ "serde",
+]
+
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -3008,9 +3018,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "measured"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
+checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
 dependencies = [
 "bytes",
 "crossbeam-utils",
@@ -3026,9 +3036,9 @@ dependencies = [

 [[package]]
 name = "measured-derive"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
+checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
@@ -3038,9 +3048,9 @@ dependencies = [

 [[package]]
 name = "measured-process"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
+checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
 "libc",
 "measured",
@@ -3275,6 +3285,12 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -3667,6 +3683,7 @@ dependencies = [
 "sysinfo",
 "tenant_size_model",
 "thiserror",
+ "tikv-jemallocator",
 "tokio",
 "tokio-epoll-uring",
 "tokio-io-timeout",
@@ -4077,6 +4094,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls 0.25.0",
+ "tokio-util",
 "tracing",
 "workspace_hack",
 ]
@@ -4117,6 +4135,12 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -5396,9 +5420,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"

 [[package]]
 name = "serde"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
+checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
 dependencies = [
 "serde_derive",
 ]
@@ -5415,9 +5439,9 @@ dependencies = [

 [[package]]
 name = "serde_derive"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
+checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
 "proc-macro2",
 "quote",
@@ -6107,12 +6131,15 @@ dependencies = [

 [[package]]
 name = "time"
-version = "0.3.21"
+version = "0.3.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
 dependencies = [
+ "deranged",
 "itoa",
 "js-sys",
+ "num-conv",
+ "powerfmt",
 "serde",
 "time-core",
 "time-macros",
@@ -6120,16 +6147,17 @@ dependencies = [

 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"

 [[package]]
 name = "time-macros"
-version = "0.2.9"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
 dependencies = [
+ "num-conv",
 "time-core",
 ]

@@ -7427,13 +7455,12 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
+ "deranged",
 "either",
 "fail",
 "futures-channel",
- "futures-core",
 "futures-executor",
 "futures-io",
- "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
 "hashbrown 0.14.5",
@@ -7451,7 +7478,9 @@ dependencies = [
 "num-traits",
 "once_cell",
 "parquet",
+ "proc-macro2",
 "prost",
+ "quote",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
@@ -7468,6 +7497,7 @@ dependencies = [
 "syn 1.0.109",
 "syn 2.0.52",
 "sync_wrapper",
+ "tikv-jemalloc-sys",
 "time",
 "time-macros",
 "tokio",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -111,8 +111,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.21", features=["lasso"] }
-measured-process = { version = "0.0.21" }
+measured = { version = "0.0.22", features=["lasso"] }
+measured-process = { version = "0.0.22" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -798,7 +798,11 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin` name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let connstr = self.connstr.clone();
+        let mut connstr = self.connstr.clone();
+        connstr
+            .query_pairs_mut()
+            .append_pair("application_name", "apply_config");
+
        let mut client = match Client::connect(connstr.as_str(), NoTls) {
            Err(e) => match e.code() {
                Some(&SqlState::INVALID_PASSWORD)
@@ -867,6 +871,11 @@ impl ComputeNode {

        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
+            let mut connstr = connstr.clone();
+            connstr
+                .query_pairs_mut()
+                .append_pair("application_name", "migrations");
+
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
            handle_migrations(&mut client).context("apply_config handle_migrations")
        });
@@ -1386,7 +1395,9 @@ pub fn forward_termination_signal() {
    let pg_pid = PG_PID.load(Ordering::SeqCst);
    if pg_pid != 0 {
        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
-        kill(pg_pid, Signal::SIGQUIT).ok();
+        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
+        // ROs to get a list of running xacts faster instead of going through the CLOG.
+        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
+        kill(pg_pid, Signal::SIGINT).ok();
    }
 }
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,6 +11,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
+mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
--- a/compute_tools/src/migration.rs
+++ b/compute_tools/src/migration.rs
@@ -0,0 +1,100 @@
+use anyhow::{Context, Result};
+use postgres::Client;
+use tracing::info;
+
+pub(crate) struct MigrationRunner<'m> {
+    client: &'m mut Client,
+    migrations: &'m [&'m str],
+}
+
+impl<'m> MigrationRunner<'m> {
+    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+        Self { client, migrations }
+    }
+
+    fn get_migration_id(&mut self) -> Result<i64> {
+        let query = "SELECT id FROM neon_migration.migration_id";
+        let row = self
+            .client
+            .query_one(query, &[])
+            .context("run_migrations get migration_id")?;
+
+        Ok(row.get::<&str, i64>("id"))
+    }
+
+    fn update_migration_id(&mut self) -> Result<()> {
+        let setval = format!(
+            "UPDATE neon_migration.migration_id SET id={}",
+            self.migrations.len()
+        );
+
+        self.client
+            .simple_query(&setval)
+            .context("run_migrations update id")?;
+
+        Ok(())
+    }
+
+    fn prepare_migrations(&mut self) -> Result<()> {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        self.client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        self.client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        self.client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        self.client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        self.client.simple_query(query)?;
+
+        Ok(())
+    }
+
+    pub fn run_migrations(mut self) -> Result<()> {
+        self.prepare_migrations()?;
+
+        let mut current_migration: usize = self.get_migration_id()? as usize;
+        let starting_migration_id = current_migration;
+
+        let query = "BEGIN";
+        self.client
+            .simple_query(query)
+            .context("run_migrations begin")?;
+
+        while current_migration < self.migrations.len() {
+            let migration = self.migrations[current_migration];
+
+            if migration.starts_with("-- SKIP") {
+                info!("Skipping migration id={}", current_migration);
+            } else {
+                info!(
+                    "Running migration id={}:\n{}\n",
+                    current_migration, migration
+                );
+                self.client.simple_query(migration).with_context(|| {
+                    format!("run_migration current_migration={}", current_migration)
+                })?;
+            }
+
+            current_migration += 1;
+        }
+
+        self.update_migration_id()?;
+
+        let query = "COMMIT";
+        self.client
+            .simple_query(query)
+            .context("run_migrations commit")?;
+
+        info!(
+            "Ran {} migrations",
+            (self.migrations.len() - starting_migration_id)
+        );
+
+        Ok(())
+    }
+}
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
 use crate::logger::inlinify;
+use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

@@ -791,69 +792,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
        include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
    ];

-    let mut func = || {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        client.simple_query(query)?;
-        Ok::<_, anyhow::Error>(())
-    };
-    func().context("handle_migrations prepare")?;
-
-    let query = "SELECT id FROM neon_migration.migration_id";
-    let row = client
-        .query_one(query, &[])
-        .context("handle_migrations get migration_id")?;
-    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
-    let starting_migration_id = current_migration;
-
-    let query = "BEGIN";
-    client
-        .simple_query(query)
-        .context("handle_migrations begin")?;
-
-    while current_migration < migrations.len() {
-        let migration = &migrations[current_migration];
-        if migration.starts_with("-- SKIP") {
-            info!("Skipping migration id={}", current_migration);
-        } else {
-            info!(
-                "Running migration id={}:\n{}\n",
-                current_migration, migration
-            );
-            client.simple_query(migration).with_context(|| {
-                format!("handle_migrations current_migration={}", current_migration)
-            })?;
-        }
-        current_migration += 1;
-    }
-    let setval = format!(
-        "UPDATE neon_migration.migration_id SET id={}",
-        migrations.len()
-    );
-    client
-        .simple_query(&setval)
-        .context("handle_migrations update id")?;
-
-    let query = "COMMIT";
-    client
-        .simple_query(query)
-        .context("handle_migrations commit")?;
-
-    info!(
-        "Ran {} migrations",
-        (migrations.len() - starting_migration_id)
-    );
+    MigrationRunner::new(client, &migrations).run_migrations()?;

    Ok(())
 }
--- a/docs/rfcs/034-direct-io-for-pageserver.md
+++ b/docs/rfcs/034-direct-io-for-pageserver.md
@@ -1,340 +0,0 @@
-# Direct IO For Pageserver
-
-## Summary
-
-This document is a proposal and implementation plan for direct IO in Pageserver.
-
-## Terminology / Glossary
-
-**kernel page cache**: the kernel's page cache is a write-back cache for filesystem contents.
-The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k).
-The cache lives in kernel memory and is not directly accessible through userspace.
-
-**Buffered IO**: the application's read/write system calls go through the kernel page cache.
-For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents
-at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict
-other pages to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes
-from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps
-track of the fact that the page is now "dirty" in some ancillary structure.
-
-**Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The moficiations
-made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel
-asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant
-ones are a) explicit request by userspace (`fsync`) and b) memory pressure.
-
-**Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity.
-If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations.
-Before reusing a page like that, the page has to be written back (writeback, see above).
-The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only
-way to get that memory is by eviction & re-using a dirty page cache page.
-Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`.
-I refer to this effect as the "malloc latency backscatter" caused by buffered IO.
-
-**Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem
-is still involved because it is ultimately in charge of mapping the concept of files & offsets within them
-to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers
-and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155). The IO operations will fail at runtime if the alignment requirements
-are not met.
-
-**"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and
-fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers,
-kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by
-the application.
-It takes more effort by the application to program with direct instead of buffered IO.
-The return is precise control over and a clear distinction between consumption/modification of memory vs disk.
-
-**Pageserver PageCache**: Pageserver has an additioanl `PageCache` (referred to as PageCache from here on, as opposed to "kernel page cache").
-Its caching unit is 8KiB which is the Postgres page size.
-Currently, it is tiny (128MiB), very much like Postgres's `shared_buffers`.
-A miss in PageCache is filled from the filesystem using buffered IO, issued through the `VirtualFile` layer in Pageserver.
-
-**VirtualFile** is Pageserver's abstraction for file IO, very similar to the faciltiy in Postgres that bears the same name.
-Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux.
-However, the faciltiy in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of
-IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`).
-
-## History Of Caching In Pageserver
-
-For multiple years, Pageserver's `PageCache` was used for all data path read _and write_ IO.
-It performed write-back to the kernel using buffered IO.
-
-We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994).
-
-The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers.
-The `PageCache` pages are usable as owned IO buffers.
-
-We then introduced vectored `Timeline::get` (cf RFC 30).
-The implementation bypasses PS `PageCache` for delta and image layer data block reads.
-(The disk btree embedded in delta & image layers is still `PageCache`'d).
-
-Most recently, and still ongoing, is [Epic: Bypass PageCache for user data blocks #7386](https://github.com/neondatabase/neon/issues/7386).
-The goal there is to eliminate the remaining caching of user data blocks in PS `PageCache`.
-
-The outcome of the above will be that
-1. all data blocks are read through VirtualFile and
-2. all indirect blocks (=disk btree blocks) are cached in the PS `PageCache`.
-The norm will be very low baseline replacement rates in PS `PageCache`.
-High baseline replacement rates will be treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS).
-It will be remediated by the storage controller, migrating tenants away to relieve pressure.
-(Such a migration mechanism in storage controller is not part of this project.)
-
-In the future, we may elminate the `PageCache` even for indirect blocks.
-For example with an LRU cache that has as unit the entire disk btree content
-instead of individual blocks.
-
-## Motivation
-
-Even though we have eliminated PS `PageCache` complexities and overheads, we are still using the kernel page cache for all IO.
-
-In this RFC, we propose switching to direct IO and lay out a plan to do it.
-
-The motivation for using direct IO:
-
-Predictable VirtualFile operation latencies.
-    * for reads: currently kernel page cache hit/miss determines fast/slow
-    * for appends: immediate back-pressure from disk instead of kernel page cache
-    * for in-place updates: we don't do in-place updates in Pageserver
-    * file fsync: will become practically constant cost because no writeback needs to happen
-
-Predictabile latencies, generally.
-    * avoid *malloc latency backscatter* caused by buffered writes (see glossary section)
-
-Efficiency
-* Direct IO avoids one layer of memory-to-memory copy.
-* We already do not rely / do not want to rely on the kernel page cache for batching of small IOs into bigger ones:
-    * writes: we do large streaming writes and/or have implemented batching in userspace.
-    * reads:
-    * intra-request: vectored get (RFC 30) takes care of merging reads => no block is read twice
-    * inter-request, e.g., getpage request for adjacent pages last-modified at nearly the same time
-        * (ideally these would come in as one vectored get request)
-        * generally, we accept making such reads *predictably* slow rather than *maybe* fast,
-            depending on how busy the kernel page cache is.
-
-Explicitness & Tangibility of resource usage.
-* It is desriable and valuable to be *explicit* about the main resources we use. For example:
-* We can build true observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?").
-* We can build accounting & QoS by implementing an IO scheduler that is tenant aware.
-
-## Definition of Done
-
-All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache.
-
-In particular, the "data path" includes the wal ingest path and anything on the `Timline::get` / `Timline::get_vectored` path.
-
-The production Pageserver config are tuned such that we get equivalent hit rates for the indirect blocks in layers (disk btree blocks) in the PS PageCache compared to what we previously got from the kernel page cache.
-
-The CPU utilization is equivalent or ideally lower.
-
-There are no regressions to ingest latency.
-
-Getpage & basebackup latencies under high memory pressure are equivalent to when we used with kernel page cache.
-Getpage & basebackup latencies under low memory pressure will be worse than when we used kernel page cache, but they are predictable, i.e., proportional to number of layers & blocks visited per layer.
-
-## Non-Goals
-
-We're not eliminating the remaining use of PS `PageCache` as part of this work.
-
-## Impacted Components
-
-Pageserver.
-
-## Proposed Implementation
-
-The work breaks down into the following high-level items:
-
-* Risk assessment: determine that our production filesystem (ext4) and Linux kernel version allows mixing direct IO and buffered IO.
-* Alignment requirements: make all VirtualFile follow IO alignment requirements (`Dio_mem_align` / `Dio_offset_align`).
-* Add Pageserver config option to configure direct vs buffered IO.
-* Determine new production configuration for PS PageCache size: when we roll out direct IO, it needs to hold the working set of indirect blocks.
-* Performance evaluation, esp avoiding regressions.
-
-The risk assessment is to understand
-1. the impact of an implementation bug where we issue some but not all IOs using direct IO, as well as
-2. the degree to which this project can be safely partially completed, i.e., if we cannot convert all code paths in the time alotted.
-
-The bulk of the design & coding work is to ensure adherence to the alignment requirements.
-
-Our automated benchmarks are insufficient to rule out performance regressions.
-Manual benchmarking / new automated benchmarks will be required for the last two items (new PS PageCache size, avoiding regressions).
-The metrics we care about were already listed in the "Definition of Done" section of this document.
-More details on benchmarking later in this doc (Phase 3).
-
-### Meeting Direct IO Alignment Requirements
-
-We need to fix all the places where we do tiny and/or unaligned IOs.
-Otherwise the kernel will fail the operation with an error.
-We can implement a fallback to buffered IO for a transitory period, to avoid user impact.
-But the **goal is to systematically ensure that we issue properly aligned IOs to the kernel/filesystem**.
-
-Ideally, we'd use the Rust type system to compile-time-ensure that we only use VirtualFile with aligned buffers.
-Feasibility of this will be explored early in the project.
-
-An alternative is to add runtime checks and potentially a runtime fallback to buffered IO so we avoid user-facing downtime.
-
-Genearlly, this work is open-ended (=> hard to estimate!).
-It is a fixpoint iteration on the code base until all the places are fixed.
-The runtime-check based approach is more amenable to doing this incrementally over many commits.
-The value of a type-system-based approach can still be realized retroactively, and it will avoid regressions.
-
-From some [early scoping experiments in January](https://www.notion.so/neondatabase/2024-01-30-benchmark-tokio-epoll-uring-less-Page-Cache-O_DIRECT-request-local-page-cache-aa026802b5214c58b17518d7f6a4219b?pvs=4),
-we know the broad categories of changes required:
-
- Tiny IOs
-    - example: writes: blob_io BUFFERED=false writer for ImageLayer
-    - example reads: blob_io / vectored_blob_io
-    - We have to move the IO buffer from inside the kernel into userspace. The perf upside is huge because we avoid the syscalls.
-    - Will very likely be caught by runtime checking.
-    - recipe for writes: use streaming IO abstractions that do IO using aligned buffers (see below)
-    - recipe for reads: shot-lived IO buffers from buffer pool (see below)
-
- Larger IOs that are unaligned
-    - typical case for this would be a Vec or Bytes that’s short-lived and used as an IoBuf / IoBufMut
-    - These are not guaranteed to be sufficiently aligned, and often are not.
-    - => need to replace with buffers that are guaranteed aligned
-    - recipe:
-      - generally these short-lived buffers should have a bounded size, it's a pre-existing design flaw if they don't
-      - if they have bounded size: can use buffer pool (see below)
-      - unbounded size: try hard to convert these to bounded size or better use streaming IO (see below)
-      - generally, unbounded size buffers are an accepted risk to timely completion of this project
-
- *Accidentally* aligned IOs
-    - Like `Larger IOs` section above, but, for some reason, they're aligned.
-    - The runtime-check won't detect them.
-    - example: current PageCache slots are sometimes aligned
-    - recipe: ???
-      - for PageCache slots: malloc the page cache slots are with correct algignment.
-
-### Buffer Pool
-
-The **buffer pool** mentioned to above will be a load-bearing component.
-Its basic function is to provide callers with a memory buffer of adequate alignment and size (statx `Dio_mem_align` / `Dio_offset_align`).
-Callers `get()` a buffer from the pool. Size is specified at `get` time and is fixed (not growable).
-Callers own the buffer and are responsible for filling it with valid data.
-They then use it to perform the IO.
-Either the IO completes and returns the buffer, or the caller loses interest, which hands over ownership to tokio-epoll-uring until IO completion.
-The buffer may be re-used, but eventually it gets dropped.
-The drop handler returns the buffer to the buffer pool.
-
-The buffer pool enforces a maximum amount of IO memory by stalling `get()` calls if all buffers are in use.
-This ensures `page_cache + buffer_pools + slop < user memory` where slop is all other memory allocations.
-
-The buffer pool buffers can optionally be wrapped by the **streaming IO abstraction** in `owned_bufers_io::write` for use as the IO buffer.
-This guarantees that the streaming IOs are issued from aligned buffers.
-
-The tricky part is buffers whose size isn't know ahead of time.
-The buffer pool can't provide such buffers.
-One workaround is to use slop space (such as a Vec) to collect all the data, then memcpy it into buffer pool buffers like so:
-```rust
-let vec = ... /* code that produces variable amount of data */;
-for chunk in vec.chunks(bufpool.buffer_size()) {
-    let buf = bufpool.get();
-    assert_eq!(buf.len(), bufpool.buffer_size());
-    buf.copy_from_slice(chunk);
-    file.write_at(..., buf, ...);
-}
-```
-However, the `vec` in that code still needs to be sized in multiples of the filesystem block size.
-The best way to ensure this is to completely refactor to `owned_bufers_io::write`, which also avoids the double-copying.
-
-If we **have** to do writes of non-block-size-multiple length, the solution is to do read-modify-write for the unaligned parts.
-We don't have infrastructure for this yet.
-It would be best to avoid this, and from my scoping work in January, I cannot remember a need for it.
-
-In the future, we might want to use [io_uring registered buffers](https://unixism.net/loti/ref-iouring/io_uring_register.html).
-It's out of reach at this time because we use tokio-epoll-uring in thread-local executor mode, meaning we'd have to register
-each buffer with all thread-local executors. However, above API requirements for the buffer pool implicitly require the buffer
-handle that's returned by `get()` to be a custom smart pointer type. We will be able to extend it in the future to include the
-io_uring registered buffer index without having to touch the entire code base.
-
-## Execution
-
-### Phase 1
-In this phase we build a bunch of foundational pieces. The work is parallelizable to some extend.
-
-* Explore type-system level way to find all unaligned IO/s
-    * idea: create custom IO buffer marker traits / types , e.g. extend IoBuf / IoBufMut to IoBufAligned and IoBufMutAligned.
-    * could take this as a general opportunity to clean up the owned buffers APIs
-* Runtime-check for alignment requirements
-* Perf simulation mode: pad VirtualFile op latencies to typical NVMe latencies
-    * Such low latencies are tricky to precisely simulate, as, e.g., tokio doesn’t guarantee that timer resolution.
-    * Maybe do a fake direct IO to some fake file in addition to the buffered IO? Doubles amount of tokio-epoll-uring traffic but it’s probably closest to reality.
-    * Can we make this safely usable in production?
-* Pageserver config changes to expose the new mdoes:
-  ```rust
-    ...
-    virtual_file_direct_io: enum  {
-        #[default]
-        Disabled,
-        Evaluate {
-            check_alignment: no | log | error
-            pad_timing: enum {
-                No,
-                TokioSleep,
-                FakeFile { path: PathBuf }
-            }
-        },
-        Enabled {
-            on_alignment_error: error | fallback_to_buffered
-        }
-    }
-    ...
-  ```
-* VirtualFile API to support direct IO
-    * What's better: Require all callers to be explicit vs just always do direct IO?
-* Buffer pool design & implementation
-  * Explore designs / prior art to avoid contention on the global buffer pool
-  * No implicit global state, create the instance in `main()` and pass it through the app. `RequestContext` is the way to go.
-  * Explore further `RequestContext` integration: two-staged pool, with a tiny pool in the `RequestContext`
-    to avoid contention on the global pool.
-  * Should be able to draw from PS PageCache as a last resort mechanism to avoid OOMs
-    (PageCache thrashing will alert operators!)
-    * Longer-term, should have model of worst-case / p9X peak buffer usage per request
-      and admit not more requests than what configured buffer pool size allows.
-      Out of scope of this project, though.
-
-
-## Phase 2
-In this phase, we do the bulk of the coding work, leveraging the runtime check to get feedback.
-Also, we use the performance simulator mode to get a worst-case estimate on the perf impact.
-
-* Leverage runtime check for alignment (= monitor for its `warn!` logs)
-  - in regress test CI => matrix build like we did for tokio-epoll-uring/vectored get/compaction algorithms
-  - in staging
-  - in benchmarks (pre-prod, nightly staging)
-  - in production?
-
-* Find & fix unaligned IOs.
-  * See section `Meeting Direct IO Alignment Requirements`
-  * This is the bulk of the work, and it's hard to estimate because we may have to refactor
-    existing code away from bad practices such as unbounded allocation / not using streaming IO.
-
-* Use performance simulator mode to get worst-case estimate for perf impact **early**
-  * in manual testing on a developer-managed EC2 instance
-  * in staging / pre-prod => work with QA team
-
-## Phase 3: Performance
-Functionally we're ready, now we have to understand the performance impact and ensure there are no regressions.
-Also, we left room for optimization with the buffer pool implementation so let's improve there as well.
-
-* Perf testing to validate perf requirements listed in "Definition of Done" section
-
-* Understand where the bottlenecks are.
-  * Manual testing is advisable for this => recommended to set up an EC2 instance with
-    a local Grafana + Prometheus + node_exporter stack.
-  * This work is time-consuming and open-ended. Get help if inexperienced.
-
-Pagebench, pgbench, and nightly prodlike cloudbench, are workload *drivers*.
-They are
-* sufficient for producing the metrics listed in "Definition of Done",
-* representative enough to detect severe regressions,
-* expose bottlenecks.
-
-However, we do not have sufficient automation for
-* creating high memory pressure secenario (e.g. with cgroups)
-* quantifying and recording before-and-after resource consumption (*CPU utilization, memory, IO*)
-* recording pageserver metrics.
-Hence, diligent perf testing will require **setting up a manually managed testbench in EC2** that resembles prod,
-with a local prometheus + grafana stack + node_exporter +scraping of the local pageserver.
-In the past, I have found having such a testbench to be most effective and flexible for diligent benchmarking.
-
-For the high memory pressure configuration, it might make sense to extend `neon_local` to manage a cgroup hierarchy.
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,11 +13,7 @@ use std::{

 use measured::{
    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{
-        group::{Encoding, MetricValue},
-        name::MetricNameEncoder,
-        Metric, MetricType, MetricVec,
-    },
+    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
    text::TextEncoder,
    LabelGroup,
 };
@@ -144,6 +140,7 @@ impl<const N: usize> HyperLogLogState<N> {
        })
    }
 }
+
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
    for HyperLogLogState<N>
 {
@@ -182,12 +179,13 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
            .into_iter()
            .enumerate()
            .try_for_each(|(hll_shard, val)| {
-                enc.write_metric_value(
-                    name.by_ref(),
+                CounterState::new(val as u64).collect_into(
+                    &(),
                    labels.by_ref().compose_with(HllShardLabel {
                        hll_shard: hll_shard as i64,
                    }),
-                    MetricValue::Int(val as i64),
+                    name.by_ref(),
+                    enc,
                )
            })
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
    metric::{
        counter::CounterState,
        gauge::GaugeState,
-        group::{Encoding, MetricValue},
+        group::Encoding,
        name::{MetricName, MetricNameEncoder},
        MetricEncoding, MetricFamilyEncoding,
    },
@@ -171,8 +171,11 @@ fn write_gauge<Enc: Encoding>(
    labels: impl LabelGroup,
    name: impl MetricNameEncoder,
    enc: &mut Enc,
-) -> Result<(), Enc::Err> {
-    enc.write_metric_value(name, labels, MetricValue::Int(x))
+) -> Result<(), Enc::Err>
+where
+    GaugeState: MetricEncoding<Enc>,
+{
+    GaugeState::new(x).collect_into(&(), labels, name, enc)
 }

 #[derive(Default)]
@@ -544,15 +547,6 @@ impl<T: Encoding> Encoding for Inc<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }

 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -579,15 +573,6 @@ impl<T: Encoding> Encoding for Dec<T> {
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }

 /// Write the dec counter to the encoder
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,59 +1,42 @@
-use std::{ops::RangeInclusive, str::FromStr};
+//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
+//!
+//! This module contains a variety of types used to represent the concept of sharding
+//! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+//! we provide an summary here.
+//!
+//! Types used to describe shards:
+//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+//!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+//!   a shard suffix.
+//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+//!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+//!   tenant, such as layer files.
+//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+//!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+//!   four hex digits.  An unsharded tenant is `0000`.
+//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+//!
+//! Types used to describe the parameters for data distribution in a sharded tenant:
+//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+//!   multiple shards.  Its value is given in 8kiB pages.
+//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+//!   always zero: this is provided for future upgrades that might introduce different
+//!   data distribution schemes.
+//!
+//! Examples:
+//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+//!   and their slugs are 0004, 0104, 0204, and 0304.

 use crate::{key::Key, models::ShardParameters};
-use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
-use utils::id::TenantId;

-/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
-///
-/// This module contains a variety of types used to represent the concept of sharding
-/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-/// we provide an summary here.
-///
-/// Types used to describe shards:
-/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-///   a shard suffix.
-/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-///   tenant, such as layer files.
-/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-///   four hex digits.  An unsharded tenant is `0000`.
-/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-///
-/// Types used to describe the parameters for data distribution in a sharded tenant:
-/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-///   multiple shards.  Its value is given in 8kiB pages.
-/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-///   always zero: this is provided for future upgrades that might introduce different
-///   data distribution schemes.
-///
-/// Examples:
-/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-///   and their slugs are 0004, 0104, 0204, and 0304.
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardNumber(pub u8);
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(u8);
-
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
+#[doc(inline)]
+pub use ::utils::shard::*;

 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -65,362 +48,6 @@ pub struct ShardIdentity {
    layout: ShardLayout,
 }

-/// Formatting helper, for generating the `shard_id` label in traces.
-struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-impl ShardCount {
-    pub const MAX: Self = Self(u8::MAX);
-
-    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
-    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
-    ///
-    /// This method returns the actual number of shards, i.e. if our internal value is
-    /// zero, we return 1 (unsharded tenants have 1 shard).
-    pub fn count(&self) -> u8 {
-        if self.0 > 0 {
-            self.0
-        } else {
-            1
-        }
-    }
-
-    /// The literal internal value: this is **not** the number of shards in the
-    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
-    /// [`Self::count`] if you want to know the cardinality of shards.
-    pub fn literal(&self) -> u8 {
-        self.0
-    }
-
-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
-    pub fn is_unsharded(&self) -> bool {
-        self.0 == 0
-    }
-
-    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
-    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
-        Self(val)
-    }
-}
-
-impl ShardNumber {
-    pub const MAX: Self = Self(u8::MAX);
-}
-
-impl TenantShardId {
-    pub fn unsharded(tenant_id: TenantId) -> Self {
-        Self {
-            tenant_id,
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
-    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
-    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
-        RangeInclusive::new(
-            Self {
-                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            },
-            Self {
-                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
-            },
-        )
-    }
-
-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
-    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
-    pub fn to_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_number: self.shard_number,
-            shard_count: self.shard_count,
-        }
-    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
-}
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
-    }
-}
-
-impl std::fmt::Display for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
-        } else {
-            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
-            // is distinct from the normal single shard case (shard count == 1).
-            self.tenant_id.fmt(f)
-        }
-    }
-}
-
-impl std::fmt::Debug for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for TenantShardId {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
-        if s.len() == 32 {
-            // Legacy case: no shard specified
-            Ok(Self {
-                tenant_id: TenantId::from_str(s)?,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            })
-        } else if s.len() == 37 {
-            let bytes = s.as_bytes();
-            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
-            Ok(Self {
-                tenant_id,
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 18]> for TenantShardId {
-    fn from(b: [u8; 18]) -> Self {
-        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
-
-        Self {
-            tenant_id: TenantId::from(tenant_id_bytes),
-            shard_number: ShardNumber(b[16]),
-            shard_count: ShardCount(b[17]),
-        }
-    }
-}
-
-impl ShardIndex {
-    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            shard_number: number,
-            shard_count: count,
-        }
-    }
-    pub fn unsharded() -> Self {
-        Self {
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
-
-    /// For use in constructing remote storage paths: concatenate this with a TenantId
-    /// to get a fully qualified TenantShardId.
-    ///
-    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
-    /// that the legacy pre-sharding remote key format is preserved.
-    pub fn get_suffix(&self) -> String {
-        if self.is_unsharded() {
-            "".to_string()
-        } else {
-            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-        }
-    }
-}
-
-impl std::fmt::Display for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-    }
-}
-
-impl std::fmt::Debug for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for ShardIndex {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 1 byte shard number, 1 byte shard count
-        if s.len() == 4 {
-            let bytes = s.as_bytes();
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(bytes, &mut shard_parts)?;
-            Ok(Self {
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 2]> for ShardIndex {
-    fn from(b: [u8; 2]) -> Self {
-        Self {
-            shard_number: ShardNumber(b[0]),
-            shard_count: ShardCount(b[1]),
-        }
-    }
-}
-
-impl Serialize for TenantShardId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
-            let mut packed: [u8; 18] = [0; 18];
-            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
-            packed[16] = self.shard_number.0;
-            packed[17] = self.shard_count.0;
-
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantShardId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = TenantShardId;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 18])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 18] = Deserialize::deserialize(s)?;
-                Ok(TenantShardId::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                TenantShardId::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                18,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -585,77 +212,6 @@ impl ShardIdentity {
    }
 }

-impl Serialize for ShardIndex {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Binary encoding is not used in index_part.json, but is included in anticipation of
-            // switching various structures (e.g. inter-process communication, remote metadata) to more
-            // compact binary encodings in future.
-            let mut packed: [u8; 2] = [0; 2];
-            packed[0] = self.shard_number.0;
-            packed[1] = self.shard_count.0;
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for ShardIndex {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = ShardIndex;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 2])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 2] = Deserialize::deserialize(s)?;
-                Ok(ShardIndex::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                ShardIndex::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                2,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -737,7 +293,9 @@ pub fn describe(

 #[cfg(test)]
 mod tests {
-    use utils::Hex;
+    use std::str::FromStr;
+
+    use utils::{id::TenantId, Hex};

    use super::*;

--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,6 +13,7 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true

@@ -23,4 +24,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
+tokio-postgres-rustls.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,6 +16,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -400,21 +401,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    }

    /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(
+    pub async fn run(
        mut self,
        handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S + Clone,
-        S: Future,
-    {
-        let ret = self
-            .run_message_loop(handler, shutdown_watcher.clone())
-            .await;
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
+        let ret = self.run_message_loop(handler, cancel).await;

        tokio::select! {
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                // do nothing; we most likely got already stopped by shutdown and will log it next.
            }
            _ = self.framed.shutdown() => {
@@ -444,21 +439,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        }
    }

-    async fn run_message_loop<F, S>(
+    async fn run_message_loop(
        &mut self,
        handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S,
-        S: Future,
-    {
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
        trace!("postgres backend to {:?} started", self.peer_addr);

        tokio::select!(
            biased;

-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received during handshake");
                return Err(QueryError::Shutdown)
@@ -473,7 +464,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        let mut query_string = Bytes::new();
        while let Some(msg) = tokio::select!(
            biased;
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                // We were requested to shut down.
                tracing::info!("shutdown request received in run_message_loop");
                return Err(QueryError::Shutdown)
@@ -485,7 +476,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            let result = self.process_message(handler, msg, &mut query_string).await;
            tokio::select!(
                biased;
-                _ = shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                    // We were requested to shut down.
                    tracing::info!("shutdown request received during response flush");

--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,13 +3,14 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::{future, sync::Arc};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
+use tokio_util::sync::CancellationToken;

 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -50,7 +51,7 @@ async fn simple_select() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
    });

    let conf = Config::new();
@@ -102,7 +103,7 @@ async fn simple_select_ssl() {

    tokio::spawn(async move {
        let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
    });

    let client_cfg = rustls::ClientConfig::builder()
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,8 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;

+pub mod shard;
+
 mod hex;
 pub use hex::Hex;

--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -0,0 +1,451 @@
+//! See `pageserver_api::shard` for description on sharding.
+
+use std::{ops::RangeInclusive, str::FromStr};
+
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+
+use crate::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(pub u8);
+
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+/// Formatting helper, for generating the `shard_id` label in traces.
+pub struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as [`TenantShardId::unsharded`].
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub const fn new(val: u8) -> Self {
+        Self(val)
+    }
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_shard_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
+    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
+}
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,6 +62,7 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -47,6 +47,9 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 const PID_FILE_NAME: &str = "pageserver.pid";

 const FEATURES: &[&str] = &[
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -267,7 +267,7 @@ async fn page_service_conn_main(
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
-        .run(&mut conn_handler, task_mgr::shutdown_watcher)
+        .run(&mut conn_handler, &task_mgr::shutdown_token())
        .await
    {
        Ok(()) => {
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -854,13 +854,14 @@ impl Timeline {
        result.add_key(DBDIR_KEY);

        // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = self.list_dbdirs(lsn, ctx).await?;
+        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();

-        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
-        dbs.sort_unstable();
-        for (spcnode, dbnode) in dbs {
-            result.add_key(relmap_file_key(spcnode, dbnode));
+        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
+        for ((spcnode, dbnode), has_relmap_file) in dbs {
+            if has_relmap_file {
+                result.add_key(relmap_file_key(spcnode, dbnode));
+            }
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
@@ -919,6 +920,9 @@ impl Timeline {
            result.add_key(AUX_FILES_KEY);
        }

+        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
+        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
+        // and the keys will not be garbage-colllected.
        #[cfg(test)]
        {
            let guard = self.extra_test_dense_keyspace.load();
@@ -927,13 +931,48 @@ impl Timeline {
            }
        }

-        Ok((
-            result.to_keyspace(),
-            /* AUX sparse key space */
-            SparseKeySpace(KeySpace {
-                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
-            }),
-        ))
+        let dense_keyspace = result.to_keyspace();
+        let sparse_keyspace = SparseKeySpace(KeySpace {
+            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
+        });
+
+        if cfg!(debug_assertions) {
+            // Verify if the sparse keyspaces are ordered and non-overlapping.
+
+            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
+            // category of sparse keys are split into their own image/delta files. If there
+            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
+            // and we want the developer to keep the keyspaces separated.
+
+            let ranges = &sparse_keyspace.0.ranges;
+
+            // TODO: use a single overlaps_with across the codebase
+            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+                !(a.end <= b.start || b.end <= a.start)
+            }
+            for i in 0..ranges.len() {
+                for j in 0..i {
+                    if overlaps_with(&ranges[i], &ranges[j]) {
+                        panic!(
+                            "overlapping sparse keyspace: {}..{} and {}..{}",
+                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
+                        );
+                    }
+                }
+            }
+            for i in 1..ranges.len() {
+                assert!(
+                    ranges[i - 1].end <= ranges[i].start,
+                    "unordered sparse keyspace: {}..{} and {}..{}",
+                    ranges[i - 1].start,
+                    ranges[i - 1].end,
+                    ranges[i].start,
+                    ranges[i].end
+                );
+            }
+        }
+
+        Ok((dense_keyspace, sparse_keyspace))
    }

    /// Get cached size of relation if it not updated after specified LSN
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -728,6 +728,9 @@ impl From<CreateImageLayersError> for CompactionError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            CreateImageLayersError::Other(e) => {
+                CompactionError::Other(e.context("create image layers"))
+            }
            _ => CompactionError::Other(e.into()),
        }
    }
--- a/poetry.lock
+++ b/poetry.lock
@@ -3133,18 +3133,18 @@ multidict = ">=4.0"

 [[package]]
 name = "zipp"
-version = "3.8.1"
+version = "3.19.1"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
-    {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
+    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
+    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
 ]

 [package.extras]
-docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"]
-testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]

 [[package]]
 name = "zstandard"
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -6,8 +6,9 @@ use anyhow::Context;
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::{convert::Infallible, future};
+use std::convert::Infallible;
 use tokio::net::{TcpListener, TcpStream};
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};

 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
@@ -67,7 +68,9 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {

 async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
    let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
-    pgbackend.run(&mut MgmtHandler, future::pending::<()>).await
+    pgbackend
+        .run(&mut MgmtHandler, &CancellationToken::new())
+        .await
 }

 /// A message received by `mgmt` when a compute node is ready.
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -3,8 +3,8 @@ use std::marker::PhantomData;
 use measured::{
    label::NoLabels,
    metric::{
-        gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
-        MetricEncoding, MetricFamilyEncoding, MetricType,
+        gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding,
+        MetricFamilyEncoding, MetricType,
    },
    text::TextEncoder,
    LabelGroup, MetricGroup,
@@ -100,7 +100,7 @@ macro_rules! jemalloc_gauge {
                enc: &mut TextEncoder<W>,
            ) -> Result<(), std::io::Error> {
                if let Ok(v) = mib.read() {
-                    enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
+                    GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?;
                }
                Ok(())
            }
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -2,7 +2,7 @@ use std::sync::{Arc, OnceLock};

 use lasso::ThreadedRodeo;
 use measured::{
-    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
+    label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
    metric::{histogram::Thresholds, name::MetricName},
    Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
    LabelGroup, MetricGroup,
@@ -577,6 +577,32 @@ impl LabelGroup for ThreadPoolWorkerId {
    }
 }

+impl LabelGroupSet for ThreadPoolWorkers {
+    type Group<'a> = ThreadPoolWorkerId;
+
+    fn cardinality(&self) -> Option<usize> {
+        Some(self.0)
+    }
+
+    fn encode_dense(&self, value: Self::Unique) -> Option<usize> {
+        Some(value)
+    }
+
+    fn decode_dense(&self, value: usize) -> Self::Group<'_> {
+        ThreadPoolWorkerId(value)
+    }
+
+    type Unique = usize;
+
+    fn encode(&self, value: Self::Group<'_>) -> Option<Self::Unique> {
+        Some(value.0)
+    }
+
+    fn decode(&self, value: &Self::Unique) -> Self::Group<'_> {
+        ThreadPoolWorkerId(*value)
+    }
+}
+
 impl LabelSet for ThreadPoolWorkers {
    type Value<'a> = ThreadPoolWorkerId;

--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -838,8 +838,9 @@ async fn query_to_json<T: GenericClient>(
        "finished reading rows"
    );

-    let mut fields = vec![];
-    let mut columns = vec![];
+    let columns_len = row_stream.columns().len();
+    let mut fields = Vec::with_capacity(columns_len);
+    let mut columns = Vec::with_capacity(columns_len);

    for c in row_stream.columns() {
        fields.push(json!({
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,9 +4,10 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
-use std::{future, time::Duration};
+use std::time::Duration;
 use tokio::net::TcpStream;
 use tokio_io_timeout::TimeoutReader;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{auth::Scope, measured_stream::MeasuredStream};

@@ -100,7 +101,7 @@ async fn handle_socket(
    // libpq protocol between safekeeper and walproposer / pageserver
    // We don't use shutdown.
    pgbackend
-        .run(&mut conn_handler, future::pending::<()>)
+        .run(&mut conn_handler, &CancellationToken::new())
        .await
 }

--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4062,7 +4062,14 @@ impl Service {
                placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking

                // There is no way to know what the tenant's config was: revert to defaults
-                config: TenantConfig::default(),
+                //
+                // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration
+                //
+                // we write to both v1+v2 storage, so that the test case can use either storage format for testing
+                config: TenantConfig {
+                    switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation),
+                    ..TenantConfig::default()
+                },
            })
            .await?;

--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -47,7 +47,7 @@ pub async fn find_large_objects(
    ignore_deltas: bool,
    concurrency: usize,
 ) -> anyhow::Result<LargeObjectListing> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));

    let objects_stream = tenants.map_ok(|tenant_shard_id| {
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -140,7 +140,7 @@ async fn find_garbage_inner(
    node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
    // Construct clients for S3 and for Console API
-    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
    let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));

    // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -432,7 +432,7 @@ pub async fn purge_garbage(
    );

    let (s3_client, target) =
-        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind)?;
+        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;

    // Sanity checks on the incoming list
    if garbage_list.active_tenant_count == 0 {
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -15,17 +15,10 @@ use std::fmt::Display;
 use std::sync::Arc;
 use std::time::Duration;

-use anyhow::Context;
-use aws_config::environment::EnvironmentVariableCredentialsProvider;
-use aws_config::imds::credentials::ImdsCredentialsProvider;
-use aws_config::meta::credentials::CredentialsProviderChain;
-use aws_config::profile::ProfileFileCredentialsProvider;
-use aws_config::retry::RetryConfig;
-use aws_config::sso::SsoCredentialsProvider;
-use aws_config::BehaviorVersion;
-use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
-use aws_sdk_s3::{Client, Config};
-use aws_smithy_async::rt::sleep::TokioSleep;
+use anyhow::{anyhow, Context};
+use aws_sdk_s3::config::Region;
+use aws_sdk_s3::error::DisplayErrorContext;
+use aws_sdk_s3::Client;

 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
@@ -242,85 +235,53 @@ impl ConsoleConfig {
    }
 }

-pub fn init_logging(file_name: &str) -> WorkerGuard {
-    let (file_writer, guard) =
-        tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
-
-    let file_logs = fmt::Layer::new()
-        .with_target(false)
-        .with_ansi(false)
-        .with_writer(file_writer);
+pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
    let stderr_logs = fmt::Layer::new()
        .with_target(false)
        .with_writer(std::io::stderr);
-    tracing_subscriber::registry()
-        .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
-        .with(file_logs)
-        .with(stderr_logs)
-        .init();

-    guard
-}
-
-pub fn init_s3_client(bucket_region: Region) -> Client {
-    let credentials_provider = {
-        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-        let chain = CredentialsProviderChain::first_try(
-            "env",
-            EnvironmentVariableCredentialsProvider::new(),
-        )
-        // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-        .or_else(
-            "profile-sso",
-            ProfileFileCredentialsProvider::builder().build(),
-        );
-
-        // Use SSO if we were given an account ID
-        match std::env::var("SSO_ACCOUNT_ID").ok() {
-            Some(sso_account) => chain.or_else(
-                "sso",
-                SsoCredentialsProvider::builder()
-                    .account_id(sso_account)
-                    .role_name("PowerUserAccess")
-                    .start_url("https://neondb.awsapps.com/start")
-                    .region(bucket_region.clone())
-                    .build(),
-            ),
-            None => chain,
-        }
-        .or_else(
-            // Finally try IMDS
-            "imds",
-            ImdsCredentialsProvider::builder().build(),
-        )
+    let disable_file_logging = match std::env::var("PAGESERVER_DISABLE_FILE_LOGGING") {
+        Ok(s) => s == "1" || s.to_lowercase() == "true",
+        Err(_) => false,
    };

-    let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
-
-    let mut builder = Config::builder()
-        .behavior_version(
-            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
-            BehaviorVersion::v2023_11_09(),
-        )
-        .region(bucket_region)
-        .retry_config(RetryConfig::adaptive().with_max_attempts(3))
-        .sleep_impl(SharedAsyncSleep::from(sleep_impl))
-        .credentials_provider(credentials_provider);
-
-    if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
-        builder = builder.endpoint_url(endpoint)
+    if disable_file_logging {
+        tracing_subscriber::registry()
+            .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+            .with(stderr_logs)
+            .init();
+        None
+    } else {
+        let (file_writer, guard) =
+            tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
+        let file_logs = fmt::Layer::new()
+            .with_target(false)
+            .with_ansi(false)
+            .with_writer(file_writer);
+        tracing_subscriber::registry()
+            .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+            .with(stderr_logs)
+            .with(file_logs)
+            .init();
+        Some(guard)
    }
-
-    Client::from_conf(builder.build())
 }

-fn init_remote(
+pub async fn init_s3_client(bucket_region: Region) -> Client {
+    let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
+        .region(bucket_region)
+        .load()
+        .await;
+    Client::new(&config)
+}
+
+async fn init_remote(
    bucket_config: BucketConfig,
    node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
    let bucket_region = Region::new(bucket_config.region);
    let delimiter = "/".to_string();
-    let s3_client = Arc::new(init_s3_client(bucket_region));
+    let s3_client = Arc::new(init_s3_client(bucket_region).await);

    let s3_root = match node_kind {
        NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
@@ -345,7 +306,7 @@ async fn list_objects_with_retries(
    s3_target: &S3Target,
    continuation_token: Option<String>,
 ) -> anyhow::Result<aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output> {
-    for _ in 0..MAX_RETRIES {
+    for trial in 0..MAX_RETRIES {
        match s3_client
            .list_objects_v2()
            .bucket(&s3_target.bucket_name)
@@ -357,16 +318,22 @@ async fn list_objects_with_retries(
        {
            Ok(response) => return Ok(response),
            Err(e) => {
+                if trial == MAX_RETRIES - 1 {
+                    return Err(e)
+                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                }
                error!(
-                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
-                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
+                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
+                    s3_target.bucket_name,
+                    s3_target.prefix_in_bucket,
+                    s3_target.delimiter,
+                    DisplayErrorContext(e),
                );
                tokio::time::sleep(Duration::from_secs(1)).await;
            }
        }
    }
-
-    anyhow::bail!("Failed to list objects {MAX_RETRIES} times")
+    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }

 async fn download_object_with_retries(
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -196,7 +196,7 @@ async fn main() -> anyhow::Result<()> {
            concurrency,
        } => {
            let downloader =
-                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
+                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency).await?;
            downloader.download().await
        }
        Command::PageserverPhysicalGc {
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -160,7 +160,7 @@ pub async fn pageserver_physical_gc(
    min_age: Duration,
    mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;

    let tenants = if tenant_ids.is_empty() {
        futures::future::Either::Left(stream_tenants(&s3_client, &target))
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -199,7 +199,7 @@ pub async fn scan_metadata(
    bucket_config: BucketConfig,
    tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;

    let tenants = if tenant_ids.is_empty() {
        futures::future::Either::Left(stream_tenants(&s3_client, &target))
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -106,7 +106,7 @@ pub async fn scan_safekeeper_metadata(
    let timelines = client.query(&query, &[]).await?;
    info!("loaded {} timelines", timelines.len());

-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
    let console_config = ConsoleConfig::from_env()?;
    let cloud_admin_api_client = CloudAdminApiClient::new(console_config);

--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -28,13 +28,13 @@ pub struct SnapshotDownloader {
 }

 impl SnapshotDownloader {
-    pub fn new(
+    pub async fn new(
        bucket_config: BucketConfig,
        tenant_id: TenantId,
        output_path: Utf8PathBuf,
        concurrency: usize,
    ) -> anyhow::Result<Self> {
-        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
        Ok(Self {
            s3_client,
            s3_root,
@@ -215,7 +215,8 @@ impl SnapshotDownloader {
    }

    pub async fn download(&self) -> anyhow::Result<()> {
-        let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?;
+        let (s3_client, target) =
+            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;

        // Generate a stream of TenantShardId
        let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -0,0 +1,263 @@
+from __future__ import annotations
+
+import time
+from typing import TYPE_CHECKING, cast
+
+import requests
+
+if TYPE_CHECKING:
+    from typing import Any, Dict, Literal, Optional, Union
+
+    from fixtures.pg_version import PgVersion
+
+
+def connection_parameters_to_env(params: Dict[str, str]) -> Dict[str, str]:
+    return {
+        "PGHOST": params["host"],
+        "PGDATABASE": params["database"],
+        "PGUSER": params["role"],
+        "PGPASSWORD": params["password"],
+    }
+
+
+class NeonAPI:
+    def __init__(self, neon_api_key: str, neon_api_base_url: str):
+        self.__neon_api_key = neon_api_key
+        self.__neon_api_base_url = neon_api_base_url.strip("/")
+
+    def __request(
+        self, method: Union[str, bytes], endpoint: str, **kwargs: Any
+    ) -> requests.Response:
+        if "headers" not in kwargs:
+            kwargs["headers"] = {}
+        kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
+
+        return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+
+    def create_project(
+        self,
+        pg_version: Optional[PgVersion] = None,
+        name: Optional[str] = None,
+        branch_name: Optional[str] = None,
+        branch_role_name: Optional[str] = None,
+        branch_database_name: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        data: Dict[str, Any] = {
+            "project": {
+                "branch": {},
+            },
+        }
+        if name:
+            data["project"]["name"] = name
+        if pg_version:
+            data["project"]["pg_version"] = int(pg_version)
+        if branch_name:
+            data["project"]["branch"]["name"] = branch_name
+        if branch_role_name:
+            data["project"]["branch"]["role_name"] = branch_role_name
+        if branch_database_name:
+            data["project"]["branch"]["database_name"] = branch_database_name
+
+        resp = self.__request(
+            "POST",
+            "/projects",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+
+        assert resp.status_code == 201
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_project_details(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+        )
+        assert resp.status_code == 200
+        return cast("Dict[str, Any]", resp.json())
+
+    def delete_project(
+        self,
+        project_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "DELETE",
+            f"/projects/{project_id}",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def start_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/start",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def suspend_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/suspend",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def restart_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/restart",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def create_endpoint(
+        self,
+        project_id: str,
+        branch_id: str,
+        endpoint_type: Literal["read_write", "read_only"],
+        settings: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        data: Dict[str, Any] = {
+            "endpoint": {
+                "branch_id": branch_id,
+            },
+        }
+
+        if endpoint_type:
+            data["endpoint"]["type"] = endpoint_type
+        if settings:
+            data["endpoint"]["settings"] = settings
+
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+
+        assert resp.status_code == 201
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_connection_uri(
+        self,
+        project_id: str,
+        branch_id: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
+        database_name: str = "neondb",
+        role_name: str = "neondb_owner",
+        pooled: bool = True,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/connection_uri",
+            params={
+                "branch_id": branch_id,
+                "endpoint_id": endpoint_id,
+                "database_name": database_name,
+                "role_name": role_name,
+                "pooled": pooled,
+            },
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_branches(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/branches",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_endpoints(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/endpoints",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_operations(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/operations",
+            headers={
+                "Accept": "application/json",
+                "Authorization": f"Bearer {self.__neon_api_key}",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def wait_for_operation_to_finish(self, project_id: str):
+        has_running = True
+        while has_running:
+            has_running = False
+            operations = self.get_operations(project_id)["operations"]
+            for op in operations:
+                if op["status"] in {"scheduling", "running", "cancelling"}:
+                    has_running = True
+            time.sleep(0.5)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -87,6 +87,8 @@ from fixtures.utils import (
 )
 from fixtures.utils import AuxFileStore as AuxFileStore  # reexport

+from .neon_api import NeonAPI
+
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -184,6 +186,25 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite
    yield versioned_dir


+@pytest.fixture(scope="session")
+def neon_api_key() -> str:
+    api_key = os.getenv("NEON_API_KEY")
+    if not api_key:
+        raise AssertionError("Set the NEON_API_KEY environment variable")
+
+    return api_key
+
+
+@pytest.fixture(scope="session")
+def neon_api_base_url() -> str:
+    return os.getenv("NEON_API_BASE_URL", "https://console-stage.neon.build/api/v2")
+
+
+@pytest.fixture(scope="session")
+def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI:
+    return NeonAPI(neon_api_key, neon_api_base_url)
+
+
 def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
    """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.

@@ -2862,14 +2883,21 @@ class PgBin:
        env.update(env_add)
        return env

-    def run(
+    def _log_env(self, env: dict[str, str]) -> None:
+        env_s = {}
+        for k, v in env.items():
+            if k.startswith("PG") and k != "PGPASSWORD":
+                env_s[k] = v
+        log.debug(f"Environment: {env_s}")
+
+    def run_nonblocking(
        self,
        command: List[str],
        env: Optional[Env] = None,
        cwd: Optional[Union[str, Path]] = None,
-    ):
+    ) -> subprocess.Popen[Any]:
        """
-        Run one of the postgres binaries.
+        Run one of the postgres binaries, not waiting for it to finish

        The command should be in list form, e.g. ['pgbench', '-p', '55432']

@@ -2880,11 +2908,34 @@ class PgBin:

        If you want stdout/stderr captured to files, use `run_capture` instead.
        """
-
        self._fixpath(command)
        log.info(f"Running command '{' '.join(command)}'")
        env = self._build_env(env)
-        subprocess.run(command, env=env, cwd=cwd, check=True)
+        self._log_env(env)
+        return subprocess.Popen(command, env=env, cwd=cwd, stdout=subprocess.PIPE, text=True)
+
+    def run(
+        self,
+        command: List[str],
+        env: Optional[Env] = None,
+        cwd: Optional[Union[str, Path]] = None,
+    ) -> None:
+        """
+        Run one of the postgres binaries, waiting for it to finish
+
+        The command should be in list form, e.g. ['pgbench', '-p', '55432']
+
+        All the necessary environment variables will be set.
+
+        If the first argument (the command name) doesn't include a path (no '/'
+        characters present), then it will be edited to include the correct path.
+
+        If you want stdout/stderr captured to files, use `run_capture` instead.
+        """
+        proc = self.run_nonblocking(command, env, cwd)
+        proc.wait()
+        if proc.returncode != 0:
+            raise subprocess.CalledProcessError(proc.returncode, proc.args)

    def run_capture(
        self,
@@ -2904,6 +2955,7 @@ class PgBin:
        self._fixpath(command)
        log.info(f"Running command '{' '.join(command)}'")
        env = self._build_env(env)
+        self._log_env(env)
        base_path, _, _ = subprocess_capture(
            self.log_dir,
            command,
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,8 +1,24 @@
-import time
+from __future__ import annotations

+import time
+import traceback
+from typing import TYPE_CHECKING
+
+import psycopg2
+import psycopg2.extras
 import pytest
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import AuxFileStore, NeonEnv, PgBin, logical_replication_sync
+from fixtures.neon_api import connection_parameters_to_env
+from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
+from fixtures.pg_version import PgVersion
+
+if TYPE_CHECKING:
+    from fixtures.benchmark_fixture import NeonBenchmarker
+    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_fixtures import NeonEnv, PgBin
+    from fixtures.pg_version import PgVersion


@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@@ -26,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
    vanilla_pg.safe_psql("truncate table pgbench_history")

    connstr = endpoint.connstr().replace("'", "''")
-    print(f"connstr='{connstr}'")
    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")

    # Wait logical replication channel to be established
@@ -42,3 +57,286 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
    sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
    sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
    assert sum_master == sum_replica
+
+
+def check_pgbench_still_running(pgbench, label=""):
+    rc = pgbench.poll()
+    if rc is not None:
+        raise RuntimeError(f"{label} pgbench terminated early with return code {rc}")
+
+
+def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
+    start = time.time()
+    pub_cur.execute("SELECT pg_current_wal_flush_lsn()")
+    pub_lsn = Lsn(pub_cur.fetchall()[0][0])
+    while (time.time() - start) < timeout_sec:
+        sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription")
+        res = sub_cur.fetchall()[0][0]
+        if res:
+            log.info(f"subscriber_lsn={res}")
+            sub_lsn = Lsn(res)
+            log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}")
+            if sub_lsn >= pub_lsn:
+                return time.time() - start
+        time.sleep(0.5)
+    raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec")
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_subscriber_lag(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts subscriber while still running the inserts, and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        sub_endpoint_id = sub_project["endpoints"][0]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+                        sub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            sub_project_id,
+                            sub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(sub_project_id)
+                        sub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-S"],
+                            env=sub_env,
+                        )
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_publisher_restart(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    pub_endpoint_id = pub_project["endpoints"][0]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        pub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            pub_project_id,
+                            pub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(pub_project_id)
+                        pub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+                            env=pub_env,
+                        )
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+import csv
+import os
+import subprocess
+import time
+import traceback
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import psycopg2
+import psycopg2.extras
+import pytest
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
+from fixtures.log_helper import log
+from fixtures.neon_api import connection_parameters_to_env
+from fixtures.pg_version import PgVersion
+
+if TYPE_CHECKING:
+    from typing import Any, List, Optional
+
+    from fixtures.benchmark_fixture import NeonBenchmarker
+    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_fixtures import PgBin
+
+
+# Granularity of ~0.5 sec
+def measure_replication_lag(master, replica, timeout_sec=600):
+    start = time.time()
+    master.execute("SELECT pg_current_wal_flush_lsn()")
+    master_lsn = Lsn(master.fetchall()[0][0])
+    while (time.time() - start) < timeout_sec:
+        replica.execute("select pg_last_wal_replay_lsn()")
+        replica_lsn = replica.fetchall()[0][0]
+        if replica_lsn:
+            if Lsn(replica_lsn) >= master_lsn:
+                return time.time() - start
+        time.sleep(0.5)
+    raise TimeoutError(f"Replication sync took more than {timeout_sec} sec")
+
+
+def check_pgbench_still_running(pgbench):
+    rc = pgbench.poll()
+    if rc is not None:
+        raise RuntimeError(f"Pgbench terminated early with return code {rc}")
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_ro_replica_lag(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    test_duration_min = 60
+    sync_interval_min = 10
+
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    project = neon_api.create_project(pg_version)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    error_occurred = False
+    try:
+        branch_id = project["branch"]["id"]
+        master_connstr = project["connection_uris"][0]["connection_uri"]
+        master_env = connection_parameters_to_env(
+            project["connection_uris"][0]["connection_parameters"]
+        )
+
+        replica = neon_api.create_endpoint(
+            project_id,
+            branch_id,
+            endpoint_type="read_only",
+            settings={"pg_settings": {"hot_standby_feedback": "on"}},
+        )
+        replica_env = master_env.copy()
+        replica_env["PGHOST"] = replica["endpoint"]["host"]
+        neon_api.wait_for_operation_to_finish(project_id)
+
+        replica_connstr = neon_api.get_connection_uri(
+            project_id,
+            endpoint_id=replica["endpoint"]["id"],
+        )["uri"]
+
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env)
+
+        master_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+            env=master_env,
+        )
+        try:
+            replica_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-S"],
+                env=replica_env,
+            )
+            try:
+                start = time.time()
+                while time.time() - start < test_duration_min * 60:
+                    check_pgbench_still_running(master_workload)
+                    check_pgbench_still_running(replica_workload)
+                    time.sleep(sync_interval_min * 60)
+                    with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect(
+                        replica_connstr
+                    ) as conn_replica:
+                        with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica:
+                            lag = measure_replication_lag(cur_master, cur_replica)
+                    log.info(f"Replica lagged behind master by {lag} seconds")
+                    zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+            finally:
+                replica_workload.terminate()
+        finally:
+            master_workload.terminate()
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception: {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred  # Fail the test if an error occurred
+        neon_api.delete_project(project_id)
+
+
+def report_pgbench_aggregate_intervals(
+    output_dir: Path,
+    prefix: str,
+    zenbenchmark: NeonBenchmarker,
+):
+    for filename in os.listdir(output_dir):
+        if filename.startswith(prefix):
+            # The file will be in the form <prefix>_<node>.<pid>
+            # So we first lop off the .<pid>, and then lop off the prefix and the _
+            node = filename.split(".")[0][len(prefix) + 1 :]
+            with open(output_dir / filename) as f:
+                reader = csv.reader(f, delimiter=" ")
+                for line in reader:
+                    num_transactions = int(line[1])
+                    if num_transactions == 0:
+                        continue
+                    sum_latency = int(line[2])
+                    sum_lag = int(line[3])
+                    zenbenchmark.record(
+                        f"{node}_num_txns", num_transactions, "txns", MetricReport.HIGHER_IS_BETTER
+                    )
+                    zenbenchmark.record(
+                        f"{node}_avg_latency",
+                        sum_latency / num_transactions,
+                        "s",
+                        MetricReport.LOWER_IS_BETTER,
+                    )
+                    zenbenchmark.record(
+                        f"{node}_avg_lag",
+                        sum_lag / num_transactions,
+                        "s",
+                        MetricReport.LOWER_IS_BETTER,
+                    )
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_replication_start_stop(
+    pg_bin: PgBin,
+    test_output_dir: Path,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Cycles through different configurations of read replicas being enabled disabled. The whole time,
+    there's a pgbench read/write workload going on the master. For each replica, we either turn it
+    on or off, and see how long it takes to catch up after some set amount of time of replicating
+    the pgbench.
+    """
+
+    prefix = "pgbench_agg"
+    num_replicas = 2
+    configuration_test_time_sec = 10 * 60
+    pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}"
+    error_occurred = False
+
+    project = neon_api.create_project(pg_version)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    try:
+        branch_id = project["branch"]["id"]
+        master_connstr = project["connection_uris"][0]["connection_uri"]
+        master_env = connection_parameters_to_env(
+            project["connection_uris"][0]["connection_parameters"]
+        )
+
+        replicas = []
+        for _ in range(num_replicas):
+            replicas.append(
+                neon_api.create_endpoint(
+                    project_id,
+                    branch_id,
+                    endpoint_type="read_only",
+                    settings={"pg_settings": {"hot_standby_feedback": "on"}},
+                )
+            )
+            neon_api.wait_for_operation_to_finish(project_id)
+
+        replica_connstr = [
+            neon_api.get_connection_uri(
+                project_id,
+                endpoint_id=replicas[i]["endpoint"]["id"],
+            )["uri"]
+            for i in range(num_replicas)
+        ]
+        replica_env = [master_env.copy() for _ in range(num_replicas)]
+        for i in range(num_replicas):
+            replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"]
+
+        pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env)
+
+        # Sync replicas
+        with psycopg2.connect(master_connstr) as conn_master:
+            with conn_master.cursor() as cur_master:
+                for i in range(num_replicas):
+                    conn_replica = psycopg2.connect(replica_connstr[i])
+                    measure_replication_lag(cur_master, conn_replica.cursor())
+
+        master_pgbench = pg_bin.run_nonblocking(
+            [
+                "pgbench",
+                "-c10",
+                pgbench_duration,
+                "-Mprepared",
+                "--log",
+                f"--log-prefix={test_output_dir}/{prefix}_master",
+                f"--aggregate-interval={configuration_test_time_sec}",
+            ],
+            env=master_env,
+        )
+        replica_pgbench: List[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)]
+
+        # Use the bits of iconfig to tell us which configuration we are on. For example
+        # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is
+        # alive.
+        for iconfig in range((1 << num_replicas) - 1, -1, -1):
+
+            def replica_enabled(iconfig: int = iconfig):
+                return bool((iconfig >> 1) & 1)
+
+            # Change configuration
+            for ireplica in range(num_replicas):
+                if replica_enabled() and replica_pgbench[ireplica] is None:
+                    replica_pgbench[ireplica] = pg_bin.run_nonblocking(
+                        [
+                            "pgbench",
+                            "-c10",
+                            "-S",
+                            pgbench_duration,
+                            "--log",
+                            f"--log-prefix={test_output_dir}/{prefix}_replica_{ireplica}",
+                            f"--aggregate-interval={configuration_test_time_sec}",
+                        ],
+                        env=replica_env[ireplica],
+                    )
+                elif not replica_enabled() and replica_pgbench[ireplica] is not None:
+                    pgb = replica_pgbench[ireplica]
+                    assert pgb is not None
+                    pgb.terminate()
+                    pgb.wait()
+                    replica_pgbench[ireplica] = None
+
+                    neon_api.suspend_endpoint(
+                        project_id,
+                        replicas[ireplica]["endpoint"]["id"],
+                    )
+                    neon_api.wait_for_operation_to_finish(project_id)
+
+            time.sleep(configuration_test_time_sec)
+
+            with psycopg2.connect(master_connstr) as conn_master:
+                with conn_master.cursor() as cur_master:
+                    for ireplica in range(num_replicas):
+                        replica_conn = psycopg2.connect(replica_connstr[ireplica])
+                        lag = measure_replication_lag(cur_master, replica_conn.cursor())
+                        zenbenchmark.record(
+                            f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
+                        )
+                        log.info(
+                            f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
+                        )
+        master_pgbench.terminate()
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(project_id)
+        # Only report results if we didn't error out
+        report_pgbench_aggregate_intervals(test_output_dir, prefix, zenbenchmark)
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -8,8 +8,11 @@ from typing import TYPE_CHECKING, cast

 import pytest
 from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
    NeonEnvBuilder,
    check_restored_datadir_content,
+    tenant_get_shards,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
@@ -21,6 +24,97 @@ if TYPE_CHECKING:
    from pytest import CaptureFixture


+TENANT_CONF = {
+    # Scaled down thresholds so that we are exercising the pageserver beyond just writing
+    # ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files.
+    "pitr_interval": "60s",
+    "checkpoint_distance": f"{8 * 1024 * 1024}",
+    "compaction_target_size": f"{8 * 1024 * 1024}",
+}
+
+# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+# # There should have been compactions mid-test as well, this final check is in addition those.
+# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant):
+#     pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True)
+
+
+def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint):
+    """
+    After running some opaque tests that create interesting content in a timeline, run
+    some generic integrity checks that the storage stack is able to reproduce the written
+    data properly.
+    """
+
+    ignored_files: Optional[list[str]] = None
+
+    # Neon handles unlogged relations in a special manner. During a
+    # basebackup, we ship the init fork as the main fork. This presents a
+    # problem in that the endpoint's data directory and the basebackup will
+    # have differences and will fail the eventual file comparison.
+    #
+    # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
+    # support for setting the persistence of a table in 9.5. The reason that
+    # this doesn't affect versions < 15 (but probably would between 9.1 and
+    # 9.5) is that all the regression tests that deal with unlogged tables
+    # up until that point dropped the unlogged tables or set them to logged
+    # at some point during the test.
+    #
+    # In version 15, Postgres grew support for unlogged sequences, and with
+    # that came a few more regression tests. These tests did not all drop
+    # the unlogged tables/sequences prior to finishing.
+    #
+    # But unlogged sequences came with a bug in that, sequences didn't
+    # inherit the persistence of their "parent" tables if they had one. This
+    # was fixed and backported to 15, thus exacerbating our problem a bit.
+    #
+    # So what we can do is just ignore file differences between the data
+    # directory and basebackup for unlogged relations.
+    results = cast(
+        "list[tuple[str, str]]",
+        endpoint.safe_psql(
+            """
+        SELECT
+            relkind,
+            pg_relation_filepath(
+                pg_filenode_relation(reltablespace, relfilenode)
+            ) AS unlogged_relation_paths
+        FROM pg_class
+        WHERE relpersistence = 'u'
+        """,
+            dbname=db_name,
+        ),
+    )
+
+    unlogged_relation_files: list[str] = []
+    for r in results:
+        unlogged_relation_files.append(r[1])
+        # This is related to the following Postgres commit:
+        #
+        # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
+        # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+        # Date:   2023-08-23 09:21:31 -0500
+        #
+        # Use the buffer cache when initializing an unlogged index.
+        #
+        # This patch was backpatched to 16. Without it, the LSN in the
+        # page header would be 0/0 in the data directory, which wouldn't
+        # match the LSN generated during the basebackup, thus creating
+        # a difference.
+        if env.pg_version <= PgVersion.V15 and r[0] == "i":
+            unlogged_relation_files.append(f"{r[1]}_init")
+
+    ignored_files = unlogged_relation_files
+
+    check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+
+    # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+    # There should have been compactions mid-test as well, this final check is in addition those.
+    for shard, pageserver in tenant_get_shards(env, env.initial_tenant):
+        pageserver.http_client().timeline_checkpoint(
+            shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True
+        )
+
+
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
@pytest.mark.timeout(600)
@@ -45,7 +139,10 @@ def test_pg_regress(

    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+    )

    # Connect to postgres and create a database called "regression".
    endpoint = env.endpoints.create_start("main")
@@ -84,67 +181,7 @@ def test_pg_regress(
    with capsys.disabled():
        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

-        ignored_files: Optional[list[str]] = None
-
-        # Neon handles unlogged relations in a special manner. During a
-        # basebackup, we ship the init fork as the main fork. This presents a
-        # problem in that the endpoint's data directory and the basebackup will
-        # have differences and will fail the eventual file comparison.
-        #
-        # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
-        # support for setting the persistence of a table in 9.5. The reason that
-        # this doesn't affect versions < 15 (but probably would between 9.1 and
-        # 9.5) is that all the regression tests that deal with unlogged tables
-        # up until that point dropped the unlogged tables or set them to logged
-        # at some point during the test.
-        #
-        # In version 15, Postgres grew support for unlogged sequences, and with
-        # that came a few more regression tests. These tests did not all drop
-        # the unlogged tables/sequences prior to finishing.
-        #
-        # But unlogged sequences came with a bug in that, sequences didn't
-        # inherit the persistence of their "parent" tables if they had one. This
-        # was fixed and backported to 15, thus exacerbating our problem a bit.
-        #
-        # So what we can do is just ignore file differences between the data
-        # directory and basebackup for unlogged relations.
-        results = cast(
-            "list[tuple[str, str]]",
-            endpoint.safe_psql(
-                """
-            SELECT
-                relkind,
-                pg_relation_filepath(
-                    pg_filenode_relation(reltablespace, relfilenode)
-                ) AS unlogged_relation_paths
-            FROM pg_class
-            WHERE relpersistence = 'u'
-            """,
-                dbname=DBNAME,
-            ),
-        )
-
-        unlogged_relation_files: list[str] = []
-        for r in results:
-            unlogged_relation_files.append(r[1])
-            # This is related to the following Postgres commit:
-            #
-            # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
-            # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-            # Date:   2023-08-23 09:21:31 -0500
-            #
-            # Use the buffer cache when initializing an unlogged index.
-            #
-            # This patch was backpatched to 16. Without it, the LSN in the
-            # page header would be 0/0 in the data directory, which wouldn't
-            # match the LSN generated during the basebackup, thus creating
-            # a difference.
-            if env.pg_version <= PgVersion.V15 and r[0] == "i":
-                unlogged_relation_files.append(f"{r[1]}_init")
-
-        ignored_files = unlogged_relation_files
-
-        check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+    post_checks(env, test_output_dir, DBNAME, endpoint)


 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
@@ -159,16 +196,20 @@ def test_isolation(
    pg_distrib_dir: Path,
    shard_count: Optional[int],
 ):
+    DBNAME = "isolation_regression"
+
    if shard_count is not None:
        neon_env_builder.num_pageservers = shard_count
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )

    # Connect to postgres and create a database called "regression".
    # isolation tests use prepared transactions, so enable them
    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
-    endpoint.safe_psql("CREATE DATABASE isolation_regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")

    # Create some local directories for pg_isolation_regress to run in.
    runpath = test_output_dir / "regress"
@@ -202,6 +243,9 @@ def test_isolation(
    with capsys.disabled():
        pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)

+    # This fails with a mismatch on `pg_multixact/offsets/0000`
+    # post_checks(env, test_output_dir, DBNAME, endpoint)
+

 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
@@ -215,15 +259,19 @@ def test_sql_regress(
    pg_distrib_dir: Path,
    shard_count: Optional[int],
 ):
+    DBNAME = "regression"
+
    if shard_count is not None:
        neon_env_builder.num_pageservers = shard_count
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )

    # Connect to postgres and create a database called "regression".
    endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("CREATE DATABASE regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")

    # Create some local directories for pg_regress to run in.
    runpath = test_output_dir / "regress"
@@ -258,4 +306,4 @@ def test_sql_regress(
    with capsys.disabled():
        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

-        check_restored_datadir_content(test_output_dir, env, endpoint)
+    post_checks(env, test_output_dir, DBNAME, endpoint)
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -1,7 +1,11 @@
+from __future__ import annotations
+
 import random
 import time
+from typing import TYPE_CHECKING

-from fixtures.neon_fixtures import NeonEnv
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv


 def test_physical_replication(neon_simple_env: NeonEnv):
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -720,9 +720,30 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
    They should have the same effect.
    """

+    def assert_size_approx_equal_for_lease_test(size_lease, size_branch):
+        """
+        Tests that evaluate sizes are checking the pageserver space consumption
+        that sits many layers below the user input.  The exact space needed
+        varies slightly depending on postgres behavior.
+
+        Rather than expecting postgres to be determinstic and occasionally
+        failing the test, we permit sizes for the same data to vary by a few pages.
+        """
+
+        # FIXME(yuchen): The delta is too large, used as temp solution to pass the test reliably.
+        # Investigate and reduce the threshold.
+        threshold = 22 * 8272
+
+        log.info(
+            f"delta: size_branch({size_branch}) -  size_lease({size_lease}) = {size_branch - size_lease}"
+        )
+
+        assert size_lease == pytest.approx(size_branch, abs=threshold)
+
    conf = {
        "pitr_interval": "0s" if zero_gc else "3600s",
        "gc_period": "0s",
+        "compaction_period": "0s",
    }

    env = neon_env_builder.init_start(initial_tenant_conf=conf)
@@ -734,7 +755,7 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
    tenant, timeline = env.neon_cli.create_tenant(conf=conf)
    lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease")

-    assert_size_approx_equal(lease_res, ro_branch_res)
+    assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res)


 def insert_with_action(
@@ -754,7 +775,11 @@ def insert_with_action(
    """

    client = env.pageserver.http_client()
-    with env.endpoints.create_start("main", tenant_id=tenant) as ep:
+    with env.endpoints.create_start(
+        "main",
+        tenant_id=tenant,
+        config_lines=["autovacuum=off"],
+    ) as ep:
        initial_size = client.tenant_size(tenant)
        log.info(f"initial size: {initial_size}")

--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -152,10 +152,12 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):

    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)

+    size_limit_mb = 30
+
    endpoint_main = env.endpoints.create(
        "test_timeline_size_quota_on_startup",
        # Set small limit for the test
-        config_lines=["neon.max_cluster_size=30MB"],
+        config_lines=[f"neon.max_cluster_size={size_limit_mb}MB"],
    )
    endpoint_main.start()

@@ -165,17 +167,39 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):

            # Insert many rows. This query must fail because of space limit
            try:
-                for _i in range(5000):
-                    cur.execute(
-                        """
-                        INSERT INTO foo
-                            SELECT 'long string to consume some space' || g
-                            FROM generate_series(1, 100) g
-                    """
-                    )

-                # If we get here, the timeline size limit failed
-                log.error("Query unexpectedly succeeded")
+                def write_rows(count):
+                    for _i in range(count):
+                        cur.execute(
+                            """
+                            INSERT INTO foo
+                                SELECT 'long string to consume some space' || g
+                                FROM generate_series(1, 100) g
+                        """
+                        )
+
+                # Write some data that exceeds limit, then let the pageserver ingest it to guarantee that some feedback has made it to
+                # the safekeeper, then try to write some more.  We expect either the initial writes or the ones after
+                # the wait_for_last_flush_lsn to generate an exception.
+                #
+                # Without the wait_for_last_flush_lsn, the size limit sometimes isn't enforced (see https://github.com/neondatabase/neon/issues/6562)
+                write_rows(2500)
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                assert logical_size > size_limit_mb * 1024 * 1024
+                write_rows(2500)
+
+                # If we get here, the timeline size limit failed.  Find out from the pageserver how large it
+                # thinks the timeline is.
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                log.error(
+                    f"Query unexpectedly succeeded, pageserver logical size is {logical_size}"
+                )
                raise AssertionError()

            except psycopg2.errors.DiskFull as err:
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -30,13 +30,12 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
-futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
-futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
@@ -69,6 +68,7 @@ sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
+tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
@@ -106,7 +106,9 @@ num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+proc-macro2 = { version = "1" }
 prost = { version = "0.11" }
+quote = { version = "1" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
Author	SHA1	Message	Date
dependabot[bot]	c70eb6ea15	build(deps-dev): bump zipp from 3.8.1 to 3.19.1 Bumps [zipp](https://github.com/jaraco/zipp) from 3.8.1 to 3.19.1. - [Release notes](https://github.com/jaraco/zipp/releases) - [Changelog](https://github.com/jaraco/zipp/blob/main/NEWS.rst) - [Commits](https://github.com/jaraco/zipp/compare/v3.8.1...v3.19.1) --- updated-dependencies: - dependency-name: zipp dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com>	2024-07-09 18:12:57 +00:00
Christian Schwarz	3f7aebb01c	refactor: postgres_backend: replace abstract shutdown_watcher with CancellationToken (#8295 ) Preliminary refactoring while working on https://github.com/neondatabase/neon/issues/7427 and specifically https://github.com/neondatabase/neon/pull/8286	2024-07-09 21:11:11 +03:00
Tristan Partin	abc330e095	Add an application_name to more Neon connections Helps identify connections in the logs.	2024-07-09 12:42:09 -05:00
Tristan Partin	6d3cb222ee	Refactor how migrations are ran Just a small improvement I noticed while looking at fixing CVE-2024-4317 in Neon.	2024-07-09 12:42:09 -05:00
Alex Chi Z	b1fe8259b4	fix(storage-scrubber): use default AWS authentication (#8299 ) part of https://github.com/neondatabase/cloud/issues/14024 close https://github.com/neondatabase/neon/issues/7665 Things running in k8s container use this authentication: https://docs.aws.amazon.com/sdkref/latest/guide/feature-container-credentials.html while we did not configure the client to use it. This pull request simply uses the default s3 client credential chain for storage scrubber. It might break compatibility with minio. ## Summary of changes * Use default AWS credential provider chain. * Improvements for s3 errors, we now have detailed errors and correct backtrace on last trial of the operation. --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2024-07-09 18:41:37 +01:00
Conrad Ludgate	4a5b55c834	chore: fix nightly build (#8142 ) ## Problem `cargo +nightly check` fails ## Summary of changes Updates `measured`, `time`, and `crc32c`. * `measured`: updated to fix https://github.com/rust-lang/rust/issues/125763. * `time`: updated to fix https://github.com/rust-lang/rust/issues/125319 * `crc32c`: updated to remove some nightly feature detection with a removed nightly feature	2024-07-09 18:25:49 +01:00
Alex Chi Z	73fa3c014b	chore(storage-scrubber): allow disable file logging (#8297 ) part of https://github.com/neondatabase/cloud/issues/14024, k8s does not always have a volume available for logging, and I'm running into weird permission errors... While I could spend time figuring out how to create temp directories for logging, I think it would be better to just disable file logging as k8s containers are ephemeral and we cannot retrieve anything on the fs after the container gets removed. ## Summary of changes `PAGESERVER_DISABLE_FILE_LOGGING=1` -> file logging disabled Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-07-09 17:11:37 +01:00
Luca BRUNO	c196cf6ac1	proxy/http: avoid spurious vector reallocations This tweaks the rows-to-JSON rendering logic in order to avoid allocating 0-sized temporary vectors and later growing them to insert elements. As the exact size is known in advance, both vectors can be built with an exact capacity upfront. This will avoid further vector growing/reallocation in the rendering hotpath. Signed-off-by: Luca BRUNO <lucab@lucabruno.net>	2024-07-09 15:20:00 +01:00
Alexander Bayandin	8b15864f59	CI(promote-compatibility-data): take into account commit sha (#8283 ) ## Problem In https://github.com/neondatabase/neon/pull/8161, we changed the path to Neon artefacts by adding commit sha to it, but we missed adding these changes to `promote-compatibility-data` job that we use for backward/forward- compatibility testing. ## Summary of changes - Add commit sha to `promote-compatibility-data`	2024-07-09 08:39:10 +00:00
Yuchen Liang	d9c1068cf4	tests: increase approx size equal threshold to avoid `test_lsn_lease_size` flakiness (#8282 ) ## Summary of changes Increase the `assert_size_approx_equal` threshold to avoid flakiness of `test_lsn_lease_size`. Still needs more investigation to fully resolve #8293. - Also set `autovacuum=off` for the endpoint we are running in the test. Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-07-08 21:50:13 +01:00
John Spray	811eb88b89	tests: stabilize test_timeline_size_quota_on_startup (#8255 ) ## Problem `test_timeline_size_quota_on_startup` assumed that writing data beyond the size limit would always be blocked. This is not so: the limit is only enforced if feedback makes it back from the pageserver to the safekeeper + compute. Closes: https://github.com/neondatabase/neon/issues/6562 ## Summary of changes - Modify the test to wait for the pageserver to catch up. The size limit was never actually being enforced robustly, the original version of this test was just writing much more than 30MB and about 98% of the time getting lucky such that the feedback happened to arrive before the tests for loop was done. - If the test fails, log the logical size as seen by the pageserver.	2024-07-08 20:06:34 +00:00
Alex Chi Z	df3dc6e4c1	fix(pageserver): write to both v1+v2 for aux tenant import (#8316 ) close https://github.com/neondatabase/neon/issues/8202 ref https://github.com/neondatabase/neon/pull/6560 For tenant imports, we now write the aux files into both v1+v2 storage, so that the test case can pick either one for testing. Given the API is only used for testing, this looks like a safe change. Signed-off-by: Alex Chi Z <chi@neon.tech>	2024-07-08 20:05:59 +01:00
John Spray	daea26a22f	tests: use smaller layers in test_pg_regress (#8232 ) ## Problem Debug-mode runs of test_pg_regress are rather slow since https://github.com/neondatabase/neon/pull/8105, and occasionally exceed their 600s timeout. ## Summary of changes - Use 8MiB layer files, avoiding large ephemeral layers On a hetzner AX102, this takes the runtime from 230s to 190s. Which hopefully will be enough to get the runtime on github runners more reliably below its 600s timeout. This has the side benefit of exercising more of the pageserver stack (including compaction) under a workload that exercises a more diverse set of postgres functionality than most of our tests.	2024-07-08 19:05:35 +00:00
Alexey Kondratov	84b039e615	compute_ctl: Use 'fast' shutdown for Postgres termination (#8289 ) ## Problem We currently use 'immediate' mode in the most commonly used shutdown path, when the control plane calls a `compute_ctl` API to terminate Postgres inside compute without waiting for the actual pod / VM termination. Yet, 'immediate' shutdown doesn't create a shutdown checkpoint and ROs have bad times figuring out the list of running xacts during next start. ## Summary of changes Use 'fast' mode, which creates a shutdown checkpoint that is important for ROs to get a list of running xacts faster instead of going through the CLOG. On the control plane side, we poll this `compute_ctl` termination API for 10s, it should be enough as we don't really write any data at checkpoint time. If it times out, we anyway switch to the slow k8s-based termination. See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals. The default VM shutdown hook already uses `fast` mode, see [1] [1] `c9fd8d7693/vm-image-spec.yaml (L30-L31)` Related to #6211	2024-07-08 19:54:02 +02:00
Yuchen Liang	a68edad913	refactor: move part of sharding API from `pageserver_api` to `utils` (#8254 ) ## Problem LSN Leases introduced in #8084 is a new API that is made shard-aware from day 1. To support ephemeral endpoint in #7994 without linking Postgres C API against `compute_ctl`, part of the sharding needs to reside in `utils`. ## Summary of changes - Create a new `shard` module in utils crate. - Move more interface related part of tenant sharding API to utils and re-export them in pageserver_api. Signed-off-by: Yuchen Liang <yuchen@neon.tech>	2024-07-08 15:43:10 +01:00
John Spray	fcdf060816	pageserver: respect has_relmap_file in collect_keyspace (#8276 ) ## Problem Rarely, a dbdir entry can exist with no `relmap_file_key` data. This causes compaction to fail, because it assumes that if the database exists, then so does the relmap file. Basebackup already handled this using a boolean to record whether such a key exists, but `collect_keyspace` didn't. ## Summary of changes - Respect the flag for whether a relfilemap exists in collect_keyspace - The reproducer for this issue will merge separately in https://github.com/neondatabase/neon/pull/8232	2024-07-08 15:39:41 +01:00
Tristan Partin	1c57f6bac3	Add long running replication tests These tests will help verify that replication, both physical and logical, works as expected in Neon. Co-authored-by: Sasha Krassovsky <sasha@neon.tech>	2024-07-08 07:30:22 -07:00
Tristan Partin	b54dd9af15	Add PgBin.run_nonblocking() Allows a process to run without blocking program execution, which can be useful for certain test scenarios. Co-authored-by: Sasha Krassovsky <sasha@neon.tech>	2024-07-08 07:30:22 -07:00
Tristan Partin	118847cd41	Log PG environment variables when a PgBin runs Useful for debugging situations like connecting to databases. Co-authored-by: Sasha Krassovsky <sasha@neon.tech>	2024-07-08 07:30:22 -07:00
Tristan Partin	f2ec542954	Add Neon HTTP API test fixture This is a Python binding to the Neon HTTP API. It isn't complete, but can be extended as necessary. Co-authored-by: Sasha Krassovsky <sasha@neon.tech>	2024-07-08 07:30:22 -07:00
Tristan Partin	2a3410d1c3	Hide import behind TYPE_CHECKING No need to import it if we aren't type checking anything.	2024-07-08 07:30:22 -07:00
John Spray	1121a1cbac	pageserver: switch to jemalloc (#8307 ) ## Problem - Resident memory on long running pageserver processes tends to climb: memory fragmentation is suspected. - Total resident memory may be a limiting factor for running on smaller nodes. ## Summary of changes - As a low-energy experiment, switch the pageserver to use jemalloc (not a net-new dependency, proxy already use it) - Decide at end of week whether to revert before next release.	2024-07-08 14:10:42 +01:00
Alex Chi Z	154ba5e1b4	fix(pageserver): ensure sparse keyspace is ordered (#8285 ) ## Problem Sparse keyspaces were constructed with ranges out of order: this didn't break things obviously, but meant that users of KeySpace functions that assume ordering would assert out. Closes https://github.com/neondatabase/neon/issues/8277 ## Summary of changes make sure the sparse keyspace has ordered keyspace parts	2024-07-08 14:05:49 +01:00