Proxy release 2025-05-20 06:01 UTC

pageserver: don't pass config to PageHandler (#11973 )
## Problem The gRPC page service API will require decoupling the `PageHandler` from the libpq protocol implementation. As preparation for this, avoid passing in the entire server config to `PageHandler`, and instead explicitly pass in the relevant fields. Touches https://github.com/neondatabase/neon/issues/11728. ## Summary of changes * Change `PageHandler` to take a `GetVectoredConcurrentIo` instead of the entire config. * Change `IoConcurrency::spawn_from_conf` to take a `GetVectoredConcurrentIo`.
2026-01-30 16:50:37 +00:00 · 2025-05-20 06:01:25 +00:00 · 2025-05-19 15:47:40 +00:00 · 2025-05-19 11:17:45 +00:00 · 2025-05-19 10:56:03 +00:00 · 2025-05-19 10:10:55 +00:00
97 changed files with 2229 additions and 637 deletions
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -49,10 +49,6 @@ inputs:
    description: 'A JSON object with project settings'
    required: false
    default: '{}'
-  default_endpoint_settings:
-    description: 'A JSON object with the default endpoint settings'
-    required: false
-    default: '{}'

 outputs:
  dsn:
@@ -139,21 +135,6 @@ runs:
            -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
            -d "{\"scheduling\": \"Essential\"}"
        fi
-        # XXX
-        # This is a workaround for the default endpoint settings, which currently do not allow some settings in the public API.
-        # https://github.com/neondatabase/cloud/issues/27108
-        if [[ -n ${DEFAULT_ENDPOINT_SETTINGS} && ${DEFAULT_ENDPOINT_SETTINGS} != "{}" ]] ; then
-          PROJECT_DATA=$(curl -X GET \
-              "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}" \
-              -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
-              -d "{\"scheduling\": \"Essential\"}"
-          )
-          NEW_DEFAULT_ENDPOINT_SETTINGS=$(echo ${PROJECT_DATA} | jq -rc ".project.default_endpoint_settings + ${DEFAULT_ENDPOINT_SETTINGS}")
-          curl -X POST --fail \
-                "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}/default_endpoint_settings" \
-                -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
-                --data "${NEW_DEFAULT_ENDPOINT_SETTINGS}"
-        fi
        

      env:
@@ -171,4 +152,3 @@ runs:
        PSQL: ${{ inputs.psql_path }}
        LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }}
        PROJECT_SETTINGS: ${{ inputs.project_settings }}
-        DEFAULT_ENDPOINT_SETTINGS: ${{ inputs.default_endpoint_settings }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -279,18 +279,14 @@ jobs:
          # run all non-pageserver tests
          ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'

-          # run pageserver tests with different settings
-          for get_vectored_concurrent_io in sequential sidecar-task; do
-            for io_engine in std-fs tokio-epoll-uring ; do
-                for io_mode in buffered direct direct-rw ; do
-                  NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
-                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
-                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \
-                  ${cov_prefix} \
-                  cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
-              done
-            done
-          done
+          # run pageserver tests
+          # (When developing new pageserver features gated by config fields, we commonly make the rust
+          # unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME.
+          # Then run the nextest invocation below for all relevant combinations. Singling out the
+          # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.)
+          NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring  \
+          ${cov_prefix} \
+          cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'

          # Run separate tests for real S3
          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
@@ -405,8 +401,6 @@ jobs:
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
-          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

      # Temporary disable this step until we figure out why it's so flaky
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -323,8 +323,6 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
-          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
          SYNC_BETWEEN_TESTS: true
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
@@ -965,7 +963,7 @@ jobs:
          fi

      - name: Verify docker-compose example and test extensions
-        timeout-minutes: 20
+        timeout-minutes: 60
        env:
          TAG: >-
            ${{
--- a/.github/workflows/cloud-extensions.yml
+++ b/.github/workflows/cloud-extensions.yml
@@ -35,7 +35,7 @@ jobs:
      matrix:
        pg-version: [16, 17]

-    runs-on: [ self-hosted, small ]
+    runs-on: us-east-2
    container:
      # We use the neon-test-extensions image here as it contains the source code for the extensions.
      image: ghcr.io/neondatabase/neon-test-extensions-v${{ matrix.pg-version }}:latest
@@ -71,20 +71,7 @@ jobs:
          region_id: ${{ inputs.region_id || 'aws-us-east-2' }}
          postgres_version: ${{ matrix.pg-version }}
          project_settings: ${{ steps.project-settings.outputs.settings }}
-          # We need these settings to get the expected output results.
-          # We cannot use the environment variables e.g. PGTZ due to
-          # https://github.com/neondatabase/neon/issues/1287
-          default_endpoint_settings: >
-            {
-              "pg_settings": {
-                "DateStyle": "Postgres,MDY",
-                "TimeZone": "America/Los_Angeles",
-                "compute_query_id": "off",
-                "neon.allow_unstable_extensions": "on"
-              }
-            }
          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-          admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }}

      - name: Run the regression tests
        run: /run-tests.sh -r /ext-src
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -63,8 +63,10 @@ jobs:

      - name: Filter out only v-string for build matrix
        id: postgres_changes
+        env:
+          CHANGES: ${{ steps.files_changed.outputs.changes }}
        run: |
-          v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
+          v_strings_only_as_json_array=$(echo ${CHANGES} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c)
          echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}"

  check-macos-build:
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1112,6 +1112,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"

+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
 [[package]]
 name = "cgroups-rs"
 version = "0.3.3"
@@ -1306,7 +1312,7 @@ dependencies = [
 "itertools 0.10.5",
 "jsonwebtoken",
 "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
 "notify",
 "num_cpus",
 "once_cell",
@@ -1429,7 +1435,7 @@ dependencies = [
 "humantime-serde",
 "hyper 0.14.30",
 "jsonwebtoken",
- "nix 0.27.1",
+ "nix 0.30.1",
 "once_cell",
 "pageserver_api",
 "pageserver_client",
@@ -3512,9 +3518,9 @@ dependencies = [

 [[package]]
 name = "libc"
-version = "0.2.169"
+version = "0.2.172"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
+checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"

 [[package]]
 name = "libloading"
@@ -3788,6 +3794,16 @@ version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"

+[[package]]
+name = "neon-shmem"
+version = "0.1.0"
+dependencies = [
+ "nix 0.30.1",
+ "tempfile",
+ "thiserror 1.0.69",
+ "workspace_hack",
+]
+
 [[package]]
 name = "never-say-never"
 version = "6.6.666"
@@ -3821,12 +3837,13 @@ dependencies = [

 [[package]]
 name = "nix"
-version = "0.27.1"
+version = "0.30.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
+checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6"
 dependencies = [
 "bitflags 2.8.0",
 "cfg-if",
+ "cfg_aliases",
 "libc",
 "memoffset 0.9.0",
 ]
@@ -4280,7 +4297,7 @@ dependencies = [
 "jsonwebtoken",
 "md5",
 "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
 "num-traits",
 "num_cpus",
 "once_cell",
@@ -4331,6 +4348,7 @@ dependencies = [
 "toml_edit",
 "tracing",
 "tracing-utils",
+ "twox-hash",
 "url",
 "utils",
 "uuid",
@@ -4355,7 +4373,7 @@ dependencies = [
 "humantime",
 "humantime-serde",
 "itertools 0.10.5",
- "nix 0.27.1",
+ "nix 0.30.1",
 "once_cell",
 "postgres_backend",
 "postgres_ffi",
@@ -4416,6 +4434,16 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_page_api"
+version = "0.1.0"
+dependencies = [
+ "prost 0.13.3",
+ "tonic",
+ "tonic-build",
+ "workspace_hack",
+]
+
 [[package]]
 name = "papaya"
 version = "0.2.1"
@@ -7898,7 +7926,7 @@ dependencies = [
 "humantime",
 "jsonwebtoken",
 "metrics",
- "nix 0.27.1",
+ "nix 0.30.1",
 "once_cell",
 "pem",
 "pin-project-lite",
@@ -8474,6 +8502,7 @@ dependencies = [
 "log",
 "memchr",
 "nix 0.26.4",
+ "nix 0.30.1",
 "nom",
 "num",
 "num-bigint",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
+    "pageserver/page_api",
    "proxy",
    "safekeeper",
    "safekeeper/client",
@@ -23,6 +24,7 @@ members = [
    "libs/postgres_ffi",
    "libs/safekeeper_api",
    "libs/desim",
+    "libs/neon-shmem",
    "libs/utils",
    "libs/consumption_metrics",
    "libs/postgres_backend",
@@ -127,7 +129,7 @@ md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
 memoffset = "0.9"
-nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
+nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
 # Do not update to >= 7.0.0, at least. The update will have a significant impact
 # on compute startup metrics (start_postgres_ms), >= 25% degradation.
 notify = "6.0.0"
@@ -251,6 +253,7 @@ pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
+pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.86.0
+ENV RUSTC_VERSION=1.87.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/patches/rum.patch
+++ b/compute/patches/rum.patch
@@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644
 			 RelationGetRelationName(index));
 
 +#ifdef NEON_SMGR
-+	smgr_start_unlogged_build(index->rd_smgr);
+	smgr_start_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
 	initRumState(&buildstate.rumstate, index);
@@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644
 	rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
 
 +#ifdef NEON_SMGR
-+	smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+	smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
 +#endif
 +
 	/*
@@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644
 	}
 
 +#ifdef NEON_SMGR
-+	smgr_end_unlogged_build(index->rd_smgr);
+	smgr_end_unlogged_build(RelationGetSmgr(index));
 +#endif
 +
 	/*
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -14,7 +14,7 @@

 use std::ffi::OsStr;
 use std::io::Write;
-use std::os::unix::prelude::AsRawFd;
+use std::os::fd::AsFd;
 use std::os::unix::process::CommandExt;
 use std::path::Path;
 use std::process::Command;
@@ -356,7 +356,7 @@ where
            let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
            // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
            // remains locked after exec.
-            nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
+            nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
                .expect("remove FD_CLOEXEC");
            // Don't run drop(file), it would close the file before we actually exec.
            std::mem::forget(file);
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -8,7 +8,6 @@
 use std::borrow::Cow;
 use std::collections::{BTreeSet, HashMap};
 use std::fs::File;
-use std::os::fd::AsRawFd;
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
@@ -31,7 +30,7 @@ use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
    NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
 };
-use nix::fcntl::{FlockArg, flock};
+use nix::fcntl::{Flock, FlockArg};
 use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
@@ -749,16 +748,16 @@ struct TimelineTreeEl {

 /// A flock-based guard over the neon_local repository directory
 struct RepoLock {
-    _file: File,
+    _file: Flock<File>,
 }

 impl RepoLock {
    fn new() -> Result<Self> {
        let repo_dir = File::open(local_env::base_path())?;
-        let repo_dir_fd = repo_dir.as_raw_fd();
-        flock(repo_dir_fd, FlockArg::LockExclusive)?;
-
-        Ok(Self { _file: repo_dir })
+        match Flock::lock(repo_dir, FlockArg::LockExclusive) {
+            Ok(f) => Ok(Self { _file: f }),
+            Err((_, e)) => Err(e).context("flock error"),
+        }
    }
 }

--- a/docker-compose/ext-src/alter_db.sh
+++ b/docker-compose/ext-src/alter_db.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# We need these settings to get the expected output results.
+# We cannot use the environment variables e.g. PGTZ due to
+# https://github.com/neondatabase/neon/issues/1287
+export DATABASE=${1:-contrib_regression}
+psql -c "ALTER DATABASE ${DATABASE} SET neon.allow_unstable_extensions='on'" \
+     -c "ALTER DATABASE ${DATABASE} SET DateStyle='Postgres,MDY'" \
+     -c "ALTER DATABASE ${DATABASE} SET TimeZone='America/Los_Angeles'" \
--- a/docker-compose/ext-src/pg_graphql-src/regular-test.sh
+++ b/docker-compose/ext-src/pg_graphql-src/regular-test.sh
@@ -18,6 +18,7 @@ TESTS=${TESTS/row_level_security/}
 TESTS=${TESTS/sqli_connection/}
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression
 ${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS}

--- a/docker-compose/ext-src/pgrag-src/regular-test.sh
+++ b/docker-compose/ext-src/pgrag-src/regular-test.sh
@@ -3,6 +3,7 @@ set -ex
 cd "$(dirname "${0}")"
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag"
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'    --use-existing --load-extension=vector --load-extension=rag --dbname=contrib_regression basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions
--- a/docker-compose/ext-src/pgx_ulid-src/Makefile
+++ b/docker-compose/ext-src/pgx_ulid-src/Makefile
@@ -20,5 +20,6 @@ installcheck: regression-test
 regression-test:
 	dropdb --if-exists contrib_regression
 	createdb contrib_regression
+	../alter_db.sh
 	psql -d contrib_regression -c "CREATE EXTENSION $(EXTNAME)"
 	$(PG_REGRESS) --inputdir=. --outputdir=. --use-existing --dbname=contrib_regression $(REGRESS)
--- a/docker-compose/ext-src/plv8-src/regular-test.sh
+++ b/docker-compose/ext-src/plv8-src/regular-test.sh
@@ -3,6 +3,7 @@ set -ex
 cd "$(dirname ${0})"
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension"));}')"
 REGRESS="${REGRESS/startup_perms/}"
--- a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile
+++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile
@@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress
 installcheck:
 	dropdb --if-exists contrib_regression
 	createdb contrib_regression
+	../alter_db.sh
 	psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_bge_small_en_v15"
 	$(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS)
--- a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile
+++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile
@@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress
 installcheck:
 	dropdb --if-exists contrib_regression
 	createdb contrib_regression
+	../alter_db.sh
 	psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_jina_reranker_v1_tiny_en"
 	$(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS)
--- a/docker-compose/ext-src/rum-src/regular-test.sh
+++ b/docker-compose/ext-src/rum-src/regular-test.sh
@@ -3,5 +3,6 @@ set -ex
 cd "$(dirname ${0})"
 dropdb --if-exist contrib_regression
 createdb contrib_regression
+. ../alter_db.sh
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        if var(REAL_S3_ENV).is_ok() {
            assert!(body.contains("remote_storage_s3_deleted_objects_total"));
        }
+
+        #[cfg(target_os = "linux")]
        assert!(body.contains("process_threads"));
    }

--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "neon-shmem"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+thiserror.workspace = true
+nix.workspace=true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[target.'cfg(target_os = "macos")'.dependencies]
+tempfile = "3.14.0"
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -0,0 +1,418 @@
+//! Shared memory utilities for neon communicator
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {} too large", max_size);
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {}", i);
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -235,7 +235,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy {
    ScatteredLsn,
 }

-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum GetVectoredConcurrentIo {
    /// The read path is fully sequential: layers are visited
@@ -305,6 +305,7 @@ impl From<OtelExporterProtocol> for tracing_utils::Protocol {
 pub struct TimelineImportConfig {
    pub import_job_concurrency: NonZeroUsize,
    pub import_job_soft_size_limit: NonZeroUsize,
+    pub import_job_checkpoint_threshold: NonZeroUsize,
 }

 pub mod statvfs {
@@ -639,23 +640,15 @@ impl Default for ConfigToml {
            tenant_config: TenantConfigToml::default(),
            no_sync: None,
            wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
-            page_service_pipelining: if !cfg!(test) {
-                PageServicePipeliningConfig::Serial
-            } else {
-                // Do not turn this into the default until scattered reads have been
-                // validated and rolled-out fully.
-                PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+            page_service_pipelining: PageServicePipeliningConfig::Pipelined(
+                PageServicePipeliningConfigPipelined {
                    max_batch_size: NonZeroUsize::new(32).unwrap(),
                    execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
                    batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
-                })
-            },
-            get_vectored_concurrent_io: if !cfg!(test) {
-                GetVectoredConcurrentIo::Sequential
-            } else {
-                GetVectoredConcurrentIo::SidecarTask
-            },
-            enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") {
+                },
+            ),
+            get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask,
+            enable_read_path_debugging: if cfg!(feature = "testing") {
                Some(true)
            } else {
                None
@@ -669,6 +662,7 @@ impl Default for ConfigToml {
            timeline_import_config: TimelineImportConfig {
                import_job_concurrency: NonZeroUsize::new(128).unwrap(),
                import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
+                import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
            },
        }
    }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -910,6 +910,11 @@ impl Key {
        self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff
    }

+    #[inline(always)]
+    pub fn is_rel_block_of_rel(&self, rel: Oid) -> bool {
+        self.is_rel_block_key() && self.field4 == rel
+    }
+
    #[inline(always)]
    pub fn is_rel_dir_key(&self) -> bool {
        self.field1 == 0x00
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -336,14 +336,30 @@ impl TimelineCreateRequest {

 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
 pub enum ShardImportStatus {
-    InProgress,
+    InProgress(Option<ShardImportProgress>),
    Done,
    Error(String),
 }
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum ShardImportProgress {
+    V1(ShardImportProgressV1),
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub struct ShardImportProgressV1 {
+    /// Total number of jobs in the import plan
+    pub jobs: usize,
+    /// Number of jobs completed
+    pub completed: usize,
+    /// Hash of the plan
+    pub import_plan_hash: u64,
+}
+
 impl ShardImportStatus {
    pub fn is_terminal(&self) -> bool {
        match self {
-            ShardImportStatus::InProgress => false,
+            ShardImportStatus::InProgress(_) => false,
            ShardImportStatus::Done | ShardImportStatus::Error(_) => true,
        }
    }
@@ -1803,7 +1819,6 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
-    use std::sync::LazyLock;

    #[derive(
        Copy,
@@ -1851,15 +1866,7 @@ pub mod virtual_file {

    impl IoMode {
        pub fn preferred() -> Self {
-            // The default behavior when running Rust unit tests without any further
-            // flags is to use the newest behavior (DirectRw).
-            // The CI uses the environment variable to unit tests for all different modes.
-            // NB: the Python regression & perf tests have their own defaults management
-            // that writes pageserver.toml; they do not use this variable.
-            static ENV_OVERRIDE: LazyLock<Option<IoMode>> = LazyLock::new(|| {
-                utils::env::var_serde_json_string("NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE")
-            });
-            ENV_OVERRIDE.unwrap_or(IoMode::DirectRw)
+            IoMode::DirectRw
        }
    }

--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -4,6 +4,7 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
+use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};

 use crate::controller_api::NodeRegisterRequest;
@@ -63,9 +64,17 @@ pub struct ValidateResponseTenant {
    pub valid: bool,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TimelineImportStatusRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    pub generation: Generation,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct PutTimelineImportStatusRequest {
    pub tenant_shard_id: TenantShardId,
    pub timeline_id: TimelineId,
    pub status: ShardImportStatus,
+    pub generation: Generation,
 }
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,7 +1,7 @@
 use std::borrow::Cow;
 use std::fs::{self, File};
 use std::io::{self, Write};
-use std::os::fd::AsRawFd;
+use std::os::fd::AsFd;

 use camino::{Utf8Path, Utf8PathBuf};

@@ -210,13 +210,13 @@ pub fn overwrite(

 /// Syncs the filesystem for the given file descriptor.
 #[cfg_attr(target_os = "macos", allow(unused_variables))]
-pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> {
+pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> {
    // Linux guarantees durability for syncfs.
    // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
    #[cfg(target_os = "linux")]
    {
        use anyhow::Context;
-        nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?;
+        nix::unistd::syncfs(fd).context("syncfs")?;
    }
    #[cfg(target_os = "macos")]
    {
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -11,9 +11,9 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
        #[cfg(all(target_os = "linux", target_env = "gnu"))]
        {
            nix::fcntl::renameat2(
-                None,
+                nix::fcntl::AT_FDCWD,
                src,
-                None,
+                nix::fcntl::AT_FDCWD,
                dst,
                nix::fcntl::RenameFlags::RENAME_NOREPLACE,
            )
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,6 +1,6 @@
 //! A module to create and read lock files.
 //!
-//! File locking is done using [`fcntl::flock`] exclusive locks.
+//! File locking is done using [`nix::fcntl::Flock`] exclusive locks.
 //! The only consumer of this module is currently
 //! [`pid_file`](crate::pid_file). See the module-level comment
 //! there for potential pitfalls with lock files that are used
@@ -9,26 +9,25 @@
 use std::fs;
 use std::io::{Read, Write};
 use std::ops::Deref;
-use std::os::unix::prelude::AsRawFd;

 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use nix::errno::Errno::EAGAIN;
-use nix::fcntl;
+use nix::fcntl::{Flock, FlockArg};

 use crate::crashsafe;

-/// A handle to an open and unlocked, but not-yet-written lock file.
+/// A handle to an open and flocked, but not-yet-written lock file.
 /// Returned by [`create_exclusive`].
 #[must_use]
 pub struct UnwrittenLockFile {
    path: Utf8PathBuf,
-    file: fs::File,
+    file: Flock<fs::File>,
 }

 /// Returned by [`UnwrittenLockFile::write_content`].
 #[must_use]
-pub struct LockFileGuard(fs::File);
+pub struct LockFileGuard(Flock<fs::File>);

 impl Deref for LockFileGuard {
    type Target = fs::File;
@@ -67,17 +66,14 @@ pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLo
        .open(lock_file_path)
        .context("open lock file")?;

-    let res = fcntl::flock(
-        lock_file.as_raw_fd(),
-        fcntl::FlockArg::LockExclusiveNonblock,
-    );
+    let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
    match res {
-        Ok(()) => Ok(UnwrittenLockFile {
+        Ok(lock_file) => Ok(UnwrittenLockFile {
            path: lock_file_path.to_owned(),
            file: lock_file,
        }),
-        Err(EAGAIN) => anyhow::bail!("file is already locked"),
-        Err(e) => Err(e).context("flock error"),
+        Err((_, EAGAIN)) => anyhow::bail!("file is already locked"),
+        Err((_, e)) => Err(e).context("flock error"),
    }
 }

@@ -105,32 +101,37 @@ pub enum LockFileRead {
 /// Check the [`LockFileRead`] variants for details.
 pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
    let res = fs::OpenOptions::new().read(true).open(path);
-    let mut lock_file = match res {
+    let lock_file = match res {
        Ok(f) => f,
        Err(e) => match e.kind() {
            std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
            _ => return Err(e).context("open lock file"),
        },
    };
-    let res = fcntl::flock(
-        lock_file.as_raw_fd(),
-        fcntl::FlockArg::LockExclusiveNonblock,
-    );
+    let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock);
    // We need the content regardless of lock success / failure.
    // But, read it after flock so that, if it succeeded, the content is consistent.
-    let mut content = String::new();
-    lock_file
-        .read_to_string(&mut content)
-        .context("read lock file")?;
    match res {
-        Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
-            LockFileGuard(lock_file),
-            content,
-        )),
-        Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
-            not_locked_file: lock_file,
-            content,
-        }),
-        Err(e) => Err(e).context("flock error"),
+        Ok(mut locked_file) => {
+            let mut content = String::new();
+            locked_file
+                .read_to_string(&mut content)
+                .context("read lock file")?;
+            Ok(LockFileRead::NotHeldByAnyProcess(
+                LockFileGuard(locked_file),
+                content,
+            ))
+        }
+        Err((mut not_locked_file, EAGAIN)) => {
+            let mut content = String::new();
+            not_locked_file
+                .read_to_string(&mut content)
+                .context("read lock file")?;
+            Ok(LockFileRead::LockedByOtherProcess {
+                not_locked_file,
+                content,
+            })
+        }
+        Err((_, e)) => Err(e).context("flock error"),
    }
 }
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -127,12 +127,12 @@ macro_rules! __check_fields_present {

            match check_fields_present0($extractors) {
                Ok(FoundEverything) => Ok(()),
-                Ok(Unconfigured) if cfg!(test) => {
+                Ok(Unconfigured) if cfg!(feature = "testing") => {
                    // allow unconfigured in tests
                    Ok(())
                },
                Ok(Unconfigured) => {
-                    panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer")
+                    panic!(r#"utils::tracing_span_assert: outside of #[cfg(feature = "testing")] expected tracing to be configured with tracing_error::ErrorLayer"#)
                },
                Err(missing) => Err(missing)
            }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -96,6 +96,7 @@ strum.workspace = true
 strum_macros.workspace = true
 wal_decoder.workspace = true
 smallvec.workspace = true
+twox-hash.workspace = true

 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "pageserver_page_api"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+prost.workspace = true
+tonic.workspace = true
+workspace_hack.workspace = true
+
+[build-dependencies]
+tonic-build.workspace = true
--- a/pageserver/page_api/build.rs
+++ b/pageserver/page_api/build.rs
@@ -0,0 +1,13 @@
+use std::env;
+use std::path::PathBuf;
+
+/// Generates Rust code from .proto Protobuf schemas, along with a binary file
+/// descriptor set for Protobuf schema reflection.
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let out_dir = PathBuf::from(env::var("OUT_DIR")?);
+    tonic_build::configure()
+        .bytes(["."])
+        .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin"))
+        .compile_protos(&["proto/page_service.proto"], &["proto"])
+        .map_err(|err| err.into())
+}
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -0,0 +1,233 @@
+// Page service, presented by pageservers for computes.
+//
+// This is the compute read path. It primarily serves page versions at given
+// LSNs, but also base backups, SLRU segments, and relation metadata.
+//
+// EXPERIMENTAL: this is still under development and subject to change.
+//
+// Request metadata headers:
+// - authorization: JWT token ("Bearer <token>"), if auth is enabled
+// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980")
+// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
+// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
+//
+// The service can be accessed via e.g. grpcurl:
+//
+//    ```
+//    grpcurl \
+//      -plaintext \
+//      -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
+//      -H "neon-shard-id: 0b10" \
+//      -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
+//      -H "authorization: Bearer $JWT" \
+//      -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
+//      localhost:51051 page_api.PageService/CheckRelExists
+//    ```
+//
+// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
+// However, this will require reconnecting when changing modes.
+//
+// TODO: write implementation guidance on
+// - Health checks
+// - Tracing, OpenTelemetry
+// - Compression
+
+syntax = "proto3";
+package page_api;
+
+service PageService {
+  // Returns whether a relation exists.
+  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
+
+  // Fetches a base backup.
+  rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
+
+  // Returns the total size of a database, as # of bytes.
+  rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse);
+
+  // Fetches pages.
+  //
+  // This is implemented as a bidirectional streaming RPC for performance. Unary
+  // requests incur costs for e.g. HTTP/2 stream setup, header parsing,
+  // authentication, and so on -- with streaming, we only pay these costs during
+  // the initial stream setup. This ~doubles throughput in benchmarks. Other
+  // RPCs use regular unary requests, since they are not as frequent and
+  // performance-critical, and this simplifies implementation.
+  //
+  // NB: a status response (e.g. errors) will terminate the stream. The stream
+  // may be shared by e.g. multiple Postgres backends, so we should avoid this.
+  // Most errors are therefore sent as GetPageResponse.status instead.
+  rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);
+
+  // Returns the size of a relation, as # of blocks.
+  rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse);
+
+  // Fetches an SLRU segment.
+  rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
+}
+
+// The LSN a request should read at.
+message ReadLsn {
+  // The request's read LSN. Required.
+  uint64 request_lsn = 1;
+  // If given, the caller guarantees that the page has not been modified since
+  // this LSN. Must be smaller than or equal to request_lsn. This allows the
+  // Pageserver to serve an old page without waiting for the request LSN to
+  // arrive. Valid for all request types.
+  //
+  // It is undefined behaviour to make a request such that the page was, in
+  // fact, modified between request_lsn and not_modified_since_lsn. The
+  // Pageserver might detect it and return an error, or it might return the old
+  // page version or the new page version. Setting not_modified_since_lsn equal
+  // to request_lsn is always safe, but can lead to unnecessary waiting.
+  uint64 not_modified_since_lsn = 2;
+}
+
+// A relation identifier.
+message RelTag {
+    uint32 spc_oid = 1;
+    uint32 db_oid = 2;
+    uint32 rel_number = 3;
+    uint32 fork_number = 4;
+}
+
+// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
+// other shards will error.
+message CheckRelExistsRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message CheckRelExistsResponse {
+  bool exists = 1;
+}
+
+// Requests a base backup at a given LSN.
+message GetBaseBackupRequest {
+  // The LSN to fetch a base backup at.
+  ReadLsn read_lsn = 1;
+  // If true, logical replication slots will not be created.
+  bool replica = 2;
+}
+
+// Base backup response chunk, returned as an ordered stream.
+message GetBaseBackupResponseChunk {
+  // A basebackup data chunk. The size is undefined, but bounded by the 4 MB
+  // gRPC message size limit.
+  bytes chunk = 1;
+}
+
+// Requests the size of a database, as # of bytes. Only valid on shard 0, other
+// shards will error.
+message GetDbSizeRequest {
+  ReadLsn read_lsn = 1;
+  uint32 db_oid = 2;
+}
+
+message GetDbSizeResponse {
+  uint64 num_bytes = 1;
+}
+
+// Requests one or more pages.
+message GetPageRequest {
+  // A request ID. Will be included in the response. Should be unique for
+  // in-flight requests on the stream.
+  uint64 request_id = 1;
+  // The request class.
+  GetPageClass request_class = 2;
+  // The LSN to read at.
+  ReadLsn read_lsn = 3;
+  // The relation to read from.
+  RelTag rel = 4;
+  // Page numbers to read. Must belong to the remote shard.
+  //
+  // Multiple pages will be executed as a single batch by the Pageserver,
+  // amortizing layer access costs and parallelizing them. This may increase the
+  // latency of any individual request, but improves the overall latency and
+  // throughput of the batch as a whole.
+  //
+  // TODO: this causes an allocation in the common single-block case. The sender
+  // can use a SmallVec to stack-allocate it, but Prost will always deserialize
+  // into a heap-allocated Vec. Consider optimizing this.
+  //
+  // TODO: we might be able to avoid a sort or something if we mandate that these
+  // are always in order. But we can't currenly rely on this on the server, because
+  // of compatibility with the libpq protocol handler.
+  repeated uint32 block_number = 5;
+}
+
+// A GetPageRequest class. Primarily intended for observability, but may also be
+// used for prioritization in the future.
+enum GetPageClass {
+  // Unknown class. For forwards compatibility: used when the client sends a
+  // class that the server doesn't know about.
+  GET_PAGE_CLASS_UNKNOWN = 0;
+  // A normal request. This is the default.
+  GET_PAGE_CLASS_NORMAL = 1;
+  // A prefetch request. NB: can only be classified on pg < 18.
+  GET_PAGE_CLASS_PREFETCH = 2;
+  // A background request (e.g. vacuum).
+  GET_PAGE_CLASS_BACKGROUND = 3;
+}
+
+// A GetPage response.
+//
+// A batch response will contain all of the requested pages. We could eagerly
+// emit individual pages as soon as they are ready, but on a readv() Postgres
+// holds buffer pool locks on all pages in the batch and we'll only return once
+// the entire batch is ready, so no one can make use of the individual pages.
+message GetPageResponse {
+  // The original request's ID.
+  uint64 request_id = 1;
+  // The response status code.
+  GetPageStatus status = 2;
+  // A string describing the status, if any.
+  string reason = 3;
+  // The 8KB page images, in the same order as the request. Empty if status != OK.
+  repeated bytes page_image = 4;
+}
+
+// A GetPageResponse status code. Since we use a bidirectional stream, we don't
+// want to send errors as gRPC statuses, since this would terminate the stream.
+enum GetPageStatus {
+  // Unknown status. For forwards compatibility: used when the server sends a
+  // status code that the client doesn't know about.
+  GET_PAGE_STATUS_UNKNOWN = 0;
+  // The request was successful.
+  GET_PAGE_STATUS_OK = 1;
+  // The page did not exist. The tenant/timeline/shard has already been
+  // validated during stream setup.
+  GET_PAGE_STATUS_NOT_FOUND = 2;
+  // The request was invalid.
+  GET_PAGE_STATUS_INVALID = 3;
+  // The tenant is rate limited. Slow down and retry later.
+  GET_PAGE_STATUS_SLOW_DOWN = 4;
+  // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
+  // layer download. This could free up the server task to process other
+  // requests while the layer download is in progress.
+}
+
+// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
+// shard 0, other shards will error.
+message GetRelSizeRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message GetRelSizeResponse {
+  uint32 num_blocks = 1;
+}
+
+// Requests an SLRU segment. Only valid on shard 0, other shards will error.
+message GetSlruSegmentRequest {
+  ReadLsn read_lsn = 1;
+  uint32 kind = 2;
+  uint32 segno = 3;
+}
+
+// Returns an SLRU segment.
+//
+// These are up 32 pages (256 KB), so we can send them as a single response.
+message GetSlruSegmentResponse {
+  bytes segment = 1;
+}
--- a/pageserver/page_api/src/lib.rs
+++ b/pageserver/page_api/src/lib.rs
@@ -0,0 +1,19 @@
+//! This crate provides the Pageserver's page API. It contains:
+//!
+//! * proto/page_service.proto: the Protobuf schema for the page API.
+//! * proto: auto-generated Protobuf types for gRPC.
+//!
+//! This crate is used by both the client and the server. Try to keep it slim.
+
+// Code generated by protobuf.
+pub mod proto {
+    tonic::include_proto!("page_api");
+
+    /// File descriptor set for Protobuf schema reflection. This allows using
+    /// e.g. grpcurl with the API.
+    pub const FILE_DESCRIPTOR_SET: &[u8] =
+        tonic::include_file_descriptor_set!("page_api_descriptor");
+
+    pub use page_service_client::PageServiceClient;
+    pub use page_service_server::{PageService, PageServiceServer};
+}
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -65,6 +65,9 @@ pub(crate) struct Args {
    #[clap(long, default_value = "1")]
    queue_depth: NonZeroUsize,

+    #[clap(long)]
+    only_relnode: Option<u32>,
+
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -206,7 +209,12 @@ async fn main_impl(
                    for r in partitioning.keys.ranges.iter() {
                        let mut i = r.start;
                        while i != r.end {
-                            if i.is_rel_block_key() {
+                            let mut include = true;
+                            include &= i.is_rel_block_key();
+                            if let Some(only_relnode) = args.only_relnode {
+                                include &= i.is_rel_block_of_rel(only_relnode);
+                            }
+                            if include {
                                filtered.add_key(i);
                            }
                            i = i.next();
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -144,7 +144,7 @@ where
        replica,
        ctx,
        io_concurrency: IoConcurrency::spawn_from_conf(
-            timeline.conf,
+            timeline.conf.get_vectored_concurrent_io,
            timeline
                .gate
                .enter()
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -7,7 +7,7 @@ use pageserver_api::models::ShardImportStatus;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::upcall_api::{
    PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant,
-    ValidateRequest, ValidateRequestTenant, ValidateResponse,
+    TimelineImportStatusRequest, ValidateRequest, ValidateRequestTenant, ValidateResponse,
 };
 use reqwest::Certificate;
 use serde::Serialize;
@@ -51,13 +51,15 @@ pub trait StorageControllerUpcallApi {
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
+        generation: Generation,
        status: ShardImportStatus,
    ) -> impl Future<Output = Result<(), RetryForeverError>> + Send;
    fn get_timeline_import_status(
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
-    ) -> impl Future<Output = Result<Option<ShardImportStatus>, RetryForeverError>> + Send;
+        generation: Generation,
+    ) -> impl Future<Output = Result<ShardImportStatus, RetryForeverError>> + Send;
 }

 impl StorageControllerUpcallClient {
@@ -102,6 +104,7 @@ impl StorageControllerUpcallClient {
        &self,
        url: &url::Url,
        request: R,
+        method: reqwest::Method,
    ) -> Result<T, RetryForeverError>
    where
        R: Serialize,
@@ -111,7 +114,7 @@ impl StorageControllerUpcallClient {
            || async {
                let response = self
                    .http_client
-                    .post(url.clone())
+                    .request(method.clone(), url.clone())
                    .json(&request)
                    .send()
                    .await?;
@@ -220,7 +223,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
            register: register.clone(),
        };

-        let response: ReAttachResponse = self.retry_http_forever(&url, request).await?;
+        let response: ReAttachResponse = self
+            .retry_http_forever(&url, request, reqwest::Method::POST)
+            .await?;
        tracing::info!(
            "Received re-attach response with {} tenants (node {}, register: {:?})",
            response.tenants.len(),
@@ -273,7 +278,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                return Err(RetryForeverError::ShuttingDown);
            }

-            let response: ValidateResponse = self.retry_http_forever(&url, request).await?;
+            let response: ValidateResponse = self
+                .retry_http_forever(&url, request, reqwest::Method::POST)
+                .await?;
            for rt in response.tenants {
                result.insert(rt.id, rt.valid);
            }
@@ -292,6 +299,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
+        generation: Generation,
        status: ShardImportStatus,
    ) -> Result<(), RetryForeverError> {
        let url = self
@@ -302,10 +310,12 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
        let request = PutTimelineImportStatusRequest {
            tenant_shard_id,
            timeline_id,
+            generation,
            status,
        };

-        self.retry_http_forever(&url, request).await
+        self.retry_http_forever(&url, request, reqwest::Method::POST)
+            .await
    }

    #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
@@ -313,33 +323,22 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
        &self,
        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
-    ) -> Result<Option<ShardImportStatus>, RetryForeverError> {
+        generation: Generation,
+    ) -> Result<ShardImportStatus, RetryForeverError> {
        let url = self
            .base_url
-            .join(format!("timeline_import_status/{}/{}", tenant_shard_id, timeline_id).as_str())
+            .join("timeline_import_status")
            .expect("Failed to build path");

-        Ok(backoff::retry(
-            || async {
-                let response = self.http_client.get(url.clone()).send().await?;
+        let request = TimelineImportStatusRequest {
+            tenant_shard_id,
+            timeline_id,
+            generation,
+        };

-                if let Err(err) = response.error_for_status_ref() {
-                    if matches!(err.status(), Some(reqwest::StatusCode::NOT_FOUND)) {
-                        return Ok(None);
-                    } else {
-                        return Err(err);
-                    }
-                }
-                response.json::<ShardImportStatus>().await.map(Some)
-            },
-            |_| false,
-            3,
-            u32::MAX,
-            "storage controller upcall",
-            &self.cancel,
-        )
-        .await
-        .ok_or(RetryForeverError::ShuttingDown)?
-        .expect("We retry forever, this should never be reached"))
+        let response: ShardImportStatus = self
+            .retry_http_forever(&url, request, reqwest::Method::GET)
+            .await?;
+        Ok(response)
    }
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -793,6 +793,7 @@ mod test {
            &self,
            _tenant_shard_id: TenantShardId,
            _timeline_id: TimelineId,
+            _generation: Generation,
            _status: pageserver_api::models::ShardImportStatus,
        ) -> Result<(), RetryForeverError> {
            unimplemented!()
@@ -802,7 +803,8 @@ mod test {
            &self,
            _tenant_shard_id: TenantShardId,
            _timeline_id: TimelineId,
-        ) -> Result<Option<ShardImportStatus>, RetryForeverError> {
+            _generation: Generation,
+        ) -> Result<ShardImportStatus, RetryForeverError> {
            unimplemented!()
        }
    }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3199,7 +3199,7 @@ async fn list_aux_files(
            .await?;

    let io_concurrency = IoConcurrency::spawn_from_conf(
-        state.conf,
+        state.conf.get_vectored_concurrent_io,
        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
    );

--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -18,7 +18,7 @@ use itertools::Itertools;
 use jsonwebtoken::TokenData;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
-    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
@@ -331,10 +331,10 @@ async fn page_service_conn_main(
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
    let mut conn_handler = PageServerHandler::new(
-        conf,
        tenant_manager,
        auth,
        pipelining_config,
+        conf.get_vectored_concurrent_io,
        perf_span_fields,
        connection_ctx,
        cancel.clone(),
@@ -371,7 +371,6 @@ async fn page_service_conn_main(
 }

 struct PageServerHandler {
-    conf: &'static PageServerConf,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -389,6 +388,7 @@ struct PageServerHandler {
    timeline_handles: Option<TimelineHandles>,

    pipelining_config: PageServicePipeliningConfig,
+    get_vectored_concurrent_io: GetVectoredConcurrentIo,

    gate_guard: GateGuard,
 }
@@ -844,17 +844,16 @@ impl BatchedFeMessage {
 impl PageServerHandler {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
-        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
        pipelining_config: PageServicePipeliningConfig,
+        get_vectored_concurrent_io: GetVectoredConcurrentIo,
        perf_span_fields: ConnectionPerfSpanFields,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
        gate_guard: GateGuard,
    ) -> Self {
        PageServerHandler {
-            conf,
            auth,
            claims: None,
            connection_ctx,
@@ -862,6 +861,7 @@ impl PageServerHandler {
            timeline_handles: Some(TimelineHandles::new(tenant_manager)),
            cancel,
            pipelining_config,
+            get_vectored_concurrent_io,
            gate_guard,
        }
    }
@@ -1278,7 +1278,7 @@ impl PageServerHandler {
    }

    #[instrument(level = tracing::Level::DEBUG, skip_all)]
-    async fn pagesteam_handle_batched_message<IO>(
+    async fn pagestream_handle_batched_message<IO>(
        &mut self,
        pgb_writer: &mut PostgresBackend<IO>,
        batch: BatchedFeMessage,
@@ -1623,7 +1623,7 @@ impl PageServerHandler {
        }

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.get_vectored_concurrent_io,
            match self.gate_guard.try_clone() {
                Ok(guard) => guard,
                Err(_) => {
@@ -1733,7 +1733,7 @@ impl PageServerHandler {
            };

            let result = self
-                .pagesteam_handle_batched_message(
+                .pagestream_handle_batched_message(
                    pgb_writer,
                    msg,
                    io_concurrency.clone(),
@@ -1909,7 +1909,7 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    self.pagesteam_handle_batched_message(
+                    self.pagestream_handle_batched_message(
                        pgb_writer,
                        batch,
                        io_concurrency.clone(),
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -586,7 +586,7 @@ impl Timeline {
        // scan directory listing (new), merge with the old results
        let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.conf.get_vectored_concurrent_io,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -645,7 +645,7 @@ impl Timeline {
        );

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.conf.get_vectored_concurrent_io,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -885,7 +885,7 @@ impl Timeline {
            );

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf,
+                self.conf.get_vectored_concurrent_io,
                self.gate
                    .enter()
                    .map_err(|_| PageReconstructError::Cancelled)?,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -8596,8 +8596,10 @@ mod tests {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Option<Bytes>, GetVectoredError> {
-        let io_concurrency =
-            IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            tline.conf.get_vectored_concurrent_io,
+            tline.gate.enter().unwrap(),
+        );
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
        let mut res = tline
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -668,7 +668,9 @@ impl From<DownloadError> for UpdateError {

 impl From<std::io::Error> for UpdateError {
    fn from(value: std::io::Error) -> Self {
-        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
+        if let Some(nix::errno::Errno::ENOSPC) =
+            value.raw_os_error().map(nix::errno::Errno::from_raw)
+        {
            UpdateError::NoSpace
        } else if value
            .get_ref()
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -31,6 +31,7 @@ pub use inmemory_layer::InMemoryLayer;
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
+use pageserver_api::config::GetVectoredConcurrentIo;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
@@ -43,7 +44,6 @@ use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
 use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
-use crate::config::PageServerConf;
 use crate::context::{
    AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
@@ -318,11 +318,10 @@ impl IoConcurrency {
    }

    pub(crate) fn spawn_from_conf(
-        conf: &'static PageServerConf,
+        conf: GetVectoredConcurrentIo,
        gate_guard: GateGuard,
    ) -> IoConcurrency {
-        use pageserver_api::config::GetVectoredConcurrentIo;
-        let selected = match conf.get_vectored_concurrent_io {
+        let selected = match conf {
            GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
            GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
        };
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3530,7 +3530,7 @@ impl Timeline {
                };

                let io_concurrency = IoConcurrency::spawn_from_conf(
-                    self_ref.conf,
+                    self_ref.conf.get_vectored_concurrent_io,
                    self_ref
                        .gate
                        .enter()
@@ -5559,7 +5559,7 @@ impl Timeline {
            });

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf,
+                self.conf.get_vectored_concurrent_io,
                self.gate
                    .enter()
                    .map_err(|_| CreateImageLayersError::Cancelled)?,
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -188,7 +188,7 @@ pub(crate) async fn generate_tombstone_image_layer(
        "removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
    );
    let io_concurrency = IoConcurrency::spawn_from_conf(
-        detached.conf,
+        detached.conf.get_vectored_concurrent_io,
        detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
    );
    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -1,6 +1,7 @@
 use std::sync::Arc;

 use anyhow::{Context, bail};
+use importbucket_client::{ControlFile, RemoteStorageWrapper};
 use pageserver_api::models::ShardImportStatus;
 use remote_storage::RemotePath;
 use tokio::task::JoinHandle;
@@ -48,120 +49,49 @@ pub async fn doit(
    let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel);

    let shard_status = storcon_client
-        .get_timeline_import_status(timeline.tenant_shard_id, timeline.timeline_id)
+        .get_timeline_import_status(
+            timeline.tenant_shard_id,
+            timeline.timeline_id,
+            timeline.generation,
+        )
        .await
        .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?;

    info!(?shard_status, "peeking shard status");
    match shard_status {
-        None | Some(ShardImportStatus::InProgress) => {
-            // TODO: checkpoint the progress into the IndexPart instead of restarting
-            // from the beginning.
-
-            //
-            // Wipe the slate clean - the flow does not allow resuming.
-            // We can implement resuming in the future by checkpointing the progress into the IndexPart.
-            //
-            info!("wipe the slate clean");
-            {
-                // TODO: do we need to hold GC lock for this?
-                let mut guard = timeline.layers.write().await;
-                assert!(
-                    guard.layer_map()?.open_layer.is_none(),
-                    "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
-                );
-                let all_layers_keys = guard.all_persistent_layers();
-                let all_layers: Vec<_> = all_layers_keys
-                    .iter()
-                    .map(|key| guard.get_from_key(key))
-                    .collect();
-                let open = guard.open_mut().context("open_mut")?;
-
-                timeline.remote_client.schedule_gc_update(&all_layers)?;
-                open.finish_gc_timeline(&all_layers);
-            }
-
-            //
-            // Wait for pgdata to finish uploading
-            //
-            info!("wait for pgdata to reach status 'done'");
+        ShardImportStatus::InProgress(maybe_progress) => {
            let storage =
                importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;
-            let status_prefix = RemotePath::from_string("status").unwrap();
-            let pgdata_status_key = status_prefix.join("pgdata");
-            loop {
-                let res = async {
-                    let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
-                        .get_json(&pgdata_status_key)
-                        .await
-                        .context("get pgdata status")?;
-                    info!(?pgdata_status, "peeking pgdata status");
-                    if pgdata_status.map(|st| st.done).unwrap_or(false) {
-                        Ok(())
-                    } else {
-                        Err(anyhow::anyhow!("pgdata not done yet"))
-                    }
-                }
-                .await;
-                match res {
-                    Ok(_) => break,
-                    Err(err) => {
-                        info!(?err, "indefinitely waiting for pgdata to finish");
-                        if tokio::time::timeout(
-                            std::time::Duration::from_secs(10),
-                            cancel.cancelled(),
-                        )
-                        .await
-                        .is_ok()
-                        {
-                            bail!("cancelled while waiting for pgdata");
-                        }
-                    }
-                }
-            }

-            //
-            // Do the import
-            //
-            info!("do the import");
-            let control_file = storage.get_control_file().await?;
-            let base_lsn = control_file.base_lsn();
+            let control_file_res = if maybe_progress.is_none() {
+                // Only prepare the import once when there's no progress.
+                prepare_import(timeline, storage.clone(), &cancel).await
+            } else {
+                storage.get_control_file().await
+            };

-            info!("update TimelineMetadata based on LSNs from control file");
-            {
-                let pg_version = control_file.pg_version();
-                let _ctx: &RequestContext = ctx;
-                async move {
-                    // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
-                    // checkpoint record, and prev_record_lsn should point to its beginning.
-                    // We should read the real end of the record from the WAL, but here we
-                    // just fake it.
-                    let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
-                    let prev_record_lsn = base_lsn;
-                    let metadata = TimelineMetadata::new(
-                        disk_consistent_lsn,
-                        Some(prev_record_lsn),
-                        None,     // no ancestor
-                        Lsn(0),   // no ancestor lsn
-                        base_lsn, // latest_gc_cutoff_lsn
-                        base_lsn, // initdb_lsn
-                        pg_version,
+            let control_file = match control_file_res {
+                Ok(cf) => cf,
+                Err(err) => {
+                    return Err(
+                        terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await,
                    );
-
-                    let _start_lsn = disk_consistent_lsn + 1;
-
-                    timeline
-                        .remote_client
-                        .schedule_index_upload_for_full_metadata_update(&metadata)?;
-
-                    timeline.remote_client.wait_completion().await?;
-
-                    anyhow::Ok(())
                }
-            }
-            .await?;
+            };

-            flow::run(timeline.clone(), control_file, storage.clone(), ctx).await?;
+            let res = flow::run(
+                timeline.clone(),
+                control_file,
+                storage.clone(),
+                maybe_progress,
+                ctx,
+            )
+            .await;
+            if let Err(err) = res {
+                return Err(
+                    terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await,
+                );
+            }

            // Communicate that shard is done.
            // Ensure at-least-once delivery of the upcall to storage controller
@@ -175,7 +105,7 @@ pub async fn doit(
                .put_timeline_import_status(
                    timeline.tenant_shard_id,
                    timeline.timeline_id,
-                    // TODO(vlad): What about import errors?
+                    timeline.generation,
                    ShardImportStatus::Done,
                )
                .await
@@ -183,16 +113,151 @@ pub async fn doit(
                    anyhow::anyhow!("Shut down while putting timeline import status")
                })?;
        }
-        Some(ShardImportStatus::Error(err)) => {
+        ShardImportStatus::Error(err) => {
            info!(
                "shard status indicates that the shard is done (error), skipping import {}",
                err
            );
        }
-        Some(ShardImportStatus::Done) => {
+        ShardImportStatus::Done => {
            info!("shard status indicates that the shard is done (success), skipping import");
        }
    }

    Ok(())
 }
+
+async fn prepare_import(
+    timeline: &Arc<Timeline>,
+    storage: RemoteStorageWrapper,
+    cancel: &CancellationToken,
+) -> anyhow::Result<ControlFile> {
+    // Wipe the slate clean before starting the import as a precaution.
+    // This method is only called when there's no recorded checkpoint for the import
+    // in the storage controller.
+    //
+    // Note that this is split-brain safe (two imports for same timeline shards running in
+    // different generations) because we go through the usual deletion path, including deletion queue.
+    info!("wipe the slate clean");
+    {
+        // TODO: do we need to hold GC lock for this?
+        let mut guard = timeline.layers.write().await;
+        assert!(
+            guard.layer_map()?.open_layer.is_none(),
+            "while importing, there should be no in-memory layer" // this just seems like a good place to assert it
+        );
+        let all_layers_keys = guard.all_persistent_layers();
+        let all_layers: Vec<_> = all_layers_keys
+            .iter()
+            .map(|key| guard.get_from_key(key))
+            .collect();
+        let open = guard.open_mut().context("open_mut")?;
+
+        timeline.remote_client.schedule_gc_update(&all_layers)?;
+        open.finish_gc_timeline(&all_layers);
+    }
+
+    //
+    // Wait for pgdata to finish uploading
+    //
+    info!("wait for pgdata to reach status 'done'");
+    let status_prefix = RemotePath::from_string("status").unwrap();
+    let pgdata_status_key = status_prefix.join("pgdata");
+    loop {
+        let res = async {
+            let pgdata_status: Option<importbucket_format::PgdataStatus> = storage
+                .get_json(&pgdata_status_key)
+                .await
+                .context("get pgdata status")?;
+            info!(?pgdata_status, "peeking pgdata status");
+            if pgdata_status.map(|st| st.done).unwrap_or(false) {
+                Ok(())
+            } else {
+                Err(anyhow::anyhow!("pgdata not done yet"))
+            }
+        }
+        .await;
+        match res {
+            Ok(_) => break,
+            Err(err) => {
+                info!(?err, "indefinitely waiting for pgdata to finish");
+                if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled())
+                    .await
+                    .is_ok()
+                {
+                    bail!("cancelled while waiting for pgdata");
+                }
+            }
+        }
+    }
+
+    let control_file = storage.get_control_file().await?;
+    let base_lsn = control_file.base_lsn();
+
+    info!("update TimelineMetadata based on LSNs from control file");
+    {
+        let pg_version = control_file.pg_version();
+        async move {
+            // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the
+            // checkpoint record, and prev_record_lsn should point to its beginning.
+            // We should read the real end of the record from the WAL, but here we
+            // just fake it.
+            let disk_consistent_lsn = Lsn(base_lsn.0 + 8);
+            let prev_record_lsn = base_lsn;
+            let metadata = TimelineMetadata::new(
+                disk_consistent_lsn,
+                Some(prev_record_lsn),
+                None,     // no ancestor
+                Lsn(0),   // no ancestor lsn
+                base_lsn, // latest_gc_cutoff_lsn
+                base_lsn, // initdb_lsn
+                pg_version,
+            );
+
+            let _start_lsn = disk_consistent_lsn + 1;
+
+            timeline
+                .remote_client
+                .schedule_index_upload_for_full_metadata_update(&metadata)?;
+
+            timeline.remote_client.wait_completion().await?;
+
+            anyhow::Ok(())
+        }
+    }
+    .await?;
+
+    Ok(control_file)
+}
+
+async fn terminate_flow_with_error(
+    timeline: &Arc<Timeline>,
+    error: anyhow::Error,
+    storcon_client: &StorageControllerUpcallClient,
+    cancel: &CancellationToken,
+) -> anyhow::Error {
+    // The import task is a aborted on tenant shutdown, so in principle, it should
+    // never be cancelled. To be on the safe side, check the cancellation tokens
+    // before marking the import as failed.
+    if !(cancel.is_cancelled() || timeline.cancel.is_cancelled()) {
+        let notify_res = storcon_client
+            .put_timeline_import_status(
+                timeline.tenant_shard_id,
+                timeline.timeline_id,
+                timeline.generation,
+                ShardImportStatus::Error(format!("{error:#}")),
+            )
+            .await;
+
+        if let Err(_notify_error) = notify_res {
+            // The [`StorageControllerUpcallClient::put_timeline_import_status`] retries
+            // forever internally, so errors returned by it can only be due to cancellation.
+            info!("failed to notify storcon about permanent import error");
+        }
+
+        // Will be logged by [`Tenant::create_timeline_import_pgdata_task`]
+        error
+    } else {
+        anyhow::anyhow!("Import task cancelled")
+    }
+}
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -29,10 +29,11 @@
 //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest)

 use std::collections::HashSet;
+use std::hash::{Hash, Hasher};
 use std::ops::Range;
 use std::sync::Arc;

-use anyhow::{bail, ensure};
+use anyhow::ensure;
 use bytes::Bytes;
 use futures::stream::FuturesOrdered;
 use itertools::Itertools;
@@ -43,6 +44,7 @@ use pageserver_api::key::{
    slru_segment_size_to_key,
 };
 use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range};
+use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::relfile_utils::parse_relfilename;
@@ -59,16 +61,36 @@ use super::Timeline;
 use super::importbucket_client::{ControlFile, RemoteStorageWrapper};
 use crate::assert_u64_eq_usize::UsizeIsU64;
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient};
 use crate::pgdatadir_mapping::{
    DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::storage_layer::{ImageLayerWriter, Layer};
+use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer};

 pub async fn run(
    timeline: Arc<Timeline>,
    control_file: ControlFile,
    storage: RemoteStorageWrapper,
+    import_progress: Option<ShardImportProgress>,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
+    // Match how we run the import based on the progress version.
+    // If there's no import progress, it means that this is a new import
+    // and we can use whichever version we want.
+    match import_progress {
+        Some(ShardImportProgress::V1(progress)) => {
+            run_v1(timeline, control_file, storage, Some(progress), ctx).await
+        }
+        None => run_v1(timeline, control_file, storage, None, ctx).await,
+    }
+}
+
+async fn run_v1(
+    timeline: Arc<Timeline>,
+    control_file: ControlFile,
+    storage: RemoteStorageWrapper,
+    import_progress: Option<ShardImportProgressV1>,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    let planner = Planner {
@@ -81,9 +103,31 @@ pub async fn run(
    let import_config = &timeline.conf.timeline_import_config;
    let plan = planner.plan(import_config).await?;

+    // Hash the plan and compare with the hash of the plan we got back from the storage controller.
+    // If the two match, it means that the planning stage had the same output.
+    //
+    // This is not intended to be a cryptographically secure hash.
+    const SEED: u64 = 42;
+    let mut hasher = twox_hash::XxHash64::with_seed(SEED);
+    plan.hash(&mut hasher);
+    let plan_hash = hasher.finish();
+
+    if let Some(progress) = &import_progress {
+        if plan_hash != progress.import_plan_hash {
+            anyhow::bail!("Import plan does not match storcon metadata");
+        }
+
+        // Handle collisions on jobs of unequal length
+        if progress.jobs != plan.jobs.len() {
+            anyhow::bail!("Import plan job length does not match storcon metadata")
+        }
+    }
+
    pausable_failpoint!("import-timeline-pre-execute-pausable");

-    plan.execute(timeline, import_config, ctx).await
+    let start_from_job_idx = import_progress.map(|progress| progress.completed);
+    plan.execute(timeline, start_from_job_idx, plan_hash, import_config, ctx)
+        .await
 }

 struct Planner {
@@ -93,8 +137,11 @@ struct Planner {
    tasks: Vec<AnyImportTask>,
 }

+#[derive(Hash)]
 struct Plan {
    jobs: Vec<ChunkProcessingJob>,
+    // Included here such that it ends up in the hash for the plan
+    shard: ShardIdentity,
 }

 impl Planner {
@@ -198,7 +245,10 @@ impl Planner {
            pgdata_lsn,
        ));

-        Ok(Plan { jobs })
+        Ok(Plan {
+            jobs,
+            shard: self.shard,
+        })
    }

    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))]
@@ -327,25 +377,45 @@ impl Plan {
    async fn execute(
        self,
        timeline: Arc<Timeline>,
+        start_after_job_idx: Option<usize>,
+        import_plan_hash: u64,
        import_config: &TimelineImportConfig,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &timeline.cancel);
+
        let mut work = FuturesOrdered::new();
        let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into()));

        let jobs_in_plan = self.jobs.len();

-        let mut jobs = self.jobs.into_iter().enumerate().peekable();
-        let mut results = Vec::new();
+        let mut jobs = self
+            .jobs
+            .into_iter()
+            .enumerate()
+            .map(|(idx, job)| (idx + 1, job))
+            .filter(|(idx, _job)| {
+                // Filter out any jobs that have been done already
+                if let Some(start_after) = start_after_job_idx {
+                    *idx > start_after
+                } else {
+                    true
+                }
+            })
+            .peekable();
+
+        let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0);
+        let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into();

        // Run import jobs concurrently up to the limit specified by the pageserver configuration.
        // Note that we process completed futures in the oreder of insertion. This will be the
        // building block for resuming imports across pageserver restarts or tenant migrations.
-        while results.len() < jobs_in_plan {
+        while last_completed_job_idx < jobs_in_plan {
            tokio::select! {
                permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => {
                    let permit = permit.expect("never closed");
                    let (job_idx, job) = jobs.next().expect("we peeked");
+
                    let job_timeline = timeline.clone();
                    let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error);

@@ -357,13 +427,35 @@ impl Plan {
                },
                maybe_complete_job_idx = work.next() => {
                    match maybe_complete_job_idx {
-                        Some(Ok((_job_idx, res))) => {
-                            results.push(res);
+                        Some(Ok((job_idx, res))) => {
+                            assert!(last_completed_job_idx.checked_add(1).unwrap() == job_idx);
+
+                            res?;
+                            last_completed_job_idx = job_idx;
+
+                            if last_completed_job_idx % checkpoint_every == 0 {
+                                let progress = ShardImportProgressV1 {
+                                    jobs: jobs_in_plan,
+                                    completed: last_completed_job_idx,
+                                    import_plan_hash,
+                                };
+
+                                storcon_client.put_timeline_import_status(
+                                    timeline.tenant_shard_id,
+                                    timeline.timeline_id,
+                                    timeline.generation,
+                                    ShardImportStatus::InProgress(Some(ShardImportProgress::V1(progress)))
+                                )
+                                .await
+                                .map_err(|_err| {
+                                    anyhow::anyhow!("Shut down while putting timeline import status")
+                                })?;
+                            }
                        },
                        Some(Err(_)) => {
-                            results.push(Err(anyhow::anyhow!(
-                                "parallel job panicked or cancelled, check pageserver logs"
-                            )));
+                            anyhow::bail!(
+                                "import job panicked or cancelled"
+                            );
                        }
                        None => {}
                    }
@@ -371,17 +463,7 @@ impl Plan {
            }
        }

-        if results.iter().all(|r| r.is_ok()) {
-            Ok(())
-        } else {
-            let mut msg = String::new();
-            for result in results {
-                if let Err(err) = result {
-                    msg.push_str(&format!("{err:?}\n\n"));
-                }
-            }
-            bail!("Some parallel jobs failed:\n\n{msg}");
-        }
+        Ok(())
    }
 }

@@ -553,6 +635,15 @@ struct ImportSingleKeyTask {
    buf: Bytes,
 }

+impl Hash for ImportSingleKeyTask {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ImportSingleKeyTask { key, buf } = self;
+
+        key.hash(state);
+        buf.hash(state);
+    }
+}
+
 impl ImportSingleKeyTask {
    fn new(key: Key, buf: Bytes) -> Self {
        ImportSingleKeyTask { key, buf }
@@ -581,6 +672,20 @@ struct ImportRelBlocksTask {
    storage: RemoteStorageWrapper,
 }

+impl Hash for ImportRelBlocksTask {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ImportRelBlocksTask {
+            shard_identity: _,
+            key_range,
+            path,
+            storage: _,
+        } = self;
+
+        key_range.hash(state);
+        path.hash(state);
+    }
+}
+
 impl ImportRelBlocksTask {
    fn new(
        shard_identity: ShardIdentity,
@@ -665,6 +770,19 @@ struct ImportSlruBlocksTask {
    storage: RemoteStorageWrapper,
 }

+impl Hash for ImportSlruBlocksTask {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ImportSlruBlocksTask {
+            key_range,
+            path,
+            storage: _,
+        } = self;
+
+        key_range.hash(state);
+        path.hash(state);
+    }
+}
+
 impl ImportSlruBlocksTask {
    fn new(key_range: Range<Key>, path: &RemotePath, storage: RemoteStorageWrapper) -> Self {
        ImportSlruBlocksTask {
@@ -707,6 +825,7 @@ impl ImportTask for ImportSlruBlocksTask {
    }
 }

+#[derive(Hash)]
 enum AnyImportTask {
    SingleKey(ImportSingleKeyTask),
    RelBlocks(ImportRelBlocksTask),
@@ -753,6 +872,7 @@ impl From<ImportSlruBlocksTask> for AnyImportTask {
    }
 }

+#[derive(Hash)]
 struct ChunkProcessingJob {
    range: Range<Key>,
    tasks: Vec<AnyImportTask>,
@@ -790,17 +910,51 @@ impl ChunkProcessingJob {

        let resident_layer = if nimages > 0 {
            let (desc, path) = writer.finish(ctx).await?;
+
+            {
+                let guard = timeline.layers.read().await;
+                let existing_layer = guard.try_get_from_key(&desc.key());
+                if let Some(layer) = existing_layer {
+                    if layer.metadata().generation != timeline.generation {
+                        return Err(anyhow::anyhow!(
+                            "Import attempted to rewrite layer file in the same generation: {}",
+                            layer.local_path()
+                        ));
+                    }
+                }
+            }
+
            Layer::finish_creating(timeline.conf, &timeline, desc, &path)?
        } else {
            // dropping the writer cleans up
            return Ok(());
        };

-        // this is sharing the same code as create_image_layers
+        // The same import job might run multiple times since not each job is checkpointed.
+        // Hence, we must support the cases where the layer already exists. We cannot be
+        // certain that the existing layer is identical to the new one, so in that case
+        // we replace the old layer with the one we just generated.
+
        let mut guard = timeline.layers.write().await;
-        guard
-            .open_mut()?
-            .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics);
+
+        let existing_layer = guard
+            .try_get_from_key(&resident_layer.layer_desc().key())
+            .cloned();
+        match existing_layer {
+            Some(existing) => {
+                guard.open_mut()?.rewrite_layers(
+                    &[(existing.clone(), resident_layer.clone())],
+                    &[],
+                    &timeline.metrics,
+                );
+            }
+            None => {
+                guard
+                    .open_mut()?
+                    .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics);
+            }
+        }
+
        crate::tenant::timeline::drop_wlock(guard);

        timeline
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -408,7 +408,7 @@ impl OpenFiles {
 /// error types may be elegible for retry.
 pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
    use nix::errno::Errno::*;
-    match e.raw_os_error().map(nix::errno::from_i32) {
+    match e.raw_os_error().map(nix::errno::Errno::from_raw) {
        Some(EIO) => {
            // Terminate on EIO because we no longer trust the device to store
            // data safely, or to uphold persistence guarantees on fsync.
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -124,9 +124,7 @@ pub(super) fn epoll_uring_error_to_std(
 ) -> std::io::Error {
    match e {
        tokio_epoll_uring::Error::Op(e) => e,
-        tokio_epoll_uring::Error::System(system) => {
-            std::io::Error::new(std::io::ErrorKind::Other, system)
-        }
+        tokio_epoll_uring::Error::System(system) => std::io::Error::other(system),
    }
 }

--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -936,6 +936,44 @@ lfc_prewarm_main(Datum main_arg)
 	lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
 }

+void
+lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
+{
+	BufferTag	tag;
+	FileCacheEntry *entry;
+	uint32		hash;
+
+	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
+		return;
+
+	CopyNRelFileInfoToBufTag(tag, rinfo);
+	tag.forkNum = forkNum;
+
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+
+	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
+	if (LFC_ENABLED())
+	{
+		for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk)
+		{
+			tag.blockNum = blkno;
+			hash = get_hash_value(lfc_hash, &tag);
+			entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
+			if (entry != NULL)
+			{
+				for (int i = 0; i < lfc_blocks_per_chunk; i++)
+				{
+					if (GET_STATE(entry, i) == AVAILABLE)
+					{
+						lfc_ctl->used_pages -= 1;
+						SET_STATE(entry, i, UNAVAILABLE);
+					}
+				}
+			}
+		}
+	}
+	LWLockRelease(lfc_lock);
+}

 /*
 * Check if page is present in the cache.
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -28,6 +28,7 @@ typedef struct FileCacheState
 extern bool lfc_store_prefetch_result;

 /* functions for local file cache */
+extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
 extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
 					   BlockNumber blkno, const void *const *buffers,
 					   BlockNumber nblocks);
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,

 #define InvalidRelFileNumber InvalidOid

-#define SMgrRelGetRelInfo(reln) \
+#define SMgrRelGetRelInfo(reln)				\
 	(reln->smgr_rnode.node)

 #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
@@ -148,6 +148,12 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
 #endif

+#define NRelFileInfoInvalidate(rinfo) do { \
+		NInfoGetSpcOid(rinfo) = InvalidOid; \
+		NInfoGetDbOid(rinfo) = InvalidOid; \
+		NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \
+	} while (0)
+
 #if PG_MAJORVERSION_NUM < 17
 #define ProcNumber BackendId
 #define INVALID_PROC_NUMBER InvalidBackendId
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -108,7 +108,7 @@ typedef enum
 	UNLOGGED_BUILD_NOT_PERMANENT
 } UnloggedBuildPhase;

-static SMgrRelation unlogged_build_rel = NULL;
+static NRelFileInfo unlogged_build_rel_info;
 static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;

 static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
@@ -912,16 +912,19 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdextend(reln, forkNum, blkno, buffer, skipFsync);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdextend(reln, forkNum, blkno, buffer, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
 			return;

 		default:
@@ -1003,21 +1006,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-			{
-				for (int i = 0; i < nblocks; i++)
-				{
-					lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
-				}
-			}
 			return;

 		default:
@@ -1387,8 +1388,14 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdread(reln, forkNum, blkno, buffer);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1474,8 +1481,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	{
 		case 0:
 			neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
+			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdreadv(reln, forknum, blocknum, buffers, nblocks);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1608,6 +1621,15 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+#if PG_MAJORVERSION_NUM >= 17
+				mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+#else
+				mdwrite(reln, forknum, blocknum, buffer, skipFsync);
+#endif
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1617,9 +1639,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 			#else
 			mdwrite(reln, forknum, blocknum, buffer, skipFsync);
 			#endif
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
 			return;
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -1680,14 +1699,16 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
 			mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
-			/* Update LFC in case of unlogged index build */
-			if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
-				lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 			return;
 		default:
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -1723,6 +1744,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				return mdnblocks(reln, forknum);
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1792,6 +1817,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 			break;

 		case RELPERSISTENCE_PERMANENT:
+			if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
+			{
+				mdtruncate(reln, forknum, old_blocks, nblocks);
+				return;
+			}
 			break;

 		case RELPERSISTENCE_TEMP:
@@ -1930,7 +1960,6 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 */
 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 		neon_log(ERROR, "unlogged relation build is already in progress");
-	Assert(unlogged_build_rel == NULL);

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
@@ -1947,7 +1976,7 @@ neon_start_unlogged_build(SMgrRelation reln)

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
-			unlogged_build_rel = reln;
+			unlogged_build_rel_info = InfoFromSMgrRel(reln);
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
 #ifdef DEBUG_COMPARE_LOCAL
 			if (!IsParallelWorker())
@@ -1968,12 +1997,9 @@ neon_start_unlogged_build(SMgrRelation reln)
 		neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
 #endif

-	unlogged_build_rel = reln;
+	unlogged_build_rel_info = InfoFromSMgrRel(reln);
 	unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;

-	/* Make the relation look like it's unlogged */
-	reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
-
 	/*
 	 * Create the local file. In a parallel build, the leader is expected to
 	 * call this first and do it.
@@ -2000,17 +2026,16 @@ neon_start_unlogged_build(SMgrRelation reln)
 static void
 neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 {
-	Assert(unlogged_build_rel == reln);
+	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromSMgrRel(reln)))));
+					RelFileInfoFmt((unlogged_build_rel_info)))));

 	if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
 		return;

 	Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
-	Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

 	/*
 	 * In a parallel build, (only) the leader process performs the 2nd
@@ -2018,7 +2043,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
 	 */
 	if (IsParallelWorker())
 	{
-		unlogged_build_rel = NULL;
+		NRelFileInfoInvalidate(unlogged_build_rel_info);
 		unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 	}
 	else
@@ -2039,11 +2064,11 @@ neon_end_unlogged_build(SMgrRelation reln)
 {
 	NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);

-	Assert(unlogged_build_rel == reln);
+	Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));

 	ereport(SmgrTrace,
 			(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
-					RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
+					RelFileInfoFmt(unlogged_build_rel_info))));

 	if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
 	{
@@ -2051,7 +2076,6 @@ neon_end_unlogged_build(SMgrRelation reln)
 		BlockNumber nblocks;

 		Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
-		Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);

 		/*
 		 * Update the last-written LSN cache.
@@ -2072,9 +2096,6 @@ neon_end_unlogged_build(SMgrRelation reln)
 								InfoFromNInfoB(rinfob),
 								MAIN_FORKNUM);

-		/* Make the relation look permanent again */
-		reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
-
 		/* Remove local copy */
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
 		{
@@ -2083,6 +2104,8 @@ neon_end_unlogged_build(SMgrRelation reln)
 				 forknum);

 			forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
+			lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
+
 			mdclose(reln, forknum);
 #ifndef DEBUG_COMPARE_LOCAL
 			/* use isRedo == true, so that we drop it immediately */
@@ -2093,7 +2116,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 		mdunlink(rinfob, INIT_FORKNUM, true);
 #endif
 	}
-	unlogged_build_rel = NULL;
+	NRelFileInfoInvalidate(unlogged_build_rel_info);
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }

@@ -2166,7 +2189,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 			 * Forget about any build we might have had in progress. The local
 			 * file will be unlinked by smgrDoPendingDeletes()
 			 */
-			unlogged_build_rel = NULL;
+			NRelFileInfoInvalidate(unlogged_build_rel_info);
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 			break;

@@ -2178,7 +2201,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 		case XACT_EVENT_PRE_PREPARE:
 			if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
 			{
-				unlogged_build_rel = NULL;
+				NRelFileInfoInvalidate(unlogged_build_rel_info);
 				unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 				ereport(ERROR,
 						(errcode(ERRCODE_INTERNAL_ERROR),
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.

 [[package]]
 name = "aiohappyeyeballs"
@@ -1145,18 +1145,19 @@ dotenv = ["python-dotenv"]

 [[package]]
 name = "flask-cors"
-version = "5.0.0"
-description = "A Flask extension adding a decorator for CORS support"
+version = "6.0.0"
+description = "A Flask extension simplifying CORS support"
 optional = false
-python-versions = "*"
+python-versions = "<4.0,>=3.9"
 groups = ["main"]
 files = [
-    {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
-    {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
+    {file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"},
+    {file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"},
 ]

 [package.dependencies]
-Flask = ">=0.9"
+flask = ">=0.9"
+Werkzeug = ">=0.7"

 [[package]]
 name = "frozenlist"
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -394,6 +394,7 @@ async fn handle_client(
    }
 }

+#[allow(clippy::large_enum_variant)]
 enum Connection {
    Raw(tokio::net::TcpStream),
    Tls(tokio_rustls::client::TlsStream<tokio::net::TcpStream>),
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -43,11 +43,12 @@ project_build_tag!(BUILD_TAG);
 use clap::{Parser, ValueEnum};

 #[derive(Clone, Debug, ValueEnum)]
+#[clap(rename_all = "kebab-case")]
 enum AuthBackendType {
-    #[value(name("cplane-v1"), alias("control-plane"))]
-    ControlPlaneV1,
+    #[clap(alias("cplane-v1"))]
+    ControlPlane,

-    #[value(name("link"), alias("control-redirect"))]
+    #[clap(alias("link"))]
    ConsoleRedirect,

    #[cfg(any(test, feature = "testing"))]
@@ -160,8 +161,11 @@ struct ProxyCliArgs {
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
    redis_rps_limit: Vec<RateBucketInfo>,
    /// Cancellation channel size (max queue size for redis kv client)
-    #[clap(long, default_value = "1024")]
+    #[clap(long, default_value_t = 1024)]
    cancellation_ch_size: usize,
+    /// Cancellation ops batch size for redis
+    #[clap(long, default_value_t = 8)]
+    cancellation_batch_size: usize,
    /// cache for `allowed_ips` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    allowed_ips_cache: String,
@@ -541,7 +545,12 @@ pub async fn run() -> anyhow::Result<()> {
            if let Some(mut redis_kv_client) = redis_kv_client {
                maintenance_tasks.spawn(async move {
                    redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
+                    handle_cancel_messages(
+                        &mut redis_kv_client,
+                        rx_cancel,
+                        args.cancellation_batch_size,
+                    )
+                    .await?;

                    drop(redis_kv_client);

@@ -707,7 +716,7 @@ fn build_auth_backend(
    args: &ProxyCliArgs,
 ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
    match &args.auth_backend {
-        AuthBackendType::ControlPlaneV1 => {
+        AuthBackendType::ControlPlane => {
            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
            let project_info_cache_config: ProjectInfoCacheOptions =
                args.project_info_cache.parse()?;
@@ -862,7 +871,7 @@ async fn configure_redis(
        ("irsa", _) => match (&args.redis_host, args.redis_port) {
            (Some(host), Some(port)) => Some(
                ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host.to_string(),
+                    host.clone(),
                    port,
                    elasticache::CredentialsProvider::new(
                        args.aws_region.clone(),
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -30,8 +30,6 @@ use crate::tls::postgres_rustls::MakeRustlsConnect;
 type IpSubnetKey = IpNet;

 const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
-const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
-const BATCH_SIZE: usize = 8;

 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -231,12 +229,13 @@ impl CancelReplyOp {
 pub async fn handle_cancel_messages(
    client: &mut RedisKVClient,
    mut rx: mpsc::Receiver<CancelKeyOp>,
+    batch_size: usize,
 ) -> anyhow::Result<()> {
-    let mut batch = Vec::with_capacity(BATCH_SIZE);
-    let mut pipeline = Pipeline::with_capacity(BATCH_SIZE);
+    let mut batch = Vec::with_capacity(batch_size);
+    let mut pipeline = Pipeline::with_capacity(batch_size);

    loop {
-        if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
+        if rx.recv_many(&mut batch, batch_size).await == 0 {
            warn!("shutting down cancellation queue");
            break Ok(());
        }
@@ -367,8 +366,7 @@ impl CancellationHandler {
            return Err(CancelError::InternalError);
        };

-        tx.send_timeout(op, REDIS_SEND_TIMEOUT)
-            .await
+        tx.try_send(op)
            .map_err(|e| {
                tracing::warn!("failed to send GetCancelData for {key}: {e}");
            })
@@ -570,7 +568,7 @@ impl Session {
    }

    // Send the store key op to the cancellation handler and set TTL for the key
-    pub(crate) async fn write_cancel_key(
+    pub(crate) fn write_cancel_key(
        &self,
        cancel_closure: CancelClosure,
    ) -> Result<(), CancelError> {
@@ -596,14 +594,14 @@ impl Session {
            expire: CANCEL_KEY_TTL,
        };

-        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+        let _ = tx.try_send(op).map_err(|e| {
            let key = self.key;
            tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
        });
        Ok(())
    }

-    pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
+    pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
        let Some(tx) = &self.cancellation_handler.tx else {
            tracing::warn!("cancellation handler is not available");
            return Err(CancelError::InternalError);
@@ -619,7 +617,7 @@ impl Session {
                .guard(RedisMsgKind::HDel),
        };

-        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+        let _ = tx.try_send(op).map_err(|e| {
            let key = self.key;
            tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
        });
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -244,9 +244,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
    let session = cancellation_handler_clone.get_key();

-    session
-        .write_cancel_key(node.cancel_closure.clone())
-        .await?;
+    session.write_cancel_key(node.cancel_closure.clone())?;

    prepare_client_connection(&node, *session.key(), &mut stream).await?;

--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -78,7 +78,7 @@ struct RequestContextInner {

 #[derive(Clone, Debug)]
 pub(crate) enum AuthMethod {
-    // aka passwordless, fka link
+    // aka link
    ConsoleRedirect,
    ScramSha256,
    ScramSha256Plus,
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -383,9 +383,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
    let session = cancellation_handler_clone.get_key();

-    session
-        .write_cancel_key(node.cancel_closure.clone())
-        .await?;
+    session.write_cancel_key(node.cancel_closure.clone())?;

    prepare_client_connection(&node, *session.key(), &mut stream).await?;

--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -94,7 +94,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
            tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
        }

-        drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error
+        drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error

        res
    }
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.86.0"
+channel = "1.87.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -22,9 +22,10 @@ use safekeeper::defaults::{
    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE,
    DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
+use safekeeper::wal_backup::WalBackup;
 use safekeeper::{
    BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
-    WAL_SERVICE_RUNTIME, broker, control_file, http, wal_backup, wal_service,
+    WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service,
 };
 use sd_notify::NotifyState;
 use storage_broker::{DEFAULT_ENDPOINT, Uri};
@@ -484,15 +485,15 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
        None => None,
    };

-    let global_timelines = Arc::new(GlobalTimelines::new(conf.clone()));
+    let wal_backup = Arc::new(WalBackup::new(&conf).await?);
+
+    let global_timelines = Arc::new(GlobalTimelines::new(conf.clone(), wal_backup.clone()));

    // Register metrics collector for active timelines. It's important to do this
    // after daemonizing, otherwise process collector will be upset.
    let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone());
    metrics::register_internal(Box::new(timeline_collector))?;

-    wal_backup::init_remote_storage(&conf).await;
-
    // Keep handles to main tasks to die if any of them disappears.
    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
        FuturesUnordered::new();
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -3,6 +3,7 @@ use std::sync::Arc;
 use anyhow::{Result, bail};
 use camino::Utf8PathBuf;
 use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use remote_storage::GenericRemoteStorage;
 use safekeeper_api::membership::Configuration;
 use tokio::fs::OpenOptions;
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
@@ -30,6 +31,7 @@ pub struct Request {
 pub async fn handle_request(
    request: Request,
    global_timelines: Arc<GlobalTimelines>,
+    storage: Arc<GenericRemoteStorage>,
 ) -> Result<()> {
    // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
    //   if LSN will point to the middle of a WAL record, timeline will be in "broken" state
@@ -127,6 +129,7 @@ pub async fn handle_request(
    assert!(first_ondisk_segment >= first_segment);

    copy_s3_segments(
+        &storage,
        wal_seg_size,
        &request.source_ttid,
        &request.destination_ttid,
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -258,6 +258,7 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo

    let global_timelines = get_global_timelines(&request);
    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
+    let storage = global_timelines.get_wal_backup().get_storage();

    // To stream the body use wrap_stream which wants Stream of Result<Bytes>,
    // so create the chan and write to it in another task.
@@ -269,6 +270,7 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
        conf.my_id,
        destination,
        tx,
+        storage,
    ));

    let rx_stream = ReceiverStream::new(rx);
@@ -390,12 +392,18 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
    );

    let global_timelines = get_global_timelines(&request);
+    let wal_backup = global_timelines.get_wal_backup();
+    let storage = wal_backup
+        .get_storage()
+        .ok_or(ApiError::BadRequest(anyhow::anyhow!(
+            "Remote Storage is not configured"
+        )))?;

    copy_timeline::handle_request(copy_timeline::Request{
        source_ttid,
        until_lsn: request_data.until_lsn,
        destination_ttid: TenantTimelineId::new(source_ttid.tenant_id, request_data.target_timeline_id),
-    }, global_timelines)
+    }, global_timelines, storage)
        .instrument(info_span!("copy_timeline", from=%source_ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
        .await
        .map_err(ApiError::InternalServerError)?;
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -125,12 +125,6 @@ pub struct SafeKeeperConf {
    pub enable_tls_wal_service_api: bool,
 }

-impl SafeKeeperConf {
-    pub fn is_wal_backup_enabled(&self) -> bool {
-        self.remote_storage.is_some() && self.wal_backup_enabled
-    }
-}
-
 impl SafeKeeperConf {
    pub fn dummy() -> Self {
        SafeKeeperConf {
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -9,6 +9,7 @@ use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
+use remote_storage::GenericRemoteStorage;
 use reqwest::Certificate;
 use safekeeper_api::Term;
 use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus};
@@ -43,6 +44,7 @@ pub async fn stream_snapshot(
    source: NodeId,
    destination: NodeId,
    tx: mpsc::Sender<Result<Bytes>>,
+    storage: Option<Arc<GenericRemoteStorage>>,
 ) {
    match tli.try_wal_residence_guard().await {
        Err(e) => {
@@ -53,10 +55,32 @@ pub async fn stream_snapshot(
        Ok(maybe_resident_tli) => {
            if let Err(e) = match maybe_resident_tli {
                Some(resident_tli) => {
-                    stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
-                        .await
+                    stream_snapshot_resident_guts(
+                        resident_tli,
+                        source,
+                        destination,
+                        tx.clone(),
+                        storage,
+                    )
+                    .await
+                }
+                None => {
+                    if let Some(storage) = storage {
+                        stream_snapshot_offloaded_guts(
+                            tli,
+                            source,
+                            destination,
+                            tx.clone(),
+                            &storage,
+                        )
+                        .await
+                    } else {
+                        tx.send(Err(anyhow!("remote storage not configured")))
+                            .await
+                            .ok();
+                        return;
+                    }
                }
-                None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
            } {
                // Error type/contents don't matter as they won't can't reach the client
                // (hyper likely doesn't do anything with it), but http stream will be
@@ -123,10 +147,12 @@ pub(crate) async fn stream_snapshot_offloaded_guts(
    source: NodeId,
    destination: NodeId,
    tx: mpsc::Sender<Result<Bytes>>,
+    storage: &GenericRemoteStorage,
 ) -> Result<()> {
    let mut ar = prepare_tar_stream(tx);

-    tli.snapshot_offloaded(&mut ar, source, destination).await?;
+    tli.snapshot_offloaded(&mut ar, source, destination, storage)
+        .await?;

    ar.finish().await?;

@@ -139,10 +165,13 @@ pub async fn stream_snapshot_resident_guts(
    source: NodeId,
    destination: NodeId,
    tx: mpsc::Sender<Result<Bytes>>,
+    storage: Option<Arc<GenericRemoteStorage>>,
 ) -> Result<()> {
    let mut ar = prepare_tar_stream(tx);

-    let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
+    let bctx = tli
+        .start_snapshot(&mut ar, source, destination, storage)
+        .await?;
    pausable_failpoint!("sk-snapshot-after-list-pausable");

    let tli_dir = tli.get_timeline_dir();
@@ -182,6 +211,7 @@ impl Timeline {
        ar: &mut tokio_tar::Builder<W>,
        source: NodeId,
        destination: NodeId,
+        storage: &GenericRemoteStorage,
    ) -> Result<()> {
        // Take initial copy of control file, then release state lock
        let mut control_file = {
@@ -216,6 +246,7 @@ impl Timeline {
        // can fail if the timeline was un-evicted and modified in the background.
        let remote_timeline_path = &self.remote_path;
        wal_backup::copy_partial_segment(
+            storage,
            &replace.previous.remote_path(remote_timeline_path),
            &replace.current.remote_path(remote_timeline_path),
        )
@@ -262,6 +293,7 @@ impl WalResidentTimeline {
        ar: &mut tokio_tar::Builder<W>,
        source: NodeId,
        destination: NodeId,
+        storage: Option<Arc<GenericRemoteStorage>>,
    ) -> Result<SnapshotContext> {
        let mut shared_state = self.write_shared_state().await;
        let wal_seg_size = shared_state.get_wal_seg_size();
@@ -283,6 +315,7 @@ impl WalResidentTimeline {

            let remote_timeline_path = &self.tli.remote_path;
            wal_backup::copy_partial_segment(
+                &*storage.context("remote storage not configured")?,
                &replace.previous.remote_path(remote_timeline_path),
                &replace.current.remote_path(remote_timeline_path),
            )
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -18,7 +18,7 @@ use crate::send_wal::EndWatch;
 use crate::state::{TimelinePersistentState, TimelineState};
 use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup::remote_timeline_path;
+use crate::wal_backup::{WalBackup, remote_timeline_path};
 use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage};

 /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop.
@@ -101,18 +101,22 @@ impl Env {
        let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?;
        let shared_state = SharedState::new(StateSK::Loaded(safekeeper));

+        let wal_backup = Arc::new(WalBackup::new(&conf).await?);
+
        let timeline = Timeline::new(
            ttid,
            &timeline_dir,
            &remote_path,
            shared_state,
            conf.clone(),
+            wal_backup.clone(),
        );
        timeline.bootstrap(
            &mut timeline.write_shared_state().await,
            &conf,
            Arc::new(TimelinesSet::default()), // ignored for now
            RateLimiter::new(0, 0),
+            wal_backup,
        );
        Ok(timeline)
    }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -35,7 +35,8 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim
 use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
-use crate::wal_backup::{self, remote_timeline_path};
+use crate::wal_backup;
+use crate::wal_backup::{WalBackup, remote_timeline_path};
 use crate::wal_backup_partial::PartialRemoteSegment;
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
 use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage};
@@ -452,6 +453,8 @@ pub struct Timeline {
    manager_ctl: ManagerCtl,
    conf: Arc<SafeKeeperConf>,

+    pub(crate) wal_backup: Arc<WalBackup>,
+
    remote_deletion: std::sync::Mutex<Option<RemoteDeletionReceiver>>,

    /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
@@ -476,6 +479,7 @@ impl Timeline {
        remote_path: &RemotePath,
        shared_state: SharedState,
        conf: Arc<SafeKeeperConf>,
+        wal_backup: Arc<WalBackup>,
    ) -> Arc<Self> {
        let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
            watch::channel(shared_state.sk.state().commit_lsn);
@@ -509,6 +513,7 @@ impl Timeline {
            wal_backup_active: AtomicBool::new(false),
            last_removed_segno: AtomicU64::new(0),
            mgr_status: AtomicStatus::new(),
+            wal_backup,
        })
    }

@@ -516,6 +521,7 @@ impl Timeline {
    pub fn load_timeline(
        conf: Arc<SafeKeeperConf>,
        ttid: TenantTimelineId,
+        wal_backup: Arc<WalBackup>,
    ) -> Result<Arc<Timeline>> {
        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();

@@ -529,6 +535,7 @@ impl Timeline {
            &remote_path,
            shared_state,
            conf,
+            wal_backup,
        ))
    }

@@ -539,6 +546,7 @@ impl Timeline {
        conf: &SafeKeeperConf,
        broker_active_set: Arc<TimelinesSet>,
        partial_backup_rate_limiter: RateLimiter,
+        wal_backup: Arc<WalBackup>,
    ) {
        let (tx, rx) = self.manager_ctl.bootstrap_manager();

@@ -561,6 +569,7 @@ impl Timeline {
                    tx,
                    rx,
                    partial_backup_rate_limiter,
+                    wal_backup,
                )
                .await
            }
@@ -606,9 +615,10 @@ impl Timeline {
        // it is cancelled, so WAL storage won't be opened again.
        shared_state.sk.close_wal_store();

-        if !only_local && self.conf.is_wal_backup_enabled() {
+        if !only_local {
            self.remote_delete().await?;
        }
+
        let dir_existed = delete_dir(&self.timeline_dir).await?;
        Ok(dir_existed)
    }
@@ -675,11 +685,20 @@ impl Timeline {
        guard: &mut std::sync::MutexGuard<Option<RemoteDeletionReceiver>>,
    ) -> RemoteDeletionReceiver {
        tracing::info!("starting remote deletion");
+        let storage = self.wal_backup.get_storage().clone();
        let (result_tx, result_rx) = tokio::sync::watch::channel(None);
        let ttid = self.ttid;
        tokio::task::spawn(
            async move {
-                let r = wal_backup::delete_timeline(&ttid).await;
+                let r = if let Some(storage) = storage {
+                    wal_backup::delete_timeline(&storage, &ttid).await
+                } else {
+                    tracing::info!(
+                        "skipping remote deletion because no remote storage is configured; this effectively leaks the objects in remote storage"
+                    );
+                    Ok(())
+                };
+
                if let Err(e) = &r {
                    // Log error here in case nobody ever listens for our result (e.g. dropped API request)
                    tracing::error!("remote deletion failed: {e}");
@@ -1046,14 +1065,13 @@ impl WalResidentTimeline {

    pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
        let (_, persisted_state) = self.get_state().await;
-        let enable_remote_read = self.conf.is_wal_backup_enabled();

        WalReader::new(
            &self.ttid,
            self.timeline_dir.clone(),
            &persisted_state,
            start_lsn,
-            enable_remote_read,
+            self.wal_backup.clone(),
        )
    }

--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -6,7 +6,7 @@

 use anyhow::Context;
 use camino::Utf8PathBuf;
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::fs::File;
 use tokio::io::{AsyncRead, AsyncWriteExt};
 use tracing::{debug, info, instrument, warn};
@@ -68,6 +68,10 @@ impl Manager {
    #[instrument(name = "evict_timeline", skip_all)]
    pub(crate) async fn evict_timeline(&mut self) -> bool {
        assert!(!self.is_offloaded);
+        let Some(storage) = self.wal_backup.get_storage() else {
+            warn!("no remote storage configured, skipping uneviction");
+            return false;
+        };
        let partial_backup_uploaded = match &self.partial_backup_uploaded {
            Some(p) => p.clone(),
            None => {
@@ -87,7 +91,7 @@ impl Manager {
                .inc();
        });

-        if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
+        if let Err(e) = do_eviction(self, &partial_backup_uploaded, &storage).await {
            warn!("failed to evict timeline: {:?}", e);
            return false;
        }
@@ -102,6 +106,10 @@ impl Manager {
    #[instrument(name = "unevict_timeline", skip_all)]
    pub(crate) async fn unevict_timeline(&mut self) {
        assert!(self.is_offloaded);
+        let Some(storage) = self.wal_backup.get_storage() else {
+            warn!("no remote storage configured, skipping uneviction");
+            return;
+        };
        let partial_backup_uploaded = match &self.partial_backup_uploaded {
            Some(p) => p.clone(),
            None => {
@@ -121,7 +129,7 @@ impl Manager {
                .inc();
        });

-        if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await {
+        if let Err(e) = do_uneviction(self, &partial_backup_uploaded, &storage).await {
            warn!("failed to unevict timeline: {:?}", e);
            return;
        }
@@ -137,8 +145,12 @@ impl Manager {
 /// Ensure that content matches the remote partial backup, if local segment exists.
 /// Then change state in control file and in-memory. If `delete_offloaded_wal` is set,
 /// delete the local segment.
-async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
-    compare_local_segment_with_remote(mgr, partial).await?;
+async fn do_eviction(
+    mgr: &mut Manager,
+    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
+) -> anyhow::Result<()> {
+    compare_local_segment_with_remote(mgr, partial, storage).await?;

    mgr.tli.switch_to_offloaded(partial).await?;
    // switch manager state as soon as possible
@@ -153,12 +165,16 @@ async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyho

 /// Ensure that content matches the remote partial backup, if local segment exists.
 /// Then download segment to local disk and change state in control file and in-memory.
-async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> {
+async fn do_uneviction(
+    mgr: &mut Manager,
+    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
+) -> anyhow::Result<()> {
    // if the local segment is present, validate it
-    compare_local_segment_with_remote(mgr, partial).await?;
+    compare_local_segment_with_remote(mgr, partial, storage).await?;

    // atomically download the partial segment
-    redownload_partial_segment(mgr, partial).await?;
+    redownload_partial_segment(mgr, partial, storage).await?;

    mgr.tli.switch_to_present().await?;
    // switch manager state as soon as possible
@@ -181,6 +197,7 @@ async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) ->
 async fn redownload_partial_segment(
    mgr: &Manager,
    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
    let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp");
    let remote_segfile = remote_segment_path(mgr, partial);
@@ -190,7 +207,7 @@ async fn redownload_partial_segment(
        remote_segfile, tmp_file
    );

-    let mut reader = wal_backup::read_object(&remote_segfile, 0).await?;
+    let mut reader = wal_backup::read_object(storage, &remote_segfile, 0).await?;
    let mut file = File::create(&tmp_file).await?;

    let actual_len = tokio::io::copy(&mut reader, &mut file).await?;
@@ -234,13 +251,16 @@ async fn redownload_partial_segment(
 async fn compare_local_segment_with_remote(
    mgr: &Manager,
    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
    let local_path = local_segment_path(mgr, partial);

    match File::open(&local_path).await {
-        Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial)
-            .await
-            .context("validation failed"),
+        Ok(mut local_file) => {
+            do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial, storage)
+                .await
+                .context("validation failed")
+        }
        Err(_) => {
            info!(
                "local WAL file {} is not present, skipping validation",
@@ -258,6 +278,7 @@ async fn do_validation(
    file: &mut File,
    wal_seg_size: usize,
    partial: &PartialRemoteSegment,
+    storage: &GenericRemoteStorage,
 ) -> anyhow::Result<()> {
    let local_size = file.metadata().await?.len() as usize;
    if local_size != wal_seg_size {
@@ -270,7 +291,7 @@ async fn do_validation(

    let remote_segfile = remote_segment_path(mgr, partial);
    let mut remote_reader: std::pin::Pin<Box<dyn AsyncRead + Send + Sync>> =
-        wal_backup::read_object(&remote_segfile, 0).await?;
+        wal_backup::read_object(storage, &remote_segfile, 0).await?;

    // remote segment should have bytes excatly up to `flush_lsn`
    let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size);
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -35,7 +35,7 @@ use crate::state::TimelineState;
 use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline};
 use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard};
 use crate::timelines_set::{TimelineSetGuard, TimelinesSet};
-use crate::wal_backup::{self, WalBackupTaskHandle};
+use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle};
 use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment};

 pub(crate) struct StateSnapshot {
@@ -200,6 +200,7 @@ pub(crate) struct Manager {
    pub(crate) conf: SafeKeeperConf,
    pub(crate) wal_seg_size: usize,
    pub(crate) walsenders: Arc<WalSenders>,
+    pub(crate) wal_backup: Arc<WalBackup>,

    // current state
    pub(crate) state_version_rx: tokio::sync::watch::Receiver<usize>,
@@ -238,6 +239,7 @@ pub async fn main_task(
    manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
    mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
    global_rate_limiter: RateLimiter,
+    wal_backup: Arc<WalBackup>,
 ) {
    tli.set_status(Status::Started);

@@ -256,6 +258,7 @@ pub async fn main_task(
        broker_active_set,
        manager_tx,
        global_rate_limiter,
+        wal_backup,
    )
    .await;

@@ -371,7 +374,7 @@ pub async fn main_task(
    mgr.tli_broker_active.set(false);

    // shutdown background tasks
-    if mgr.conf.is_wal_backup_enabled() {
+    if let Some(storage) = mgr.wal_backup.get_storage() {
        if let Some(backup_task) = mgr.backup_task.take() {
            // If we fell through here, then the timeline is shutting down. This is important
            // because otherwise joining on the wal_backup handle might hang.
@@ -379,7 +382,7 @@ pub async fn main_task(

            backup_task.join().await;
        }
-        wal_backup::update_task(&mut mgr, false, &last_state).await;
+        wal_backup::update_task(&mut mgr, storage, false, &last_state).await;
    }

    if let Some(recovery_task) = &mut mgr.recovery_task {
@@ -415,11 +418,13 @@ impl Manager {
        broker_active_set: Arc<TimelinesSet>,
        manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
        global_rate_limiter: RateLimiter,
+        wal_backup: Arc<WalBackup>,
    ) -> Manager {
        let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
        Manager {
            wal_seg_size: tli.get_wal_seg_size().await,
            walsenders: tli.get_walsenders().clone(),
+            wal_backup,
            state_version_rx: tli.get_state_version_rx(),
            num_computes_rx: tli.get_walreceivers().get_num_rx(),
            tli_broker_active: broker_active_set.guard(tli.clone()),
@@ -477,8 +482,8 @@ impl Manager {
        let is_wal_backup_required =
            wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state);

-        if self.conf.is_wal_backup_enabled() {
-            wal_backup::update_task(self, is_wal_backup_required, state).await;
+        if let Some(storage) = self.wal_backup.get_storage() {
+            wal_backup::update_task(self, storage, is_wal_backup_required, state).await;
        }

        // update the state in Arc<Timeline>
@@ -624,9 +629,9 @@ impl Manager {
    /// Spawns partial WAL backup task if needed.
    async fn update_partial_backup(&mut self, state: &StateSnapshot) {
        // check if WAL backup is enabled and should be started
-        if !self.conf.is_wal_backup_enabled() {
+        let Some(storage) = self.wal_backup.get_storage() else {
            return;
-        }
+        };

        if self.partial_backup_task.is_some() {
            // partial backup is already running
@@ -650,6 +655,7 @@ impl Manager {
            self.conf.clone(),
            self.global_rate_limiter.clone(),
            cancel.clone(),
+            storage,
        ));
        self.partial_backup_task = Some((handle, cancel));
    }
@@ -669,6 +675,10 @@ impl Manager {
    /// Reset partial backup state and remove its remote storage data. Since it
    /// might concurrently uploading something, cancel the task first.
    async fn backup_partial_reset(&mut self) -> anyhow::Result<Vec<String>> {
+        let Some(storage) = self.wal_backup.get_storage() else {
+            anyhow::bail!("remote storage is not enabled");
+        };
+
        info!("resetting partial backup state");
        // Force unevict timeline if it is evicted before erasing partial backup
        // state. The intended use of this function is to drop corrupted remote
@@ -689,7 +699,7 @@ impl Manager {
        }

        let tli = self.wal_resident_timeline()?;
-        let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await;
+        let mut partial_backup = PartialBackup::new(tli, self.conf.clone(), storage).await;
        // Reset might fail e.g. when cfile is already reset but s3 removal
        // failed, so set manager state to None beforehand. In any case caller
        // is expected to retry until success.
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -25,6 +25,7 @@ use crate::rate_limit::RateLimiter;
 use crate::state::TimelinePersistentState;
 use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir};
 use crate::timelines_set::TimelinesSet;
+use crate::wal_backup::WalBackup;
 use crate::wal_storage::Storage;
 use crate::{SafeKeeperConf, control_file, wal_storage};

@@ -47,15 +48,24 @@ struct GlobalTimelinesState {
    conf: Arc<SafeKeeperConf>,
    broker_active_set: Arc<TimelinesSet>,
    global_rate_limiter: RateLimiter,
+    wal_backup: Arc<WalBackup>,
 }

 impl GlobalTimelinesState {
    /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
+    fn get_dependencies(
+        &self,
+    ) -> (
+        Arc<SafeKeeperConf>,
+        Arc<TimelinesSet>,
+        RateLimiter,
+        Arc<WalBackup>,
+    ) {
        (
            self.conf.clone(),
            self.broker_active_set.clone(),
            self.global_rate_limiter.clone(),
+            self.wal_backup.clone(),
        )
    }

@@ -84,7 +94,7 @@ pub struct GlobalTimelines {

 impl GlobalTimelines {
    /// Create a new instance of the global timelines map.
-    pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
+    pub fn new(conf: Arc<SafeKeeperConf>, wal_backup: Arc<WalBackup>) -> Self {
        Self {
            state: Mutex::new(GlobalTimelinesState {
                timelines: HashMap::new(),
@@ -92,6 +102,7 @@ impl GlobalTimelines {
                conf,
                broker_active_set: Arc::new(TimelinesSet::default()),
                global_rate_limiter: RateLimiter::new(1, 1),
+                wal_backup,
            }),
        }
    }
@@ -147,7 +158,7 @@ impl GlobalTimelines {
    /// just lock and unlock it for each timeline -- this function is called
    /// during init when nothing else is running, so this is fine.
    async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
-        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = {
            let state = self.state.lock().unwrap();
            state.get_dependencies()
        };
@@ -162,7 +173,7 @@ impl GlobalTimelines {
                        TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                    {
                        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(conf.clone(), ttid) {
+                        match Timeline::load_timeline(conf.clone(), ttid, wal_backup.clone()) {
                            Ok(tli) => {
                                let mut shared_state = tli.write_shared_state().await;
                                self.state
@@ -175,6 +186,7 @@ impl GlobalTimelines {
                                    &conf,
                                    broker_active_set.clone(),
                                    partial_backup_rate_limiter.clone(),
+                                    wal_backup.clone(),
                                );
                            }
                            // If we can't load a timeline, it's most likely because of a corrupted
@@ -212,6 +224,10 @@ impl GlobalTimelines {
        self.state.lock().unwrap().broker_active_set.clone()
    }

+    pub fn get_wal_backup(&self) -> Arc<WalBackup> {
+        self.state.lock().unwrap().wal_backup.clone()
+    }
+
    /// Create a new timeline with the given id. If the timeline already exists, returns
    /// an existing timeline.
    pub(crate) async fn create(
@@ -222,7 +238,7 @@ impl GlobalTimelines {
        start_lsn: Lsn,
        commit_lsn: Lsn,
    ) -> Result<Arc<Timeline>> {
-        let (conf, _, _) = {
+        let (conf, _, _, _) = {
            let state = self.state.lock().unwrap();
            if let Ok(timeline) = state.get(&ttid) {
                // Timeline already exists, return it.
@@ -267,7 +283,7 @@ impl GlobalTimelines {
        check_tombstone: bool,
    ) -> Result<Arc<Timeline>> {
        // Check for existence and mark that we're creating it.
-        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = {
            let mut state = self.state.lock().unwrap();
            match state.timelines.get(&ttid) {
                Some(GlobalMapTimeline::CreationInProgress) => {
@@ -296,7 +312,14 @@ impl GlobalTimelines {
        };

        // Do the actual move and reflect the result in the map.
-        match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
+        match GlobalTimelines::install_temp_timeline(
+            ttid,
+            tmp_path,
+            conf.clone(),
+            wal_backup.clone(),
+        )
+        .await
+        {
            Ok(timeline) => {
                let mut timeline_shared_state = timeline.write_shared_state().await;
                let mut state = self.state.lock().unwrap();
@@ -314,6 +337,7 @@ impl GlobalTimelines {
                    &conf,
                    broker_active_set,
                    partial_backup_rate_limiter,
+                    wal_backup,
                );
                drop(timeline_shared_state);
                Ok(timeline)
@@ -336,6 +360,7 @@ impl GlobalTimelines {
        ttid: TenantTimelineId,
        tmp_path: &Utf8PathBuf,
        conf: Arc<SafeKeeperConf>,
+        wal_backup: Arc<WalBackup>,
    ) -> Result<Arc<Timeline>> {
        let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
        let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
@@ -377,7 +402,7 @@ impl GlobalTimelines {
        // Do the move.
        durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;

-        Timeline::load_timeline(conf, ttid)
+        Timeline::load_timeline(conf, ttid, wal_backup)
    }

    /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -2,6 +2,7 @@ use std::cmp::min;
 use std::collections::HashSet;
 use std::num::NonZeroU32;
 use std::pin::Pin;
+use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{Context, Result};
@@ -17,7 +18,7 @@ use safekeeper_api::models::PeerInfo;
 use tokio::fs::File;
 use tokio::select;
 use tokio::sync::mpsc::{self, Receiver, Sender};
-use tokio::sync::{OnceCell, watch};
+use tokio::sync::watch;
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -63,7 +64,12 @@ pub(crate) fn is_wal_backup_required(
 /// Based on peer information determine which safekeeper should offload; if it
 /// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task
 /// is running, kill it.
-pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) {
+pub(crate) async fn update_task(
+    mgr: &mut Manager,
+    storage: Arc<GenericRemoteStorage>,
+    need_backup: bool,
+    state: &StateSnapshot,
+) {
    let (offloader, election_dbg_str) =
        determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf);
    let elected_me = Some(mgr.conf.my_id) == offloader;
@@ -82,7 +88,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St
                return;
            };

-            let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx);
+            let async_task = backup_task_main(
+                resident,
+                storage,
+                mgr.conf.backup_parallel_jobs,
+                shutdown_rx,
+            );

            let handle = if mgr.conf.current_thread_runtime {
                tokio::spawn(async_task)
@@ -169,33 +180,31 @@ fn determine_offloader(
    }
 }

-static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::const_new();
-
-// Storage must be configured and initialized when this is called.
-fn get_configured_remote_storage() -> &'static GenericRemoteStorage {
-    REMOTE_STORAGE
-        .get()
-        .expect("failed to get remote storage")
-        .as_ref()
-        .unwrap()
+pub struct WalBackup {
+    storage: Option<Arc<GenericRemoteStorage>>,
 }

-pub async fn init_remote_storage(conf: &SafeKeeperConf) {
-    // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide
-    // dependencies to all tasks instead.
-    REMOTE_STORAGE
-        .get_or_init(|| async {
-            if let Some(conf) = conf.remote_storage.as_ref() {
-                Some(
-                    GenericRemoteStorage::from_config(conf)
-                        .await
-                        .expect("failed to create remote storage"),
-                )
-            } else {
-                None
+impl WalBackup {
+    /// Create a new WalBackup instance.
+    pub async fn new(conf: &SafeKeeperConf) -> Result<Self> {
+        if !conf.wal_backup_enabled {
+            return Ok(Self { storage: None });
+        }
+
+        match conf.remote_storage.as_ref() {
+            Some(config) => {
+                let storage = GenericRemoteStorage::from_config(config).await?;
+                Ok(Self {
+                    storage: Some(Arc::new(storage)),
+                })
            }
-        })
-        .await;
+            None => Ok(Self { storage: None }),
+        }
+    }
+
+    pub fn get_storage(&self) -> Option<Arc<GenericRemoteStorage>> {
+        self.storage.clone()
+    }
 }

 struct WalBackupTask {
@@ -204,12 +213,14 @@ struct WalBackupTask {
    wal_seg_size: usize,
    parallel_jobs: usize,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,
+    storage: Arc<GenericRemoteStorage>,
 }

 /// Offload single timeline.
 #[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))]
 async fn backup_task_main(
    tli: WalResidentTimeline,
+    storage: Arc<GenericRemoteStorage>,
    parallel_jobs: usize,
    mut shutdown_rx: Receiver<()>,
 ) {
@@ -223,6 +234,7 @@ async fn backup_task_main(
        timeline_dir: tli.get_timeline_dir(),
        timeline: tli,
        parallel_jobs,
+        storage,
    };

    // task is spinned up only when wal_seg_size already initialized
@@ -293,6 +305,7 @@ impl WalBackupTask {

            match backup_lsn_range(
                &self.timeline,
+                self.storage.clone(),
                &mut backup_lsn,
                commit_lsn,
                self.wal_seg_size,
@@ -322,6 +335,7 @@ impl WalBackupTask {

 async fn backup_lsn_range(
    timeline: &WalResidentTimeline,
+    storage: Arc<GenericRemoteStorage>,
    backup_lsn: &mut Lsn,
    end_lsn: Lsn,
    wal_seg_size: usize,
@@ -352,7 +366,12 @@ async fn backup_lsn_range(
    loop {
        let added_task = match iter.next() {
            Some(s) => {
-                uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path));
+                uploads.push_back(backup_single_segment(
+                    &storage,
+                    s,
+                    timeline_dir,
+                    remote_timeline_path,
+                ));
                true
            }
            None => false,
@@ -388,6 +407,7 @@ async fn backup_lsn_range(
 }

 async fn backup_single_segment(
+    storage: &GenericRemoteStorage,
    seg: &Segment,
    timeline_dir: &Utf8Path,
    remote_timeline_path: &RemotePath,
@@ -395,7 +415,13 @@ async fn backup_single_segment(
    let segment_file_path = seg.file_path(timeline_dir)?;
    let remote_segment_path = seg.remote_path(remote_timeline_path);

-    let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await;
+    let res = backup_object(
+        storage,
+        &segment_file_path,
+        &remote_segment_path,
+        seg.size(),
+    )
+    .await;
    if res.is_ok() {
        BACKED_UP_SEGMENTS.inc();
    } else {
@@ -455,12 +481,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
 }

 async fn backup_object(
+    storage: &GenericRemoteStorage,
    source_file: &Utf8Path,
    target_file: &RemotePath,
    size: usize,
 ) -> Result<()> {
-    let storage = get_configured_remote_storage();
-
    let file = File::open(&source_file)
        .await
        .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
@@ -475,12 +500,11 @@ async fn backup_object(
 }

 pub(crate) async fn backup_partial_segment(
+    storage: &GenericRemoteStorage,
    source_file: &Utf8Path,
    target_file: &RemotePath,
    size: usize,
 ) -> Result<()> {
-    let storage = get_configured_remote_storage();
-
    let file = File::open(&source_file)
        .await
        .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
@@ -504,25 +528,20 @@ pub(crate) async fn backup_partial_segment(
 }

 pub(crate) async fn copy_partial_segment(
+    storage: &GenericRemoteStorage,
    source: &RemotePath,
    destination: &RemotePath,
 ) -> Result<()> {
-    let storage = get_configured_remote_storage();
    let cancel = CancellationToken::new();

    storage.copy_object(source, destination, &cancel).await
 }

 pub async fn read_object(
+    storage: &GenericRemoteStorage,
    file_path: &RemotePath,
    offset: u64,
 ) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead + Send + Sync>>> {
-    let storage = REMOTE_STORAGE
-        .get()
-        .context("Failed to get remote storage")?
-        .as_ref()
-        .context("No remote storage configured")?;
-
    info!("segment download about to start from remote path {file_path:?} at offset {offset}");

    let cancel = CancellationToken::new();
@@ -547,8 +566,10 @@ pub async fn read_object(

 /// Delete WAL files for the given timeline. Remote storage must be configured
 /// when called.
-pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
-    let storage = get_configured_remote_storage();
+pub async fn delete_timeline(
+    storage: &GenericRemoteStorage,
+    ttid: &TenantTimelineId,
+) -> Result<()> {
    let remote_path = remote_timeline_path(ttid)?;

    // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
@@ -618,14 +639,14 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
 }

 /// Used by wal_backup_partial.
-pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> {
+pub async fn delete_objects(storage: &GenericRemoteStorage, paths: &[RemotePath]) -> Result<()> {
    let cancel = CancellationToken::new(); // not really used
-    let storage = get_configured_remote_storage();
    storage.delete_objects(paths, &cancel).await
 }

 /// Copy segments from one timeline to another. Used in copy_timeline.
 pub async fn copy_s3_segments(
+    storage: &GenericRemoteStorage,
    wal_seg_size: usize,
    src_ttid: &TenantTimelineId,
    dst_ttid: &TenantTimelineId,
@@ -634,12 +655,6 @@ pub async fn copy_s3_segments(
 ) -> Result<()> {
    const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;

-    let storage = REMOTE_STORAGE
-        .get()
-        .expect("failed to get remote storage")
-        .as_ref()
-        .unwrap();
-
    let remote_dst_path = remote_timeline_path(dst_ttid)?;

    let cancel = CancellationToken::new();
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -19,9 +19,11 @@
 //! file. Code updates state in the control file before doing any S3 operations.
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
+use std::sync::Arc;
+
 use camino::Utf8PathBuf;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
-use remote_storage::RemotePath;
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use safekeeper_api::Term;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -154,12 +156,16 @@ pub struct PartialBackup {
    conf: SafeKeeperConf,
    local_prefix: Utf8PathBuf,
    remote_timeline_path: RemotePath,
-
+    storage: Arc<GenericRemoteStorage>,
    state: State,
 }

 impl PartialBackup {
-    pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup {
+    pub async fn new(
+        tli: WalResidentTimeline,
+        conf: SafeKeeperConf,
+        storage: Arc<GenericRemoteStorage>,
+    ) -> PartialBackup {
        let (_, persistent_state) = tli.get_state().await;
        let wal_seg_size = tli.get_wal_seg_size().await;

@@ -173,6 +179,7 @@ impl PartialBackup {
            conf,
            local_prefix,
            remote_timeline_path,
+            storage,
        }
    }

@@ -240,7 +247,8 @@ impl PartialBackup {
        let remote_path = prepared.remote_path(&self.remote_timeline_path);

        // Upload first `backup_bytes` bytes of the segment to the remote storage.
-        wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?;
+        wal_backup::backup_partial_segment(&self.storage, &local_path, &remote_path, backup_bytes)
+            .await?;
        PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64);

        // We uploaded the segment, now let's verify that the data is still actual.
@@ -326,7 +334,7 @@ impl PartialBackup {
            let remote_path = self.remote_timeline_path.join(seg);
            objects_to_delete.push(remote_path);
        }
-        wal_backup::delete_objects(&objects_to_delete).await
+        wal_backup::delete_objects(&self.storage, &objects_to_delete).await
    }

    /// Delete all non-Uploaded segments from the remote storage. There should be only one
@@ -424,6 +432,7 @@ pub async fn main_task(
    conf: SafeKeeperConf,
    limiter: RateLimiter,
    cancel: CancellationToken,
+    storage: Arc<GenericRemoteStorage>,
 ) -> Option<PartialRemoteSegment> {
    debug!("started");
    let await_duration = conf.partial_backup_timeout;
@@ -432,7 +441,7 @@ pub async fn main_task(
    let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx();
    let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx();

-    let mut backup = PartialBackup::new(tli, conf).await;
+    let mut backup = PartialBackup::new(tli, conf, storage).await;

    debug!("state: {:?}", backup.state);

--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -21,6 +21,7 @@ use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion};
 use pq_proto::SystemId;
 use remote_storage::RemotePath;
+use std::sync::Arc;
 use tokio::fs::{self, File, OpenOptions, remove_file};
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tracing::*;
@@ -32,7 +33,7 @@ use crate::metrics::{
    REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure,
 };
 use crate::state::TimelinePersistentState;
-use crate::wal_backup::{read_object, remote_timeline_path};
+use crate::wal_backup::{WalBackup, read_object, remote_timeline_path};

 pub trait Storage {
    // Last written LSN.
@@ -645,7 +646,7 @@ pub struct WalReader {
    wal_segment: Option<Pin<Box<dyn AsyncRead + Send + Sync>>>,

    // S3 will be used to read WAL if LSN is not available locally
-    enable_remote_read: bool,
+    wal_backup: Arc<WalBackup>,

    // We don't have WAL locally if LSN is less than local_start_lsn
    local_start_lsn: Lsn,
@@ -664,7 +665,7 @@ impl WalReader {
        timeline_dir: Utf8PathBuf,
        state: &TimelinePersistentState,
        start_pos: Lsn,
-        enable_remote_read: bool,
+        wal_backup: Arc<WalBackup>,
    ) -> Result<Self> {
        if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) {
            bail!("state uninitialized, no data to read");
@@ -693,7 +694,7 @@ impl WalReader {
            wal_seg_size: state.server.wal_seg_size as usize,
            pos: start_pos,
            wal_segment: None,
-            enable_remote_read,
+            wal_backup,
            local_start_lsn: state.local_start_lsn,
            timeline_start_lsn: state.timeline_start_lsn,
            pg_version: state.server.pg_version / 10000,
@@ -812,9 +813,9 @@ impl WalReader {
        }

        // Try to open remote file, if remote reads are enabled
-        if self.enable_remote_read {
+        if let Some(storage) = self.wal_backup.get_storage() {
            let remote_wal_file_path = self.remote_path.join(&wal_file_name);
-            return read_object(&remote_wal_file_path, xlogoff as u64).await;
+            return read_object(&storage, &remote_wal_file_path, xlogoff as u64).await;
        }

        bail!("WAL segment is not found")
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -31,7 +31,7 @@ use pageserver_api::models::{
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::upcall_api::{
-    PutTimelineImportStatusRequest, ReAttachRequest, ValidateRequest,
+    PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest,
 };
 use pageserver_client::{BlockUnblock, mgmt_api};
 use routerify::Middleware;
@@ -160,22 +160,22 @@ async fn handle_validate(req: Request<Body>) -> Result<Response<Body>, ApiError>
 async fn handle_get_timeline_import_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::GenerationsApi)?;

-    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
-
-    let req = match maybe_forward(req).await {
+    let mut req = match maybe_forward(req).await {
        ForwardOutcome::Forwarded(res) => {
            return res;
        }
        ForwardOutcome::NotForwarded(req) => req,
    };

+    let get_req = json_request::<TimelineImportStatusRequest>(&mut req).await?;
+
    let state = get_state(&req);
+
    json_response(
        StatusCode::OK,
        state
            .service
-            .handle_timeline_shard_import_progress(tenant_shard_id, timeline_id)
+            .handle_timeline_shard_import_progress(get_req)
            .await?,
    )
 }
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -628,11 +628,7 @@ impl Scheduler {
            tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
        }

-        if node.attached_shard_count < expected_attached_shards_per_node {
-            expected_attached_shards_per_node - node.attached_shard_count
-        } else {
-            0
-        }
+        expected_attached_shards_per_node.saturating_sub(node.attached_shard_count)
    }

    pub(crate) fn expected_attached_shard_count(&self) -> usize {
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -47,7 +47,7 @@ use pageserver_api::shard::{
 };
 use pageserver_api::upcall_api::{
    PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant,
-    ValidateRequest, ValidateResponse, ValidateResponseTenant,
+    TimelineImportStatusRequest, ValidateRequest, ValidateResponse, ValidateResponseTenant,
 };
 use pageserver_client::{BlockUnblock, mgmt_api};
 use reqwest::{Certificate, StatusCode};
@@ -194,6 +194,14 @@ pub(crate) enum LeadershipStatus {
    Candidate,
 }

+enum ShardGenerationValidity {
+    Valid,
+    Mismatched {
+        claimed: Generation,
+        actual: Option<Generation>,
+    },
+}
+
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
@@ -3909,19 +3917,36 @@ impl Service {

    pub(crate) async fn handle_timeline_shard_import_progress(
        self: &Arc<Self>,
-        tenant_shard_id: TenantShardId,
-        timeline_id: TimelineId,
+        req: TimelineImportStatusRequest,
    ) -> Result<ShardImportStatus, ApiError> {
+        let validity = self
+            .validate_shard_generation(req.tenant_shard_id, req.generation)
+            .await?;
+        match validity {
+            ShardGenerationValidity::Valid => {
+                // fallthrough
+            }
+            ShardGenerationValidity::Mismatched { claimed, actual } => {
+                tracing::info!(
+                    claimed=?claimed.into(),
+                    actual=?actual.and_then(|g| g.into()),
+                    "Rejecting import progress fetch from stale generation"
+                );
+
+                return Err(ApiError::BadRequest(anyhow::anyhow!("Invalid generation")));
+            }
+        }
+
        let maybe_import = self
            .persistence
-            .get_timeline_import(tenant_shard_id.tenant_id, timeline_id)
+            .get_timeline_import(req.tenant_shard_id.tenant_id, req.timeline_id)
            .await?;

        let import = maybe_import.ok_or_else(|| {
            ApiError::NotFound(
                format!(
                    "import for {}/{} not found",
-                    tenant_shard_id.tenant_id, timeline_id
+                    req.tenant_shard_id.tenant_id, req.timeline_id
                )
                .into(),
            )
@@ -3930,11 +3955,11 @@ impl Service {
        import
            .shard_statuses
            .0
-            .get(&tenant_shard_id.to_index())
+            .get(&req.tenant_shard_id.to_index())
            .cloned()
            .ok_or_else(|| {
                ApiError::NotFound(
-                    format!("shard {} not found", tenant_shard_id.shard_slug()).into(),
+                    format!("shard {} not found", req.tenant_shard_id.shard_slug()).into(),
                )
            })
    }
@@ -3943,6 +3968,24 @@ impl Service {
        self: &Arc<Self>,
        req: PutTimelineImportStatusRequest,
    ) -> Result<(), ApiError> {
+        let validity = self
+            .validate_shard_generation(req.tenant_shard_id, req.generation)
+            .await?;
+        match validity {
+            ShardGenerationValidity::Valid => {
+                // fallthrough
+            }
+            ShardGenerationValidity::Mismatched { claimed, actual } => {
+                tracing::info!(
+                    claimed=?claimed.into(),
+                    actual=?actual.and_then(|g| g.into()),
+                    "Rejecting import progress update from stale generation"
+                );
+
+                return Err(ApiError::PreconditionFailed("Invalid generation".into()));
+            }
+        }
+
        let res = self
            .persistence
            .update_timeline_import(req.tenant_shard_id, req.timeline_id, req.status)
@@ -3977,6 +4020,56 @@ impl Service {
        Ok(())
    }

+    /// Check that a provided generation for some tenant shard is the most recent one.
+    ///
+    /// Validate with the in-mem state first, and, if that passes, validate with the
+    /// database state which is authoritative.
+    async fn validate_shard_generation(
+        self: &Arc<Self>,
+        tenant_shard_id: TenantShardId,
+        generation: Generation,
+    ) -> Result<ShardGenerationValidity, ApiError> {
+        {
+            let locked = self.inner.read().unwrap();
+            let tenant_shard =
+                locked
+                    .tenants
+                    .get(&tenant_shard_id)
+                    .ok_or(ApiError::InternalServerError(anyhow::anyhow!(
+                        "{} shard not found",
+                        tenant_shard_id
+                    )))?;
+
+            if tenant_shard.generation != Some(generation) {
+                return Ok(ShardGenerationValidity::Mismatched {
+                    claimed: generation,
+                    actual: tenant_shard.generation,
+                });
+            }
+        }
+
+        let mut db_generations = self
+            .persistence
+            .shard_generations(std::iter::once(&tenant_shard_id))
+            .await?;
+        let (_tid, db_generation) =
+            db_generations
+                .pop()
+                .ok_or(ApiError::InternalServerError(anyhow::anyhow!(
+                    "{} shard not found",
+                    tenant_shard_id
+                )))?;
+
+        if db_generation != Some(generation) {
+            return Ok(ShardGenerationValidity::Mismatched {
+                claimed: generation,
+                actual: db_generation,
+            });
+        }
+
+        Ok(ShardGenerationValidity::Valid)
+    }
+
    /// Finalize the import of a timeline
    ///
    /// This method should be called once all shards have reported that the import is complete.
@@ -3989,7 +4082,7 @@ impl Service {
    /// imports are stored in the database).
    #[instrument(skip_all, fields(
        tenant_id=%import.tenant_id,
-        shard_id=%import.timeline_id,
+        timeline_id=%import.timeline_id,
    ))]
    async fn finalize_timeline_import(
        self: &Arc<Self>,
--- a/storage_controller/src/timeline_import.rs
+++ b/storage_controller/src/timeline_import.rs
@@ -5,7 +5,7 @@ use http_utils::error::ApiError;
 use reqwest::Method;
 use serde::{Deserialize, Serialize};

-use pageserver_api::models::ShardImportStatus;
+use pageserver_api::models::{ShardImportProgress, ShardImportStatus};
 use tokio_util::sync::CancellationToken;
 use utils::{
    id::{TenantId, TimelineId},
@@ -28,7 +28,12 @@ impl ShardImportStatuses {
        ShardImportStatuses(
            shards
                .into_iter()
-                .map(|ts_id| (ts_id, ShardImportStatus::InProgress))
+                .map(|ts_id| {
+                    (
+                        ts_id,
+                        ShardImportStatus::InProgress(None::<ShardImportProgress>),
+                    )
+                })
                .collect(),
        )
    }
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{
 };
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver_api::shard::ShardIndex;
-use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath};
 use tokio_util::sync::CancellationToken;
 use tracing::{info, warn};
 use utils::generation::Generation;
@@ -165,23 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors(
                                .head_object(&path, &CancellationToken::new())
                                .await;

-                            if let Err(e) = response {
-                                // Object is not present.
-                                let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta());
+                            match response {
+                                Ok(_) => {}
+                                Err(DownloadError::NotFound) => {
+                                    // Object is not present.
+                                    let is_l0 =
+                                        LayerMap::is_l0(layer.key_range(), layer.is_delta());

-                                let msg = format!(
-                                    "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}",
-                                    layer,
-                                    metadata.generation.get_suffix(),
-                                    metadata.shard,
-                                    is_l0,
-                                    e,
-                                );
+                                    let msg = format!(
+                                        "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})",
+                                        layer,
+                                        metadata.generation.get_suffix(),
+                                        metadata.shard,
+                                        is_l0,
+                                    );

-                                if is_l0 || ignore_error {
-                                    result.warnings.push(msg);
-                                } else {
-                                    result.errors.push(msg);
+                                    if is_l0 || ignore_error {
+                                        result.warnings.push(msg);
+                                    } else {
+                                        result.errors.push(msg);
+                                    }
+                                }
+                                Err(e) => {
+                                    tracing::warn!(
+                                        "cannot check if the layer {}{} is present in remote storage (error: {})",
+                                        layer,
+                                        metadata.generation.get_suffix(),
+                                        e,
+                                    );
                                }
                            }
                        }
--- a/test_runner/bin/neon_local_create_deep_l0_stack.py
+++ b/test_runner/bin/neon_local_create_deep_l0_stack.py
@@ -0,0 +1,59 @@
+"""
+Script to creates a stack of L0 deltas each of which should have 1 Value::Delta per page in `data`,
+in your running neon_local setup.
+
+Use this bash setup to reset your neon_local environment.
+The last line of this bash snippet will run this file here.
+```
+ export NEON_REPO_DIR=$PWD/.neon
+ export NEON_BIN_DIR=$PWD/target/release
+ $NEON_BIN_DIR/neon_local stop
+ rm -rf $NEON_REPO_DIR
+ $NEON_BIN_DIR/neon_local init
+ cat >>  $NEON_REPO_DIR/pageserver_1/pageserver.toml <<"EOF"
+ # customizations
+ virtual_file_io_mode = "direct-rw"
+ page_service_pipelining={mode="pipelined", max_batch_size=32, execution="concurrent-futures"}
+ get_vectored_concurrent_io={mode="sidecar-task"}
+EOF
+ $NEON_BIN_DIR/neon_local start
+
+ psql 'postgresql://localhost:1235/storage_controller' -c 'DELETE FROM tenant_shards'
+ sed 's/.*get_vectored_concurrent_io.*/get_vectored_concurrent_io={mode="sidecar-task"}/' -i $NEON_REPO_DIR/pageserver_1/pageserver.toml
+ $NEON_BIN_DIR/neon_local pageserver restart
+ sleep 2
+ $NEON_BIN_DIR/neon_local tenant create --set-default
+ ./target/debug/neon_local endpoint stop foo
+ rm -rf  $NEON_REPO_DIR/endpoints/foo
+ ./target/debug/neon_local endpoint create foo
+ echo 'full_page_writes=off' >>  $NEON_REPO_DIR/endpoints/foo/postgresql.conf
+ ./target/debug/neon_local endpoint start foo
+
+  pushd test_runner; poetry run python3 -m bin.neon_local_create_deep_l0_stack 10; popd
+```
+"""
+
+import sys
+
+import psycopg2
+from fixtures.common_types import TenantShardId, TimelineId
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.makelayers.l0stack import L0StackShape, make_l0_stack_standalone
+
+ps_http = PageserverHttpClient(port=9898, is_testing_enabled_or_skip=lambda: None)
+vps_http = PageserverHttpClient(port=1234, is_testing_enabled_or_skip=lambda: None)
+
+tenants = ps_http.tenant_list()
+assert len(tenants) == 1
+tenant_shard_id = TenantShardId.parse(tenants[0]["id"])
+
+timlines = ps_http.timeline_list(tenant_shard_id)
+assert len(timlines) == 1
+timeline_id = TimelineId(timlines[0]["timeline_id"])
+
+connstr = "postgresql://cloud_admin@localhost:55432/postgres"
+conn = psycopg2.connect(connstr)
+
+shape = L0StackShape(logical_table_size_mib=50, delta_stack_height=int(sys.argv[1]))
+
+make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, conn, shape)
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -103,7 +103,7 @@ class AbstractNeonCli:
            else:
                stdout = ""

-            log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
+            log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}")
            raise

        indent = "  "
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1255,6 +1255,12 @@ class NeonEnv:
                "no_sync": True,
                # Look for gaps in WAL received from safekeepeers
                "validate_wal_contiguity": True,
+                # TODO(vlad): make these configurable through the builder
+                "timeline_import_config": {
+                    "import_job_concurrency": 4,
+                    "import_job_soft_size_limit": 512 * 1024,
+                    "import_job_checkpoint_threshold": 4,
+                },
            }

            # Batching (https://github.com/neondatabase/neon/issues/9377):
@@ -1371,7 +1377,11 @@ class NeonEnv:
            force=config.config_init_force,
        )

-    def start(self, timeout_in_seconds: int | None = None):
+    def start(
+        self,
+        timeout_in_seconds: int | None = None,
+        extra_ps_env_vars: dict[str, str] | None = None,
+    ):
        # Storage controller starts first, so that pageserver /re-attach calls don't
        # bounce through retries on startup
        self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)
@@ -1390,7 +1400,10 @@ class NeonEnv:
            for pageserver in self.pageservers:
                futs.append(
                    executor.submit(
-                        lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)  # type: ignore[misc]
+                        lambda ps=pageserver: ps.start(  # type: ignore[misc]
+                            extra_env_vars=extra_ps_env_vars or {},
+                            timeout_in_seconds=timeout_in_seconds,
+                        ),
                    )
                )

--- a/test_runner/fixtures/pageserver/makelayers/init.py
+++ b/test_runner/fixtures/pageserver/makelayers/init.py
--- a/test_runner/fixtures/pageserver/makelayers/l0stack.py
+++ b/test_runner/fixtures/pageserver/makelayers/l0stack.py
@@ -0,0 +1,148 @@
+from dataclasses import dataclass
+
+from psycopg2.extensions import connection as PgConnection
+
+from fixtures.common_types import Lsn, TenantShardId, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.utils import wait_for_last_record_lsn
+
+
+@dataclass
+class L0StackShape:
+    logical_table_size_mib: int = 50
+    delta_stack_height: int = 20
+
+
+def make_l0_stack(endpoint: Endpoint, shape: L0StackShape):
+    """
+    Creates stack of L0 deltas each of which should have 1 Value::Delta per page in table `data`.
+    """
+    env = endpoint.env
+
+    # TDOO: wait for storcon to finish any reonciles before jumping to action here?
+    description = env.storage_controller.tenant_describe(endpoint.tenant_id)
+    shards = description["shards"]
+    assert len(shards) == 1, "does not support sharding"
+    tenant_shard_id = TenantShardId.parse(shards[0]["tenant_shard_id"])
+
+    endpoint.config(["full_page_writes=off"])
+    endpoint.reconfigure()
+
+    ps = env.get_pageserver(shards[0]["node_attached"])
+
+    timeline_id = endpoint.show_timeline_id()
+
+    vps_http = env.storage_controller.pageserver_api()
+    ps_http = ps.http_client()
+    endpoint_conn = endpoint.connect()
+    make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, endpoint_conn, shape)
+
+
+def make_l0_stack_standalone(
+    vps_http: PageserverHttpClient,
+    ps_http: PageserverHttpClient,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    endpoint_conn: PgConnection,
+    shape: L0StackShape,
+):
+    """
+    See make_l0_stack for details.
+
+    This function is a standalone version of make_l0_stack, usable from not-test code.
+    """
+
+    assert not tenant_shard_id.shard_index.is_sharded, (
+        "the current implementation only supports unsharded tenants"
+    )
+
+    tenant_id = tenant_shard_id.tenant_id
+    conn = endpoint_conn
+    desired_size = shape.logical_table_size_mib * 1024 * 1024
+
+    config = {
+        "gc_period": "0s",  # disable periodic gc
+        "checkpoint_timeout": "10 years",
+        "compaction_period": "1h",  # doesn't matter, but 0 value will kill walredo every 10s
+        "compaction_threshold": 100000,  # we just want L0s
+        "compaction_target_size": 134217728,
+        "checkpoint_distance": 268435456,
+        "image_creation_threshold": 100000,  # we just want L0s
+    }
+
+    vps_http.set_tenant_config(tenant_id, config)
+
+    conn.autocommit = True
+    cur = conn.cursor()
+
+    # Ensure full_page_writes are disabled so that all Value::Delta in
+    # pageserver are !will_init, and therefore a getpage needs to read
+    # the entire delta stack.
+    cur.execute("SHOW full_page_writes")
+    assert cur.fetchall()[0][0] == "off", "full_page_writes should be off"
+
+    # each tuple is 23 (header) + 100 bytes = 123 bytes
+    # page header si 24 bytes
+    # 8k page size
+    # (8k-24bytes) / 123 bytes = 63 tuples per page
+    # set fillfactor to 10 to have 6 tuples per page
+    cur.execute("DROP TABLE IF EXISTS data")
+    cur.execute("CREATE TABLE data(id bigint, row char(92)) with (fillfactor=10)")
+    need_pages = desired_size // 8192
+    need_rows = need_pages * 6
+    log.info(f"Need {need_pages} pages, {need_rows} rows")
+    cur.execute(f"INSERT INTO data SELECT i,'row'||i FROM generate_series(1, {need_rows}) as i")
+    # Raise fillfactor to 100% so that all updates are HOT updates.
+    # We assert they're hot updates by checking fetch_id_to_page_mapping remains the same.
+    cur.execute("ALTER TABLE data SET (fillfactor=100)")
+
+    def settle_and_flush():
+        cur.execute("SELECT pg_current_wal_flush_lsn()")
+        flush_lsn = Lsn(cur.fetchall()[0][0])
+        wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, flush_lsn)
+        ps_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    # create an L0 for the initial data we just inserted
+    settle_and_flush()
+
+    # assert we wrote what we think we wrote
+    cur.execute("""
+        with ntuples_per_page as (
+            select (ctid::text::point)[0]::bigint pageno,count(*) ntuples from data group by pageno
+        )
+        select ntuples, count(*) npages from ntuples_per_page group by ntuples order by ntuples;
+    """)
+    rows = cur.fetchall()
+    log.info(f"initial table layout: {rows}")
+    assert len(rows) == 1
+    assert rows[0][0] == 6, f"expected 6 tuples per page, got {rows[0][0]}"
+    assert rows[0][1] == need_pages, f"expected {need_pages} pages, got {rows[0][1]}"
+
+    def fetch_id_to_page_mapping():
+        cur.execute("""
+            SELECT id,(ctid::text::point)[0]::bigint pageno FROM data ORDER BY id
+        """)
+        return cur.fetchall()
+
+    initial_mapping = fetch_id_to_page_mapping()
+
+    # every iteration updates one tuple in each page
+    delta_stack_height = shape.delta_stack_height
+    for i in range(0, delta_stack_height):
+        log.info(i)
+        cur.execute(f"UPDATE data set row = row||',u' where id % 6 = {i % 6}")
+        log.info(f"modified rows: {cur.rowcount}")
+        assert cur.rowcount == need_pages
+        settle_and_flush()
+        post_update_mapping = fetch_id_to_page_mapping()
+        assert initial_mapping == post_update_mapping, "Postgres should be doing HOT updates"
+
+    # Assert the layer count is what we expect it is
+    layer_map = vps_http.layer_map_info(tenant_id, timeline_id)
+    assert (
+        len(layer_map.delta_l0_layers()) == delta_stack_height + 1 + 1
+    )  # +1 for the initdb layer + 1 for the table creation & fill
+    assert len(layer_map.delta_l0_layers()) == len(layer_map.delta_layers())  # it's all L0s
+    assert len(layer_map.image_layers()) == 0  # no images
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -15,7 +15,8 @@ Some handy pytest flags for local development:
 - `-k` selects a test to run
 - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`)
 - `--preserve-database-files` to skip cleanup
- `--out-dir` to produce a JSON with the recorded test metrics
+- `--out-dir` to produce a JSON with the recorded test metrics.
+  There is a post-processing tool at `test_runner/performance/out_dir_to_csv.py`.

 # What performance tests do we have and how we run them

--- a/test_runner/performance/out_dir_to_csv.py
+++ b/test_runner/performance/out_dir_to_csv.py
@@ -0,0 +1,57 @@
+# Tool to convert the JSON output from running a perf test with `--out-dir` to a CSV that
+# can be easily pasted into a spreadsheet for quick viz & analysis.
+# Check the `./README.md` in this directory for `--out-dir`.
+#
+# TODO: add the pytest.mark.parametrize to the json and make them columns here
+# https://github.com/neondatabase/neon/issues/11878
+
+import csv
+import json
+import os
+import sys
+
+
+def json_to_csv(json_file):
+    with open(json_file) as f:
+        data = json.load(f)
+
+    # Collect all possible metric names to form headers
+    all_metrics = set()
+    for result in data.get("result", []):
+        for metric in result.get("data", []):
+            all_metrics.add(metric["name"])
+
+    # Sort metrics for consistent output
+    metrics = sorted(list(all_metrics))
+
+    # Create headers
+    headers = ["suit"] + metrics
+
+    # Prepare rows
+    rows = []
+    for result in data.get("result", []):
+        row = {"suit": result["suit"]}
+
+        # Initialize all metrics to empty
+        for metric in metrics:
+            row[metric] = ""
+
+        # Fill in available metrics
+        for item in result.get("data", []):
+            row[item["name"]] = item["value"]
+
+        rows.append(row)
+
+    # Write to stdout as CSV
+    writer = csv.DictWriter(sys.stdout, fieldnames=headers)
+    writer.writeheader()
+    writer.writerows(rows)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(f"Usage: python {os.path.basename(__file__)} <json_file>")
+        sys.exit(1)
+
+    json_file = sys.argv[1]
+    json_to_csv(json_file)
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -10,7 +10,8 @@ from typing import Any
 import pytest
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
+from fixtures.pageserver.makelayers import l0stack
 from fixtures.utils import humantime_to_ms

 TARGET_RUNTIME = 30
@@ -34,28 +35,18 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
    mode: str = "pipelined"


-EXECUTION = ["concurrent-futures"]
-BATCHING = ["uniform-lsn", "scattered-lsn"]
-
-NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
-for max_batch_size in [1, 32]:
-    for execution in EXECUTION:
-        for batching in BATCHING:
-            NON_BATCHABLE.append(
-                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
-            )
-
-BATCHABLE: list[PageServicePipeliningConfig] = []
+PS_IO_CONCURRENCY = ["sidecar-task"]
+PIPELINING_CONFIGS: list[PageServicePipeliningConfig] = []
 for max_batch_size in [32]:
-    for execution in EXECUTION:
-        for batching in BATCHING:
-            BATCHABLE.append(
+    for execution in ["concurrent-futures"]:
+        for batching in ["scattered-lsn"]:
+            PIPELINING_CONFIGS.append(
                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
            )


@pytest.mark.parametrize(
-    "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
+    "tablesize_mib, pipelining_config, target_runtime, ps_io_concurrency, effective_io_concurrency, readhead_buffer_size, name",
    [
        # batchable workloads should show throughput and CPU efficiency improvements
        *[
@@ -63,20 +54,23 @@ for max_batch_size in [32]:
                50,
                config,
                TARGET_RUNTIME,
+                ps_io_concurrency,
                100,
                128,
                f"batchable {dataclasses.asdict(config)}",
            )
-            for config in BATCHABLE
+            for config in PIPELINING_CONFIGS
+            for ps_io_concurrency in PS_IO_CONCURRENCY
        ],
    ],
 )
-def test_throughput(
+def test_postgres_seqscan(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
    tablesize_mib: int,
    pipelining_config: PageServicePipeliningConfig,
    target_runtime: int,
+    ps_io_concurrency: str,
    effective_io_concurrency: int,
    readhead_buffer_size: int,
    name: str,
@@ -97,6 +91,10 @@ def test_throughput(
    If the compute provides pipeline depth (effective_io_concurrency=100), then
    pipelining configs, especially with max_batch_size>1 should yield dramatic improvements
    in all performance metrics.
+
+    We advance the LSN from a disruptor thread to simulate the effect of a workload with concurrent writes
+    in another table. The `scattered-lsn` batching mode handles this well whereas the
+    initial implementatin (`uniform-lsn`) would break the batch.
    """

    #
@@ -114,7 +112,19 @@ def test_throughput(
        }
    )
    # For storing configuration as a metric, insert a fake 0 with labels with actual data
-    params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})})
+    params.update(
+        {
+            "config": (
+                0,
+                {
+                    "labels": {
+                        "pipelining_config": dataclasses.asdict(pipelining_config),
+                        "ps_io_concurrency": ps_io_concurrency,
+                    }
+                },
+            )
+        }
+    )

    log.info("params: %s", params)

@@ -266,7 +276,10 @@ def test_throughput(
        return iters

    env.pageserver.patch_config_toml_nonrecursive(
-        {"page_service_pipelining": dataclasses.asdict(pipelining_config)}
+        {
+            "page_service_pipelining": dataclasses.asdict(pipelining_config),
+            "get_vectored_concurrent_io": {"mode": ps_io_concurrency},
+        }
    )

    # set trace for log analysis below
@@ -318,77 +331,63 @@ def test_throughput(
    )


-PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
-for max_batch_size in [1, 32]:
-    for execution in EXECUTION:
-        for batching in BATCHING:
-            PRECISION_CONFIGS.append(
-                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
-            )
-
-
@pytest.mark.parametrize(
-    "pipelining_config,name",
-    [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS],
+    "pipelining_config,ps_io_concurrency,l0_stack_height,queue_depth,name",
+    [
+        (config, ps_io_concurrency, l0_stack_height, queue_depth, f"{dataclasses.asdict(config)}")
+        for config in PIPELINING_CONFIGS
+        for ps_io_concurrency in PS_IO_CONCURRENCY
+        for queue_depth in [1, 2, 32]
+        for l0_stack_height in [0, 20]
+    ],
 )
-def test_latency(
+def test_random_reads(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
    pg_bin: PgBin,
    pipelining_config: PageServicePipeliningConfig,
+    ps_io_concurrency: str,
+    l0_stack_height: int,
+    queue_depth: int,
    name: str,
 ):
    """
-    Measure the latency impact of pipelining in an un-batchable workloads.
-
-    An ideal implementation should not increase average or tail latencies for such workloads.
-
-    We don't have support in pagebench to create queue depth yet.
-    => https://github.com/neondatabase/neon/issues/9837
+    Throw pagebench random getpage at latest lsn workload from a single client against pageserver.
    """

    #
    # Setup
    #

+    def build_snapshot_cb(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
+        env = neon_env_builder.init_start()
+        endpoint = env.endpoints.create_start("main")
+        l0stack.make_l0_stack(
+            endpoint,
+            l0stack.L0StackShape(logical_table_size_mib=50, delta_stack_height=l0_stack_height),
+        )
+        return env
+
+    env = neon_env_builder.build_and_use_snapshot(
+        f"test_page_service_batching--test_pagebench-{l0_stack_height}", build_snapshot_cb
+    )
+
    def patch_ps_config(ps_config):
-        if pipelining_config is not None:
-            ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config)
+        ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config)
+        ps_config["get_vectored_concurrent_io"] = {"mode": ps_io_concurrency}

-    neon_env_builder.pageserver_config_override = patch_ps_config
+    env.pageserver.edit_config_toml(patch_ps_config)

-    env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start("main")
-    conn = endpoint.connect()
-    cur = conn.cursor()
+    env.start()

-    cur.execute("SET max_parallel_workers_per_gather=0")  # disable parallel backends
-    cur.execute("SET effective_io_concurrency=1")
-
-    cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
-    cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
-
-    log.info("Filling the table")
-    cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
-    tablesize = 50 * 1024 * 1024
-    npages = tablesize // (8 * 1024)
-    cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
-    # TODO: can we force postgres to do sequential scans?
-
-    cur.close()
-    conn.close()
-
-    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
-
-    endpoint.stop()
+    lsn = env.safekeepers[0].get_commit_lsn(env.initial_tenant, env.initial_timeline)
+    ep = env.endpoints.create_start("main", lsn=lsn)
+    data_table_relnode_oid = ep.safe_psql_scalar("SELECT 'data'::regclass::oid")
+    ep.stop_and_destroy()

    for sk in env.safekeepers:
        sk.stop()

-    #
-    # Run single-threaded pagebench (TODO: dedup with other benchmark code)
-    #
-
    env.pageserver.allowed_errors.append(
        # https://github.com/neondatabase/neon/issues/6925
        r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
@@ -396,6 +395,8 @@ def test_latency(

    ps_http = env.pageserver.http_client()

+    metrics_before = ps_http.get_metrics()
+
    cmd = [
        str(env.neon_binpath / "pagebench"),
        "get-page-latest-lsn",
@@ -405,6 +406,10 @@ def test_latency(
        env.pageserver.connstr(password=None),
        "--num-clients",
        "1",
+        "--queue-depth",
+        str(queue_depth),
+        "--only-relnode",
+        str(data_table_relnode_oid),
        "--runtime",
        "10s",
    ]
@@ -413,12 +418,22 @@ def test_latency(
    results_path = Path(basepath + ".stdout")
    log.info(f"Benchmark results at: {results_path}")

+    metrics_after = ps_http.get_metrics()
+
    with open(results_path) as f:
        results = json.load(f)
    log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")

    total = results["total"]

+    metric = "request_count"
+    zenbenchmark.record(
+        metric,
+        metric_value=total[metric],
+        unit="",
+        report=MetricReport.HIGHER_IS_BETTER,
+    )
+
    metric = "latency_mean"
    zenbenchmark.record(
        metric,
@@ -435,3 +450,17 @@ def test_latency(
            unit="ms",
            report=MetricReport.LOWER_IS_BETTER,
        )
+
+    reads_before = metrics_before.query_one(
+        "pageserver_io_operations_seconds_count", filter={"operation": "read"}
+    )
+    reads_after = metrics_after.query_one(
+        "pageserver_io_operations_seconds_count", filter={"operation": "read"}
+    )
+
+    zenbenchmark.record(
+        "virtual_file_reads",
+        metric_value=reads_after.value - reads_before.value,
+        unit="",
+        report=MetricReport.LOWER_IS_BETTER,
+    )
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -510,7 +510,7 @@ def list_elegible_layers(
        except KeyError:
            # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
            # matches what's on disk.
-            log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
+            log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
            raise

    return list(c for c in candidates if is_visible(c))
@@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
    except:
        # On assertion failures, log some details to help with debugging
        heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
-        log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
+        log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
        raise

    # Scrub the remote storage
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
  "v17": [
    "17.5",
-    "e5374b72997b0afc8374137674e873f7a558120a"
+    "8be779fd3ab9e87206da96a7e4842ef1abf04f44"
  ],
  "v16": [
    "16.9",
-    "bb5eee65ac753b5a66d255ec5fb4c0e33180e8fd"
+    "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198"
  ],
  "v15": [
    "15.13",
-    "052df87d338dc30687d0c96f1a4d9b6cb4882b2e"
+    "de7640f55da07512834d5cc40c4b3fb376b5f04f"
  ],
  "v14": [
    "14.18",
-    "ead1e76bdcb71ef87f52f0610bd7333247f75179"
+    "55c0d45abe6467c02084c2192bca117eda6ce1e7"
  ]
 }
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,8 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
-nix = { version = "0.26" }
+nix-2f80eeee3b1b6c7e = { package = "nix", version = "0.26" }
+nix-fa1f6196edfd7249 = { package = "nix", version = "0.30", features = ["dir", "ioctl", "mman", "poll", "signal", "socket"] }
 nom = { version = "7" }
 num = { version = "0.4" }
 num-bigint = { version = "0.4" }
Author	SHA1	Message	Date
github-actions[bot]	40bb9ff62a	Proxy release 2025-05-20 06:01 UTC	2025-05-20 06:01:25 +00:00
Erik Grinaker	f4150614d0	pageserver: don't pass config to `PageHandler` (#11973 ) ## Problem The gRPC page service API will require decoupling the `PageHandler` from the libpq protocol implementation. As preparation for this, avoid passing in the entire server config to `PageHandler`, and instead explicitly pass in the relevant fields. Touches https://github.com/neondatabase/neon/issues/11728. ## Summary of changes * Change `PageHandler` to take a `GetVectoredConcurrentIo` instead of the entire config. * Change `IoConcurrency::spawn_from_conf` to take a `GetVectoredConcurrentIo`.	2025-05-19 15:47:40 +00:00
Erik Grinaker	38dbc5f67f	pageserver/page_api: add binary Protobuf descriptor (#11968 ) ## Problem A binary Protobuf schema descriptor can be used to expose an API reflection service, which in turn allows convenient usage of e.g. `grpcurl` against the gRPC server. Touches #11728. ## Summary of changes * Generate a binary schema descriptor as `pageserver_page_api::proto::FILE_DESCRIPTOR_SET`. * Opportunistically rename the Protobuf package from `page_service` to `page_api`.	2025-05-19 11:17:45 +00:00
Folke Behrens	3685ad606d	endpoint_storage: Fix metrics test by excluding assertion on macos (#11952 )	2025-05-19 10:56:03 +00:00
Ivan Efremov	76a7d37f7e	proxy: Drop cancellation ops if they don't fit into the queue (#11950 ) Add a redis ops batch size argument for proxy and remove timeouts by using try_send()	2025-05-19 10:10:55 +00:00
Erik Grinaker	cdb6479c8a	pageserver: add gRPC page service schema (#11815 ) ## Problem For the [communicator project](https://github.com/neondatabase/company_projects/issues/352), we want to move to gRPC for the page service protocol. Touches #11728. ## Summary of changes This patch adds an experimental gRPC Protobuf schema for the page service. It is equivalent to the current page service, but with several improvements, e.g.: * Connection multiplexing. * Reduced head-of-line blocking. * Client-side batching. * Explicit tenant shard routing. * GetPage request classification (normal vs. prefetch). * Explicit rate limiting ("slow down" response status). The API is exposed as a new `pageserver/page_api` package. This is separate from the `pageserver_api` package to reduce the dependency footprint for the communicator. The longer-term plan is to also split out e.g. the WAL ingestion service to a separate gRPC package, e.g. `pageserver/wal_api`. Subsequent PRs will: add Rust domain types for the Protobuf types, expose a gRPC server, and implement the page service. Preliminary prototype benchmarks of this gRPC API is within 10% of baseline libpq performance. We'll do further benchmarking and optimization as the implementation lands in `main` and is deployed to staging.	2025-05-19 09:03:06 +00:00
Konstantin Knizhnik	81c557d87e	Unlogged build get smgr (#11954 ) ## Problem See https://github.com/neondatabase/neon/issues/11910 and https://neondb.slack.com/archives/C04DGM6SMTM/p1747314649059129 ## Summary of changes Do not change persistence in `start_unlogged_build` Postgres PRs: https://github.com/neondatabase/postgres/pull/642 https://github.com/neondatabase/postgres/pull/641 https://github.com/neondatabase/postgres/pull/640 https://github.com/neondatabase/postgres/pull/639 --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-05-18 05:02:47 +00:00
Trung Dinh	e963129678	pagesteam_handle_batched_message -> pagestream_handle_batched_message (#11916 ) ## Problem Found a typo in code. ## Summary of changes Co-authored-by: Trung Dinh <tdinh@roblox.com> Co-authored-by: Erik Grinaker <erik@neon.tech>	2025-05-17 22:30:29 +00:00
dependabot[bot]	4f0a9fc569	chore(deps): bump flask-cors from 5.0.0 to 6.0.0 in the pip group across 1 directory (#11960 ) Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2025-05-17 22:06:32 +00:00
Emmanuel Ferdman	81c6a5a796	Migrate to correct logger interface (#11956 ) ## Problem Currently the `logger` library throws annoying deprecation warnings: ```python DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead ``` ## Summary of changes This small PR resolves the annoying deprecation warnings by migrating to `.warning` as suggested. Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>	2025-05-17 21:12:01 +00:00
Konstantin Knizhnik	8e05639dbf	Invalidate LFC after unlogged build (#11951 ) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1747391617951239 LFC is not always properly updated during unlogged build so it can contain stale content. ## Summary of changes Invalidate LFC content at the end of unlogged build Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-05-17 19:06:59 +00:00
Alexander Bayandin	deed46015d	CI(test-images): increase timeout from 20m to 60m (#11955 ) ## Problem For some reason (unknown yet) 20m timeout is not enough for `test-images` job on arm runners. Ref: https://github.com/neondatabase/neon/actions/runs/15075321681/job/42387530399?pr=11953 ## Summary of changes - Increase the timeout from 20m to 1h	2025-05-17 06:34:54 +00:00
Heikki Linnakangas	532d9b646e	Add simple facility for an extendable shared memory area (#11929 ) You still need to provide a max size up-front, but memory is only allocated for the portion that is in use. The module is currently unused, but will be used by the new compute communicator project, in the neon Postgres extension. See https://github.com/neondatabase/neon/issues/11729 --------- Co-authored-by: Erik Grinaker <erik@neon.tech>	2025-05-16 21:22:36 +00:00
Heikki Linnakangas	55f91cf10b	Update 'nix' package (#11948 ) There were some incompatible changes. Most churn was from switching from the now-deprecated fcntl:flock() function to fcntl::Flock::lock(). The new function returns a guard object, while with the old function, the lock was associated directly with the file descriptor. It's good to stay up-to-date in general, but the impetus to do this now is that in https://github.com/neondatabase/neon/pull/11929, I want to use some functions that were added only in the latest version of 'nix', and it's nice to not have to build multiple versions. (Although, different versions of 'nix' are still pulled in as indirect dependencies from other packages)	2025-05-16 14:45:08 +00:00
Folke Behrens	baafcc5d41	proxy: Fix misspelled flag value alias, swap names and aliases (#11949 ) ## Problem There's a misspelled flag value alias that's not really used anywhere. ## Summary of changes Fix the alias and make aliases the official flag values and keep old values as aliases. Also rename enum variant. No need for it to carry the version now.	2025-05-16 14:12:39 +00:00
Evan Fleming	aa22572d8c	safekeeper: refactor static remote storage usage to use Arc (#10179 ) Greetings! Please add `w=1` to github url when viewing diff (sepcifically `wal_backup.rs`) ## Problem This PR is aimed at addressing the remaining work of #8200. Namely, removing static usage of remote storage in favour of arc. I did not opt to pass `Arc<RemoteStorage>` directly since it is actually `Optional<RemoteStorage>` as it is not necessarily always configured. I wanted to avoid having to pass `Arc<Optional<RemoteStorage>>` everywhere with individual consuming functions likely needing to handle unwrapping. Instead I've added a `WalBackup` struct that holds `Optional<RemoteStorage>` and handles initialization/unwrapping RemoteStorage internally. wal_backup functions now take self and `Arc<WalBackup>` is passed as a dependency through the various consumers that need it. ## Summary of changes - Add `WalBackup` that holds `Optional<RemoteStorage>` and handles initialization and unwrapping - Modify wal_backup functions to take `WalBackup` as self (Add `w=1` to github url when viewing diff here) - Initialize `WalBackup` in safekeeper root - Store `Arc<WalBackup>` in `GlobalTimelineMap` and pass and store in each Timeline as loaded - use `WalBackup` through Timeline as needed ## Refs - task to remove global variables https://github.com/neondatabase/neon/issues/8200 - drive-by fixes https://github.com/neondatabase/neon/issues/11501 by turning the panic reported there into an error `remote storage not configured` --------- Co-authored-by: Christian Schwarz <christian@neon.tech>	2025-05-16 12:41:10 +00:00
Arpad Müller	2d247375b3	Update rust to 1.87.0 (#11938 ) We keep the practice of keeping the compiler up to date, pointing to the latest release. This is done by many other projects in the Rust ecosystem as well. The 1.87.0 release marks 10 years of Rust. [Announcement blog post](https://blog.rust-lang.org/2025/05/15/Rust-1.87.0/) Prior update was in #11431	2025-05-16 12:21:24 +00:00
Christian Schwarz	a7ce323949	benchmarking: extend `test_page_service_batching.py` to cover concurrent IO + batching under random reads (#10466 ) This PR commits the benchmarks I ran to qualify concurrent IO before we released it. Changes: - Add `l0stack` fixture; a reusable abstraction for creating a stack of L0 deltas each of which has 1 Value::Delta per page. - Such a stack of L0 deltas is a good and understandable demo for concurrent IO because to reconstruct any page, $layer_stack_height` Values need to be read. Before concurrent IO, the reads were sequential. With concurrent IO, they are executed concurrently. - So, switch `test_latency` to use the l0stack. - Teach `pagebench`, which is used by `test_latency`, to limit itself to the blocks of the relation created by the l0stack abstraction. - Additional parametrization of `test_latency` over dimensions `ps_io_concurrency,l0_stack_height,queue_depth` - Use better names for the tests to reflect what they do, leave interpretation of the (now quite high-dimensional) results to the reader - `test_{throughput => postgres_seqscan}` - `test_{latency => random_reads}` - Cut down on permutations to those we use in production. Runtime is about 2min. Refs - concurrent IO epic https://github.com/neondatabase/neon/issues/9378 - batching task: fixes https://github.com/neondatabase/neon/issues/9837 --------- Co-authored-by: Peter Bendel <peterbendel@neon.tech>	2025-05-15 17:48:13 +00:00
Vlad Lazar	31026d5a3c	pageserver: support import schema evolution (#11935 ) ## Problem Imports don't support schema evolution nicely. If we want to change the stuff we keep in storcon, we'd have to carry the old cruft around. ## Summary of changes Version import progress. Note that the import progress version determines the version of the import job split and execution. This means that we can also use it as a mechanism for deploying new import implementations in the future.	2025-05-15 16:13:15 +00:00
Vlad Lazar	2621ce2daf	pageserver: checkpoint import progress in the storage controller (#11862 ) ## Problem Timeline imports do not have progress checkpointing. Any time that the tenant is shut-down, all progress is lost and the import restarts from the beginning when the tenant is re-attached. ## Summary of changes This PR adds progress checkpointing. ### Preliminaries The unit of work is a `ChunkProcessingJob`. Each `ChunkProcessingJob` deals with the import for a set of key ranges. The job split is done by using an estimation of how many pages each job will produce. The planning stage must be pure: given a fixed set of contents in the import bucket, it will always yield the same plan. This property is enforced by checking that the hash of the plan is identical when resuming from a checkpoint. The storage controller tracks the progress of each shard in the import in the database in the form of the latest job that has has completed. ### Flow This is the high level flow for the happy path: 1. On the first run of the import task, the import task queries storcon for the progress and sees that none is recorded. 2. Execute the preparatory stage of the import 3. Import jobs start running concurrently in a `FuturesOrdered`. Every time the checkpointing threshold of jobs has been reached, notify the storage controller. 4. Tenant is detached and re-attached 5. Import task starts up again and gets the latest progress checkpoint from the storage controller in the form of a job index. 6. The plan is computed again and we check that the hash matches with the original plan. 7. Jobs are spawned from where the previous import task left off. Note that we will not report progress after the completion of each job, so some jobs might run twice. Closes https://github.com/neondatabase/neon/issues/11568 Closes https://github.com/neondatabase/neon/issues/11664	2025-05-15 13:18:22 +00:00
Vlad Lazar	a703cd342b	storage_controller: enforce generations in import upcalls (#11900 ) ## Problem Import up-calls did not enforce the usage of the latest generation. The import might have finished in one previous generation, but not in the latest one. Hence, the controller might try to activate a timeline before it is ready. In theory, that would be fine, but it's tricky to reason about. ## Summary of Changes Pageserver provides the current generation in the upcall to the storage controller and the later validates the generation. If the generation is stale, we return an error which stops progress of the import job. Note that the import job will retry the upcall until the stale location is detached. I'll add some proper tests for this as part of the [checkpointing PR](https://github.com/neondatabase/neon/pull/11862). Closes https://github.com/neondatabase/neon/issues/11884	2025-05-15 10:02:11 +00:00
Alexander Bayandin	42e4cf18c9	CI(neon_extra_builds): fix workflow syntax (#11932 ) ## Problem ``` Error when evaluating 'strategy' for job 'build-pgxn'. neondatabase/neon/.github/workflows/build-macos.yml@7907a9e2bf898f3d22b98d9d4d2c6ffc4d480fc3 (Line: 45, Col: 27): Matrix vector 'postgres-version' does not contain any values ``` See https://github.com/neondatabase/neon/actions/runs/15039594216/job/42268015127?pr=11929 ## Summary of changes - Fix typo: `.chnages` -> `.changes` - Ensure JSON is JSON by moving step output to env variable	2025-05-15 09:53:59 +00:00
Alex Chi Z.	9e5a41a342	fix(scrubber): `remote_storage` error causes layers to be deleted as orphans (#11924 ) ## Problem close https://github.com/neondatabase/neon/issues/11159 ; we get occasional wrong deletions of layer files being used and errors in staging. This patch fixed it. Example errors: ``` Timeline metadata errors: ["index_part.json contains a layer .... (shard 0000) that is not present in remote storage (layer_is_l0: false) with error: Failed to download a remote file: s3 head object\n\nCaused by:\n 0: dispatch failure\n 1: timeout\n 2: error trying to connect: HTTP connect timeout occurred after 3.1s\n ``` This error should not be fired because the file could exist, but we cannot know if it exists due to head request failure. ## Summary of changes Only generate cannot find layer errors when the head_object return type is `NotFound`. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-05-15 07:02:16 +00:00
Konstantin Knizhnik	48b870bc07	Use unlogged build in GIST for storing root page (#11892 ) ## Problem See https://github.com/neondatabase/neon/issues/11891 Newly added assert is first when root page of GIST index is written to the disk as part of sorted build. ## Summary of changes Wrap writing of root page in unlogged build. https://github.com/neondatabase/postgres/pull/632 https://github.com/neondatabase/postgres/pull/633 https://github.com/neondatabase/postgres/pull/634 --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-05-15 04:45:22 +00:00
Christian Schwarz	32a12783fd	pageserver: batching & concurrent IO: update binary-built-in defaults; reduce CI matrix (#11923 ) Use the current production config for batching & concurrent IO. Remove the permutation testing for unit tests from CI. (The pageserver unit test matrix takes ~10min for debug builds). Drive-by-fix use of `if cfg!(test)` inside crate `pageserver_api`. It is ineffective for early-enabling new defaults for pageserver unit tests only. The reason is that the `test` cfg is only set for the crate under test but not its dependencies. So, `cargo test -p pageserver` will build `pageserver_api` with `cfg!(test) == false`. Resort to checking for feature flag `testing` instead, since all our unit tests are run with `--feature testing`. refs - `scattered-lsn` batching has been implemented and rolled out in all envs, cf https://github.com/neondatabase/neon/issues/10765 - preliminary for https://github.com/neondatabase/neon/pull/10466 - epic https://github.com/neondatabase/neon/issues/9377 - epic https://github.com/neondatabase/neon/issues/9378 - drive-by fix https://neondb.slack.com/archives/C0277TKAJCA/p1746821515504219	2025-05-14 16:30:21 +00:00
a-masterov	68120cfa31	Fix Cloud Extensions Regression (#11907 ) ## Problem The regression test on extensions relied on the admin API to set the default endpoint settings, which is not stable and requires admin privileges. Specifically: - The workflow was using `default_endpoint_settings` to configure necessary PostgreSQL settings like `DateStyle`, `TimeZone`, and `neon.allow_unstable_extensions` - This approach was failing because the API endpoint for setting `default_endpoint_settings` was changed (referenced in a comment as issue #27108) - The admin API requires special privileges. ## Summary of changes We get rid of the admin API dependency and use ALTER DATABASE statements instead: Removed the default_endpoint_settings mechanism: - Removed the default_endpoint_settings input parameter from the neon-project-create action - Removed the API call that was attempting to set these settings at the project level - Completely removed the default_endpoint_settings configuration from the cloud-extensions workflow Added database-level settings: - Created a new `alter_db.sh` script that applies the same settings directly to each test database - Modified all extension test scripts to call this script after database creation	2025-05-14 13:19:53 +00:00