WIP

experiment: find all places that need new rclsn-by-generation facility
delete all the code that uses current inmem & controlfile remote_consistent_lsn field After this removal, search for `remote_consistent_lsn` highlights the places where the new solution that tracks rclsn per generation id needs to be fitted in
2026-05-16 12:40:36 +00:00 · 2025-02-28 22:10:08 +01:00 · 2025-02-20 01:06:38 +01:00 · 2025-02-20 01:05:38 +01:00 · 2025-02-17 17:24:17 +00:00 · 2025-02-17 16:32:24 +00:00
81 changed files with 2038 additions and 867 deletions
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -348,6 +348,10 @@ jobs:
          rerun_failed: true
          pg_version: ${{ matrix.pg_version }}
          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
+          # Attempt to stop tests gracefully to generate test reports
+          # until they are forcibly stopped by the stricter `timeout-minutes` limit.
+          extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }}
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
--- a/.github/workflows/force-test-extensions-upgrade.yml
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -0,0 +1,76 @@
+name: Force Test Upgrading of Extension
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '45 2 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+  statuses: write
+  contents: read
+
+jobs:
+  regress:
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
+
+    runs-on: small
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: false
+
+      - name: Get the last compute release tag
+        id: get-last-compute-release-tag
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "/repos/${GITHUB_REPOSITORY}/releases")
+          echo tag=${tag} >> ${GITHUB_OUTPUT}
+
+      - name: Test extension upgrade
+        timeout-minutes: 20
+        env:
+          NEWTAG: latest
+          OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          PG_VERSION: ${{ matrix.pg-version }}
+          FORCE_ALL_UPGRADE_TESTS: true
+        run: ./docker-compose/test_extensions_upgrade.sh
+
+      - name: Print logs and clean up
+        if: always()
+        run: |
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
+
+      - name: Post to the Slack channel
+        if: ${{ github.event.schedule && failure() }}
+        uses: slackapi/slack-github-action@v1
+        with:
+          channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
+          slack-message: |
+            Test upgrading of extensions: ${{ job.status }}
+            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -786,7 +786,7 @@ dependencies = [
 [[package]]
 name = "azure_core"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
 [[package]]
 name = "azure_identity"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "async-lock",
 "async-trait",
@@ -834,7 +834,7 @@ dependencies = [
 [[package]]
 name = "azure_storage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "RustyXML",
 "async-lock",
@@ -852,7 +852,7 @@ dependencies = [
 [[package]]
 name = "azure_storage_blobs"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "RustyXML",
 "azure_core",
@@ -872,7 +872,7 @@ dependencies = [
 [[package]]
 name = "azure_svc_blobstorage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "azure_core",
 "bytes",
@@ -1029,12 +1029,6 @@ dependencies = [
 "generic-array",
 ]

-[[package]]
-name = "boxcar"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
-
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -4929,7 +4923,6 @@ dependencies = [
 "aws-sdk-iam",
 "aws-sigv4",
 "base64 0.13.1",
- "boxcar",
 "bstr",
 "bytes",
 "camino",
@@ -4981,7 +4974,6 @@ dependencies = [
 "postgres-protocol2",
 "postgres_backend",
 "pq_proto",
- "prometheus",
 "rand 0.8.5",
 "rand_distr",
 "rcgen",
@@ -5006,7 +4998,6 @@ dependencies = [
 "smallvec",
 "smol_str",
 "socket2",
- "strum",
 "strum_macros",
 "subtle",
 "thiserror 1.0.69",
@@ -5021,7 +5012,6 @@ dependencies = [
 "tracing",
 "tracing-log",
 "tracing-opentelemetry",
- "tracing-serde",
 "tracing-subscriber",
 "tracing-utils",
 "try-lock",
@@ -6462,6 +6452,7 @@ dependencies = [
 "pageserver_client",
 "postgres_connection",
 "rand 0.8.5",
+ "regex",
 "reqwest",
 "routerify",
 "rustls 0.23.18",
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -148,7 +148,7 @@ RUN case $DEBIAN_VERSION in \
    apt install --no-install-recommends --no-install-suggests -y \
    ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
    $VERSION_INSTALLS \
    && apt clean && rm -rf /var/lib/apt/lists/*

@@ -1464,6 +1464,31 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control

+#########################################################################################
+#
+# Layer "pg-duckdb-pg-build"
+# compile pg_duckdb extension
+#
+#########################################################################################
+FROM build-deps AS pg_duckdb-src
+WORKDIR /ext-src
+COPY compute/patches/pg_duckdb_v031.patch .
+# pg_duckdb build requires source dir to be a git repo to get submodules
+# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: 
+# - extension management function duckdb.install_extension()
+# - access to duckdb.extensions table and its sequence
+RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
+    cd pg_duckdb-src && \
+    git submodule update --init --recursive && \
+    patch -p1 < /ext-src/pg_duckdb_v031.patch
+
+FROM pg-build AS pg_duckdb-build
+ARG PG_VERSION
+COPY --from=pg_duckdb-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_duckdb-src
+RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control 
+        
 #########################################################################################
 #
 # Layer "pg_repack"
@@ -1577,6 +1602,7 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/

 #########################################################################################
--- a/compute/patches/pg_duckdb_v031.patch
+++ b/compute/patches/pg_duckdb_v031.patch
@@ -0,0 +1,11 @@
+diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
+index d777d76..af60106 100644
+--- a/sql/pg_duckdb--0.2.0--0.3.0.sql
+++ b/sql/pg_duckdb--0.2.0--0.3.0.sql
+@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
+ GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC;
+ GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC;
+ GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC;
+GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
+GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
+GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -25,10 +25,10 @@
 //! docker push localhost:3030/localregistry/compute-node-v14:latest
 //! ```

-use anyhow::Context;
+use anyhow::{bail, Context};
 use aws_config::BehaviorVersion;
 use camino::{Utf8Path, Utf8PathBuf};
-use clap::Parser;
+use clap::{Parser, Subcommand};
 use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
 use nix::unistd::Pid;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -44,32 +44,59 @@ mod s3_uri;
 const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
 const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);

+#[derive(Subcommand, Debug)]
+enum Command {
+    /// Runs local postgres (neon binary), restores into it,
+    /// uploads pgdata to s3 to be consumed by pageservers
+    Pgdata {
+        /// Raw connection string to the source database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        source_connection_string: Option<String>,
+        /// If specified, will not shut down the local postgres after the import. Used in local testing
+        #[clap(short, long)]
+        interactive: bool,
+        /// Port to run postgres on. Default is 5432.
+        #[clap(long, default_value_t = 5432)]
+        pg_port: u16, // port to run postgres on, 5432 is default
+
+        /// Number of CPUs in the system. This is used to configure # of
+        /// parallel worker processes, for index creation.
+        #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
+        num_cpus: Option<usize>,
+
+        /// Amount of RAM in the system. This is used to configure shared_buffers
+        /// and maintenance_work_mem.
+        #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
+        memory_mb: Option<usize>,
+    },
+
+    /// Runs pg_dump-pg_restore from source to destination without running local postgres.
+    DumpRestore {
+        /// Raw connection string to the source database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        source_connection_string: Option<String>,
+        /// Raw connection string to the destination database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        destination_connection_string: Option<String>,
+    },
+}
+
 #[derive(clap::Parser)]
 struct Args {
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_WORKDIR")]
    working_directory: Utf8PathBuf,
    #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
    s3_prefix: Option<s3_uri::S3Uri>,
-    #[clap(long)]
-    source_connection_string: Option<String>,
-    #[clap(short, long)]
-    interactive: bool,
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")]
    pg_bin_dir: Utf8PathBuf,
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")]
    pg_lib_dir: Utf8PathBuf,
-    #[clap(long)]
-    pg_port: Option<u16>, // port to run postgres on, 5432 is default

-    /// Number of CPUs in the system. This is used to configure # of
-    /// parallel worker processes, for index creation.
-    #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
-    num_cpus: Option<usize>,
-
-    /// Amount of RAM in the system. This is used to configure shared_buffers
-    /// and maintenance_work_mem.
-    #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
-    memory_mb: Option<usize>,
+    #[clap(subcommand)]
+    command: Command,
 }

 #[serde_with::serde_as]
@@ -78,6 +105,8 @@ struct Spec {
    encryption_secret: EncryptionSecret,
    #[serde_as(as = "serde_with::base64::Base64")]
    source_connstring_ciphertext_base64: Vec<u8>,
+    #[serde_as(as = "Option<serde_with::base64::Base64>")]
+    destination_connstring_ciphertext_base64: Option<Vec<u8>>,
 }

 #[derive(serde::Deserialize)]
@@ -93,192 +122,150 @@ const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
    "C.UTF-8"
 };

-#[tokio::main]
-pub(crate) async fn main() -> anyhow::Result<()> {
-    utils::logging::init(
-        utils::logging::LogFormat::Plain,
-        utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
-        utils::logging::Output::Stdout,
-    )?;
-
-    info!("starting");
-
-    let args = Args::parse();
-
-    // Validate arguments
-    if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
-        anyhow::bail!("either s3_prefix or source_connection_string must be specified");
-    }
-    if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
-        anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
-    }
-
-    let working_directory = args.working_directory;
-    let pg_bin_dir = args.pg_bin_dir;
-    let pg_lib_dir = args.pg_lib_dir;
-    let pg_port = args.pg_port.unwrap_or_else(|| {
-        info!("pg_port not specified, using default 5432");
-        5432
-    });
-
-    // Initialize AWS clients only if s3_prefix is specified
-    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
-        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
-        let kms = aws_sdk_kms::Client::new(&config);
-        (Some(config), Some(kms))
-    } else {
-        (None, None)
-    };
-
-    // Get source connection string either from S3 spec or direct argument
-    let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
-        let spec: Spec = {
-            let spec_key = s3_prefix.append("/spec.json");
-            let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
-            let object = s3_client
-                .get_object()
-                .bucket(&spec_key.bucket)
-                .key(spec_key.key)
-                .send()
-                .await
-                .context("get spec from s3")?
-                .body
-                .collect()
-                .await
-                .context("download spec body")?;
-            serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
-        };
-
-        match spec.encryption_secret {
-            EncryptionSecret::KMS { key_id } => {
-                let mut output = kms_client
-                    .unwrap()
-                    .decrypt()
-                    .key_id(key_id)
-                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
-                        spec.source_connstring_ciphertext_base64,
-                    ))
-                    .send()
-                    .await
-                    .context("decrypt source connection string")?;
-                let plaintext = output
-                    .plaintext
-                    .take()
-                    .context("get plaintext source connection string")?;
-                String::from_utf8(plaintext.into_inner())
-                    .context("parse source connection string as utf8")?
-            }
-        }
-    } else {
-        args.source_connection_string.unwrap()
-    };
-
-    match tokio::fs::create_dir(&working_directory).await {
-        Ok(()) => {}
-        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
-            if !is_directory_empty(&working_directory)
-                .await
-                .context("check if working directory is empty")?
-            {
-                anyhow::bail!("working directory is not empty");
-            } else {
-                // ok
-            }
-        }
-        Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
-    }
-
-    let pgdata_dir = working_directory.join("pgdata");
-    tokio::fs::create_dir(&pgdata_dir)
+async fn decode_connstring(
+    kms_client: &aws_sdk_kms::Client,
+    key_id: &String,
+    connstring_ciphertext_base64: Vec<u8>,
+) -> Result<String, anyhow::Error> {
+    let mut output = kms_client
+        .decrypt()
+        .key_id(key_id)
+        .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
+            connstring_ciphertext_base64,
+        ))
+        .send()
        .await
-        .context("create pgdata directory")?;
+        .context("decrypt connection string")?;

-    let pgbin = pg_bin_dir.join("postgres");
-    let pg_version = match get_pg_version(pgbin.as_ref()) {
-        PostgresMajorVersion::V14 => 14,
-        PostgresMajorVersion::V15 => 15,
-        PostgresMajorVersion::V16 => 16,
-        PostgresMajorVersion::V17 => 17,
-    };
-    let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
-    postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
-        superuser,
-        locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
-        pg_version,
-        initdb_bin: pg_bin_dir.join("initdb").as_ref(),
-        library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
-        pgdata: &pgdata_dir,
-    })
-    .await
-    .context("initdb")?;
+    let plaintext = output
+        .plaintext
+        .take()
+        .context("get plaintext connection string")?;

-    // If the caller didn't specify CPU / RAM to use for sizing, default to
-    // number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM.
-    let nproc = args.num_cpus.unwrap_or_else(num_cpus::get);
-    let memory_mb = args.memory_mb.unwrap_or(256);
+    String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8")
+}

-    // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
-    // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
-    // available for misc other stuff that PostgreSQL uses memory for.
-    let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
-    let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
+struct PostgresProcess {
+    pgdata_dir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pgbin: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    postgres_proc: Option<tokio::process::Child>,
+}

-    //
-    // Launch postgres process
-    //
-    let mut postgres_proc = tokio::process::Command::new(pgbin)
-        .arg("-D")
-        .arg(&pgdata_dir)
-        .args(["-p", &format!("{pg_port}")])
-        .args(["-c", "wal_level=minimal"])
-        .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
-        .args(["-c", "max_wal_senders=0"])
-        .args(["-c", "fsync=off"])
-        .args(["-c", "full_page_writes=off"])
-        .args(["-c", "synchronous_commit=off"])
-        .args([
-            "-c",
-            &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
-        ])
-        .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
-        .args(["-c", &format!("max_parallel_workers={nproc}")])
-        .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
-        .args(["-c", &format!("max_worker_processes={nproc}")])
-        .args([
-            "-c",
-            &format!(
-                "effective_io_concurrency={}",
-                if cfg!(target_os = "macos") { 0 } else { 100 }
-            ),
-        ])
-        .env_clear()
-        .env("LD_LIBRARY_PATH", &pg_lib_dir)
-        .env(
-            "ASAN_OPTIONS",
-            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+impl PostgresProcess {
+    fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self {
+        Self {
+            pgdata_dir,
+            pgbin: pg_bin_dir.join("postgres"),
+            pg_bin_dir,
+            pg_lib_dir,
+            postgres_proc: None,
+        }
+    }
+
+    async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> {
+        tokio::fs::create_dir(&self.pgdata_dir)
+            .await
+            .context("create pgdata directory")?;
+
+        let pg_version = match get_pg_version(self.pgbin.as_ref()) {
+            PostgresMajorVersion::V14 => 14,
+            PostgresMajorVersion::V15 => 15,
+            PostgresMajorVersion::V16 => 16,
+            PostgresMajorVersion::V17 => 17,
+        };
+        postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
+            superuser: initdb_user,
+            locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
+            pg_version,
+            initdb_bin: self.pg_bin_dir.join("initdb").as_ref(),
+            library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
+            pgdata: &self.pgdata_dir,
+        })
+        .await
+        .context("initdb")
+    }
+
+    async fn start(
+        &mut self,
+        initdb_user: &str,
+        port: u16,
+        nproc: usize,
+        memory_mb: usize,
+    ) -> Result<&tokio::process::Child, anyhow::Error> {
+        self.prepare(initdb_user).await?;
+
+        // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
+        // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
+        // available for misc other stuff that PostgreSQL uses memory for.
+        let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
+        let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
+
+        //
+        // Launch postgres process
+        //
+        let mut proc = tokio::process::Command::new(&self.pgbin)
+            .arg("-D")
+            .arg(&self.pgdata_dir)
+            .args(["-p", &format!("{port}")])
+            .args(["-c", "wal_level=minimal"])
+            .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
+            .args(["-c", "max_wal_senders=0"])
+            .args(["-c", "fsync=off"])
+            .args(["-c", "full_page_writes=off"])
+            .args(["-c", "synchronous_commit=off"])
+            .args([
+                "-c",
+                &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
+            ])
+            .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
+            .args(["-c", &format!("max_parallel_workers={nproc}")])
+            .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
+            .args(["-c", &format!("max_worker_processes={nproc}")])
+            .args(["-c", "effective_io_concurrency=100"])
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &self.pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
+            .stdout(std::process::Stdio::piped())
+            .stderr(std::process::Stdio::piped())
+            .spawn()
+            .context("spawn postgres")?;
+
+        info!("spawned postgres, waiting for it to become ready");
+        tokio::spawn(
+            child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take())
+                .instrument(info_span!("postgres")),
+        );
+
+        self.postgres_proc = Some(proc);
+        Ok(self.postgres_proc.as_ref().unwrap())
+    }
+
+    async fn shutdown(&mut self) -> Result<(), anyhow::Error> {
+        let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap();
+        info!("shutdown postgres");
+        nix::sys::signal::kill(
+            Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")),
+            nix::sys::signal::SIGTERM,
        )
-        .env(
-            "UBSAN_OPTIONS",
-            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
-        )
-        .stdout(std::process::Stdio::piped())
-        .stderr(std::process::Stdio::piped())
-        .spawn()
-        .context("spawn postgres")?;
-
-    info!("spawned postgres, waiting for it to become ready");
-    tokio::spawn(
-        child_stdio_to_log::relay_process_output(
-            postgres_proc.stdout.take(),
-            postgres_proc.stderr.take(),
-        )
-        .instrument(info_span!("postgres")),
-    );
+        .context("signal postgres to shut down")?;
+        proc.wait()
+            .await
+            .context("wait for postgres to shut down")
+            .map(|_| ())
+    }
+}

+async fn wait_until_ready(connstring: String, create_dbname: String) {
    // Create neondb database in the running postgres
-    let restore_pg_connstring =
-        format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
-
    let start_time = std::time::Instant::now();

    loop {
@@ -289,7 +276,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
            std::process::exit(1);
        }

-        match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
+        match tokio_postgres::connect(
+            &connstring.replace("dbname=neondb", "dbname=postgres"),
+            tokio_postgres::NoTls,
+        )
+        .await
+        {
            Ok((client, connection)) => {
                // Spawn the connection handling task to maintain the connection
                tokio::spawn(async move {
@@ -298,9 +290,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
                    }
                });

-                match client.simple_query("CREATE DATABASE neondb;").await {
+                match client
+                    .simple_query(format!("CREATE DATABASE {create_dbname};").as_str())
+                    .await
+                {
                    Ok(_) => {
-                        info!("created neondb database");
+                        info!("created {} database", create_dbname);
                        break;
                    }
                    Err(e) => {
@@ -324,10 +319,16 @@ pub(crate) async fn main() -> anyhow::Result<()> {
            }
        }
    }
+}

-    let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
-
-    let dumpdir = working_directory.join("dumpdir");
+async fn run_dump_restore(
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    source_connstring: String,
+    destination_connstring: String,
+) -> Result<(), anyhow::Error> {
+    let dumpdir = workdir.join("dumpdir");

    let common_args = [
        // schema mapping (prob suffices to specify them on one side)
@@ -356,7 +357,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
            .arg("--no-sync")
            // POSITIONAL args
            // source db (db name included in connection string)
-            .arg(&source_connection_string)
+            .arg(&source_connstring)
            // how we run it
            .env_clear()
            .env("LD_LIBRARY_PATH", &pg_lib_dir)
@@ -376,19 +377,18 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        let st = pg_dump.wait().await.context("wait for pg_dump")?;
        info!(status=?st, "pg_dump exited");
        if !st.success() {
-            warn!(status=%st, "pg_dump failed, restore will likely fail as well");
+            error!(status=%st, "pg_dump failed, restore will likely fail as well");
+            bail!("pg_dump failed");
        }
    }

-    // TODO: do it in a streaming way, plenty of internal research done on this already
+    // TODO: maybe do it in a streaming way, plenty of internal research done on this already
    // TODO: do the unlogged table trick
-
-    info!("restore from working directory into vanilla postgres");
    {
        let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore"))
            .args(&common_args)
            .arg("-d")
-            .arg(&restore_pg_connstring)
+            .arg(&destination_connstring)
            // POSITIONAL args
            .arg(&dumpdir)
            // how we run it
@@ -411,33 +411,82 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        let st = pg_restore.wait().await.context("wait for pg_restore")?;
        info!(status=?st, "pg_restore exited");
        if !st.success() {
-            warn!(status=%st, "pg_restore failed, restore will likely fail as well");
+            error!(status=%st, "pg_restore failed, restore will likely fail as well");
+            bail!("pg_restore failed");
        }
    }

+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn cmd_pgdata(
+    kms_client: Option<aws_sdk_kms::Client>,
+    maybe_s3_prefix: Option<s3_uri::S3Uri>,
+    maybe_spec: Option<Spec>,
+    source_connection_string: Option<String>,
+    interactive: bool,
+    pg_port: u16,
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    num_cpus: Option<usize>,
+    memory_mb: Option<usize>,
+) -> Result<(), anyhow::Error> {
+    if maybe_spec.is_none() && source_connection_string.is_none() {
+        bail!("spec must be provided for pgdata command");
+    }
+    if maybe_spec.is_some() && source_connection_string.is_some() {
+        bail!("only one of spec or source_connection_string can be provided");
+    }
+
+    let source_connection_string = if let Some(spec) = maybe_spec {
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                decode_connstring(
+                    kms_client.as_ref().unwrap(),
+                    &key_id,
+                    spec.source_connstring_ciphertext_base64,
+                )
+                .await?
+            }
+        }
+    } else {
+        source_connection_string.unwrap()
+    };
+
+    let superuser = "cloud_admin";
+    let destination_connstring = format!(
+        "host=localhost port={} user={} dbname=neondb",
+        pg_port, superuser
+    );
+
+    let pgdata_dir = workdir.join("pgdata");
+    let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone());
+    let nproc = num_cpus.unwrap_or_else(num_cpus::get);
+    let memory_mb = memory_mb.unwrap_or(256);
+    proc.start(superuser, pg_port, nproc, memory_mb).await?;
+    wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await;
+
+    run_dump_restore(
+        workdir.clone(),
+        pg_bin_dir,
+        pg_lib_dir,
+        source_connection_string,
+        destination_connstring,
+    )
+    .await?;
+
    // If interactive mode, wait for Ctrl+C
-    if args.interactive {
+    if interactive {
        info!("Running in interactive mode. Press Ctrl+C to shut down.");
        tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
    }

-    info!("shutdown postgres");
-    {
-        nix::sys::signal::kill(
-            Pid::from_raw(
-                i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"),
-            ),
-            nix::sys::signal::SIGTERM,
-        )
-        .context("signal postgres to shut down")?;
-        postgres_proc
-            .wait()
-            .await
-            .context("wait for postgres to shut down")?;
-    }
+    proc.shutdown().await?;

    // Only sync if s3_prefix was specified
-    if let Some(s3_prefix) = args.s3_prefix {
+    if let Some(s3_prefix) = maybe_s3_prefix {
        info!("upload pgdata");
        aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
            .await
@@ -445,7 +494,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {

        info!("write status");
        {
-            let status_dir = working_directory.join("status");
+            let status_dir = workdir.join("status");
            std::fs::create_dir(&status_dir).context("create status directory")?;
            let status_file = status_dir.join("pgdata");
            std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
@@ -458,3 +507,153 @@ pub(crate) async fn main() -> anyhow::Result<()> {

    Ok(())
 }
+
+async fn cmd_dumprestore(
+    kms_client: Option<aws_sdk_kms::Client>,
+    maybe_spec: Option<Spec>,
+    source_connection_string: Option<String>,
+    destination_connection_string: Option<String>,
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+) -> Result<(), anyhow::Error> {
+    let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec {
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                let source = decode_connstring(
+                    kms_client.as_ref().unwrap(),
+                    &key_id,
+                    spec.source_connstring_ciphertext_base64,
+                )
+                .await?;
+
+                let dest = if let Some(dest_ciphertext) =
+                    spec.destination_connstring_ciphertext_base64
+                {
+                    decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext)
+                        .await?
+                } else {
+                    bail!("destination connection string must be provided in spec for dump_restore command");
+                };
+
+                (source, dest)
+            }
+        }
+    } else {
+        (
+            source_connection_string.unwrap(),
+            if let Some(val) = destination_connection_string {
+                val
+            } else {
+                bail!("destination connection string must be provided for dump_restore command");
+            },
+        )
+    };
+
+    run_dump_restore(
+        workdir,
+        pg_bin_dir,
+        pg_lib_dir,
+        source_connstring,
+        destination_connstring,
+    )
+    .await
+}
+
+#[tokio::main]
+pub(crate) async fn main() -> anyhow::Result<()> {
+    utils::logging::init(
+        utils::logging::LogFormat::Json,
+        utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        utils::logging::Output::Stdout,
+    )?;
+
+    info!("starting");
+
+    let args = Args::parse();
+
+    // Initialize AWS clients only if s3_prefix is specified
+    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
+        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+        let kms = aws_sdk_kms::Client::new(&config);
+        (Some(config), Some(kms))
+    } else {
+        (None, None)
+    };
+
+    let spec: Option<Spec> = if let Some(s3_prefix) = &args.s3_prefix {
+        let spec_key = s3_prefix.append("/spec.json");
+        let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
+        let object = s3_client
+            .get_object()
+            .bucket(&spec_key.bucket)
+            .key(spec_key.key)
+            .send()
+            .await
+            .context("get spec from s3")?
+            .body
+            .collect()
+            .await
+            .context("download spec body")?;
+        serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+    } else {
+        None
+    };
+
+    match tokio::fs::create_dir(&args.working_directory).await {
+        Ok(()) => {}
+        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+            if !is_directory_empty(&args.working_directory)
+                .await
+                .context("check if working directory is empty")?
+            {
+                bail!("working directory is not empty");
+            } else {
+                // ok
+            }
+        }
+        Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
+    }
+
+    match args.command {
+        Command::Pgdata {
+            source_connection_string,
+            interactive,
+            pg_port,
+            num_cpus,
+            memory_mb,
+        } => {
+            cmd_pgdata(
+                kms_client,
+                args.s3_prefix,
+                spec,
+                source_connection_string,
+                interactive,
+                pg_port,
+                args.working_directory,
+                args.pg_bin_dir,
+                args.pg_lib_dir,
+                num_cpus,
+                memory_mb,
+            )
+            .await?;
+        }
+        Command::DumpRestore {
+            source_connection_string,
+            destination_connection_string,
+        } => {
+            cmd_dumprestore(
+                kms_client,
+                spec,
+                source_connection_string,
+                destination_connection_string,
+                args.working_directory,
+                args.pg_bin_dir,
+                args.pg_lib_dir,
+            )
+            .await?;
+        }
+    }
+
+    Ok(())
+}
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -11,6 +11,7 @@ if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEW
  exit 1
 fi
 export PG_VERSION=${PG_VERSION:-16}
+export PG_TEST_VERSION=${PG_VERSION}
 function wait_for_ready {
  TIME=0
  while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
@@ -59,8 +60,12 @@ docker compose cp  ext-src neon-test-extensions:/
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
 create_extensions "${EXTNAMES}"
-query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
-exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
+  exts="${EXTNAMES}"
+else
+  query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
+  exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+fi
 if [ -z "${exts}" ]; then
  echo "No extensions were upgraded"
 else
@@ -88,7 +93,10 @@ else
      exit 1
    fi
    docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
-    docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh
+    if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then
+      docker  compose exec neon-test-extensions  cat /ext-src/${EXTDIR}/regression.diffs
+      exit 1
+    fi
    docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update"
    docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
  done
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -351,7 +351,7 @@ pub struct TenantConfigToml {

    /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
    /// `index_part.json`, and it cannot be reversed.
-    pub rel_size_v2_enabled: Option<bool>,
+    pub rel_size_v2_enabled: bool,

    // gc-compaction related configs
    /// Enable automatic gc-compaction trigger on this tenant.
@@ -633,7 +633,7 @@ impl Default for TenantConfigToml {
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
            timeline_offloading: true,
            wal_receiver_protocol_override: None,
-            rel_size_v2_enabled: None,
+            rel_size_v2_enabled: false,
            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,10 +1,12 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::Bytes;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::Oid;
 use postgres_ffi::RepOriginId;
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
+use utils::const_assert;

 use crate::reltag::{BlockNumber, RelTag, SlruKind};

@@ -49,6 +51,64 @@ pub const AUX_KEY_PREFIX: u8 = 0x62;
 /// The key prefix of ReplOrigin keys.
 pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;

+/// The key prefix of db directory keys.
+pub const DB_DIR_KEY_PREFIX: u8 = 0x64;
+
+/// The key prefix of rel directory keys.
+pub const REL_DIR_KEY_PREFIX: u8 = 0x65;
+
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub enum RelDirExists {
+    Exists,
+    Removed,
+}
+
+#[derive(Debug)]
+pub struct DecodeError;
+
+impl fmt::Display for DecodeError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "invalid marker")
+    }
+}
+
+impl std::error::Error for DecodeError {}
+
+impl RelDirExists {
+    /// The value of the rel directory keys that indicates the existence of a relation.
+    const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r");
+
+    pub fn encode(&self) -> Bytes {
+        match self {
+            Self::Exists => Self::REL_EXISTS_MARKER.clone(),
+            Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(),
+        }
+    }
+
+    pub fn decode_option(data: Option<impl AsRef<[u8]>>) -> Result<Self, DecodeError> {
+        match data {
+            Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists),
+            // Any other marker is invalid
+            Some(_) => Err(DecodeError),
+            None => Ok(Self::Removed),
+        }
+    }
+
+    pub fn decode(data: impl AsRef<[u8]>) -> Result<Self, DecodeError> {
+        let data = data.as_ref();
+        if data == Self::REL_EXISTS_MARKER {
+            Ok(Self::Exists)
+        } else if data == SPARSE_TOMBSTONE_MARKER {
+            Ok(Self::Removed)
+        } else {
+            Err(DecodeError)
+        }
+    }
+}
+
+/// A tombstone in the sparse keyspace, which is an empty buffer.
+pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b"");
+
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -110,6 +170,24 @@ impl Key {
        }
    }

+    pub fn rel_dir_sparse_key_range() -> Range<Self> {
+        Key {
+            field1: REL_DIR_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: REL_DIR_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
+    }
+
    /// This function checks more extensively what keys we can take on the write path.
    /// If a key beginning with 00 does not have a global/default tablespace OID, it
    /// will be rejected on the write path.
@@ -440,6 +518,36 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
    }
 }

+#[inline(always)]
+pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key {
+    Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: relnode,
+        field5: forknum,
+        field6: 1,
+    }
+}
+
+pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
+    Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX,
+    } // it's fine to exclude the last key b/c we only use field6 == 1
+}
+
 #[inline(always)]
 pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
@@ -734,9 +842,9 @@ impl Key {
        self.field1 == RELATION_SIZE_PREFIX
    }

-    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
+    pub const fn sparse_non_inherited_keyspace() -> Range<Key> {
        // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
-        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
+        const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX);
        Key {
            field1: AUX_KEY_PREFIX,
            field2: 0,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1144,6 +1144,7 @@ pub struct TimelineInfo {
    /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
    /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
    /// as it is easier to reason about.
+    #[serde(default)]
    pub applied_gc_cutoff_lsn: Lsn,

    /// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval.
@@ -1152,6 +1153,7 @@ pub struct TimelineInfo {
    ///
    /// Note that holders of valid LSN leases may be able to create branches and read pages earlier
    /// than this LSN, but new leases may not be taken out earlier than this LSN.
+    #[serde(default)]
    pub min_readable_lsn: Lsn,

    pub disk_consistent_lsn: Lsn,
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -9,6 +9,8 @@ use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::io::ErrorKind;
 use std::net::SocketAddr;
+use std::os::fd::AsRawFd;
+use std::os::fd::RawFd;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{ready, Poll};
@@ -268,6 +270,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
 }

 pub struct PostgresBackend<IO> {
+    pub socket_fd: RawFd,
    framed: MaybeWriteOnly<IO>,

    pub state: ProtoState,
@@ -293,9 +296,11 @@ impl PostgresBackend<tokio::net::TcpStream> {
        tls_config: Option<Arc<rustls::ServerConfig>>,
    ) -> io::Result<Self> {
        let peer_addr = socket.peer_addr()?;
+        let socket_fd = socket.as_raw_fd();
        let stream = MaybeTlsStream::Unencrypted(socket);

        Ok(Self {
+            socket_fd,
            framed: MaybeWriteOnly::Full(Framed::new(stream)),
            state: ProtoState::Initialization,
            auth_type,
@@ -307,6 +312,7 @@ impl PostgresBackend<tokio::net::TcpStream> {

 impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    pub fn new_from_io(
+        socket_fd: RawFd,
        socket: IO,
        peer_addr: SocketAddr,
        auth_type: AuthType,
@@ -315,6 +321,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        let stream = MaybeTlsStream::Unencrypted(socket);

        Ok(Self {
+            socket_fd,
            framed: MaybeWriteOnly::Full(Framed::new(stream)),
            state: ProtoState::Initialization,
            auth_type,
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -10,8 +10,8 @@ use crate::simple_query::SimpleQueryStream;
 use crate::types::{Oid, ToSql, Type};

 use crate::{
-    prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
-    SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
+    query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
+    SimpleQueryMessage, Statement, Transaction, TransactionBuilder,
 };
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
@@ -54,18 +54,18 @@ impl Responses {
 }

 /// A cache of type info and prepared statements for fetching type info
-/// (corresponding to the queries in the [prepare] module).
+/// (corresponding to the queries in the [crate::prepare] module).
 #[derive(Default)]
 struct CachedTypeInfo {
    /// A statement for basic information for a type from its
-    /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
+    /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
    /// fallback).
    typeinfo: Option<Statement>,
    /// A statement for getting information for a composite type from its OID.
-    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY).
+    /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY).
    typeinfo_composite: Option<Statement>,
    /// A statement for getting information for a composite type from its OID.
-    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or
+    /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or
    /// its fallback).
    typeinfo_enum: Option<Statement>,

@@ -190,26 +190,6 @@ impl Client {
        &self.inner
    }

-    /// Creates a new prepared statement.
-    ///
-    /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
-    /// which are set when executed. Prepared statements can only be used with the connection that created them.
-    pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
-        self.prepare_typed(query, &[]).await
-    }
-
-    /// Like `prepare`, but allows the types of query parameters to be explicitly specified.
-    ///
-    /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
-    /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
-    pub async fn prepare_typed(
-        &self,
-        query: &str,
-        parameter_types: &[Type],
-    ) -> Result<Statement, Error> {
-        prepare::prepare(&self.inner, query, parameter_types).await
-    }
-
    /// Executes a statement, returning a vector of the resulting rows.
    ///
    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
@@ -222,14 +202,11 @@ impl Client {
    /// # Panics
    ///
    /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn query<T>(
+    pub async fn query(
        &self,
-        statement: &T,
+        statement: Statement,
        params: &[&(dyn ToSql + Sync)],
-    ) -> Result<Vec<Row>, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
+    ) -> Result<Vec<Row>, Error> {
        self.query_raw(statement, slice_iter(params))
            .await?
            .try_collect()
@@ -250,13 +227,15 @@ impl Client {
    /// Panics if the number of parameters provided does not match the number expected.
    ///
    /// [`query`]: #method.query
-    pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
+    pub async fn query_raw<'a, I>(
+        &self,
+        statement: Statement,
+        params: I,
+    ) -> Result<RowStream, Error>
    where
-        T: ?Sized + ToStatement,
        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
        I::IntoIter: ExactSizeIterator,
    {
-        let statement = statement.__convert().into_statement(self).await?;
        query::query(&self.inner, statement, params).await
    }

@@ -271,55 +250,6 @@ impl Client {
        query::query_txt(&self.inner, statement, params).await
    }

-    /// Executes a statement, returning the number of rows modified.
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn execute<T>(
-        &self,
-        statement: &T,
-        params: &[&(dyn ToSql + Sync)],
-    ) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
-        self.execute_raw(statement, slice_iter(params)).await
-    }
-
-    /// The maximally flexible version of [`execute`].
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    ///
-    /// [`execute`]: #method.execute
-    pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        let statement = statement.__convert().into_statement(self).await?;
-        query::execute(self.inner(), statement, params).await
-    }
-
    /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
    ///
    /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -1,7 +1,8 @@
+#![allow(async_fn_in_trait)]
+
 use crate::query::RowStream;
 use crate::types::Type;
 use crate::{Client, Error, Transaction};
-use async_trait::async_trait;
 use postgres_protocol2::Oid;

 mod private {
@@ -11,7 +12,6 @@ mod private {
 /// A trait allowing abstraction over connections and transactions.
 ///
 /// This trait is "sealed", and cannot be implemented outside of this crate.
-#[async_trait]
 pub trait GenericClient: private::Sealed {
    /// Like `Client::query_raw_txt`.
    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
@@ -26,7 +26,6 @@ pub trait GenericClient: private::Sealed {

 impl private::Sealed for Client {}

-#[async_trait]
 impl GenericClient for Client {
    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
    where
@@ -39,14 +38,12 @@ impl GenericClient for Client {

    /// Query for type information
    async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
-        self.get_type(oid).await
+        crate::prepare::get_type(self.inner(), oid).await
    }
 }

 impl private::Sealed for Transaction<'_> {}

-#[async_trait]
-#[allow(clippy::needless_lifetimes)]
 impl GenericClient for Transaction<'_> {
    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
    where
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -14,7 +14,6 @@ pub use crate::row::{Row, SimpleQueryRow};
 pub use crate::simple_query::SimpleQueryStream;
 pub use crate::statement::{Column, Statement};
 pub use crate::tls::NoTls;
-pub use crate::to_statement::ToStatement;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
 use crate::types::ToSql;
@@ -65,7 +64,6 @@ pub mod row;
 mod simple_query;
 mod statement;
 pub mod tls;
-mod to_statement;
 mod transaction;
 mod transaction_builder;
 pub mod types;
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -1,7 +1,6 @@
 use crate::client::InnerClient;
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
-use crate::error::SqlState;
 use crate::types::{Field, Kind, Oid, Type};
 use crate::{query, slice_iter};
 use crate::{Column, Error, Statement};
@@ -13,7 +12,6 @@ use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use std::future::Future;
 use std::pin::Pin;
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;

 pub(crate) const TYPEINFO_QUERY: &str = "\
@@ -24,14 +22,6 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
 WHERE t.oid = $1
 ";

-// Range types weren't added until Postgres 9.2, so pg_range may not exist
-const TYPEINFO_FALLBACK_QUERY: &str = "\
-SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid
-FROM pg_catalog.pg_type t
-INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
-WHERE t.oid = $1
-";
-
 const TYPEINFO_ENUM_QUERY: &str = "\
 SELECT enumlabel
 FROM pg_catalog.pg_enum
@@ -39,14 +29,6 @@ WHERE enumtypid = $1
 ORDER BY enumsortorder
 ";

-// Postgres 9.0 didn't have enumsortorder
-const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\
-SELECT enumlabel
-FROM pg_catalog.pg_enum
-WHERE enumtypid = $1
-ORDER BY oid
-";
-
 pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
 SELECT attname, atttypid
 FROM pg_catalog.pg_attribute
@@ -56,15 +38,13 @@ AND attnum > 0
 ORDER BY attnum
 ";

-static NEXT_ID: AtomicUsize = AtomicUsize::new(0);
-
 pub async fn prepare(
    client: &Arc<InnerClient>,
+    name: &'static str,
    query: &str,
    types: &[Type],
 ) -> Result<Statement, Error> {
-    let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst));
-    let buf = encode(client, &name, query, types)?;
+    let buf = encode(client, name, query, types)?;
    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

    match responses.next().await? {
@@ -105,10 +85,11 @@ pub async fn prepare(

 fn prepare_rec<'a>(
    client: &'a Arc<InnerClient>,
+    name: &'static str,
    query: &'a str,
    types: &'a [Type],
 ) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
-    Box::pin(prepare(client, query, types))
+    Box::pin(prepare(client, name, query, types))
 }

 fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
@@ -192,13 +173,8 @@ async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Erro
        return Ok(stmt);
    }

-    let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
-        Ok(stmt) => stmt,
-        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
-            prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
-        }
-        Err(e) => return Err(e),
-    };
+    let typeinfo = "neon_proxy_typeinfo";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?;

    client.set_typeinfo(&stmt);
    Ok(stmt)
@@ -219,13 +195,8 @@ async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement,
        return Ok(stmt);
    }

-    let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
-        Ok(stmt) => stmt,
-        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
-            prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
-        }
-        Err(e) => return Err(e),
-    };
+    let typeinfo = "neon_proxy_typeinfo_enum";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?;

    client.set_typeinfo_enum(&stmt);
    Ok(stmt)
@@ -255,7 +226,8 @@ async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<State
        return Ok(stmt);
    }

-    let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
+    let typeinfo = "neon_proxy_typeinfo_composite";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?;

    client.set_typeinfo_composite(&stmt);
    Ok(stmt)
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -157,49 +157,6 @@ where
    })
 }

-pub async fn execute<'a, I>(
-    client: &InnerClient,
-    statement: Statement,
-    params: I,
-) -> Result<u64, Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    let buf = if log_enabled!(Level::Debug) {
-        let params = params.into_iter().collect::<Vec<_>>();
-        debug!(
-            "executing statement {} with parameters: {:?}",
-            statement.name(),
-            BorrowToSqlParamsDebug(params.as_slice()),
-        );
-        encode(client, &statement, params)?
-    } else {
-        encode(client, &statement, params)?
-    };
-    let mut responses = start(client, buf).await?;
-
-    let mut rows = 0;
-    loop {
-        match responses.next().await? {
-            Message::DataRow(_) => {}
-            Message::CommandComplete(body) => {
-                rows = body
-                    .tag()
-                    .map_err(Error::parse)?
-                    .rsplit(' ')
-                    .next()
-                    .unwrap()
-                    .parse()
-                    .unwrap_or(0);
-            }
-            Message::EmptyQueryResponse => rows = 0,
-            Message::ReadyForQuery(_) => return Ok(rows),
-            _ => return Err(Error::unexpected_message()),
-        }
-    }
-}
-
 async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

--- a/libs/proxy/tokio-postgres2/src/statement.rs
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -13,7 +13,7 @@ use std::{

 struct StatementInner {
    client: Weak<InnerClient>,
-    name: String,
+    name: &'static str,
    params: Vec<Type>,
    columns: Vec<Column>,
 }
@@ -22,7 +22,7 @@ impl Drop for StatementInner {
    fn drop(&mut self) {
        if let Some(client) = self.client.upgrade() {
            let buf = client.with_buf(|buf| {
-                frontend::close(b'S', &self.name, buf).unwrap();
+                frontend::close(b'S', self.name, buf).unwrap();
                frontend::sync(buf);
                buf.split().freeze()
            });
@@ -40,7 +40,7 @@ pub struct Statement(Arc<StatementInner>);
 impl Statement {
    pub(crate) fn new(
        inner: &Arc<InnerClient>,
-        name: String,
+        name: &'static str,
        params: Vec<Type>,
        columns: Vec<Column>,
    ) -> Statement {
@@ -55,14 +55,14 @@ impl Statement {
    pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
        Statement(Arc::new(StatementInner {
            client: Weak::new(),
-            name: String::new(),
+            name: "<anonymous>",
            params,
            columns,
        }))
    }

    pub(crate) fn name(&self) -> &str {
-        &self.0.name
+        self.0.name
    }

    /// Returns the expected types of the statement's parameters.
--- a/libs/proxy/tokio-postgres2/src/to_statement.rs
+++ b/libs/proxy/tokio-postgres2/src/to_statement.rs
@@ -1,57 +0,0 @@
-use crate::to_statement::private::{Sealed, ToStatementType};
-use crate::Statement;
-
-mod private {
-    use crate::{Client, Error, Statement};
-
-    pub trait Sealed {}
-
-    pub enum ToStatementType<'a> {
-        Statement(&'a Statement),
-        Query(&'a str),
-    }
-
-    impl ToStatementType<'_> {
-        pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
-            match self {
-                ToStatementType::Statement(s) => Ok(s.clone()),
-                ToStatementType::Query(s) => client.prepare(s).await,
-            }
-        }
-    }
-}
-
-/// A trait abstracting over prepared and unprepared statements.
-///
-/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
-/// was prepared previously.
-///
-/// This trait is "sealed" and cannot be implemented by anything outside this crate.
-pub trait ToStatement: Sealed {
-    #[doc(hidden)]
-    fn __convert(&self) -> ToStatementType<'_>;
-}
-
-impl ToStatement for Statement {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Statement(self)
-    }
-}
-
-impl Sealed for Statement {}
-
-impl ToStatement for str {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for str {}
-
-impl ToStatement for String {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for String {}
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -202,7 +202,6 @@ pub struct TimelineStatus {
    pub commit_lsn: Lsn,
    pub backup_lsn: Lsn,
    pub peer_horizon_lsn: Lsn,
-    pub remote_consistent_lsn: Lsn,
    pub peers: Vec<PeerInfo>,
    pub walsenders: Vec<WalSenderState>,
    pub walreceivers: Vec<WalReceiverState>,
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -28,7 +28,7 @@ inferno.workspace = true
 fail.workspace = true
 futures = { workspace = true }
 jsonwebtoken.workspace = true
-nix.workspace = true
+nix = {workspace = true, features = [ "ioctl" ] }
 once_cell.workspace = true
 pin-project-lite.workspace = true
 regex.workspace = true
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -93,6 +93,9 @@ pub mod try_rcu;

 pub mod guard_arc_swap;

+#[cfg(target_os = "linux")]
+pub mod linux_socket_ioctl;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/linux_socket_ioctl.rs
+++ b/libs/utils/src/linux_socket_ioctl.rs
@@ -0,0 +1,35 @@
+//! Linux-specific socket ioctls.
+//!
+//! <https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27>
+
+use std::{
+    io,
+    mem::MaybeUninit,
+    os::{fd::RawFd, raw::c_int},
+};
+
+use nix::libc::{FIONREAD, TIOCOUTQ};
+
+unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result<c_int> {
+    let mut inq: MaybeUninit<c_int> = MaybeUninit::uninit();
+    let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr());
+    if err == 0 {
+        Ok(inq.assume_init())
+    } else {
+        Err(io::Error::last_os_error())
+    }
+}
+
+/// # Safety
+///
+/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
+pub unsafe fn inq(socket_fd: RawFd) -> io::Result<c_int> {
+    do_ioctl(socket_fd, FIONREAD)
+}
+
+/// # Safety
+///
+/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
+pub unsafe fn outq(socket_fd: RawFd) -> io::Result<c_int> {
+    do_ioctl(socket_fd, TIOCOUTQ)
+}
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -5,7 +5,7 @@ use pq_proto::{read_cstr, PG_EPOCH};
 use serde::{Deserialize, Serialize};
 use tracing::{trace, warn};

-use crate::lsn::Lsn;
+use crate::{generation::Generation, lsn::Lsn};

 /// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
 ///
@@ -32,6 +32,12 @@ pub struct PageserverFeedback {
    pub replytime: SystemTime,
    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
    pub shard_number: u32,
+    /// The shard's pageserver-side generation number.
+    /// Used to track `remote_consistent_lsn` by generation which is required
+    /// to determine whether
+    /// - WAL offers still need to be sent
+    /// - in future: whether WAL can be evicted and/or pruned
+    pub generation: Generation,
 }

 impl PageserverFeedback {
@@ -43,6 +49,7 @@ impl PageserverFeedback {
            disk_consistent_lsn: Lsn::INVALID,
            replytime: *PG_EPOCH,
            shard_number: 0,
+            generation: Generation::none(),
        }
    }

@@ -101,6 +108,8 @@ impl PageserverFeedback {
            buf.put_u32(self.shard_number);
        }

+        todo!("ps_generation");
+
        buf[buf_ptr] = nkeys;
    }

@@ -147,6 +156,9 @@ impl PageserverFeedback {
                    assert_eq!(len, 4);
                    rf.shard_number = buf.get_u32();
                }
+                b"ps_generation" => {
+                    todo!();
+                }
                _ => {
                    let len = buf.get_i32();
                    warn!(
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::Key;
+use pageserver_api::key::{rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::{Instant, SystemTime};
@@ -501,13 +501,9 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(
-                        src,
-                        blknum,
-                        Version::Lsn(self.lsn),
-                        self.ctx,
-                        self.io_concurrency.clone(),
-                    )
+                    // TODO: investigate using get_vectored for the entire startblk..endblk range.
+                    // But this code path is not on the critical path for most basebackups (?).
+                    .get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
                    .await
                    .map_err(|e| BasebackupError::Server(e.into()))?;
                segment_data.extend_from_slice(&img[..]);
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
+use std::os::fd::RawFd;
 use std::pin::Pin;
 use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
@@ -129,7 +130,7 @@ pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
        &["tenant_id", "shard_id", "timeline_id"],
        // Low resolution to reduce cardinality.
-        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
+        vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
    )
    .expect("failed to define a metric")
 });
@@ -1439,27 +1440,66 @@ impl Drop for SmgrOpTimer {
 }

 impl SmgrOpFlushInProgress {
-    pub(crate) async fn measure<Fut, O>(self, mut started_at: Instant, mut fut: Fut) -> O
+    /// The caller must guarantee that `socket_fd`` outlives this function.
+    pub(crate) async fn measure<Fut, O>(
+        self,
+        started_at: Instant,
+        mut fut: Fut,
+        socket_fd: RawFd,
+    ) -> O
    where
        Fut: std::future::Future<Output = O>,
    {
        let mut fut = std::pin::pin!(fut);

-        // Whenever observe_guard gets called, or dropped,
-        // it adds the time elapsed since its last call to metrics.
-        // Last call is tracked in `now`.
+        let mut logged = false;
+        let mut last_counter_increment_at = started_at;
        let mut observe_guard = scopeguard::guard(
-            || {
+            |is_timeout| {
                let now = Instant::now();
-                let elapsed = now - started_at;
-                self.global_micros
-                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.per_timeline_micros
-                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                started_at = now;
+
+                // Increment counter
+                {
+                    let elapsed_since_last_observe = now - last_counter_increment_at;
+                    self.global_micros
+                        .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
+                    self.per_timeline_micros
+                        .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
+                    last_counter_increment_at = now;
+                }
+
+                // Log something on every timeout, and on completion but only if we hit a timeout.
+                if is_timeout || logged {
+                    logged = true;
+                    let elapsed_total = now - started_at;
+                    let msg = if is_timeout {
+                        "slow flush ongoing"
+                    } else {
+                        "slow flush completed or cancelled"
+                    };
+
+                    let (inq, outq) = {
+                        // SAFETY: caller guarantees that `socket_fd` outlives this function.
+                        #[cfg(target_os = "linux")]
+                        unsafe {
+                            (
+                                utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2),
+                                utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2),
+                            )
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            _ = socket_fd; // appease unused lint on macOS
+                            (-1, -1)
+                        }
+                    };
+
+                    let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
+                    tracing::info!(elapsed_total_secs, inq, outq, msg);
+                }
            },
            |mut observe| {
-                observe();
+                observe(false);
            },
        );

@@ -1467,7 +1507,7 @@ impl SmgrOpFlushInProgress {
            match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
                Ok(v) => return v,
                Err(_timeout) => {
-                    (*observe_guard)();
+                    (*observe_guard)(true);
                }
            }
        }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -73,6 +73,7 @@ use pageserver_api::models::PageTraceEvent;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;

 /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
 /// is not yet in state [`TenantState::Active`].
@@ -236,7 +237,7 @@ pub async fn libpq_listener_main(

 type ConnectionHandlerResult = anyhow::Result<()>;

-#[instrument(skip_all, fields(peer_addr))]
+#[instrument(skip_all, fields(peer_addr, application_name))]
 #[allow(clippy::too_many_arguments)]
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
@@ -257,6 +258,8 @@ async fn page_service_conn_main(
        .set_nodelay(true)
        .context("could not set TCP_NODELAY")?;

+    let socket_fd = socket.as_raw_fd();
+
    let peer_addr = socket.peer_addr().context("get peer address")?;
    tracing::Span::current().record("peer_addr", field::display(peer_addr));

@@ -305,7 +308,7 @@ async fn page_service_conn_main(
        cancel.clone(),
        gate_guard,
    );
-    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;

    match pgbackend.run(&mut conn_handler, &cancel).await {
        Ok(()) => {
@@ -1286,12 +1289,15 @@ impl PageServerHandler {
            ))?;

            // what we want to do
+            let socket_fd = pgb_writer.socket_fd;
            let flush_fut = pgb_writer.flush();
            // metric for how long flushing takes
            let flush_fut = match flushing_timer {
-                Some(flushing_timer) => {
-                    futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut))
-                }
+                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+                    Instant::now(),
+                    flush_fut,
+                    socket_fd,
+                )),
                None => futures::future::Either::Right(flush_fut),
            };
            // do it while respecting cancellation
@@ -2457,9 +2463,16 @@ where
    fn startup(
        &mut self,
        _pgb: &mut PostgresBackend<IO>,
-        _sm: &FeStartupPacket,
+        sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
        fail::fail_point!("ps::connection-start::startup-packet");
+
+        if let FeStartupPacket::StartupMessage { params, .. } = sm {
+            if let Some(app_name) = params.get("application_name") {
+                Span::current().record("application_name", field::display(app_name));
+            }
+        };
+
        Ok(())
    }

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,13 +23,14 @@ use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
-use pageserver_api::key::Key;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
-    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
-    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range,
+    slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key,
+    twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY,
+    CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
+use pageserver_api::key::{rel_tag_sparse_key, Key};
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -490,12 +491,33 @@ impl Timeline {
        if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
            return Ok(false);
        }
-        // fetch directory listing
+
+        // Read path: first read the new reldir keyspace. Early return if the relation exists.
+        // Otherwise, read the old reldir keyspace.
+        // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
+
+        if self.get_rel_size_v2_enabled() {
+            // fetch directory listing (new)
+            let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
+            let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
+                .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            let exists_v2 = buf == RelDirExists::Exists;
+            // Fast path: if the relation exists in the new format, return true.
+            // TODO: we should have a verification mode that checks both keyspaces
+            // to ensure the relation only exists in one of them.
+            if exists_v2 {
+                return Ok(true);
+            }
+        }
+
+        // fetch directory listing (old)
+
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
        let buf = version.get(self, key, ctx).await?;

        let dir = RelDirectory::des(&buf)?;
-        Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
+        let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum));
+        Ok(exists_v1)
    }

    /// Get a list of all existing relations in given tablespace and database.
@@ -513,12 +535,12 @@ impl Timeline {
        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<HashSet<RelTag>, PageReconstructError> {
-        // fetch directory listing
+        // fetch directory listing (old)
        let key = rel_dir_to_key(spcnode, dbnode);
        let buf = version.get(self, key, ctx).await?;

        let dir = RelDirectory::des(&buf)?;
-        let rels: HashSet<RelTag> =
+        let rels_v1: HashSet<RelTag> =
            HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
                spcnode,
                dbnode,
@@ -526,6 +548,46 @@ impl Timeline {
                forknum: *forknum,
            }));

+        if !self.get_rel_size_v2_enabled() {
+            return Ok(rels_v1);
+        }
+
+        // scan directory listing (new), merge with the old results
+        let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            self.gate
+                .enter()
+                .map_err(|_| PageReconstructError::Cancelled)?,
+        );
+        let results = self
+            .scan(
+                KeySpace::single(key_range),
+                version.get_lsn(),
+                ctx,
+                io_concurrency,
+            )
+            .await?;
+        let mut rels = rels_v1;
+        for (key, val) in results {
+            let val = RelDirExists::decode(&val?)
+                .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            assert_eq!(key.field6, 1);
+            assert_eq!(key.field2, spcnode);
+            assert_eq!(key.field3, dbnode);
+            let tag = RelTag {
+                spcnode,
+                dbnode,
+                relnode: key.field4,
+                forknum: key.field5,
+            };
+            if val == RelDirExists::Removed {
+                debug_assert!(!rels.contains(&tag), "removed reltag in v2");
+                continue;
+            }
+            let did_not_contain = rels.insert(tag);
+            debug_assert!(did_not_contain, "duplicate reltag in v2");
+        }
        Ok(rels)
    }

@@ -1144,7 +1206,11 @@ impl Timeline {

        let dense_keyspace = result.to_keyspace();
        let sparse_keyspace = SparseKeySpace(KeySpace {
-            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
+            ranges: vec![
+                Key::metadata_aux_key_range(),
+                repl_origin_key_range(),
+                Key::rel_dir_sparse_key_range(),
+            ],
        });

        if cfg!(debug_assertions) {
@@ -1274,12 +1340,22 @@ pub struct DatadirModification<'a> {

    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
-    pending_directory_entries: Vec<(DirectoryKind, usize)>,
+    pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>,

    /// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
    pending_metadata_bytes: usize,
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum MetricsUpdate {
+    /// Set the metrics to this value
+    Set(u64),
+    /// Increment the metrics by this value
+    Add(u64),
+    /// Decrement the metrics by this value
+    Sub(u64),
+}
+
 impl DatadirModification<'_> {
    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
@@ -1359,7 +1435,8 @@ impl DatadirModification<'_> {
        let buf = DbDirectory::ser(&DbDirectory {
            dbdirs: HashMap::new(),
        })?;
-        self.pending_directory_entries.push((DirectoryKind::Db, 0));
+        self.pending_directory_entries
+            .push((DirectoryKind::Db, MetricsUpdate::Set(0)));
        self.put(DBDIR_KEY, Value::Image(buf.into()));

        let buf = if self.tline.pg_version >= 17 {
@@ -1372,7 +1449,7 @@ impl DatadirModification<'_> {
            })
        }?;
        self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, 0));
+            .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0)));
        self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));

        let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
@@ -1382,17 +1459,23 @@ impl DatadirModification<'_> {
        // harmless but they'd just be dropped on later compaction.
        if self.tline.tenant_shard_id.is_shard_zero() {
            self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::Clog),
+                MetricsUpdate::Set(0),
+            ));
            self.put(
                slru_dir_to_key(SlruKind::MultiXactMembers),
                empty_dir.clone(),
            );
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::Clog),
+                MetricsUpdate::Set(0),
+            ));
            self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets),
+                MetricsUpdate::Set(0),
+            ));
        }

        Ok(())
@@ -1658,10 +1741,16 @@ impl DatadirModification<'_> {
        }
        if r.is_none() {
            // Create RelDirectory
+            // TODO: if we have fully migrated to v2, no need to create this directory
            let buf = RelDirectory::ser(&RelDirectory {
                rels: HashSet::new(),
            })?;
-            self.pending_directory_entries.push((DirectoryKind::Rel, 0));
+            self.pending_directory_entries
+                .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
+            if self.tline.get_rel_size_v2_enabled() {
+                self.pending_directory_entries
+                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
+            }
            self.put(
                rel_dir_to_key(spcnode, dbnode),
                Value::Image(Bytes::from(buf)),
@@ -1685,8 +1774,10 @@ impl DatadirModification<'_> {
            if !dir.xids.insert(xid) {
                anyhow::bail!("twophase file for xid {} already exists", xid);
            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
        } else {
            let xid = xid as u32;
@@ -1694,8 +1785,10 @@ impl DatadirModification<'_> {
            if !dir.xids.insert(xid) {
                anyhow::bail!("twophase file for xid {} already exists", xid);
            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
        };
        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -1744,8 +1837,10 @@ impl DatadirModification<'_> {
        let mut dir = DbDirectory::des(&buf)?;
        if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
            let buf = DbDirectory::ser(&dir)?;
-            self.pending_directory_entries
-                .push((DirectoryKind::Db, dir.dbdirs.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::Db,
+                MetricsUpdate::Set(dir.dbdirs.len() as u64),
+            ));
            self.put(DBDIR_KEY, Value::Image(buf.into()));
        } else {
            warn!(
@@ -1778,39 +1873,85 @@ impl DatadirModification<'_> {
        // tablespace.  Create the reldir entry for it if so.
        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
            .context("deserialize db")?;
-        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let mut rel_dir =
+
+        let dbdir_exists =
            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                // Didn't exist. Update dbdir
                e.insert(false);
                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
-                self.pending_directory_entries
-                    .push((DirectoryKind::Db, dbdir.dbdirs.len()));
+                self.pending_directory_entries.push((
+                    DirectoryKind::Db,
+                    MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
+                ));
                self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-                // and create the RelDirectory
-                RelDirectory::default()
+                false
            } else {
-                // reldir already exists, fetch it
-                RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                    .context("deserialize db")?
+                true
            };

+        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
+        let mut rel_dir = if !dbdir_exists {
+            // Create the RelDirectory
+            RelDirectory::default()
+        } else {
+            // reldir already exists, fetch it
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
+                .context("deserialize db")?
+        };
+
        // Add the new relation to the rel directory entry, and write it back
        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
            return Err(RelationError::AlreadyExists);
        }

-        self.pending_directory_entries
-            .push((DirectoryKind::Rel, rel_dir.rels.len()));
-
-        self.put(
-            rel_dir_key,
-            Value::Image(Bytes::from(
-                RelDirectory::ser(&rel_dir).context("serialize")?,
-            )),
-        );
-
+        if self.tline.get_rel_size_v2_enabled() {
+            let sparse_rel_dir_key =
+                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
+            // check if the rel_dir_key exists in v2
+            let val = self
+                .sparse_get(sparse_rel_dir_key, ctx)
+                .await
+                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = RelDirExists::decode_option(val)
+                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            if val == RelDirExists::Exists {
+                return Err(RelationError::AlreadyExists);
+            }
+            self.put(
+                sparse_rel_dir_key,
+                Value::Image(RelDirExists::Exists.encode()),
+            );
+            if !dbdir_exists {
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
+                self.pending_directory_entries
+                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
+                // We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation.
+                // TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there
+                // will be key not found errors if we don't create an empty one for rel_size_v2.
+                self.put(
+                    rel_dir_key,
+                    Value::Image(Bytes::from(
+                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
+                    )),
+                );
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
+        } else {
+            if !dbdir_exists {
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
+            self.put(
+                rel_dir_key,
+                Value::Image(Bytes::from(
+                    RelDirectory::ser(&rel_dir).context("serialize")?,
+                )),
+            );
+        }
        // Put size
        let size_key = rel_size_to_key(rel);
        let buf = nblocks.to_le_bytes();
@@ -1896,9 +2037,34 @@ impl DatadirModification<'_> {

            let mut dirty = false;
            for rel_tag in rel_tags {
-                if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                    self.pending_directory_entries
+                        .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
                    dirty = true;
+                    true
+                } else if self.tline.get_rel_size_v2_enabled() {
+                    // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
+                    // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
+                    // logic).
+                    let key =
+                        rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
+                    let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
+                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                    if val == RelDirExists::Exists {
+                        self.pending_directory_entries
+                            .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
+                        // put tombstone
+                        self.put(key, Value::Image(RelDirExists::Removed.encode()));
+                        // no need to set dirty to true
+                        true
+                    } else {
+                        false
+                    }
+                } else {
+                    false
+                };

+                if found {
                    // update logical size
                    let size_key = rel_size_to_key(rel_tag);
                    let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1914,8 +2080,6 @@ impl DatadirModification<'_> {

            if dirty {
                self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
-                self.pending_directory_entries
-                    .push((DirectoryKind::Rel, dir.rels.len()));
            }
        }

@@ -1939,8 +2103,10 @@ impl DatadirModification<'_> {
        if !dir.segments.insert(segno) {
            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
        }
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
+        self.pending_directory_entries.push((
+            DirectoryKind::SlruSegment(kind),
+            MetricsUpdate::Set(dir.segments.len() as u64),
+        ));
        self.put(
            dir_key,
            Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1987,8 +2153,10 @@ impl DatadirModification<'_> {
        if !dir.segments.remove(&segno) {
            warn!("slru segment {:?}/{} does not exist", kind, segno);
        }
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
+        self.pending_directory_entries.push((
+            DirectoryKind::SlruSegment(kind),
+            MetricsUpdate::Set(dir.segments.len() as u64),
+        ));
        self.put(
            dir_key,
            Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -2020,8 +2188,10 @@ impl DatadirModification<'_> {
            if !dir.xids.remove(&xid) {
                warn!("twophase file for xid {} does not exist", xid);
            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
        } else {
            let xid: u32 = u32::try_from(xid)?;
@@ -2030,8 +2200,10 @@ impl DatadirModification<'_> {
            if !dir.xids.remove(&xid) {
                warn!("twophase file for xid {} does not exist", xid);
            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
        };
        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -2147,7 +2319,7 @@ impl DatadirModification<'_> {
        }

        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
-            writer.update_directory_entries_count(kind, count as u64);
+            writer.update_directory_entries_count(kind, count);
        }

        Ok(())
@@ -2233,7 +2405,7 @@ impl DatadirModification<'_> {
        }

        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
-            writer.update_directory_entries_count(kind, count as u64);
+            writer.update_directory_entries_count(kind, count);
        }

        self.pending_metadata_bytes = 0;
@@ -2297,6 +2469,22 @@ impl DatadirModification<'_> {
        self.tline.get(key, lsn, ctx).await
    }

+    /// Get a key from the sparse keyspace. Automatically converts the missing key error
+    /// and the empty value into None.
+    async fn sparse_get(
+        &self,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, PageReconstructError> {
+        let val = self.get(key, ctx).await;
+        match val {
+            Ok(val) if val.is_empty() => Ok(None),
+            Ok(val) => Ok(Some(val)),
+            Err(PageReconstructError::MissingKey(_)) => Ok(None),
+            Err(e) => Err(e),
+        }
+    }
+
    fn put(&mut self, key: Key, val: Value) {
        if Self::is_data_key(&key) {
            self.put_data(key.to_compact(), val)
@@ -2379,6 +2567,23 @@ impl Version<'_> {
        }
    }

+    /// Get a key from the sparse keyspace. Automatically converts the missing key error
+    /// and the empty value into None.
+    async fn sparse_get(
+        &self,
+        timeline: &Timeline,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, PageReconstructError> {
+        let val = self.get(timeline, key, ctx).await;
+        match val {
+            Ok(val) if val.is_empty() => Ok(None),
+            Ok(val) => Ok(Some(val)),
+            Err(PageReconstructError::MissingKey(_)) => Ok(None),
+            Err(e) => Err(e),
+        }
+    }
+
    fn get_lsn(&self) -> Lsn {
        match self {
            Version::Lsn(lsn) => *lsn,
@@ -2438,6 +2643,7 @@ pub(crate) enum DirectoryKind {
    Rel,
    AuxFiles,
    SlruSegment(SlruKind),
+    RelV2,
 }

 impl DirectoryKind {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3924,6 +3924,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

+    pub fn get_rel_size_v2_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .rel_size_v2_enabled
+            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
+    }
+
    pub fn get_compaction_upper_limit(&self) -> usize {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -5640,7 +5647,7 @@ pub(crate) mod harness {
                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                timeline_offloading: Some(tenant_conf.timeline_offloading),
                wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
-                rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
+                rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled),
                gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled),
                gc_compaction_initial_threshold_kb: Some(
                    tenant_conf.gc_compaction_initial_threshold_kb,
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -485,7 +485,9 @@ impl TenantConfOpt {
            wal_receiver_protocol_override: self
                .wal_receiver_protocol_override
                .or(global_conf.wal_receiver_protocol_override),
-            rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
+            rel_size_v2_enabled: self
+                .rel_size_v2_enabled
+                .unwrap_or(global_conf.rel_size_v2_enabled),
            gc_compaction_enabled: self
                .gc_compaction_enabled
                .unwrap_or(global_conf.gc_compaction_enabled),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -117,7 +117,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
-use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate};
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;
@@ -327,6 +327,7 @@ pub struct Timeline {
    // in `crate::page_service` writes these metrics.
    pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,

+    directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM],
    directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],

    /// Ensures layers aren't frozen by checkpointer between
@@ -2355,6 +2356,14 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

+    pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .rel_size_v2_enabled
+            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
+    }
+
    fn get_compaction_upper_limit(&self) -> usize {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2664,6 +2673,7 @@ impl Timeline {
                ),

                directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
+                directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)),

                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

@@ -3430,8 +3440,42 @@ impl Timeline {
        }
    }

-    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
-        self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) {
+        // TODO: this directory metrics is not correct -- we could have multiple reldirs in the system
+        // for each of the database, but we only store one value, and therefore each pgdirmodification
+        // would overwrite the previous value if they modify different databases.
+
+        match count {
+            MetricsUpdate::Set(count) => {
+                self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+                self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed);
+            }
+            MetricsUpdate::Add(count) => {
+                // TODO: these operations are not atomic; but we only have one writer to the metrics, so
+                // it's fine.
+                if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
+                    // The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub
+                    // the value reliably.
+                    self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed);
+                }
+                // Otherwise, ignore this update
+            }
+            MetricsUpdate::Sub(count) => {
+                // TODO: these operations are not atomic; but we only have one writer to the metrics, so
+                // it's fine.
+                if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
+                    // The metrics has been initialized with `MetricsUpdate::Set` before.
+                    // The operation could overflow so we need to normalize the value.
+                    let prev_val =
+                        self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed);
+                    let res = prev_val.saturating_sub(count);
+                    self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed);
+                }
+                // Otherwise, ignore this update
+            }
+        };
+
+        // TODO: remove this, there's no place in the code that updates this aux metrics.
        let aux_metric =
            self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);

@@ -3649,7 +3693,9 @@ impl Timeline {
            // space. If that's not the case, we had at least one key encounter a gap in the image layer
            // and stop the search as a result of that.
            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
-            // Do not fire missing key error for sparse keys.
+            // Do not fire missing key error and end early for sparse keys. Note that we hava already removed
+            // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
+            // figuring out what is the inherited key range and do a fine-grained pruning.
            removed.remove_overlapping_with(&KeySpace {
                ranges: vec![SPARSE_RANGE],
            });
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -7,7 +7,9 @@ use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
-use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
+use crate::tenant::{
+    DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
+};

 #[derive(thiserror::Error, Debug)]
 pub(crate) enum OffloadError {
@@ -37,12 +39,25 @@ pub(crate) async fn offload_timeline(
    debug_assert_current_span_has_tenant_and_timeline_id();
    tracing::info!("offloading archived timeline");

-    let (timeline, guard) = make_timeline_delete_guard(
+    let delete_guard_res = make_timeline_delete_guard(
        tenant,
        timeline.timeline_id,
        TimelineDeleteGuardKind::Offload,
-    )
-    .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
+    );
+    if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res {
+        let is_archived = timeline.is_archived();
+        if is_archived == Some(true) {
+            tracing::error!("timeline is archived but has non-archived children: {children:?}");
+            return Err(OffloadError::NotArchived);
+        }
+        tracing::info!(
+            ?is_archived,
+            "timeline is not archived and has unarchived children"
+        );
+        return Err(OffloadError::NotArchived);
+    };
+    let (timeline, guard) =
+        delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;

    let TimelineOrOffloaded::Timeline(timeline) = timeline else {
        tracing::error!("timeline already offloaded, but given timeline object");
--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -18,6 +18,8 @@
 #include "neon_utils.h"

 static int	extension_server_port = 0;
+static int	extension_server_request_timeout = 60;
+static int	extension_server_connect_timeout = 60;

 static download_extension_file_hook_type prev_download_extension_file_hook = NULL;

@@ -34,19 +36,18 @@ static download_extension_file_hook_type prev_download_extension_file_hook = NUL
 static bool
 neon_download_extension_file_http(const char *filename, bool is_library)
 {
-	static CURL	   *handle = NULL;
-
 	CURLcode	res;
-	char	   *compute_ctl_url;
 	bool		ret = false;
+	CURL	   *handle = NULL;
+	char	   *compute_ctl_url;

-	if (handle == NULL)
-	{
-		handle = alloc_curl_handle();
+	handle = alloc_curl_handle();

-		curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
-		curl_easy_setopt(handle, CURLOPT_TIMEOUT, 60L /* seconds */ );
-	}
+	curl_easy_setopt(handle, CURLOPT_CUSTOMREQUEST, "POST");
+	if (extension_server_request_timeout > 0)
+		curl_easy_setopt(handle, CURLOPT_TIMEOUT, (long)extension_server_request_timeout /* seconds */ );
+	if (extension_server_connect_timeout > 0)
+		curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, (long)extension_server_connect_timeout /* seconds */ );

 	compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
 							   extension_server_port, filename, is_library ? "?is_library=true" : "");
@@ -57,6 +58,8 @@ neon_download_extension_file_http(const char *filename, bool is_library)

 	/* Perform the request, res will get the return code */
 	res = curl_easy_perform(handle);
+	curl_easy_cleanup(handle);
+
 	/* Check for errors */
 	if (res == CURLE_OK)
 	{
@@ -88,6 +91,24 @@ pg_init_extension_server()
 							0,	/* no flags required */
 							NULL, NULL, NULL);

+	DefineCustomIntVariable("neon.extension_server_request_timeout",
+							"timeout for fetching extensions in seconds",
+							NULL,
+							&extension_server_request_timeout,
+							60, 0, INT_MAX,
+							PGC_SUSET,
+							GUC_UNIT_S,
+							NULL, NULL, NULL);
+
+	DefineCustomIntVariable("neon.extension_server_connect_timeout",
+							"timeout for connecting to the extension server in seconds",
+							NULL,
+							&extension_server_connect_timeout,
+							60, 0, INT_MAX,
+							PGC_SUSET,
+							GUC_UNIT_S,
+							NULL, NULL, NULL);
+
 	/* set download_extension_file_hook */
 	prev_download_extension_file_hook = download_extension_file_hook;
 	download_extension_file_hook = neon_download_extension_file_http;
--- a/pgxn/neon/hll.c
+++ b/pgxn/neon/hll.c
@@ -122,8 +122,8 @@ addSHLL(HyperLogLogState *cState, uint32 hash)
 	index = hash >> HLL_C_BITS;

 	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
-	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
-
+	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS) - 1;
+	Assert(count <= HLL_C_BITS);
 	cState->regs[index][count] = now;
 }

@@ -136,7 +136,7 @@ getMaximum(const TimestampTz* reg, TimestampTz since)
 	{
 		if (reg[i] >= since)
 		{
-			max = i;
+			max = i + 1;
 		}
 	}

--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -378,8 +378,9 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	{
 	case PS_Disconnected:
 	{
-		const char *keywords[3];
-		const char *values[3];
+		const char *keywords[4];
+		const char *values[4];
+		char pid_str[16];
 		int			n_pgsql_params;
 		TimestampTz	now;
 		int64		us_since_last_attempt;
@@ -424,14 +425,30 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		 * can override the password from the env variable. Seems useful, although
 		 * we don't currently use that capability anywhere.
 		 */
-		keywords[0] = "dbname";
-		values[0] = connstr;
-		n_pgsql_params = 1;
+		n_pgsql_params = 0;
+
+		/*
+		 * Pageserver logs include this in the connection's tracing span.
+		 * This allows for reasier log correlation between compute and pageserver.
+		 */
+		keywords[n_pgsql_params] = "application_name";
+		{
+			int ret = snprintf(pid_str, sizeof(pid_str), "%d", MyProcPid);
+			if (ret < 0 || ret >= (int)(sizeof(pid_str)))
+				elog(FATAL, "stack-allocated buffer too small to hold pid");
+		}
+		/* lifetime: PQconnectStartParams strdups internally */
+		values[n_pgsql_params] = (const char*) pid_str;
+		n_pgsql_params++;
+
+		keywords[n_pgsql_params] = "dbname";
+		values[n_pgsql_params] = connstr;
+		n_pgsql_params++;

 		if (neon_auth_token)
 		{
-			keywords[1] = "password";
-			values[1] = neon_auth_token;
+			keywords[n_pgsql_params] = "password";
+			values[n_pgsql_params] = neon_auth_token;
 			n_pgsql_params++;
 		}

--- a/poetry.lock
+++ b/poetry.lock
@@ -412,6 +412,7 @@ files = [

 [package.dependencies]
 botocore-stubs = "*"
+mypy-boto3-kms = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"kms\""}
 mypy-boto3-s3 = {version = ">=1.26.0,<1.27.0", optional = true, markers = "extra == \"s3\""}
 types-s3transfer = "*"
 typing-extensions = ">=4.1.0"
@@ -2022,6 +2023,18 @@ install-types = ["pip"]
 mypyc = ["setuptools (>=50)"]
 reports = ["lxml"]

+[[package]]
+name = "mypy-boto3-kms"
+version = "1.26.147"
+description = "Type annotations for boto3.KMS 1.26.147 service generated with mypy-boto3-builder 7.14.5"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "mypy-boto3-kms-1.26.147.tar.gz", hash = "sha256:816a4d1bb0585e1b9620a3f96c1d69a06f53b7b5621858579dd77c60dbb5fa5c"},
+    {file = "mypy_boto3_kms-1.26.147-py3-none-any.whl", hash = "sha256:493f0db674a25c88769f5cb8ab8ac00d3dda5dfc903d5cda34c990ee64689f79"},
+]
+
 [[package]]
 name = "mypy-boto3-s3"
 version = "1.26.0.post1"
@@ -2758,18 +2771,18 @@ pytest = ">=5,<8"

 [[package]]
 name = "pytest-timeout"
-version = "2.1.0"
+version = "2.3.1"
 description = "pytest plugin to abort hanging tests"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 groups = ["main"]
 files = [
-    {file = "pytest-timeout-2.1.0.tar.gz", hash = "sha256:c07ca07404c612f8abbe22294b23c368e2e5104b521c1790195561f37e1ac3d9"},
-    {file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"},
+    {file = "pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9"},
+    {file = "pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e"},
 ]

 [package.dependencies]
-pytest = ">=5.0.0"
+pytest = ">=7.0.0"

 [[package]]
 name = "pytest-xdist"
@@ -3807,4 +3820,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "4dc3165fe22c0e0f7a030ea0f8a680ae2ff74561d8658c393abbe9112caaf5d7"
+content-hash = "00ddc42c32e235b6171845fc066dcab078282ed832cd464d5e8a0afa959dd04a"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -19,7 +19,6 @@ aws-config.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
-boxcar = "0.2.8"
 bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
@@ -63,7 +62,6 @@ postgres_backend.workspace = true
 postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
 postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
 pq_proto.workspace = true
-prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
@@ -81,7 +79,6 @@ sha2 = { workspace = true, features = ["asm", "oid"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
-strum.workspace = true
 strum_macros.workspace = true
 subtle.workspace = true
 thiserror.workspace = true
@@ -95,7 +92,6 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 tracing.workspace = true
 tracing-log.workspace = true
-tracing-serde.workspace = true
 tracing-opentelemetry.workspace = true
 try-lock.workspace = true
 typed-json.workspace = true
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -140,9 +140,8 @@ async fn authenticate(
    let (psql_session_id, waiter) = loop {
        let psql_session_id = new_psql_session_id();

-        match control_plane::mgmt::get_waiter(&psql_session_id) {
-            Ok(waiter) => break (psql_session_id, waiter),
-            Err(_e) => continue,
+        if let Ok(waiter) = control_plane::mgmt::get_waiter(&psql_session_id) {
+            break (psql_session_id, waiter);
        }
    };

--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -220,11 +220,11 @@ async fn fetch_jwks(
 }

 impl JwkCacheEntryLock {
-    async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
+    async fn acquire_permit(self: &Arc<Self>) -> JwkRenewalPermit<'_> {
        JwkRenewalPermit::acquire_permit(self).await
    }

-    fn try_acquire_permit<'a>(self: &'a Arc<Self>) -> Option<JwkRenewalPermit<'a>> {
+    fn try_acquire_permit(self: &Arc<Self>) -> Option<JwkRenewalPermit<'_>> {
        JwkRenewalPermit::try_acquire_permit(self)
    }

@@ -393,7 +393,7 @@ impl JwkCacheEntryLock {
                verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?;
            }
            key => return Err(JwtError::UnsupportedKeyType(key.into())),
-        };
+        }

        tracing::debug!(?payload, "JWT signature valid with claims");

@@ -510,7 +510,7 @@ fn verify_rsa_signature(
            key.verify(data, &sig)?;
        }
        _ => return Err(JwtError::InvalidRsaSigningAlgorithm),
-    };
+    }

    Ok(())
 }
--- a/proxy/src/binary/local_proxy.rs
+++ b/proxy/src/binary/local_proxy.rs
@@ -4,6 +4,20 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;

+use anyhow::{bail, ensure, Context};
+use camino::{Utf8Path, Utf8PathBuf};
+use clap::Parser;
+use compute_api::spec::LocalProxySpec;
+use futures::future::Either;
+use thiserror::Error;
+use tokio::net::TcpListener;
+use tokio::sync::Notify;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, warn};
+use utils::sentry_init::init_sentry;
+use utils::{pid_file, project_build_tag, project_git_version};
+
 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
 use crate::auth::{self};
@@ -25,24 +39,10 @@ use crate::serverless::{self, GlobalConnPoolOptions};
 use crate::tls::client_config::compute_client_config_with_root_certs;
 use crate::types::RoleName;
 use crate::url::ApiUrl;
-use anyhow::{bail, ensure, Context};
-use camino::{Utf8Path, Utf8PathBuf};
-use compute_api::spec::LocalProxySpec;
-use futures::future::Either;

 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);

-use clap::Parser;
-use thiserror::Error;
-use tokio::net::TcpListener;
-use tokio::sync::Notify;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, warn};
-use utils::sentry_init::init_sentry;
-use utils::{pid_file, project_build_tag, project_git_version};
-
 /// Neon proxy/router
 #[derive(Parser)]
 #[command(version = GIT_VERSION, about)]
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -5,12 +5,6 @@
 /// the outside. Similar to an ingress controller for HTTPS.
 use std::{net::SocketAddr, sync::Arc};

-use crate::context::RequestContext;
-use crate::metrics::{Metrics, ThreadPoolMetrics};
-use crate::protocol2::ConnectionInfo;
-use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
-use crate::stream::{PqStream, Stream};
-use crate::tls::TlsServerEndPoint;
 use anyhow::{anyhow, bail, ensure, Context};
 use clap::Arg;
 use futures::future::Either;
@@ -25,6 +19,13 @@ use tracing::{error, info, Instrument};
 use utils::project_git_version;
 use utils::sentry_init::init_sentry;

+use crate::context::RequestContext;
+use crate::metrics::{Metrics, ThreadPoolMetrics};
+use crate::protocol2::ConnectionInfo;
+use crate::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
+use crate::stream::{PqStream, Stream};
+use crate::tls::TlsServerEndPoint;
+
 project_git_version!(GIT_VERSION);

 fn cli() -> clap::Command {
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -3,6 +3,16 @@ use std::pin::pin;
 use std::sync::Arc;
 use std::time::Duration;

+use anyhow::bail;
+use futures::future::Either;
+use remote_storage::RemoteStorageConfig;
+use tokio::net::TcpListener;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use tracing::{info, warn, Instrument};
+use utils::sentry_init::init_sentry;
+use utils::{project_build_tag, project_git_version};
+
 use crate::auth::backend::jwt::JwkCache;
 use crate::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
 use crate::cancellation::{handle_cancel_messages, CancellationHandler};
@@ -24,15 +34,6 @@ use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::GlobalConnPoolOptions;
 use crate::tls::client_config::compute_client_config_with_root_certs;
 use crate::{auth, control_plane, http, serverless, usage_metrics};
-use anyhow::bail;
-use futures::future::Either;
-use remote_storage::RemoteStorageConfig;
-use tokio::net::TcpListener;
-use tokio::task::JoinSet;
-use tokio_util::sync::CancellationToken;
-use tracing::{info, warn, Instrument};
-use utils::sentry_init::init_sentry;
-use utils::{project_build_tag, project_git_version};

 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
@@ -303,7 +304,7 @@ pub async fn run() -> anyhow::Result<()> {
    match auth_backend {
        Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"),
        Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"),
-    };
+    }
    info!("Using region: {}", args.aws_region);

    // TODO: untangle the config args
@@ -803,9 +804,10 @@ fn build_auth_backend(
 mod tests {
    use std::time::Duration;

-    use crate::rate_limiter::RateBucketInfo;
    use clap::Parser;

+    use crate::rate_limiter::RateBucketInfo;
+
    #[test]
    fn parse_endpoint_rps_limit() {
        let config = super::ProxyCliArgs::parse_from([
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -242,7 +242,7 @@ impl EndpointsCache {
                            });
                            tracing::error!("error parsing value {value:?}: {err:?}");
                        }
-                    };
+                    }
                }
                if total.is_power_of_two() {
                    tracing::debug!("endpoints read {}", total);
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -137,8 +137,8 @@ impl ConnCfg {
            match k {
                // Only set `user` if it's not present in the config.
                // Console redirect auth flow takes username from the console's response.
-                "user" if self.user_is_set() => continue,
-                "database" if self.db_is_set() => continue,
+                "user" if self.user_is_set() => {}
+                "database" if self.db_is_set() => {}
                "options" => {
                    if let Some(options) = filtered_options(v) {
                        self.set_param(k, &options);
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -82,7 +82,7 @@ pub async fn task_main(
                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
                    return;
                }
-            };
+            }

            let ctx = RequestContext::new(
                session_id,
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -19,8 +19,7 @@ use crate::cache::{Cached, TimedLru};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
-use crate::intern::AccountIdInt;
-use crate::intern::ProjectIdInt;
+use crate::intern::{AccountIdInt, ProjectIdInt};
 use crate::types::{EndpointCacheKey, EndpointId};
 use crate::{compute, scram};

--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -7,9 +7,8 @@ use chrono::{DateTime, Utc};
 use opentelemetry::trace::TraceContextExt;
 use scopeguard::defer;
 use serde::ser::{SerializeMap, Serializer};
-use tracing::span;
 use tracing::subscriber::Interest;
-use tracing::{callsite, Event, Metadata, Span, Subscriber};
+use tracing::{callsite, span, Event, Metadata, Span, Subscriber};
 use tracing_opentelemetry::OpenTelemetrySpanExt;
 use tracing_subscriber::filter::{EnvFilter, LevelFilter};
 use tracing_subscriber::fmt::format::{Format, Full};
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -119,7 +119,7 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
        // if no more bytes available then exit
        if bytes_read == 0 {
            return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
-        };
+        }

        // check if we have enough bytes to continue
        if let Some(header) = buf.try_get::<ProxyProtocolV2Header>() {
@@ -169,7 +169,7 @@ fn process_proxy_payload(
                header.version_and_command
            ),
        )),
-    };
+    }

    let size_err =
        "invalid proxy protocol length. payload not large enough to fit requested IP addresses";
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -198,7 +198,7 @@ where

                warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT);
            }
-        };
+        }

        let wait_duration = retry_after(num_retries, compute.retry);
        num_retries += 1;
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -118,7 +118,7 @@ pub async fn task_main(
                    error!("per-client task finished with an error: failed to set socket option: {e:#}");
                    return;
                }
-            };
+            }

            let ctx = RequestContext::new(
                session_id,
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -169,7 +169,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                        });
                        tracing::error!("broken message: {e}");
                    }
-                };
+                }
                return Ok(());
            }
            Ok(msg) => msg,
@@ -180,7 +180,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                match serde_json::from_str::<NotificationHeader>(&payload) {
                    Ok(header) => tracing::error!(topic = header.topic, "broken message: {e}"),
                    Err(_) => tracing::error!("broken message: {e}"),
-                };
+                }
                return Ok(());
            }
        };
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -372,7 +372,7 @@ impl PoolingBackend {
            debug!("setting up backend session state");

            // initiates the auth session
-            if let Err(e) = client.execute("select auth.init()", &[]).await {
+            if let Err(e) = client.batch_execute("select auth.init();").await {
                discard.discard();
                return Err(e.into());
            }
@@ -651,7 +651,7 @@ async fn connect_http2(
                    e,
                )));
            }
-        };
+        }
    };

    let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new())
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -23,7 +23,6 @@ use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use parking_lot::RwLock;
 use postgres_client::tls::NoTlsStream;
-use postgres_client::types::ToSql;
 use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
 use tokio::net::TcpStream;
@@ -281,13 +280,9 @@ impl ClientInnerCommon<postgres_client::Client> {
            let token = resign_jwt(&local_data.key, payload, local_data.jti)?;

            // initiates the auth session
-            self.inner.batch_execute("discard all").await?;
-            self.inner
-                .execute(
-                    "select auth.jwt_session_init($1)",
-                    &[&&*token as &(dyn ToSql + Sync)],
-                )
-                .await?;
+            // this is safe from query injections as the jwt format free of any escape characters.
+            let query = format!("discard all; select auth.jwt_session_init('{token}')");
+            self.inner.batch_execute(&query).await?;

            let pid = self.inner.get_process_id();
            info!(pid, jti = local_data.jti, "user session state init");
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,12 +17,12 @@ Jinja2 = "^3.1.5"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.20241019"
 boto3 = "^1.34.11"
-boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
+boto3-stubs = {extras = ["s3", "kms"], version = "^1.26.16"}
 moto = {extras = ["server"], version = "^5.0.6"}
 backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
-pytest-timeout = "^2.1.0"
+pytest-timeout = "^2.3.1"
 Werkzeug = "^3.0.6"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"
--- a/pytest.ini
+++ b/pytest.ini
@@ -11,7 +11,7 @@ markers =
 testpaths =
    test_runner
 minversion = 6.0
-log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
+log_format = %(asctime)s.%(msecs)03d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
 log_date_format = %Y-%m-%d %H:%M:%S
 log_cli = true
 timeout = 300
--- a/safekeeper/src/http/openapi_spec.yaml
+++ b/safekeeper/src/http/openapi_spec.yaml
@@ -266,7 +266,6 @@ components:
        - flush_lsn
        - commit_lsn
        - backup_lsn
-        - remote_consistent_lsn
        - peer_horizon_lsn
        - safekeeper_connstr
      properties:
@@ -279,8 +278,6 @@ components:
          type: string
        backup_lsn:
          type: string
-        remote_consistent_lsn:
-          type: string
        peer_horizon_lsn:
          type: string
        safekeeper_connstr:
@@ -325,8 +322,6 @@ components:
          type: string
        peer_horizon_lsn:
          type: string
-        remote_consistent_lsn:
-          type: string

    AcceptorStateStatus:
      type: object
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -199,7 +199,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        commit_lsn: inmem.commit_lsn,
        backup_lsn: inmem.backup_lsn,
        peer_horizon_lsn: inmem.peer_horizon_lsn,
-        remote_consistent_lsn: inmem.remote_consistent_lsn,
        peers: tli.get_peers(conf).await,
        walsenders: tli.get_walsenders().get_all_public(),
        walreceivers: tli.get_walreceivers().get_all(),
@@ -456,7 +455,6 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
        last_log_term: sk_info.last_log_term.unwrap_or(0),
        flush_lsn: sk_info.flush_lsn.0,
        commit_lsn: sk_info.commit_lsn.0,
-        remote_consistent_lsn: sk_info.remote_consistent_lsn.0,
        peer_horizon_lsn: sk_info.peer_horizon_lsn.0,
        safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
        http_connstr: sk_info.http_connstr.unwrap_or_else(|| "".to_owned()),
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -471,7 +471,6 @@ pub struct TimelineCollector {
    flush_lsn: GenericGaugeVec<AtomicU64>,
    epoch_start_lsn: GenericGaugeVec<AtomicU64>,
    peer_horizon_lsn: GenericGaugeVec<AtomicU64>,
-    remote_consistent_lsn: GenericGaugeVec<AtomicU64>,
    ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
    feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
    ps_feedback_count: GenericGaugeVec<AtomicU64>,
@@ -543,16 +542,6 @@ impl TimelineCollector {
        .unwrap();
        descs.extend(peer_horizon_lsn.desc().into_iter().cloned());

-        let remote_consistent_lsn = GenericGaugeVec::new(
-            Opts::new(
-                "safekeeper_remote_consistent_lsn",
-                "LSN which is persisted to the remote storage in pageserver",
-            ),
-            &["tenant_id", "timeline_id"],
-        )
-        .unwrap();
-        descs.extend(remote_consistent_lsn.desc().into_iter().cloned());
-
        let ps_last_received_lsn = GenericGaugeVec::new(
            Opts::new(
                "safekeeper_ps_last_received_lsn",
@@ -698,7 +687,6 @@ impl TimelineCollector {
            flush_lsn,
            epoch_start_lsn,
            peer_horizon_lsn,
-            remote_consistent_lsn,
            ps_last_received_lsn,
            feedback_last_time_seconds,
            ps_feedback_count,
@@ -732,7 +720,6 @@ impl Collector for TimelineCollector {
        self.flush_lsn.reset();
        self.epoch_start_lsn.reset();
        self.peer_horizon_lsn.reset();
-        self.remote_consistent_lsn.reset();
        self.ps_last_received_lsn.reset();
        self.feedback_last_time_seconds.reset();
        self.ps_feedback_count.reset();
@@ -786,9 +773,6 @@ impl Collector for TimelineCollector {
            self.peer_horizon_lsn
                .with_label_values(labels)
                .set(tli.mem_state.peer_horizon_lsn.into());
-            self.remote_consistent_lsn
-                .with_label_values(labels)
-                .set(tli.mem_state.remote_consistent_lsn.into());
            self.timeline_active
                .with_label_values(labels)
                .set(tli.timeline_is_active as u64);
@@ -849,7 +833,6 @@ impl Collector for TimelineCollector {
        mfs.extend(self.flush_lsn.collect());
        mfs.extend(self.epoch_start_lsn.collect());
        mfs.extend(self.peer_horizon_lsn.collect());
-        mfs.extend(self.remote_consistent_lsn.collect());
        mfs.extend(self.ps_last_received_lsn.collect());
        mfs.extend(self.feedback_last_time_seconds.collect());
        mfs.extend(self.ps_feedback_count.collect());
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -308,10 +308,8 @@ impl WalResidentTimeline {
        // removed further than `backup_lsn`. Since we're holding shared_state
        // lock and setting `wal_removal_on_hold` later, it guarantees that WAL
        // won't be removed until we're done.
-        let from_lsn = min(
-            shared_state.sk.state().remote_consistent_lsn,
-            shared_state.sk.state().backup_lsn,
-        );
+        // TODO: do we still need this snapshot code path?
+        let from_lsn = shared_state.sk.state().backup_lsn;
        if from_lsn == Lsn::INVALID {
            // this is possible if snapshot is called before handling first
            // elected message
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -5,7 +5,7 @@ use crate::timeline_manager::StateSnapshot;
 /// Get oldest LSN we still need to keep.
 ///
 /// We hold WAL till it is consumed by
-/// 1) pageserver (remote_consistent_lsn)
+/// 1) pageserver (min_remote_consistent_lsn)
 /// 2) s3 offloading.
 /// 3) Additionally we must store WAL since last local commit_lsn because
 ///    that's where we start looking for last WAL record on start.
@@ -17,7 +17,7 @@ use crate::timeline_manager::StateSnapshot;
 pub(crate) fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option<Lsn>) -> Lsn {
    use std::cmp::min;

-    let mut horizon_lsn = state.cfile_remote_consistent_lsn;
+    let mut horizon_lsn = state.min_remote_consistent_lsn;
    // we don't want to remove WAL that is not yet offloaded to s3
    horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn);
    // Min by local commit_lsn to be able to begin reading WAL from somewhere on
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -560,19 +560,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                // Send a periodic keep alive when the connection has been idle for a while.
                // Since we've been idle, also check if we can stop streaming.
                _ = keepalive_ticker.tick() => {
-                    if let Some(remote_consistent_lsn) = self.wal_sender_guard
-                        .walsenders()
-                        .get_ws_remote_consistent_lsn(self.wal_sender_guard.id())
-                    {
-                        if self.tli.should_walsender_stop(remote_consistent_lsn).await {
-                            // Stop streaming if the receivers are caught up and
-                            // there's no active compute. This causes the loop in
-                            // [`crate::send_interpreted_wal::InterpretedWalSender::run`]
-                            // to exit and terminate the WAL stream.
-                            break;
-                        }
-                    }
-
                    self.pgb
                        .write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
                            wal_end: self.end_watch_view.get().0,
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -251,17 +251,6 @@ impl WalSenders {
        shared.update_reply_feedback();
    }

-    /// Get remote_consistent_lsn reported by the pageserver. Returns None if
-    /// client is not pageserver.
-    pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
-        let shared = self.mutex.lock();
-        let slot = shared.get_slot(id);
-        match slot.get_feedback() {
-            ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
-            _ => None,
-        }
-    }
-
    /// Unregister walsender.
    fn unregister(self: &Arc<WalSenders>, id: WalSenderId) {
        let mut shared = self.mutex.lock();
@@ -890,28 +879,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                return Ok(());
            }

-            // Timed out waiting for WAL, check for termination and send KA.
-            // Check for termination only if we are streaming up to commit_lsn
-            // (to pageserver).
-            if let EndWatch::Commit(_) = self.end_watch {
-                if let Some(remote_consistent_lsn) = self
-                    .ws_guard
-                    .walsenders
-                    .get_ws_remote_consistent_lsn(self.ws_guard.id)
-                {
-                    if self.tli.should_walsender_stop(remote_consistent_lsn).await {
-                        // Terminate if there is nothing more to send.
-                        // Note that "ending streaming" part of the string is used by
-                        // pageserver to identify WalReceiverError::SuccessfulCompletion,
-                        // do not change this string without updating pageserver.
-                        return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
-                        "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
-                        self.appname, self.start_pos,
-                    )));
-                    }
-                }
-            }
-
            let msg = BeMessage::KeepAlive(WalSndKeepAlive {
                wal_end: self.end_pos.0,
                timestamp: get_current_timestamp(),
@@ -1020,7 +987,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
                    .walsenders
                    .record_ps_feedback(self.ws_guard.id, &ps_feedback);
                self.tli
-                    .update_remote_consistent_lsn(ps_feedback.remote_consistent_lsn)
+                    .process_remote_consistent_lsn_update(
+                        ps_feedback.generation,
+                        ps_feedback.remote_consistent_lsn,
+                    )
                    .await;
                // in principle new remote_consistent_lsn could allow to
                // deactivate the timeline, but we check that regularly through
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -61,10 +61,9 @@ pub struct TimelinePersistentState {
    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
    /// only by walproposer.
    pub peer_horizon_lsn: Lsn,
-    /// LSN of the oldest known checkpoint made by pageserver and successfully
-    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
-    /// informational purposes, we receive it from pageserver (or broker).
-    pub remote_consistent_lsn: Lsn,
+    /// Obsolete; nowadays we track remote_consistent_lsn by generation number
+    /// in a separate cache with relaxed persistency requirements.
+    remote_consistent_lsn: Lsn,
    /// Holds names of partial segments uploaded to remote storage. Used to
    /// clean up old objects without leaving garbage in remote storage.
    pub partial_backup: wal_backup_partial::State,
@@ -171,7 +170,6 @@ pub struct TimelineMemState {
    pub commit_lsn: Lsn,
    pub backup_lsn: Lsn,
    pub peer_horizon_lsn: Lsn,
-    pub remote_consistent_lsn: Lsn,
    #[serde(with = "hex")]
    pub proposer_uuid: PgUuid,
 }
@@ -198,7 +196,6 @@ where
                commit_lsn: state.commit_lsn,
                backup_lsn: state.backup_lsn,
                peer_horizon_lsn: state.peer_horizon_lsn,
-                remote_consistent_lsn: state.remote_consistent_lsn,
                proposer_uuid: state.proposer_uuid,
            },
            pers: state,
@@ -213,7 +210,6 @@ where
        s.commit_lsn = self.inmem.commit_lsn;
        s.backup_lsn = self.inmem.backup_lsn;
        s.peer_horizon_lsn = self.inmem.peer_horizon_lsn;
-        s.remote_consistent_lsn = self.inmem.remote_consistent_lsn;
        s.proposer_uuid = self.inmem.proposer_uuid;
        s
    }
@@ -230,7 +226,6 @@ where
        self.inmem.commit_lsn = s.commit_lsn;
        self.inmem.backup_lsn = s.backup_lsn;
        self.inmem.peer_horizon_lsn = s.peer_horizon_lsn;
-        self.inmem.remote_consistent_lsn = s.remote_consistent_lsn;
        self.inmem.proposer_uuid = s.proposer_uuid;
        Ok(())
    }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -11,6 +11,7 @@ use safekeeper_api::models::{
 use safekeeper_api::Term;
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
+use utils::generation::Generation;
 use utils::id::TenantId;
 use utils::sync::gate::Gate;

@@ -214,7 +215,7 @@ impl StateSK {
            StateSK::Empty => unreachable!(),
        }

-        // update everything else, including remote_consistent_lsn and backup_lsn
+        // update everything else, including backup_lsn
        let mut sync_control_file = false;
        let state = self.state_mut();
        let wal_seg_size = state.server.wal_seg_size as u64;
@@ -222,13 +223,6 @@ impl StateSK {
        state.inmem.backup_lsn = max(Lsn(sk_info.backup_lsn), state.inmem.backup_lsn);
        sync_control_file |= state.backup_lsn + wal_seg_size < state.inmem.backup_lsn;

-        state.inmem.remote_consistent_lsn = max(
-            Lsn(sk_info.remote_consistent_lsn),
-            state.inmem.remote_consistent_lsn,
-        );
-        sync_control_file |=
-            state.remote_consistent_lsn + wal_seg_size < state.inmem.remote_consistent_lsn;
-
        state.inmem.peer_horizon_lsn =
            max(Lsn(sk_info.peer_horizon_lsn), state.inmem.peer_horizon_lsn);
        sync_control_file |= state.peer_horizon_lsn + wal_seg_size < state.inmem.peer_horizon_lsn;
@@ -364,7 +358,6 @@ impl SharedState {
            flush_lsn: self.sk.flush_lsn().0,
            // note: this value is not flushed to control file yet and can be lost
            commit_lsn: self.sk.state().inmem.commit_lsn.0,
-            remote_consistent_lsn: self.sk.state().inmem.remote_consistent_lsn.0,
            peer_horizon_lsn: self.sk.state().inmem.peer_horizon_lsn.0,
            safekeeper_connstr: conf
                .advertise_pg_addr
@@ -880,6 +873,16 @@ impl Timeline {
    pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
        self.manager_ctl.backup_partial_reset().await
    }
+
+    pub async fn process_remote_consistent_lsn_update(
+        &self,
+        generation: Generation,
+        candidate: Lsn,
+    ) {
+        // TODO: still update controlfile state for backwards compate
+
+        todo!("implement & use the remote_persistent_lsn cache")
+    }
 }

 /// This is a guard that allows to read/write disk timeline state.
@@ -904,23 +907,6 @@ impl Deref for WalResidentTimeline {
 }

 impl WalResidentTimeline {
-    /// Returns true if walsender should stop sending WAL to pageserver. We
-    /// terminate it if remote_consistent_lsn reached commit_lsn and there is no
-    /// computes. While there might be nothing to stream already, we learn about
-    /// remote_consistent_lsn update through replication feedback, and we want
-    /// to stop pushing to the broker if pageserver is fully caughtup.
-    pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
-        if self.is_cancelled() {
-            return true;
-        }
-        let shared_state = self.read_shared_state().await;
-        if self.walreceivers.get_num() == 0 {
-            return shared_state.sk.state().inmem.commit_lsn == Lsn(0) || // no data at all yet
-            reported_remote_consistent_lsn >= shared_state.sk.state().inmem.commit_lsn;
-        }
-        false
-    }
-
    /// Ensure that current term is t, erroring otherwise, and lock the state.
    pub async fn acquire_term(&self, t: Term) -> Result<ReadGuardSharedState> {
        let ss = self.read_shared_state().await;
@@ -972,15 +958,6 @@ impl WalResidentTimeline {
    pub fn get_timeline_dir(&self) -> Utf8PathBuf {
        self.timeline_dir.clone()
    }
-
-    /// Update in memory remote consistent lsn.
-    pub async fn update_remote_consistent_lsn(&self, candidate: Lsn) {
-        let mut shared_state = self.write_shared_state().await;
-        shared_state.sk.state_mut().inmem.remote_consistent_lsn = max(
-            shared_state.sk.state().inmem.remote_consistent_lsn,
-            candidate,
-        );
-    }
 }

 /// This struct contains methods that are used by timeline manager task.
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -47,11 +47,10 @@ pub(crate) struct StateSnapshot {
    // inmem values
    pub(crate) commit_lsn: Lsn,
    pub(crate) backup_lsn: Lsn,
-    pub(crate) remote_consistent_lsn: Lsn,
+    pub(crate) min_remote_consistent_lsn: Lsn,

    // persistent control file values
    pub(crate) cfile_commit_lsn: Lsn,
-    pub(crate) cfile_remote_consistent_lsn: Lsn,
    pub(crate) cfile_backup_lsn: Lsn,

    // latest state
@@ -60,7 +59,7 @@ pub(crate) struct StateSnapshot {

    // misc
    pub(crate) cfile_last_persist_at: std::time::Instant,
-    pub(crate) inmem_flush_pending: bool,
+    pub(crate) cfile_inmem_flush_pending: bool,
    pub(crate) wal_removal_on_hold: bool,
    pub(crate) peers: Vec<PeerInfo>,
 }
@@ -72,24 +71,23 @@ impl StateSnapshot {
        Self {
            commit_lsn: state.inmem.commit_lsn,
            backup_lsn: state.inmem.backup_lsn,
-            remote_consistent_lsn: state.inmem.remote_consistent_lsn,
+            min_remote_consistent_lsn: todo!(""),
            cfile_commit_lsn: state.commit_lsn,
-            cfile_remote_consistent_lsn: state.remote_consistent_lsn,
            cfile_backup_lsn: state.backup_lsn,
            flush_lsn: read_guard.sk.flush_lsn(),
            last_log_term: read_guard.sk.last_log_term(),
            cfile_last_persist_at: state.pers.last_persist_at(),
-            inmem_flush_pending: Self::has_unflushed_inmem_state(state),
+            cfile_inmem_flush_pending: Self::has_unflushed_cfile_inmem_state(state),
            wal_removal_on_hold: read_guard.wal_removal_on_hold,
            peers: read_guard.get_peers(heartbeat_timeout),
        }
    }

-    fn has_unflushed_inmem_state(state: &TimelineState<FileStorage>) -> bool {
+    fn has_unflushed_cfile_inmem_state(state: &TimelineState<FileStorage>) -> bool {
        state.inmem.commit_lsn > state.commit_lsn
            || state.inmem.backup_lsn > state.backup_lsn
            || state.inmem.peer_horizon_lsn > state.peer_horizon_lsn
-            || state.inmem.remote_consistent_lsn > state.remote_consistent_lsn
+        // NB: remote_consistent_lsn storage is stored separately from control file
    }
 }

@@ -503,14 +501,14 @@ impl Manager {
    ) {
        let is_active = is_wal_backup_required
            || num_computes > 0
-            || state.remote_consistent_lsn < state.commit_lsn;
+            || state.min_remote_consistent_lsn < state.commit_lsn;

        // update the broker timeline set
        if self.tli_broker_active.set(is_active) {
            // write log if state has changed
            info!(
-                "timeline active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                is_active, state.remote_consistent_lsn, state.commit_lsn,
+                "timeline active={} now, min_remote_consistent_lsn={}, commit_lsn={}",
+                is_active, state.min_remote_consistent_lsn, state.commit_lsn,
            );

            MANAGER_ACTIVE_CHANGES.inc();
@@ -528,7 +526,7 @@ impl Manager {
        state: &StateSnapshot,
        next_event: &mut Option<Instant>,
    ) {
-        if !state.inmem_flush_pending {
+        if !state.cfile_inmem_flush_pending {
            return;
        }

--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -13,6 +13,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{auth::Scope, measured_stream::MeasuredStream};

+use std::os::fd::AsRawFd;
+
 use crate::metrics::TrafficMetrics;
 use crate::SafeKeeperConf;
 use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines};
@@ -62,6 +64,7 @@ async fn handle_socket(
    global_timelines: Arc<GlobalTimelines>,
 ) -> Result<(), QueryError> {
    socket.set_nodelay(true)?;
+    let socket_fd = socket.as_raw_fd();
    let peer_addr = socket.peer_addr()?;

    // Set timeout on reading from the socket. It prevents hanged up connection
@@ -107,7 +110,7 @@ async fn handle_socket(
        auth_pair,
        global_timelines,
    );
-    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
    // libpq protocol between safekeeper and walproposer / pageserver
    // We don't use shutdown.
    pgbackend
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -38,8 +38,6 @@ message SafekeeperTimelineInfo {
    uint64 commit_lsn = 5;
    // LSN up to which safekeeper has backed WAL.
    uint64 backup_lsn = 6;
-    // LSN of last checkpoint uploaded by pageserver.
-    uint64 remote_consistent_lsn = 7;
    uint64 peer_horizon_lsn = 8;
    uint64 local_start_lsn = 9;
    uint64 standby_horizon = 14;
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -760,7 +760,6 @@ mod tests {
            flush_lsn: 1,
            commit_lsn: 2,
            backup_lsn: 3,
-            remote_consistent_lsn: 4,
            peer_horizon_lsn: 5,
            safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
            http_connstr: "neon-1-sk-1.local:7677".to_owned(),
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -34,6 +34,7 @@ reqwest = { workspace = true, features = ["stream"] }
 routerify.workspace = true
 safekeeper_api.workspace = true
 safekeeper_client.workspace = true
+regex.workspace = true
 rustls-native-certs.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -516,6 +516,17 @@ async fn handle_tenant_timeline_block_unblock_gc(
    json_response(StatusCode::OK, ())
 }

+// For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters
+// and tenant/timeline IDs.  Since we are proxying to arbitrary paths, we don't have routing templates to
+// compare to, so we can just filter out our well known ID format with regexes.
+fn path_without_ids(path: &str) -> String {
+    static ID_REGEX: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
+    ID_REGEX
+        .get_or_init(|| regex::Regex::new(r"([0-9a-fA-F]{32}(-[0-9]{4})?|\?.*)").unwrap())
+        .replace_all(path, "")
+        .to_string()
+}
+
 async fn handle_tenant_timeline_passthrough(
    service: Arc<Service>,
    req: Request<Body>,
@@ -551,10 +562,7 @@ async fn handle_tenant_timeline_passthrough(
        .metrics_group
        .storage_controller_passthrough_request_latency;

-    // This is a bit awkward. We remove the param from the request
-    // and join the words by '_' to get a label for the request.
-    let just_path = path.replace(&tenant_shard_str, "");
-    let path_label = just_path
+    let path_label = path_without_ids(&path)
        .split('/')
        .filter(|token| !token.is_empty())
        .collect::<Vec<_>>()
@@ -2089,3 +2097,16 @@ pub fn make_router(
            )
        })
 }
+
+#[cfg(test)]
+mod test {
+
+    use super::path_without_ids;
+
+    #[test]
+    fn test_path_without_ids() {
+        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/");
+        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788"), "/v1/tenant//timeline/");
+        assert_eq!(path_without_ids("/v1/tenant/1a2b3344556677881122334455667788-0108/timeline/AA223344556677881122334455667788?parameter=foo"), "/v1/tenant//timeline/");
+    }
+}
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -12,7 +12,8 @@ use storage_controller::persistence::Persistence;
 use storage_controller::service::chaos_injector::ChaosInjector;
 use storage_controller::service::{
    Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT,
-    MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
+    MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT,
+    PRIORITY_RECONCILER_CONCURRENCY_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
 };
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
@@ -75,10 +76,14 @@ struct Cli {
    #[arg(long)]
    split_threshold: Option<u64>,

-    /// Maximum number of reconcilers that may run in parallel
+    /// Maximum number of normal-priority reconcilers that may run in parallel
    #[arg(long)]
    reconciler_concurrency: Option<usize>,

+    /// Maximum number of high-priority reconcilers that may run in parallel
+    #[arg(long)]
+    priority_reconciler_concurrency: Option<usize>,
+
    /// How long to wait for the initial database connection to be available.
    #[arg(long, default_value = "5s")]
    db_connect_timeout: humantime::Duration,
@@ -289,6 +294,9 @@ async fn async_main() -> anyhow::Result<()> {
        reconciler_concurrency: args
            .reconciler_concurrency
            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
+        priority_reconciler_concurrency: args
+            .priority_reconciler_concurrency
+            .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
        split_threshold: args.split_threshold,
        neon_local_repo_dir: args.neon_local_repo_dir,
        max_secondary_lag_bytes: args.max_secondary_lag_bytes,
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -91,9 +91,10 @@ pub(crate) struct ReconcilerConfigBuilder {
 }

 impl ReconcilerConfigBuilder {
-    pub(crate) fn new() -> Self {
+    /// Priority is special: you must pick one thoughtfully, do not just use 'normal' as the default
+    pub(crate) fn new(priority: ReconcilerPriority) -> Self {
        Self {
-            config: ReconcilerConfig::default(),
+            config: ReconcilerConfig::new(priority),
        }
    }

@@ -129,8 +130,18 @@ impl ReconcilerConfigBuilder {
    }
 }

-#[derive(Default, Debug, Copy, Clone)]
+// Higher priorities are used for user-facing tasks, so that a long backlog of housekeeping work (e.g. reconciling on startup, rescheduling
+// things on node changes) does not starve user-facing tasks.
+#[derive(Debug, Copy, Clone)]
+pub(crate) enum ReconcilerPriority {
+    Normal,
+    High,
+}
+
+#[derive(Debug, Copy, Clone)]
 pub(crate) struct ReconcilerConfig {
+    pub(crate) priority: ReconcilerPriority,
+
    // During live migration give up on warming-up the secondary
    // after this timeout.
    secondary_warmup_timeout: Option<Duration>,
@@ -145,6 +156,18 @@ pub(crate) struct ReconcilerConfig {
 }

 impl ReconcilerConfig {
+    /// Configs are always constructed with an explicit priority, to force callers to think about whether
+    /// the operation they're scheduling is high-priority or not. Normal priority is not a safe default, because
+    /// scheduling something user-facing at normal priority can result in it getting starved out by background work.
+    pub(crate) fn new(priority: ReconcilerPriority) -> Self {
+        Self {
+            priority,
+            secondary_warmup_timeout: None,
+            secondary_download_request_timeout: None,
+            tenant_creation_hint: false,
+        }
+    }
+
    pub(crate) fn get_secondary_warmup_timeout(&self) -> Duration {
        const SECONDARY_WARMUP_TIMEOUT_DEFAULT: Duration = Duration::from_secs(300);
        self.secondary_warmup_timeout
@@ -164,7 +187,9 @@ impl ReconcilerConfig {

 impl From<&MigrationConfig> for ReconcilerConfig {
    fn from(value: &MigrationConfig) -> Self {
-        let mut builder = ReconcilerConfigBuilder::new();
+        // Run reconciler at high priority because MigrationConfig comes from human requests that should
+        // be presumed urgent.
+        let mut builder = ReconcilerConfigBuilder::new(ReconcilerPriority::High);

        if let Some(timeout) = value.secondary_warmup_timeout {
            builder = builder.secondary_warmup_timeout(timeout)
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -30,7 +30,10 @@ use crate::{
        AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence,
        ShardGenerationState, TenantFilter,
    },
-    reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
+    reconciler::{
+        ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder,
+        ReconcilerPriority,
+    },
    safekeeper::Safekeeper,
    scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
    tenant_shard::{
@@ -79,7 +82,7 @@ use pageserver_api::{
    },
 };
 use pageserver_client::{mgmt_api, BlockUnblock};
-use tokio::sync::mpsc::error::TrySendError;
+use tokio::sync::{mpsc::error::TrySendError, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use utils::{
    completion::Barrier,
@@ -195,6 +198,7 @@ pub(crate) enum LeadershipStatus {
 }

 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
+pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;

 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -366,9 +370,12 @@ pub struct Config {
    /// and/or upon handling the re-attach request from a node.
    pub max_warming_up_interval: Duration,

-    /// How many Reconcilers may be spawned concurrently
+    /// How many normal-priority Reconcilers may be spawned concurrently
    pub reconciler_concurrency: usize,

+    /// How many high-priority Reconcilers may be spawned concurrently
+    pub priority_reconciler_concurrency: usize,
+
    /// How large must a shard grow in bytes before we split it?
    /// None disables auto-splitting.
    pub split_threshold: Option<u64>,
@@ -436,9 +443,14 @@ pub struct Service {
    // that transition it to/from Active.
    node_op_locks: IdLockMap<NodeId, NodeOperations>,

-    // Limit how many Reconcilers we will spawn concurrently
+    // Limit how many Reconcilers we will spawn concurrently for normal-priority tasks such as background reconciliations
+    // and reconciliation on startup.
    reconciler_concurrency: Arc<tokio::sync::Semaphore>,

+    // Limit how many Reconcilers we will spawn concurrently for high-priority tasks such as tenant/timeline CRUD, which
+    // a human user might be waiting for.
+    priority_reconciler_concurrency: Arc<tokio::sync::Semaphore>,
+
    /// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
    /// Send into this queue to promptly attempt to reconcile this shard next time units are available.
    ///
@@ -1263,12 +1275,15 @@ impl Service {
        }

        // Maybe some other work can proceed now that this job finished.
+        //
+        // Only bother with this if we have some semaphore units available in the normal-priority semaphore (these
+        // reconciles are scheduled at `[ReconcilerPriority::Normal]`).
        if self.reconciler_concurrency.available_permits() > 0 {
            while let Ok(tenant_shard_id) = locked.delayed_reconcile_rx.try_recv() {
                let (nodes, tenants, _scheduler) = locked.parts_mut();
                if let Some(shard) = tenants.get_mut(&tenant_shard_id) {
                    shard.delayed_reconcile = false;
-                    self.maybe_reconcile_shard(shard, nodes);
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal);
                }

                if self.reconciler_concurrency.available_permits() == 0 {
@@ -1565,6 +1580,9 @@ impl Service {
            reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
                config.reconciler_concurrency,
            )),
+            priority_reconciler_concurrency: Arc::new(tokio::sync::Semaphore::new(
+                config.priority_reconciler_concurrency,
+            )),
            delayed_reconcile_tx,
            abort_tx,
            startup_complete: startup_complete.clone(),
@@ -2337,7 +2355,7 @@ impl Service {
        let waiters = {
            let mut locked = self.inner.write().unwrap();
            let (nodes, tenants, _scheduler) = locked.parts_mut();
-            let config = ReconcilerConfigBuilder::new()
+            let config = ReconcilerConfigBuilder::new(ReconcilerPriority::High)
                .tenant_creation_hint(true)
                .build();
            tenants
@@ -2812,7 +2830,8 @@ impl Service {

                        shard.schedule(scheduler, &mut schedule_context)?;

-                        let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
+                        let maybe_waiter =
+                            self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High);
                        if let Some(waiter) = maybe_waiter {
                            waiters.push(waiter);
                        }
@@ -2933,7 +2952,9 @@ impl Service {
            let (nodes, tenants, _scheduler) = locked.parts_mut();
            for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                shard.config = config.clone();
-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                if let Some(waiter) =
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
+                {
                    waiters.push(waiter);
                }
            }
@@ -3215,7 +3236,9 @@ impl Service {
                debug_assert!(shard.intent.get_attached().is_none());
                debug_assert!(shard.intent.get_secondary().is_empty());

-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                if let Some(waiter) =
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
+                {
                    detach_waiters.push(waiter);
                }
            }
@@ -3367,7 +3390,7 @@ impl Service {

            // In case scheduling is being switched back on, try it now.
            shard.schedule(scheduler, &mut schedule_context).ok();
-            self.maybe_reconcile_shard(shard, nodes);
+            self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High);
        }

        Ok(())
@@ -4416,7 +4439,7 @@ impl Service {
                    tracing::warn!("Failed to schedule {tenant_shard_id} during shard abort: {e}")
                }

-                self.maybe_reconcile_shard(shard, nodes);
+                self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High);
            }

            // We don't expect any new_shard_count shards to exist here, but drop them just in case
@@ -4582,7 +4605,11 @@ impl Service {
                        tracing::warn!("Failed to schedule child shard {child}: {e}");
                    }
                    // In the background, attach secondary locations for the new shards
-                    if let Some(waiter) = self.maybe_reconcile_shard(&mut child_state, nodes) {
+                    if let Some(waiter) = self.maybe_reconcile_shard(
+                        &mut child_state,
+                        nodes,
+                        ReconcilerPriority::High,
+                    ) {
                        waiters.push(waiter);
                    }

@@ -4947,7 +4974,9 @@ impl Service {
                shard.intent.clear_secondary(scheduler);

                // Run Reconciler to execute detach fo secondary locations.
-                if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) {
+                if let Some(waiter) =
+                    self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
+                {
                    waiters.push(waiter);
                }
            }
@@ -5215,7 +5244,7 @@ impl Service {

            let reconciler_config = match migrate_req.migration_config {
                Some(cfg) => (&cfg).into(),
-                None => ReconcilerConfig::default(),
+                None => ReconcilerConfig::new(ReconcilerPriority::High),
            };

            self.maybe_configured_reconcile_shard(shard, nodes, reconciler_config)
@@ -5281,7 +5310,7 @@ impl Service {
                );
            }

-            self.maybe_reconcile_shard(shard, nodes)
+            self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::High)
        };

        if let Some(waiter) = waiter {
@@ -5693,7 +5722,7 @@ impl Service {
                            )
                        }

-                        self.maybe_reconcile_shard(shard, nodes);
+                        self.maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal);
                    }

                    // Here we remove an existing observed location for the node we're removing, and it will
@@ -6062,7 +6091,14 @@ impl Service {
                                    tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
                                }
                                Ok(()) => {
-                                    if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() {
+                                    if self
+                                        .maybe_reconcile_shard(
+                                            tenant_shard,
+                                            nodes,
+                                            ReconcilerPriority::Normal,
+                                        )
+                                        .is_some()
+                                    {
                                        tenants_affected += 1;
                                    };
                                }
@@ -6093,7 +6129,11 @@ impl Service {

                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
                        if observed_loc.conf.is_none() {
-                            self.maybe_reconcile_shard(tenant_shard, nodes);
+                            self.maybe_reconcile_shard(
+                                tenant_shard,
+                                nodes,
+                                ReconcilerPriority::Normal,
+                            );
                        }
                    }
                }
@@ -6457,8 +6497,36 @@ impl Service {
        &self,
        shard: &mut TenantShard,
        nodes: &Arc<HashMap<NodeId, Node>>,
+        priority: ReconcilerPriority,
    ) -> Option<ReconcilerWaiter> {
-        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::default())
+        self.maybe_configured_reconcile_shard(shard, nodes, ReconcilerConfig::new(priority))
+    }
+
+    /// Before constructing a Reconciler, acquire semaphore units from the appropriate concurrency limit (depends on priority)
+    fn get_reconciler_units(
+        &self,
+        priority: ReconcilerPriority,
+    ) -> Result<ReconcileUnits, TryAcquireError> {
+        let units = match priority {
+            ReconcilerPriority::Normal => self.reconciler_concurrency.clone().try_acquire_owned(),
+            ReconcilerPriority::High => {
+                match self
+                    .priority_reconciler_concurrency
+                    .clone()
+                    .try_acquire_owned()
+                {
+                    Ok(u) => Ok(u),
+                    Err(TryAcquireError::NoPermits) => {
+                        // If the high priority semaphore is exhausted, then high priority tasks may steal units from
+                        // the normal priority semaphore.
+                        self.reconciler_concurrency.clone().try_acquire_owned()
+                    }
+                    Err(e) => Err(e),
+                }
+            }
+        };
+
+        units.map(ReconcileUnits::new)
    }

    /// Wrap [`TenantShard`] reconciliation methods with acquisition of [`Gate`] and [`ReconcileUnits`],
@@ -6478,8 +6546,8 @@ impl Service {
            }
        };

-        let units = match self.reconciler_concurrency.clone().try_acquire_owned() {
-            Ok(u) => ReconcileUnits::new(u),
+        let units = match self.get_reconciler_units(reconciler_config.priority) {
+            Ok(u) => u,
            Err(_) => {
                tracing::info!(tenant_id=%shard.tenant_shard_id.tenant_id, shard_id=%shard.tenant_shard_id.shard_slug(),
                    "Concurrency limited: enqueued for reconcile later");
@@ -6572,7 +6640,10 @@ impl Service {

            // Eventual consistency: if an earlier reconcile job failed, and the shard is still
            // dirty, spawn another rone
-            if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
+            if self
+                .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
+                .is_some()
+            {
                reconciles_spawned += 1;
            } else if shard.delayed_reconcile {
                // Shard wanted to reconcile but for some reason couldn't.
@@ -6658,7 +6729,10 @@ impl Service {
            tracing::info!(tenant_shard_id=%tenant_shard_id, "Applying optimization: {optimization:?}");
            if shard.apply_optimization(scheduler, optimization) {
                optimizations_applied += 1;
-                if self.maybe_reconcile_shard(shard, nodes).is_some() {
+                if self
+                    .maybe_reconcile_shard(shard, nodes, ReconcilerPriority::Normal)
+                    .is_some()
+                {
                    reconciles_spawned += 1;
                }
            }
@@ -7208,7 +7282,7 @@ impl Service {
        // to not stall the operation when a cold secondary is encountered.
        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
-        let reconciler_config = ReconcilerConfigBuilder::new()
+        let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
            .build();
@@ -7541,7 +7615,7 @@ impl Service {
    ) -> Result<(), OperationError> {
        const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20);
        const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5);
-        let reconciler_config = ReconcilerConfigBuilder::new()
+        let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal)
            .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT)
            .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT)
            .build();
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -88,7 +88,11 @@ impl ChaosInjector {

        shard.intent.demote_attached(scheduler, old_location);
        shard.intent.promote_attached(scheduler, new_location);
-        self.service.maybe_reconcile_shard(shard, nodes);
+        self.service.maybe_reconcile_shard(
+            shard,
+            nodes,
+            crate::reconciler::ReconcilerPriority::Normal,
+        );
    }

    async fn inject_chaos(&mut self) {
--- a/test_runner/fixtures/fast_import.py
+++ b/test_runner/fixtures/fast_import.py
@@ -4,8 +4,10 @@ import subprocess
 import tempfile
 from collections.abc import Iterator
 from pathlib import Path
+from typing import cast

 import pytest
+from _pytest.config import Config

 from fixtures.log_helper import log
 from fixtures.neon_cli import AbstractNeonCli
@@ -23,6 +25,7 @@ class FastImport(AbstractNeonCli):
        pg_distrib_dir: Path,
        pg_version: PgVersion,
        workdir: Path,
+        cleanup: bool = True,
    ):
        if extra_env is None:
            env_vars = {}
@@ -47,12 +50,43 @@ class FastImport(AbstractNeonCli):
        if not workdir.exists():
            raise Exception(f"Working directory '{workdir}' does not exist")
        self.workdir = workdir
+        self.cleanup = cleanup
+
+    def run_pgdata(
+        self,
+        s3prefix: str | None = None,
+        pg_port: int | None = None,
+        source_connection_string: str | None = None,
+        interactive: bool = False,
+    ):
+        return self.run(
+            "pgdata",
+            s3prefix=s3prefix,
+            pg_port=pg_port,
+            source_connection_string=source_connection_string,
+            interactive=interactive,
+        )
+
+    def run_dump_restore(
+        self,
+        s3prefix: str | None = None,
+        source_connection_string: str | None = None,
+        destination_connection_string: str | None = None,
+    ):
+        return self.run(
+            "dump-restore",
+            s3prefix=s3prefix,
+            source_connection_string=source_connection_string,
+            destination_connection_string=destination_connection_string,
+        )

    def run(
        self,
-        pg_port: int,
-        source_connection_string: str | None = None,
+        command: str,
        s3prefix: str | None = None,
+        pg_port: int | None = None,
+        source_connection_string: str | None = None,
+        destination_connection_string: str | None = None,
        interactive: bool = False,
    ) -> subprocess.CompletedProcess[str]:
        if self.cmd is not None:
@@ -60,13 +94,17 @@ class FastImport(AbstractNeonCli):
        args = [
            f"--pg-bin-dir={self.pg_bin}",
            f"--pg-lib-dir={self.pg_lib}",
-            f"--pg-port={pg_port}",
            f"--working-directory={self.workdir}",
        ]
-        if source_connection_string is not None:
-            args.append(f"--source-connection-string={source_connection_string}")
        if s3prefix is not None:
            args.append(f"--s3-prefix={s3prefix}")
+        args.append(command)
+        if pg_port is not None:
+            args.append(f"--pg-port={pg_port}")
+        if source_connection_string is not None:
+            args.append(f"--source-connection-string={source_connection_string}")
+        if destination_connection_string is not None:
+            args.append(f"--destination-connection-string={destination_connection_string}")
        if interactive:
            args.append("--interactive")

@@ -77,7 +115,7 @@ class FastImport(AbstractNeonCli):
        return self

    def __exit__(self, *args):
-        if self.workdir.exists():
+        if self.workdir.exists() and self.cleanup:
            shutil.rmtree(self.workdir)


@@ -87,9 +125,17 @@ def fast_import(
    test_output_dir: Path,
    neon_binpath: Path,
    pg_distrib_dir: Path,
+    pytestconfig: Config,
 ) -> Iterator[FastImport]:
-    workdir = Path(tempfile.mkdtemp())
-    with FastImport(None, neon_binpath, pg_distrib_dir, pg_version, workdir) as fi:
+    workdir = Path(tempfile.mkdtemp(dir=test_output_dir, prefix="fast_import_"))
+    with FastImport(
+        None,
+        neon_binpath,
+        pg_distrib_dir,
+        pg_version,
+        workdir,
+        cleanup=not cast(bool, pytestconfig.getoption("--preserve-database-files")),
+    ) as fi:
        yield fi

        if fi.cmd is None:
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -27,6 +27,7 @@ from urllib.parse import quote, urlparse

 import asyncpg
 import backoff
+import boto3
 import httpx
 import psycopg2
 import psycopg2.sql
@@ -37,6 +38,8 @@ from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
 from jwcrypto import jwk
+from mypy_boto3_kms import KMSClient
+from mypy_boto3_s3 import S3Client

 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -199,6 +202,30 @@ def mock_s3_server(port_distributor: PortDistributor) -> Iterator[MockS3Server]:
    mock_s3_server.kill()


+@pytest.fixture(scope="session")
+def mock_kms(mock_s3_server: MockS3Server) -> Iterator[KMSClient]:
+    yield boto3.client(
+        "kms",
+        endpoint_url=mock_s3_server.endpoint(),
+        region_name=mock_s3_server.region(),
+        aws_access_key_id=mock_s3_server.access_key(),
+        aws_secret_access_key=mock_s3_server.secret_key(),
+        aws_session_token=mock_s3_server.session_token(),
+    )
+
+
+@pytest.fixture(scope="session")
+def mock_s3_client(mock_s3_server: MockS3Server) -> Iterator[S3Client]:
+    yield boto3.client(
+        "s3",
+        endpoint_url=mock_s3_server.endpoint(),
+        region_name=mock_s3_server.region(),
+        aws_access_key_id=mock_s3_server.access_key(),
+        aws_secret_access_key=mock_s3_server.secret_key(),
+        aws_session_token=mock_s3_server.session_token(),
+    )
+
+
 class PgProtocol:
    """Reusable connection logic"""

--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -34,7 +34,6 @@ class SafekeeperTimelineStatus:
    timeline_start_lsn: Lsn
    backup_lsn: Lsn
    peer_horizon_lsn: Lsn
-    remote_consistent_lsn: Lsn
    walreceivers: list[Walreceiver]


@@ -205,7 +204,6 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
            timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
            backup_lsn=Lsn(resj["backup_lsn"]),
            peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
-            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
            walreceivers=walreceivers,
        )

--- a/test_runner/regress/test_import_pgdata.py
+++ b/test_runner/regress/test_import_pgdata.py
@@ -1,7 +1,9 @@
+import base64
 import json
 import re
 import time
 from enum import Enum
+from pathlib import Path

 import psycopg2
 import psycopg2.errors
@@ -14,8 +16,12 @@ from fixtures.pageserver.http import (
    ImportPgdataIdemptencyKey,
    PageserverApiException,
 )
+from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
-from fixtures.remote_storage import RemoteStorageKind
+from fixtures.remote_storage import MockS3Server, RemoteStorageKind
+from mypy_boto3_kms import KMSClient
+from mypy_boto3_kms.type_defs import EncryptResponseTypeDef
+from mypy_boto3_s3 import S3Client
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -103,13 +109,15 @@ def test_pgdata_import_smoke(
    while True:
        relblock_size = vanilla_pg.safe_psql_scalar("select pg_relation_size('t')")
        log.info(
-            f"relblock size: {relblock_size/8192} pages (target: {target_relblock_size//8192}) pages"
+            f"relblock size: {relblock_size / 8192} pages (target: {target_relblock_size // 8192}) pages"
        )
        if relblock_size >= target_relblock_size:
            break
        addrows = int((target_relblock_size - relblock_size) // 8192)
        assert addrows >= 1, "forward progress"
-        vanilla_pg.safe_psql(f"insert into t select generate_series({nrows+1}, {nrows + addrows})")
+        vanilla_pg.safe_psql(
+            f"insert into t select generate_series({nrows + 1}, {nrows + addrows})"
+        )
        nrows += addrows
    expect_nrows = nrows
    expect_sum = (
@@ -332,6 +340,224 @@ def test_pgdata_import_smoke(
        br_initdb_endpoint.safe_psql("select * from othertable")


+def test_fast_import_with_pageserver_ingest(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+    pg_distrib_dir: Path,
+    pg_version: PgVersion,
+    mock_s3_server: MockS3Server,
+    mock_kms: KMSClient,
+    mock_s3_client: S3Client,
+    neon_env_builder: NeonEnvBuilder,
+    make_httpserver: HTTPServer,
+):
+    # Prepare KMS and S3
+    key_response = mock_kms.create_key(
+        Description="Test key",
+        KeyUsage="ENCRYPT_DECRYPT",
+        Origin="AWS_KMS",
+    )
+    key_id = key_response["KeyMetadata"]["KeyId"]
+
+    def encrypt(x: str) -> EncryptResponseTypeDef:
+        return mock_kms.encrypt(KeyId=key_id, Plaintext=x)
+
+    # Start source postgres and ingest data
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    # Setup pageserver and fake cplane for import progress
+    def handler(request: Request) -> Response:
+        log.info(f"control plane request: {request.json}")
+        return Response(json.dumps({}), status=200)
+
+    cplane_mgmt_api_server = make_httpserver
+    cplane_mgmt_api_server.expect_request(re.compile(".*")).respond_with_handler(handler)
+
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start()
+
+    env.pageserver.patch_config_toml_nonrecursive(
+        {
+            "import_pgdata_upcall_api": f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/path/to/mgmt/api",
+            # because import_pgdata code uses this endpoint, not the one in common remote storage config
+            # TODO: maybe use common remote_storage config in pageserver?
+            "import_pgdata_aws_endpoint_url": env.s3_mock_server.endpoint(),
+        }
+    )
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Encrypt connstrings and put spec into S3
+    source_connstring_encrypted = encrypt(vanilla_pg.connstr())
+    spec = {
+        "encryption_secret": {"KMS": {"key_id": key_id}},
+        "source_connstring_ciphertext_base64": base64.b64encode(
+            source_connstring_encrypted["CiphertextBlob"]
+        ).decode("utf-8"),
+        "project_id": "someproject",
+        "branch_id": "somebranch",
+    }
+
+    bucket = "test-bucket"
+    key_prefix = "test-prefix"
+    mock_s3_client.create_bucket(Bucket=bucket)
+    mock_s3_client.put_object(Bucket=bucket, Key=f"{key_prefix}/spec.json", Body=json.dumps(spec))
+
+    # Create timeline with import_pgdata
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(tenant_id)
+
+    timeline_id = TimelineId.generate()
+    log.info("starting import")
+    start = time.monotonic()
+
+    idempotency = ImportPgdataIdemptencyKey.random()
+    log.info(f"idempotency key {idempotency}")
+    # TODO: teach neon_local CLI about the idempotency & 429 error so we can run inside the loop
+    # and check for 429
+
+    import_branch_name = "imported"
+    env.storage_controller.timeline_create(
+        tenant_id,
+        {
+            "new_timeline_id": str(timeline_id),
+            "import_pgdata": {
+                "idempotency_key": str(idempotency),
+                "location": {
+                    "AwsS3": {
+                        "region": env.s3_mock_server.region(),
+                        "bucket": bucket,
+                        "key": key_prefix,
+                    }
+                },
+            },
+        },
+    )
+    env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id)
+
+    # Run fast_import
+    if fast_import.extra_env is None:
+        fast_import.extra_env = {}
+    fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key()
+    fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key()
+    fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token()
+    fast_import.extra_env["AWS_REGION"] = mock_s3_server.region()
+    fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint()
+    fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug"
+    pg_port = port_distributor.get_port()
+    fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}")
+    vanilla_pg.stop()
+
+    def validate_vanilla_equivalence(ep):
+        res = ep.safe_psql("SELECT count(*), sum(a) FROM foo;", dbname="neondb")
+        assert res[0] == (10, 55), f"got result: {res}"
+
+    # Sanity check that data in pgdata is expected:
+    pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
+    with VanillaPostgres(
+        fast_import.workdir / "pgdata", pgbin, pg_port, False
+    ) as new_pgdata_vanilla_pg:
+        new_pgdata_vanilla_pg.start()
+
+        # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres
+        conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb")
+        validate_vanilla_equivalence(conn)
+
+    # Poll pageserver statuses in s3
+    while True:
+        locations = env.storage_controller.locate(tenant_id)
+        active_count = 0
+        for location in locations:
+            shard_id = TenantShardId.parse(location["shard_id"])
+            ps = env.get_pageserver(location["node_id"])
+            try:
+                detail = ps.http_client().timeline_detail(shard_id, timeline_id)
+                log.info(f"timeline {tenant_id}/{timeline_id} detail: {detail}")
+                state = detail["state"]
+                log.info(f"shard {shard_id} state: {state}")
+                if state == "Active":
+                    active_count += 1
+            except PageserverApiException as e:
+                if e.status_code == 404:
+                    log.info("not found, import is in progress")
+                    continue
+                elif e.status_code == 429:
+                    log.info("import is in progress")
+                    continue
+                else:
+                    raise
+
+            if state == "Active":
+                key = f"{key_prefix}/status/shard-{shard_id.shard_index}"
+                shard_status_file_contents = (
+                    mock_s3_client.get_object(Bucket=bucket, Key=key)["Body"].read().decode("utf-8")
+                )
+                shard_status = json.loads(shard_status_file_contents)
+                assert shard_status["done"] is True
+
+        if active_count == len(locations):
+            log.info("all shards are active")
+            break
+        time.sleep(0.5)
+
+    import_duration = time.monotonic() - start
+    log.info(f"import complete; duration={import_duration:.2f}s")
+
+    ep = env.endpoints.create_start(branch_name=import_branch_name, tenant_id=tenant_id)
+
+    # check that data is there
+    validate_vanilla_equivalence(ep)
+
+    # check that we can do basic ops
+
+    ep.safe_psql("create table othertable(values text)", dbname="neondb")
+    rw_lsn = Lsn(ep.safe_psql_scalar("select pg_current_wal_flush_lsn()"))
+    ep.stop()
+
+    # ... at the tip
+    _ = env.create_branch(
+        new_branch_name="br-tip",
+        ancestor_branch_name=import_branch_name,
+        tenant_id=tenant_id,
+        ancestor_start_lsn=rw_lsn,
+    )
+    br_tip_endpoint = env.endpoints.create_start(
+        branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id
+    )
+    validate_vanilla_equivalence(br_tip_endpoint)
+    br_tip_endpoint.safe_psql("select * from othertable", dbname="neondb")
+    br_tip_endpoint.stop()
+
+    # ... at the initdb lsn
+    locations = env.storage_controller.locate(tenant_id)
+    [shard_zero] = [
+        loc for loc in locations if TenantShardId.parse(loc["shard_id"]).shard_number == 0
+    ]
+    shard_zero_ps = env.get_pageserver(shard_zero["node_id"])
+    shard_zero_timeline_info = shard_zero_ps.http_client().timeline_detail(
+        shard_zero["shard_id"], timeline_id
+    )
+    initdb_lsn = Lsn(shard_zero_timeline_info["initdb_lsn"])
+    _ = env.create_branch(
+        new_branch_name="br-initdb",
+        ancestor_branch_name=import_branch_name,
+        tenant_id=tenant_id,
+        ancestor_start_lsn=initdb_lsn,
+    )
+    br_initdb_endpoint = env.endpoints.create_start(
+        branch_name="br-initdb", endpoint_id="br-initdb-ro", tenant_id=tenant_id
+    )
+    validate_vanilla_equivalence(br_initdb_endpoint)
+    with pytest.raises(psycopg2.errors.UndefinedTable):
+        br_initdb_endpoint.safe_psql("select * from othertable", dbname="neondb")
+    br_initdb_endpoint.stop()
+
+    env.pageserver.stop(immediate=True)
+
+
 def test_fast_import_binary(
    test_output_dir,
    vanilla_pg: VanillaPostgres,
@@ -342,7 +568,7 @@ def test_fast_import_binary(
    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")

    pg_port = port_distributor.get_port()
-    fast_import.run(pg_port, vanilla_pg.connstr())
+    fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr())
    vanilla_pg.stop()

    pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version)
@@ -358,6 +584,118 @@ def test_fast_import_binary(
        assert res[0][0] == 10


+def test_fast_import_restore_to_connstring(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+    pg_distrib_dir: Path,
+    pg_version: PgVersion,
+):
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    pgdatadir = test_output_dir / "destination-pgdata"
+    pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
+    port = port_distributor.get_port()
+    with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg:
+        destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"])
+        destination_vanilla_pg.start()
+
+        # create another database & role and try to restore there
+        destination_vanilla_pg.safe_psql("""
+            CREATE ROLE testrole WITH
+                LOGIN
+                PASSWORD 'testpassword'
+                NOSUPERUSER
+                NOCREATEDB
+                NOCREATEROLE;
+        """)
+        destination_vanilla_pg.safe_psql("CREATE DATABASE testdb OWNER testrole;")
+
+        destination_connstring = destination_vanilla_pg.connstr(
+            dbname="testdb", user="testrole", password="testpassword"
+        )
+        fast_import.run_dump_restore(
+            source_connection_string=vanilla_pg.connstr(),
+            destination_connection_string=destination_connstring,
+        )
+        vanilla_pg.stop()
+        conn = PgProtocol(dsn=destination_connstring)
+        res = conn.safe_psql("SELECT count(*) FROM foo;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 10
+
+
+def test_fast_import_restore_to_connstring_from_s3_spec(
+    test_output_dir,
+    vanilla_pg: VanillaPostgres,
+    port_distributor: PortDistributor,
+    fast_import: FastImport,
+    pg_distrib_dir: Path,
+    pg_version: PgVersion,
+    mock_s3_server: MockS3Server,
+    mock_kms: KMSClient,
+    mock_s3_client: S3Client,
+):
+    # Prepare KMS and S3
+    key_response = mock_kms.create_key(
+        Description="Test key",
+        KeyUsage="ENCRYPT_DECRYPT",
+        Origin="AWS_KMS",
+    )
+    key_id = key_response["KeyMetadata"]["KeyId"]
+
+    def encrypt(x: str) -> EncryptResponseTypeDef:
+        return mock_kms.encrypt(KeyId=key_id, Plaintext=x)
+
+    # Start source postgres and ingest data
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE foo (a int); INSERT INTO foo SELECT generate_series(1, 10);")
+
+    # Start target postgres
+    pgdatadir = test_output_dir / "destination-pgdata"
+    pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
+    port = port_distributor.get_port()
+    with VanillaPostgres(pgdatadir, pg_bin, port) as destination_vanilla_pg:
+        destination_vanilla_pg.configure(["shared_preload_libraries='neon_rmgr'"])
+        destination_vanilla_pg.start()
+
+        # Encrypt connstrings and put spec into S3
+        source_connstring_encrypted = encrypt(vanilla_pg.connstr())
+        destination_connstring_encrypted = encrypt(destination_vanilla_pg.connstr())
+        spec = {
+            "encryption_secret": {"KMS": {"key_id": key_id}},
+            "source_connstring_ciphertext_base64": base64.b64encode(
+                source_connstring_encrypted["CiphertextBlob"]
+            ).decode("utf-8"),
+            "destination_connstring_ciphertext_base64": base64.b64encode(
+                destination_connstring_encrypted["CiphertextBlob"]
+            ).decode("utf-8"),
+        }
+
+        mock_s3_client.create_bucket(Bucket="test-bucket")
+        mock_s3_client.put_object(
+            Bucket="test-bucket", Key="test-prefix/spec.json", Body=json.dumps(spec)
+        )
+
+        # Run fast_import
+        if fast_import.extra_env is None:
+            fast_import.extra_env = {}
+        fast_import.extra_env["AWS_ACCESS_KEY_ID"] = mock_s3_server.access_key()
+        fast_import.extra_env["AWS_SECRET_ACCESS_KEY"] = mock_s3_server.secret_key()
+        fast_import.extra_env["AWS_SESSION_TOKEN"] = mock_s3_server.session_token()
+        fast_import.extra_env["AWS_REGION"] = mock_s3_server.region()
+        fast_import.extra_env["AWS_ENDPOINT_URL"] = mock_s3_server.endpoint()
+        fast_import.extra_env["RUST_LOG"] = "aws_config=debug,aws_sdk_kms=debug"
+        fast_import.run_dump_restore(s3prefix="s3://test-bucket/test-prefix")
+        vanilla_pg.stop()
+
+        res = destination_vanilla_pg.safe_psql("SELECT count(*) FROM foo;")
+        log.info(f"Result: {res}")
+        assert res[0][0] == 10
+
+
 # TODO: Maybe test with pageserver?
 # 1. run whole neon env
 # 2. create timeline with some s3 path???
--- a/test_runner/regress/test_lfc_resize.py
+++ b/test_runner/regress/test_lfc_resize.py
@@ -72,6 +72,11 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):

    thread.join()

+    # Fill LFC: seqscan should fetch the whole table in cache.
+    # It is needed for further correct evaluation of LFC file size
+    # (a sparse chunk of LFC takes less than 1 MB on disk).
+    cur.execute("select sum(abalance) from pgbench_accounts")
+
    # Before shrinking the cache, check that it really is large now
    (lfc_file_size, lfc_file_blocks) = get_lfc_size()
    assert int(lfc_file_blocks) > 128 * 1024
--- a/test_runner/regress/test_relations.py
+++ b/test_runner/regress/test_relations.py
@@ -0,0 +1,68 @@
+from __future__ import annotations
+
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+)
+
+
+def test_pageserver_reldir_v2(
+    neon_env_builder: NeonEnvBuilder,
+):
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "rel_size_v2_enabled": "false",
+        }
+    )
+
+    endpoint = env.endpoints.create_start("main")
+    # Create a relation in v1
+    endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)")
+    endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)")
+
+    # Switch to v2
+    env.pageserver.http_client().update_tenant_config(
+        env.initial_tenant,
+        {
+            "rel_size_v2_enabled": True,
+        },
+    )
+
+    # Check if both relations are still accessible
+    endpoint.safe_psql("SELECT * FROM foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+
+    # Restart the endpoint
+    endpoint.stop()
+    endpoint.start()
+
+    # Check if both relations are still accessible again after restart
+    endpoint.safe_psql("SELECT * FROM foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+
+    # Create a relation in v2
+    endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)")
+    # Delete a relation in v1
+    endpoint.safe_psql("DROP TABLE foo1")
+
+    # Check if both relations are still accessible
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("SELECT * FROM foo3")
+
+    # Restart the endpoint
+    endpoint.stop()
+    # This will acquire a basebackup, which lists all relations.
+    endpoint.start()
+
+    # Check if both relations are still accessible
+    endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("SELECT * FROM foo3")
+
+    endpoint.safe_psql("DROP TABLE foo3")
+    endpoint.stop()
+    endpoint.start()
+
+    # Check if relations are still accessible
+    endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("DROP TABLE IF EXISTS foo3")
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -481,7 +481,8 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder):
    counts = timeline_detail["directory_entries_counts"]
    assert counts
    log.info(f"directory counts: {counts}")
-    assert counts[2] > COUNT_AT_LEAST_EXPECTED
+    # We need to add up reldir v1 + v2 counts
+    assert counts[2] + counts[7] > COUNT_AT_LEAST_EXPECTED


 def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv):
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1445,6 +1445,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):

    # roughly fills one segment
    endpoint.safe_psql("insert into t select generate_series(1,250000), 'payload'")
+    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])

    endpoint.stop()  # stop compute

@@ -1473,7 +1474,15 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder):
        "flush_lsn to get aligned",
    )

-    cmp_sk_wal([sk1, sk2], tenant_id, timeline_id)
+    sk1_digest = sk1.http_client().timeline_digest(
+        tenant_id, timeline_id, sk1.get_timeline_start_lsn(tenant_id, timeline_id), lsn
+    )
+
+    sk2_digest = sk1.http_client().timeline_digest(
+        tenant_id, timeline_id, sk2.get_timeline_start_lsn(tenant_id, timeline_id), lsn
+    )
+
+    assert sk1_digest == sk2_digest

    # stop one of safekeepers which weren't recovering and insert a bit more to check we can commit
    env.safekeepers[2].stop()