Fix lack of GetWALInsertionTimeLine in PG14

Make mypy happy
Make ruff happy
2026-05-17 13:10:38 +00:00 · 2025-05-30 14:26:44 +03:00 · 2025-05-30 14:12:41 +03:00 · 2025-05-30 07:58:29 +03:00 · 2025-05-29 20:34:50 +02:00 · 2025-05-29 18:17:48 +03:00
105 changed files with 3853 additions and 11229 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -314,7 +314,8 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
+          # test_pageserver_max_throughput_getpage_at_latest_lsn is run in separate workflow periodic_pagebench.yml because it needs snapshots
+          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} --ignore=test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
          pg_version: v16
          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -33,10 +33,9 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        pg-version: [16]
+        pg-version: [16, 17]

-    #runs-on: us-east-2
-    runs-on: small
+    runs-on: us-east-2
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
      credentials:
@@ -60,13 +59,6 @@ jobs:
        run: |
          cd "vendor/postgres-v${PG_VERSION}"
          patch -p1 < "../../compute/patches/cloud_regress_pg${PG_VERSION}.patch"
-          patch -p1 < "../../compute/patches/cloud_regress_pg${PG_VERSION}_ha_495.patch"
-          cd src/test/regress/data
-          #mv onek.data onek.data.tmp
-          #mv tenk.data tenk.data.tmp
-          #awk 'BEGIN {OFS="\t";n=1000} {for(i=0;i<n;i++){ print $1+i*1000, $2+i*1000, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16 }}' <onek.data.tmp > onek.data
-          #awk 'BEGIN {OFS="\t";n=10} {for(i=0;i<n;i++){ print $1+i*10000, $2+i*10000, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16 }}' <tenk.data.tmp > tenk.data
-          

      - name: Generate a random password
        id: pwgen
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -1,4 +1,4 @@
-name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
+name: Periodic pagebench performance test on unit-perf hetzner runner

 on:
  schedule:
@@ -8,7 +8,7 @@ on:
    #        │   │ ┌───────────── day of the month (1 - 31)
    #        │   │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #        │   │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron: '0 */3 * * *' # Runs every 3 hours
+    - cron: '0 */4 * * *' # Runs every 4 hours
  workflow_dispatch: # Allows manual triggering of the workflow
    inputs:
      commit_hash:
@@ -16,6 +16,11 @@ on:
        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
        required: false
        default: ''
+      recreate_snapshots:
+        type: boolean
+        description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.'
+        required: false
+        default: false

 defaults:
  run:
@@ -29,13 +34,13 @@ permissions:
  contents: read

 jobs:
-  trigger_bench_on_ec2_machine_in_eu_central_1:
+  run_periodic_pagebench_test:
    permissions:
      id-token: write # aws-actions/configure-aws-credentials
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [ self-hosted, small ]
+    runs-on: [ self-hosted, unit-perf ]
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
      credentials:
@@ -44,10 +49,13 @@ jobs:
      options: --init
    timeout-minutes: 360  # Set the timeout to 6 hours
    env:
-      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
      RUN_ID: ${{ github.run_id }}
-      AWS_DEFAULT_REGION : "eu-central-1"
-      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
+      DEFAULT_PG_VERSION: 16
+      BUILD_TYPE: release
+      RUST_BACKTRACE: 1
+      # NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container
+      S3_BUCKET: neon-github-public-dev
+      PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
    steps:
    # we don't need the neon source code because we run everything remotely
    # however we still need the local github actions to run the allure step below
@@ -56,99 +64,194 @@ jobs:
      with:
        egress-policy: audit

-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
+      id: set-env
+      shell: bash -euxo pipefail {0}
+      run: |
+        {
+          echo "NEON_DIR=${RUNNER_TEMP}/neon"
+          echo "NEON_BIN=${RUNNER_TEMP}/neon/bin"
+          echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install"
+          echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib"
+          echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots"
+          echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output"
+          echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local"
+          echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results"
+          echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results"
+        } >> "$GITHUB_ENV"

-    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
-      run: curl https://ifconfig.me
+        echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT"

-    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
-      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+    - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
      with:
        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
-        role-duration-seconds: 3600
-
-    - name: Start EC2 instance and wait for the instance to boot up
-      run: |
-        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
-        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
-        sleep 60 # sleep some time to allow cloudinit and our API server to start up
-
-    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
-      run: |
-        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
-        echo "Public IP of the EC2 instance: $public_ip"
-        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
-
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built)
    - name: Determine commit hash
+      id: commit_hash
+      shell: bash -euxo pipefail {0}
      env:
        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
      run: |
-        if [ -z "$INPUT_COMMIT_HASH" ]; then
-          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+        if [[ -z "${INPUT_COMMIT_HASH}" ]]; then
+          COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
          echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
        else
-          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+          COMMIT_HASH="${INPUT_COMMIT_HASH}"
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
          echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
        fi
+    - name: Checkout the neon repository at given commit hash
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        ref: ${{ steps.commit_hash.outputs.commit_hash }}

-    - name: Start Bench with run_id
+    # does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash
+    # example artifact
+    # s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst
+    - name: Determine artifact S3_KEY for given commit hash and download and extract artifact
+      id: artifact_prefix
+      shell: bash -euxo pipefail {0}
+      env:
+        ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
+        COMMIT_HASH: ${{ env.COMMIT_HASH }}
+        COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }}
      run: |
-        curl -k -X 'POST' \
-        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
-        -H 'accept: application/json' \
-        -H 'Content-Type: application/json' \
-        -H "Authorization: Bearer $API_KEY" \
-        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
+        attempt=0
+        max_attempts=24 # 5 minutes * 24 = 2 hours

-    - name: Poll Test Status
-      id: poll_step
-      run: |
-        status=""
-        while [[ "$status" != "failure" && "$status" != "success" ]]; do
-          response=$(curl -k -X 'GET' \
-          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
-          -H 'accept: application/json' \
-          -H "Authorization: Bearer $API_KEY")
-          echo "Response: $response"
-          set +x
-          status=$(echo $response | jq -r '.status')
-          echo "Test status: $status"
-          if [[ "$status" == "failure" ]]; then
-            echo "Test failed"
-            exit 1 # Fail the job step if status is failure
-          elif [[ "$status" == "success" || "$status" == "null" ]]; then
+        while [[ $attempt -lt $max_attempts ]]; do
+          # the following command will fail until the artifacts are available ...
+          S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \
+            | jq -r '.Contents[]?.Key' \
+            | grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \
+            | sort --version-sort \
+            | tail -1) || true # ... thus ignore errors from the command
+          if [[ -n "${S3_KEY}" ]]; then
+            echo "Artifact found: $S3_KEY"
+            echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV
            break
-          elif [[ "$status" == "too_many_runs" ]]; then
-            echo "Too many runs already running"
-            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
-            exit 1
          fi
-
-          sleep 60 # Poll every 60 seconds
+          
+          # Increment attempt counter and sleep for 5 minutes
+          attempt=$((attempt + 1))
+          echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..."
+          sleep 300 # Sleep for 5 minutes
        done

-    - name: Retrieve Test Logs
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      run: |
-        curl -k -X 'GET' \
-        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
-        -H 'accept: application/gzip' \
-        -H "Authorization: Bearer $API_KEY" \
-        --output "test_log_${GITHUB_RUN_ID}.gz"
+        if [[ -z "${S3_KEY}" ]]; then
+          echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours
+        else
+          mkdir -p $(dirname $ARCHIVE)
+          time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE}
+          mkdir -p ${NEON_DIR}
+          time tar -xf ${ARCHIVE} -C ${NEON_DIR}
+          rm -f ${ARCHIVE}
+        fi

-    - name: Unzip Test Log and Print it into this job's log
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+    - name: Download snapshots from S3
+      if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }}
+      id: download_snapshots
+      shell: bash -euxo pipefail {0}
      run: |
-        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
-        cat "test_log_${GITHUB_RUN_ID}"
+        # Download the snapshots from S3
+        mkdir -p ${TEST_OUTPUT}
+        mkdir -p $BACKUP_DIR
+        cd $BACKUP_DIR
+        mkdir parts
+        cd parts
+        PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \
+          | jq -r '.Contents[]?.Key' \
+          | grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \
+          | sort \
+          | tail -1)
+        echo "Latest PART: $PART"
+        if [[ -z "$PART" ]]; then
+          echo "ERROR: No matching S3 key found" >&2
+          exit 1
+        fi
+        S3_KEY=$(dirname $PART)
+        time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ .
+        cd $TEST_OUTPUT
+        time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions
+        rm -rf ${BACKUP_DIR}
+
+    - name: Cache poetry deps
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      shell: bash -euxo pipefail {0}
+      run: ./scripts/pysync
+
+    # we need high number of open files for pagebench
+    - name: show ulimits
+      shell: bash -euxo pipefail {0}
+      run: |
+        ulimit -a
+
+    - name: Run pagebench testcase
+      shell: bash -euxo pipefail {0}
+      env:
+        CI: false  # need to override this env variable set by github to enforce using snapshots
+      run: |
+        export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE}
+        # report the commit hash of the neon repository in the revision of the test results
+        export GITHUB_SHA=${COMMIT_HASH}
+        rm -rf ${PERF_REPORT_DIR}
+        rm -rf ${ALLURE_RESULTS_DIR}
+        mkdir -p ${PERF_REPORT_DIR}
+        mkdir -p ${ALLURE_RESULTS_DIR}
+        PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA"
+        EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json"
+        # run only two selected tests
+        # environment set by parent:
+        # RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release
+        ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS}
+        ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS}
+
+    - name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results
+      shell: bash -euxo pipefail {0}
+      run: |
+        export REPORT_FROM="$PERF_REPORT_DIR"
+        export GITHUB_SHA=${COMMIT_HASH}
+        time ./scripts/generate_and_push_perf_report.sh
+
+    - name: Upload test results
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-store
+      with:
+        report-dir:  ${{ steps.set-env.outputs.allure_results_dir }}
+        unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }}
+        aws-oidc-role-arn:  ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Allure report
+      id: create-allure-report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

+    - name: Upload snapshots
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }}
+      id: upload_snapshots
+      shell: bash -euxo pipefail {0}
+      run: |
+        mkdir -p $BACKUP_DIR
+        cd $TEST_OUTPUT
+        tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst
+        cd $BACKUP_DIR
+        mkdir parts
+        split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part.
+        SNAPSHOT_DATE=$(date +%F)  # YYYY-MM-DD
+        cd parts
+        time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/
+
    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
@@ -157,26 +260,22 @@ jobs:
        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
+        
    - name: Cleanup Test Resources
      if: always()
+      shell: bash -euxo pipefail {0}
+      env:
+        ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
      run: |
-        curl -k -X 'POST' \
-        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
-        -H 'accept: application/json' \
-        -H "Authorization: Bearer $API_KEY" \
-        -d ''
+        # Cleanup the test resources
+        if [[ -d "${BACKUP_DIR}" ]]; then
+          rm -rf ${BACKUP_DIR}
+        fi
+        if [[ -d "${TEST_OUTPUT}" ]]; then
+          rm -rf ${TEST_OUTPUT}
+        fi
+        if [[ -d "${NEON_DIR}" ]]; then
+          rm -rf ${NEON_DIR}
+        fi
+        rm -rf $(dirname $ARCHIVE)

-    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
-        role-duration-seconds: 3600
-
-    - name: Stop EC2 instance and wait for the instance to be stopped
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      run: |
-        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
-        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1276,7 +1276,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "jsonwebtoken",
 "regex",
 "remote_storage",
@@ -1308,7 +1308,7 @@ dependencies = [
 "flate2",
 "futures",
 "http 1.1.0",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "itertools 0.10.5",
 "jsonwebtoken",
 "metrics",
@@ -2597,7 +2597,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 0.2.9",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "slab",
 "tokio",
 "tokio-util",
@@ -2616,7 +2616,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 1.1.0",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "slab",
 "tokio",
 "tokio-util",
@@ -2863,14 +2863,14 @@ dependencies = [
 "pprof",
 "regex",
 "routerify",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-pemfile 2.1.1",
 "serde",
 "serde_json",
 "serde_path_to_error",
 "thiserror 1.0.69",
 "tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
 "tokio-util",
 "tracing",
@@ -3200,12 +3200,12 @@ dependencies = [

 [[package]]
 name = "indexmap"
-version = "2.0.1"
+version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
+checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
 dependencies = [
 "equivalent",
- "hashbrown 0.14.5",
+ "hashbrown 0.15.2",
 "serde",
 ]

@@ -3228,7 +3228,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
 dependencies = [
 "ahash",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "is-terminal",
 "itoa",
 "log",
@@ -3251,7 +3251,7 @@ dependencies = [
 "crossbeam-utils",
 "dashmap 6.1.0",
 "env_logger",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "itoa",
 "log",
 "num-format",
@@ -3898,6 +3898,16 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num"
 version = "0.4.1"
@@ -4102,7 +4112,7 @@ dependencies = [
 "opentelemetry-http",
 "opentelemetry-proto",
 "opentelemetry_sdk",
- "prost 0.13.3",
+ "prost 0.13.5",
 "reqwest",
 "thiserror 1.0.69",
 ]
@@ -4115,8 +4125,8 @@ checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
 dependencies = [
 "opentelemetry",
 "opentelemetry_sdk",
- "prost 0.13.3",
- "tonic",
+ "prost 0.13.5",
+ "tonic 0.12.3",
 ]

 [[package]]
@@ -4182,6 +4192,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "p256"
 version = "0.11.1"
@@ -4286,6 +4302,7 @@ dependencies = [
 "enumset",
 "fail",
 "futures",
+ "hashlink",
 "hex",
 "hex-literal",
 "http-utils",
@@ -4322,7 +4339,7 @@ dependencies = [
 "reqwest",
 "rpds",
 "rstest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "scopeguard",
 "send-future",
 "serde",
@@ -4341,7 +4358,7 @@ dependencies = [
 "tokio-epoll-uring",
 "tokio-io-timeout",
 "tokio-postgres",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -4434,6 +4451,16 @@ dependencies = [
 "workspace_hack",
 ]

+[[package]]
+name = "pageserver_page_api"
+version = "0.1.0"
+dependencies = [
+ "prost 0.13.5",
+ "tonic 0.13.1",
+ "tonic-build",
+ "workspace_hack",
+]
+
 [[package]]
 name = "papaya"
 version = "0.2.1"
@@ -4810,14 +4837,14 @@ dependencies = [
 "bytes",
 "once_cell",
 "pq_proto",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror 1.0.69",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-util",
 "tracing",
 ]
@@ -4924,7 +4951,7 @@ dependencies = [
 "inferno 0.12.0",
 "num",
 "paste",
- "prost 0.13.3",
+ "prost 0.13.5",
 ]

 [[package]]
@@ -5029,12 +5056,12 @@ dependencies = [

 [[package]]
 name = "prost"
-version = "0.13.3"
+version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f"
+checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
 dependencies = [
 "bytes",
- "prost-derive 0.13.3",
+ "prost-derive 0.13.5",
 ]

 [[package]]
@@ -5072,7 +5099,7 @@ dependencies = [
 "once_cell",
 "petgraph",
 "prettyplease",
- "prost 0.13.3",
+ "prost 0.13.5",
 "prost-types 0.13.3",
 "regex",
 "syn 2.0.100",
@@ -5094,9 +5121,9 @@ dependencies = [

 [[package]]
 name = "prost-derive"
-version = "0.13.3"
+version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
+checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d"
 dependencies = [
 "anyhow",
 "itertools 0.12.1",
@@ -5120,7 +5147,7 @@ version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670"
 dependencies = [
- "prost 0.13.3",
+ "prost 0.13.5",
 ]

 [[package]]
@@ -5168,7 +5195,7 @@ dependencies = [
 "hyper 0.14.30",
 "hyper 1.4.1",
 "hyper-util",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "ipnet",
 "itertools 0.10.5",
 "itoa",
@@ -5202,7 +5229,7 @@ dependencies = [
 "rsa",
 "rstest",
 "rustc-hash 1.1.0",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-native-certs 0.8.0",
 "rustls-pemfile 2.1.1",
 "scopeguard",
@@ -5221,13 +5248,14 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-postgres2",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-tungstenite 0.21.0",
 "tokio-util",
 "tracing",
 "tracing-log",
 "tracing-opentelemetry",
 "tracing-subscriber",
+ "tracing-test",
 "tracing-utils",
 "try-lock",
 "typed-json",
@@ -5444,13 +5472,13 @@ dependencies = [
 "num-bigint",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-native-certs 0.8.0",
 "ryu",
 "sha1_smol",
 "socket2",
 "tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-util",
 "url",
 ]
@@ -5898,15 +5926,15 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.23.18"
+version = "0.23.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f"
+checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321"
 dependencies = [
 "log",
 "once_cell",
 "ring",
 "rustls-pki-types",
- "rustls-webpki 0.102.8",
+ "rustls-webpki 0.103.3",
 "subtle",
 "zeroize",
 ]
@@ -5995,6 +6023,17 @@ dependencies = [
 "untrusted",
 ]

+[[package]]
+name = "rustls-webpki"
+version = "0.103.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
+dependencies = [
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+
 [[package]]
 name = "rustversion"
 version = "1.0.12"
@@ -6046,7 +6085,7 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "safekeeper_api",
 "safekeeper_client",
 "scopeguard",
@@ -6063,7 +6102,7 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -6235,7 +6274,7 @@ checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335"
 dependencies = [
 "httpdate",
 "reqwest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -6664,11 +6703,11 @@ dependencies = [
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
- "prost 0.13.3",
- "rustls 0.23.18",
+ "prost 0.13.5",
+ "rustls 0.23.27",
 "tokio",
- "tokio-rustls 0.26.0",
- "tonic",
+ "tokio-rustls 0.26.2",
+ "tonic 0.13.1",
 "tonic-build",
 "tracing",
 "utils",
@@ -6710,7 +6749,7 @@ dependencies = [
 "regex",
 "reqwest",
 "routerify",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-native-certs 0.8.0",
 "safekeeper_api",
 "safekeeper_client",
@@ -6725,7 +6764,7 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-util",
 "tracing",
 "utils",
@@ -6763,7 +6802,7 @@ dependencies = [
 "postgres_ffi",
 "remote_storage",
 "reqwest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-native-certs 0.8.0",
 "serde",
 "serde_json",
@@ -7297,10 +7336,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
 "ring",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "tokio",
 "tokio-postgres",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "x509-certificate",
 ]

@@ -7344,12 +7383,11 @@ dependencies = [

 [[package]]
 name = "tokio-rustls"
-version = "0.26.0"
+version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
+checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
 dependencies = [
- "rustls 0.23.18",
- "rustls-pki-types",
+ "rustls 0.23.27",
 "tokio",
 ]

@@ -7447,7 +7485,7 @@ version = "0.22.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
 dependencies = [
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "serde",
 "serde_spanned",
 "toml_datetime",
@@ -7459,6 +7497,27 @@ name = "tonic"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52"
+dependencies = [
+ "async-trait",
+ "base64 0.22.1",
+ "bytes",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "percent-encoding",
+ "pin-project",
+ "prost 0.13.5",
+ "tokio-stream",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@@ -7471,13 +7530,12 @@ dependencies = [
 "hyper-util",
 "percent-encoding",
 "pin-project",
- "prost 0.13.3",
+ "prost 0.13.5",
 "rustls-native-certs 0.8.0",
- "rustls-pemfile 2.1.1",
 "tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
- "tower 0.4.13",
+ "tower 0.5.2",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -7485,9 +7543,9 @@ dependencies = [

 [[package]]
 name = "tonic-build"
-version = "0.12.3"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11"
+checksum = "eac6f67be712d12f0b41328db3137e0d0757645d8904b4cb7d51cd9c2279e847"
 dependencies = [
 "prettyplease",
 "proc-macro2",
@@ -7505,16 +7563,11 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
 dependencies = [
 "futures-core",
 "futures-util",
- "indexmap 1.9.3",
 "pin-project",
 "pin-project-lite",
- "rand 0.8.5",
- "slab",
 "tokio",
- "tokio-util",
 "tower-layer",
 "tower-service",
- "tracing",
 ]

 [[package]]
@@ -7525,9 +7578,12 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
 dependencies = [
 "futures-core",
 "futures-util",
+ "indexmap 2.9.0",
 "pin-project-lite",
+ "slab",
 "sync_wrapper 1.0.1",
 "tokio",
+ "tokio-util",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -7678,6 +7734,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
 "matchers",
+ "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -7691,6 +7748,27 @@ dependencies = [
 "tracing-serde",
 ]

+[[package]]
+name = "tracing-test"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68"
+dependencies = [
+ "tracing-core",
+ "tracing-subscriber",
+ "tracing-test-macro",
+]
+
+[[package]]
+name = "tracing-test-macro"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568"
+dependencies = [
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "tracing-utils"
 version = "0.1.0"
@@ -7833,7 +7911,7 @@ dependencies = [
 "base64 0.22.1",
 "log",
 "once_cell",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-pki-types",
 "url",
 "webpki-roots",
@@ -8028,7 +8106,7 @@ dependencies = [
 "pageserver_api",
 "postgres_ffi",
 "pprof",
- "prost 0.13.3",
+ "prost 0.13.5",
 "remote_storage",
 "serde",
 "serde_json",
@@ -8484,8 +8562,7 @@ dependencies = [
 "hyper 0.14.30",
 "hyper 1.4.1",
 "hyper-util",
- "indexmap 1.9.3",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "itertools 0.12.1",
 "lazy_static",
 "libc",
@@ -8507,16 +8584,16 @@ dependencies = [
 "percent-encoding",
 "prettyplease",
 "proc-macro2",
- "prost 0.13.3",
+ "prost 0.13.5",
 "quote",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
 "regex-syntax 0.8.2",
 "reqwest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-pki-types",
- "rustls-webpki 0.102.8",
+ "rustls-webpki 0.103.3",
 "scopeguard",
 "sec1 0.7.3",
 "serde",
@@ -8534,15 +8611,15 @@ dependencies = [
 "time",
 "time-macros",
 "tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
 "tokio-util",
 "toml_edit",
- "tonic",
- "tower 0.4.13",
+ "tower 0.5.2",
 "tracing",
 "tracing-core",
 "tracing-log",
+ "tracing-subscriber",
 "url",
 "uuid",
 "zeroize",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ members = [
    "pageserver/ctl",
    "pageserver/client",
    "pageserver/pagebench",
+    "pageserver/page_api",
    "proxy",
    "safekeeper",
    "safekeeper/client",
@@ -148,7 +149,7 @@ pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
-prost = "0.13"
+prost = "0.13.5"
 rand = "0.8"
 redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
@@ -198,7 +199,7 @@ tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
-tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
+tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "prost", "tls-ring", "tls-native-roots"] }
 tower = { version = "0.5.2", default-features = false }
 tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }

@@ -252,6 +253,7 @@ pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
+pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
@@ -278,7 +280,7 @@ criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
-tonic-build = "0.12"
+tonic-build = "0.13.1"

 [patch.crates-io]

--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -155,7 +155,7 @@ RUN set -e \

 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.17.0
+ENV SQL_EXPORTER_VERSION=0.17.3
 RUN curl -fsSL \
    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
    --output sql_exporter.tar.gz \
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -582,6 +582,38 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control

+#########################################################################################
+#
+# Layer "online_advisor-build"
+# compile online_advisor extension
+#
+#########################################################################################
+FROM build-deps AS online_advisor-src
+ARG PG_VERSION
+
+# online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries
+# last release 1.0 - May 15, 2025
+WORKDIR /ext-src
+RUN case "${PG_VERSION:?}" in \
+    "v17") \
+        ;; \
+    *) \
+        echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \
+        ;; \
+    esac && \
+	wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
+    echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \
+    mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS online_advisor-build
+COPY --from=online_advisor-src /ext-src/ /ext-src/
+WORKDIR /ext-src/
+RUN if [ -d online_advisor-src ]; then \
+	    cd online_advisor-src && \
+        make -j install && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \
+    fi
+
 #########################################################################################
 #
 # Layer "pg_hashids-build"
@@ -1648,6 +1680,7 @@ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1751,17 +1784,17 @@ ARG TARGETARCH
 RUN if [ "$TARGETARCH" = "amd64" ]; then\
        postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
        pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
-        sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
+        sql_exporter_sha256='9a41127a493e8bfebfe692bf78c7ed2872a58a3f961ee534d1b0da9ae584aaab';\
    else\
        postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
        pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
-        sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
+        sql_exporter_sha256='530e6afc77c043497ed965532c4c9dfa873bc2a4f0b3047fad367715c0081d6a';\
    fi\
    && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
     | tar xzf - --strip-components=1 -C.\
    && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
     | tar xzf - --strip-components=1 -C.\
-    && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.0/sql_exporter-0.17.0.linux-${TARGETARCH}.tar.gz\
+    && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.3/sql_exporter-0.17.3.linux-${TARGETARCH}.tar.gz\
     | tar xzf - --strip-components=1 -C.\
    && echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\
    && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
@@ -1823,6 +1856,7 @@ COPY --from=pgjwt-src /ext-src/ /ext-src/
 COPY --from=pg_graphql-src /ext-src/ /ext-src/
 #COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
 COPY --from=hypopg-src /ext-src/ /ext-src/
+COPY --from=online_advisor-src /ext-src/ /ext-src/
 COPY --from=pg_hashids-src /ext-src/ /ext-src/
 COPY --from=rum-src /ext-src/ /ext-src/
 COPY --from=pgtap-src /ext-src/ /ext-src/
--- a/compute/patches/cloud_regress_pg16_ha_495.patch
+++ b/compute/patches/cloud_regress_pg16_ha_495.patch
--- a/compute/patches/cloud_regress_pg17_ha.patch
+++ b/compute/patches/cloud_regress_pg17_ha.patch
--- a/compute/patches/cloud_regress_pg17_ha_plus.patch
+++ b/compute/patches/cloud_regress_pg17_ha_plus.patch
@@ -1,129 +0,0 @@
-diff --git a/src/test/regress/sql/box.sql b/src/test/regress/sql/box.sql
-index 249636c76c3..540c2b54dda 100644
--- a/src/test/regress/sql/box.sql
-+++ b/src/test/regress/sql/box.sql
-@@ -196,7 +196,7 @@ CREATE TABLE quad_box_tbl (id int, b box);
- 
- INSERT INTO quad_box_tbl
-   SELECT (x - 1) * 100 + y, box(point(x * 10, y * 10), point(x * 10 + 5, y * 10 + 5))
-  FROM generate_series(1, 95 * 100) x,
-+  FROM generate_series(1, 100) x,
-        generate_series(1, 95 * 100) y;
- 
- -- insert repeating data to test allTheSame
-diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
-index 3ca8a2d6090..a8e40f906c4 100644
--- a/src/test/regress/sql/partition_join.sql
-+++ b/src/test/regress/sql/partition_join.sql
-@@ -533,7 +533,7 @@ create temp table prtx2_3 partition of prtx2 for values from (21) to (31);
- insert into prtx1 select 1 + i%30, i, i
-   from generate_series(1, 95 * 1000) i;
- insert into prtx2 select 1 + i%30, i, i
-  from generate_series(1, 95 * 500) i, generate_series(1, 95 * 10) j;
-+  from generate_series(1, 500) i, generate_series(1, 95 * 10) j;
- create index on prtx2 (b);
- create index on prtx2 (c);
- analyze prtx1;
-diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
-index 82ac39d5dc8..bef0a891ade 100644
--- a/src/test/regress/sql/partition_prune.sql
-+++ b/src/test/regress/sql/partition_prune.sql
-@@ -1274,9 +1274,9 @@ select
-   case c when 0 then null else 3 end,
-   case d when 0 then null else 4 end
- from
-  generate_series(0, 95 * 1) a,
-  generate_series(0, 95 * 1) b,
-  generate_series(0, 95 * 1) c,
-+  generate_series(0, 1) a,
-+  generate_series(0, 1) b,
-+  generate_series(0, 1) c,
-   generate_series(0, 95 * 1) d;
- 
- -- Ensure partition pruning works correctly for each combination of IS NULL
-diff --git a/src/test/regress/sql/polygon.sql b/src/test/regress/sql/polygon.sql
-index d39a2b4e8f8..2d862985510 100644
--- a/src/test/regress/sql/polygon.sql
-+++ b/src/test/regress/sql/polygon.sql
-@@ -42,7 +42,7 @@ CREATE TABLE quad_poly_tbl (id int, p polygon);
- 
- INSERT INTO quad_poly_tbl
- 	SELECT (x - 1) * 100 + y, polygon(circle(point(x * 10, y * 10), 1 + (x + y) % 10))
-	FROM generate_series(1, 95 * 100) x,
-+	FROM generate_series(1, 100) x,
- 		 generate_series(1, 95 * 100) y;
- 
- INSERT INTO quad_poly_tbl
-diff --git a/src/test/regress/sql/rangetypes.sql b/src/test/regress/sql/rangetypes.sql
-index b51d6c405c2..4138418c7a6 100644
--- a/src/test/regress/sql/rangetypes.sql
-+++ b/src/test/regress/sql/rangetypes.sql
-@@ -314,13 +314,13 @@ select count(*) from test_range_gist where ir -|- int4multirange(int4range(100,2
- create table test_range_spgist(ir int4range);
- create index test_range_spgist_idx on test_range_spgist using spgist (ir);
- 
-insert into test_range_spgist select int4range(g, g+10) from generate_series(1, 95 * 2000) g;
-insert into test_range_spgist select 'empty'::int4range from generate_series(1, 95 * 500) g;
-insert into test_range_spgist select int4range(g, g+10000) from generate_series(1, 95 * 1000) g;
-insert into test_range_spgist select 'empty'::int4range from generate_series(1, 95 * 500) g;
-insert into test_range_spgist select int4range(NULL,g*10,'(]') from generate_series(1, 95 * 100) g;
-insert into test_range_spgist select int4range(g*10,NULL,'(]') from generate_series(1, 95 * 100) g;
-insert into test_range_spgist select int4range(g, g+10) from generate_series(1, 95 * 2000) g;
-+insert into test_range_spgist select int4range(g, g+10) from generate_series(1, 0.1 * 95 * 2000) g;
-+insert into test_range_spgist select 'empty'::int4range from generate_series(1, 0.1 * 95 * 500) g;
-+insert into test_range_spgist select int4range(g, g+10000) from generate_series(1, 0.1 * 95 * 1000) g;
-+insert into test_range_spgist select 'empty'::int4range from generate_series(1, 0.1 * 95 * 500) g;
-+insert into test_range_spgist select int4range(NULL,g*10,'(]') from generate_series(1, 0.1 * 95 * 100) g;
-+insert into test_range_spgist select int4range(g*10,NULL,'(]') from generate_series(1, 0.1 * 95 * 100) g;
-+insert into test_range_spgist select int4range(g, g+10) from generate_series(1, 0.1 * 95 * 2000) g;
- 
- -- first, verify non-indexed results
- SET enable_seqscan    = t;
-diff --git a/src/test/regress/sql/spgist.sql b/src/test/regress/sql/spgist.sql
-index 0c4f24e1d49..61e53375539 100644
--- a/src/test/regress/sql/spgist.sql
-+++ b/src/test/regress/sql/spgist.sql
-@@ -16,9 +16,9 @@ vacuum spgist_point_tbl;
- 
- -- Insert more data, to make the index a few levels deep.
- insert into spgist_point_tbl (id, p)
-select g,      point(g*10, g*10) from generate_series(1, 95 * 10000) g;
-+select g,      point(g*10, g*10) from generate_series(1, 0.1 * 95 * 10000) g;
- insert into spgist_point_tbl (id, p)
-select g+100000, point(g*10+1, g*10+1) from generate_series(1, 95 * 10000) g;
-+select g+100000, point(g*10+1, g*10+1) from generate_series(1, 0.1 * 95 * 10000) g;
- 
- -- To test vacuum, delete some entries from all over the index.
- delete from spgist_point_tbl where id % 2 = 1;
-@@ -37,8 +37,8 @@ vacuum spgist_point_tbl;
- create table spgist_box_tbl(id serial, b box);
- insert into spgist_box_tbl(b)
- select box(point(i,j),point(i+s,j+s))
-  from generate_series(1, 95 * 100,5) i,
-       generate_series(1, 95 * 100,5) j,
-+  from generate_series(1,100,5) i,
-+       generate_series(1,100,5) j,
-        generate_series(1, 95 * 10) s;
- create index spgist_box_idx on spgist_box_tbl using spgist (b);
- 
-@@ -86,6 +86,6 @@ create unlogged table spgist_unlogged_tbl(id serial, b box);
- create index spgist_unlogged_idx on spgist_unlogged_tbl using spgist (b);
- insert into spgist_unlogged_tbl(b)
- select box(point(i,j))
-  from generate_series(1, 95 * 100,5) i,
-+  from generate_series(1,100,5) i,
-        generate_series(1, 95 * 10,5) j;
- -- leave this table around, to help in testing dump/restore
-diff --git a/src/test/regress/sql/tuplesort.sql b/src/test/regress/sql/tuplesort.sql
-index fa762f26ac7..7a1fd619eba 100644
--- a/src/test/regress/sql/tuplesort.sql
-+++ b/src/test/regress/sql/tuplesort.sql
-@@ -276,7 +276,7 @@ ROLLBACK;
- CREATE TEMP TABLE test_mark_restore(col1 int, col2 int, col12 int);
- -- need a few duplicates for mark/restore to matter
- INSERT INTO test_mark_restore(col1, col2, col12)
-   SELECT a.i, b.i, a.i * b.i FROM generate_series(1, 95 * 500) a(i), generate_series(1, 95 * 5) b(i);
-+   SELECT a.i, b.i, a.i * b.i FROM generate_series(1, 500) a(i), generate_series(1, 95 * 5) b(i);
- 
- BEGIN;
- 
--- a/compute/patches/cloud_regress_pg17_ha_plus2.patch
+++ b/compute/patches/cloud_regress_pg17_ha_plus2.patch
@@ -1,593 +0,0 @@
-diff --git a/src/test/regress/sql/box.sql b/src/test/regress/sql/box.sql
-index 249636c76c3..540c2b54dda 100644
--- a/src/test/regress/sql/box.sql
-+++ b/src/test/regress/sql/box.sql
-@@ -196,7 +196,7 @@ CREATE TABLE quad_box_tbl (id int, b box);
- 
- INSERT INTO quad_box_tbl
-   SELECT (x - 1) * 100 + y, box(point(x * 10, y * 10), point(x * 10 + 5, y * 10 + 5))
-  FROM generate_series(1, 95 * 100) x,
-+  FROM generate_series(1, 100) x,
-        generate_series(1, 95 * 100) y;
- 
- -- insert repeating data to test allTheSame
-diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql
-index 39d3cd7821a..86efbb72609 100644
--- a/src/test/regress/sql/brin.sql
-+++ b/src/test/regress/sql/brin.sql
-@@ -476,7 +476,7 @@ CREATE TABLE brintest_3 (a text, b text, c text, d text);
- 
- -- long random strings (~2000 chars each, so ~6kB for min/max on two
- -- columns) to trigger toasting
-WITH rand_value AS (SELECT string_agg(fipshash(i::text),'') AS val FROM generate_series(1, 95 * 60) s(i))
-+WITH rand_value AS (SELECT string_agg(fipshash(i::text),'') AS val FROM generate_series(1,60) s(i))
- INSERT INTO brintest_3
- SELECT val, val, val, val FROM rand_value;
- 
-@@ -495,7 +495,7 @@ VACUUM brintest_3;
- -- retry insert with a different random-looking (but deterministic) value
- -- the value is different, and so should replace either min or max in the
- -- brin summary
-WITH rand_value AS (SELECT string_agg(fipshash((-i)::text),'') AS val FROM generate_series(1, 95 * 60) s(i))
-+WITH rand_value AS (SELECT string_agg(fipshash((-i)::text),'') AS val FROM generate_series(1,60) s(i))
- INSERT INTO brintest_3
- SELECT val, val, val, val FROM rand_value;
- 
-diff --git a/src/test/regress/sql/brin_multi.sql b/src/test/regress/sql/brin_multi.sql
-index b7f7a9e8803..b1a109fe07f 100644
--- a/src/test/regress/sql/brin_multi.sql
-+++ b/src/test/regress/sql/brin_multi.sql
-@@ -612,7 +612,7 @@ CREATE TABLE brin_date_test(a DATE);
- INSERT INTO brin_date_test SELECT '4713-01-01 BC'::date + i FROM generate_series(1, 95 * 30) s(i);
- 
- -- insert values close to date minimum
-INSERT INTO brin_date_test SELECT '5874897-12-01'::date + i FROM generate_series(1, 95 * 30) s(i);
-+INSERT INTO brin_date_test SELECT '5874897-12-01'::date + i FROM generate_series(1, 30) s(i);
- 
- CREATE INDEX ON brin_date_test USING brin (a date_minmax_multi_ops) WITH (pages_per_range=1);
- 
-diff --git a/src/test/regress/sql/btree_index.sql b/src/test/regress/sql/btree_index.sql
-index d0d86db1667..88a752264a0 100644
--- a/src/test/regress/sql/btree_index.sql
-+++ b/src/test/regress/sql/btree_index.sql
-@@ -267,7 +267,7 @@ VACUUM delete_test_table;
- --
- -- The vacuum above should've turned the leaf page into a fast root. We just
- -- need to insert some rows to cause the fast root page to split.
-INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1, 95 * 1000) i;
-+INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,1000) i;
- 
- -- Test unsupported btree opclass parameters
- create index on btree_tall_tbl (id int4_ops(foo=1));
-diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
-index 13006372064..1fd4cbfa7ef 100644
--- a/src/test/regress/sql/create_table.sql
-+++ b/src/test/regress/sql/create_table.sql
-@@ -47,7 +47,7 @@ DEALLOCATE select1;
- -- (temporarily hide query, to avoid the long CREATE TABLE stmt)
- \set ECHO none
- SELECT 'CREATE TABLE extra_wide_table(firstc text, '|| array_to_string(array_agg('c'||i||' bool'),',')||', lastc text);'
-FROM generate_series(1, 95 * 1100) g(i)
-+FROM generate_series(1, 1100) g(i)
- \gexec
- \set ECHO all
- INSERT INTO extra_wide_table(firstc, lastc) VALUES('first col', 'last col');
-@@ -74,7 +74,7 @@ CREATE TABLE default_expr_agg (a int DEFAULT (avg(1)));
- -- invalid use of subquery
- CREATE TABLE default_expr_agg (a int DEFAULT (select 1));
- -- invalid use of set-returning function
-CREATE TABLE default_expr_agg (a int DEFAULT (generate_series(1, 95 * 3)));
-+CREATE TABLE default_expr_agg (a int DEFAULT (generate_series(1,3)));
- 
- -- Verify that subtransaction rollback restores rd_createSubid.
- BEGIN;
-@@ -359,7 +359,7 @@ CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
- CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
-   FOR VALUES FROM ((select 1)) TO ('2019-01-01');
- CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted
-  FOR VALUES FROM (generate_series(1, 95 * 3)) TO ('2019-01-01');
-+  FOR VALUES FROM (generate_series(1, 3)) TO ('2019-01-01');
- 
- -- trying to specify list for range partitioned table
- CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES IN ('a');
-diff --git a/src/test/regress/sql/fast_default.sql b/src/test/regress/sql/fast_default.sql
-index 28fefad6fe6..7d7060820e4 100644
--- a/src/test/regress/sql/fast_default.sql
-+++ b/src/test/regress/sql/fast_default.sql
-@@ -318,7 +318,7 @@ CREATE TABLE T (pk INT NOT NULL PRIMARY KEY);
- 
- SELECT set('t');
- 
-INSERT INTO T SELECT * FROM generate_series(1, 95 * 10) a;
-+INSERT INTO T SELECT * FROM generate_series(1, 10) a;
- 
- ALTER TABLE T ADD COLUMN c_bigint BIGINT NOT NULL DEFAULT -1;
- 
-@@ -326,7 +326,7 @@ INSERT INTO T SELECT b, b - 10 FROM generate_series(11, 20) a(b);
- 
- ALTER TABLE T ADD COLUMN c_text TEXT DEFAULT 'hello';
- 
-INSERT INTO T SELECT b, b - 10, (b + 10)::text FROM generate_series(21, 30) a(b);
-+INSERT INTO T SELECT b, b - 10, (b + 10)::text FROM generate_series(21, 95 * 30) a(b);
- 
- -- WHERE clause
- SELECT c_bigint, c_text FROM T WHERE c_bigint = -1 LIMIT 1;
-diff --git a/src/test/regress/sql/hash_index.sql b/src/test/regress/sql/hash_index.sql
-index fcd5f91a39f..6ac90c57730 100644
--- a/src/test/regress/sql/hash_index.sql
-+++ b/src/test/regress/sql/hash_index.sql
-@@ -220,7 +220,7 @@ SELECT h.seqno AS f20000
- CREATE TABLE hash_split_heap (keycol INT);
- INSERT INTO hash_split_heap SELECT 1 FROM generate_series(1, 95 * 500) a;
- CREATE INDEX hash_split_index on hash_split_heap USING HASH (keycol);
-INSERT INTO hash_split_heap SELECT 1 FROM generate_series(1, 95 * 5000) a;
-+INSERT INTO hash_split_heap SELECT 1 FROM generate_series(1, POW(95, 0.5) * 5000) a;
- 
- -- Let's do a backward scan.
- BEGIN;
-@@ -236,7 +236,7 @@ END;
- 
- -- DELETE, INSERT, VACUUM.
- DELETE FROM hash_split_heap WHERE keycol = 1;
-INSERT INTO hash_split_heap SELECT a/2 FROM generate_series(1, 95 * 25000) a;
-+INSERT INTO hash_split_heap SELECT a/2 FROM generate_series(1, POW(95, 0.5) * 25000) a;
- 
- VACUUM hash_split_heap;
- 
-diff --git a/src/test/regress/sql/horology.sql b/src/test/regress/sql/horology.sql
-index 3920a9528ae..d6ce372d799 100644
--- a/src/test/regress/sql/horology.sql
-+++ b/src/test/regress/sql/horology.sql
-@@ -551,14 +551,14 @@ SELECT to_timestamp('2011-12-18 11:38 +01:xyz', 'YYYY-MM-DD HH12:MI OF');  -- er
- SELECT to_timestamp('2018-11-02 12:34:56.025', 'YYYY-MM-DD HH24:MI:SS.MS');
- 
- SELECT i, to_timestamp('2018-11-02 12:34:56', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 95 * 6) i;
-SELECT i, to_timestamp('2018-11-02 12:34:56.1', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 95 * 6) i;
-SELECT i, to_timestamp('2018-11-02 12:34:56.12', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 95 * 6) i;
-SELECT i, to_timestamp('2018-11-02 12:34:56.123', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 95 * 6) i;
-SELECT i, to_timestamp('2018-11-02 12:34:56.1234', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 95 * 6) i;
-SELECT i, to_timestamp('2018-11-02 12:34:56.12345', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 95 * 6) i;
-+SELECT i, to_timestamp('2018-11-02 12:34:56.1', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 6) i;
-+SELECT i, to_timestamp('2018-11-02 12:34:56.12', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 6) i;
-+SELECT i, to_timestamp('2018-11-02 12:34:56.123', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 6) i;
-+SELECT i, to_timestamp('2018-11-02 12:34:56.1234', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 6) i;
-+SELECT i, to_timestamp('2018-11-02 12:34:56.12345', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 6) i;
- SELECT i, to_timestamp('2018-11-02 12:34:56.123456', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 95 * 6) i;
- SELECT i, to_timestamp('2018-11-02 12:34:56.123456789', 'YYYY-MM-DD HH24:MI:SS.FF' || i) FROM generate_series(1, 95 * 6) i;
-SELECT i, to_timestamp('20181102123456123456', 'YYYYMMDDHH24MISSFF' || i) FROM generate_series(1, 95 * 6) i;
-+SELECT i, to_timestamp('20181102123456123456', 'YYYYMMDDHH24MISSFF' || i) FROM generate_series(1, 6) i;
- 
- SELECT to_date('1 4 1902', 'Q MM YYYY');  -- Q is ignored
- SELECT to_date('3 4 21 01', 'W MM CC YY');
-diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
-index 96c19fa5297..276f6d25c67 100644
--- a/src/test/regress/sql/inherit.sql
-+++ b/src/test/regress/sql/inherit.sql
-@@ -742,7 +742,7 @@ create table inhcld1(f2 name, f1 int primary key);
- create table inhcld2(f1 int primary key, f2 name);
- alter table inhpar attach partition inhcld1 for values from (1) to (5);
- alter table inhpar attach partition inhcld2 for values from (5) to (100);
-insert into inhpar select x, x::text from generate_series(1, 95 * 10) x;
-+insert into inhpar select x, x::text from generate_series(1,10) x;
- 
- explain (verbose, costs off)
- update inhpar i set (f1, f2) = (select i.f1, i.f2 || '-' from int4_tbl limit 1);
-diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
-index c9fdd126d15..bbbda3d6237 100644
--- a/src/test/regress/sql/insert.sql
-+++ b/src/test/regress/sql/insert.sql
-@@ -320,8 +320,8 @@ create table part_ee_ff3_2 partition of part_ee_ff3 for values from (25) to (30)
- 
- truncate list_parted;
- insert into list_parted values ('aa'), ('cc');
-insert into list_parted select 'Ff', s.a from generate_series(1, 95 * 29) s(a);
-insert into list_parted select 'gg', s.a from generate_series(1, 95 * 9) s(a);
-+insert into list_parted select 'Ff', s.a from generate_series(1, 29) s(a);
-+insert into list_parted select 'gg', s.a from generate_series(1, 9) s(a);
- insert into list_parted (b) values (1);
- select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_parted group by 1, 2 order by 1;
- 
-diff --git a/src/test/regress/sql/join_hash.sql b/src/test/regress/sql/join_hash.sql
-index 47abc031c0f..34c4d8c1312 100644
--- a/src/test/regress/sql/join_hash.sql
-+++ b/src/test/regress/sql/join_hash.sql
-@@ -310,9 +310,9 @@ rollback to settings;
- -- Exercise rescans.  We'll turn off parallel_leader_participation so
- -- that we can check that instrumentation comes back correctly.
- 
-create table join_foo as select generate_series(1, 95 * 3) as id, 'xxxxx'::text as t;
-+create table join_foo as select generate_series(1, POW(95, 0.5) * 3) as id, 'xxxxx'::text as t;
- alter table join_foo set (parallel_workers = 0);
-create table join_bar as select generate_series(1, 95 * 10000) as id, 'xxxxx'::text as t;
-+create table join_bar as select generate_series(1, POW(95, 0.5) * 10000) as id, 'xxxxx'::text as t;
- alter table join_bar set (parallel_workers = 2);
- 
- -- multi-batch with rescan, parallel-oblivious
-diff --git a/src/test/regress/sql/merge.sql b/src/test/regress/sql/merge.sql
-index b60271d9400..7d89c85179f 100644
--- a/src/test/regress/sql/merge.sql
-+++ b/src/test/regress/sql/merge.sql
-@@ -1457,7 +1457,7 @@ CREATE TABLE pa_source (sid integer, delta float)
- -- insert many rows to the source table
- INSERT INTO pa_source SELECT id, id * 10  FROM generate_series(1, 95 * 14) AS id;
- -- insert a few rows in the target table (odd numbered tid)
-INSERT INTO pa_target SELECT '2017-01-31', id, id * 100, 'initial' FROM generate_series(1, 95 * 9,3) AS id;
-+INSERT INTO pa_target SELECT '2017-01-31', id, id * 100, 'initial' FROM generate_series(1,9,3) AS id;
- INSERT INTO pa_target SELECT '2017-02-28', id, id * 100, 'initial' FROM generate_series(2,9,3) AS id;
- 
- -- try simple MERGE
-diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
-index 53a9b26d4c4..0c48dd2be78 100644
--- a/src/test/regress/sql/partition_join.sql
-+++ b/src/test/regress/sql/partition_join.sql
-@@ -13,7 +13,7 @@ CREATE TABLE prt1 (a int, b int, c varchar) PARTITION BY RANGE(a);
- CREATE TABLE prt1_p1 PARTITION OF prt1 FOR VALUES FROM (0) TO (250);
- CREATE TABLE prt1_p3 PARTITION OF prt1 FOR VALUES FROM (500) TO (600);
- CREATE TABLE prt1_p2 PARTITION OF prt1 FOR VALUES FROM (250) TO (500);
-INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0, 95 * 599) i WHERE i % 2 = 0;
-+INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0,599) i WHERE i % 2 = 0;
- CREATE INDEX iprt1_p1_a on prt1_p1(a);
- CREATE INDEX iprt1_p2_a on prt1_p2(a);
- CREATE INDEX iprt1_p3_a on prt1_p3(a);
-@@ -23,7 +23,7 @@ CREATE TABLE prt2 (a int, b int, c varchar) PARTITION BY RANGE(b);
- CREATE TABLE prt2_p1 PARTITION OF prt2 FOR VALUES FROM (0) TO (250);
- CREATE TABLE prt2_p2 PARTITION OF prt2 FOR VALUES FROM (250) TO (500);
- CREATE TABLE prt2_p3 PARTITION OF prt2 FOR VALUES FROM (500) TO (600);
-INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0, 95 * 599) i WHERE i % 3 = 0;
-+INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0,599) i WHERE i % 3 = 0;
- CREATE INDEX iprt2_p1_b on prt2_p1(b);
- CREATE INDEX iprt2_p2_b on prt2_p2(b);
- CREATE INDEX iprt2_p3_b on prt2_p3(b);
-@@ -149,7 +149,7 @@ CREATE TABLE prt1_e (a int, b int, c int) PARTITION BY RANGE(((a + b)/2));
- CREATE TABLE prt1_e_p1 PARTITION OF prt1_e FOR VALUES FROM (0) TO (250);
- CREATE TABLE prt1_e_p2 PARTITION OF prt1_e FOR VALUES FROM (250) TO (500);
- CREATE TABLE prt1_e_p3 PARTITION OF prt1_e FOR VALUES FROM (500) TO (600);
-INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 95 * 599, 2) i;
-+INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
- CREATE INDEX iprt1_e_p1_ab2 on prt1_e_p1(((a+b)/2));
- CREATE INDEX iprt1_e_p2_ab2 on prt1_e_p2(((a+b)/2));
- CREATE INDEX iprt1_e_p3_ab2 on prt1_e_p3(((a+b)/2));
-@@ -159,7 +159,7 @@ CREATE TABLE prt2_e (a int, b int, c int) PARTITION BY RANGE(((b + a)/2));
- CREATE TABLE prt2_e_p1 PARTITION OF prt2_e FOR VALUES FROM (0) TO (250);
- CREATE TABLE prt2_e_p2 PARTITION OF prt2_e FOR VALUES FROM (250) TO (500);
- CREATE TABLE prt2_e_p3 PARTITION OF prt2_e FOR VALUES FROM (500) TO (600);
-INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 95 * 599, 3) i;
-+INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
- ANALYZE prt2_e;
- 
- EXPLAIN (COSTS OFF)
-@@ -248,14 +248,14 @@ CREATE TABLE prt1_m (a int, b int, c int) PARTITION BY RANGE(a, ((a + b)/2));
- CREATE TABLE prt1_m_p1 PARTITION OF prt1_m FOR VALUES FROM (0, 0) TO (250, 250);
- CREATE TABLE prt1_m_p2 PARTITION OF prt1_m FOR VALUES FROM (250, 250) TO (500, 500);
- CREATE TABLE prt1_m_p3 PARTITION OF prt1_m FOR VALUES FROM (500, 500) TO (600, 600);
-INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 95 * 599, 2) i;
-+INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
- ANALYZE prt1_m;
- 
- CREATE TABLE prt2_m (a int, b int, c int) PARTITION BY RANGE(((b + a)/2), b);
- CREATE TABLE prt2_m_p1 PARTITION OF prt2_m FOR VALUES FROM (0, 0) TO (250, 250);
- CREATE TABLE prt2_m_p2 PARTITION OF prt2_m FOR VALUES FROM (250, 250) TO (500, 500);
- CREATE TABLE prt2_m_p3 PARTITION OF prt2_m FOR VALUES FROM (500, 500) TO (600, 600);
-INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 95 * 599, 3) i;
-+INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
- ANALYZE prt2_m;
- 
- EXPLAIN (COSTS OFF)
-@@ -269,14 +269,14 @@ CREATE TABLE plt1 (a int, b int, c text) PARTITION BY LIST(c);
- CREATE TABLE plt1_p1 PARTITION OF plt1 FOR VALUES IN ('0000', '0003', '0004', '0010');
- CREATE TABLE plt1_p2 PARTITION OF plt1 FOR VALUES IN ('0001', '0005', '0002', '0009');
- CREATE TABLE plt1_p3 PARTITION OF plt1 FOR VALUES IN ('0006', '0007', '0008', '0011');
-INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 95 * 599, 2) i;
-+INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
- ANALYZE plt1;
- 
- CREATE TABLE plt2 (a int, b int, c text) PARTITION BY LIST(c);
- CREATE TABLE plt2_p1 PARTITION OF plt2 FOR VALUES IN ('0000', '0003', '0004', '0010');
- CREATE TABLE plt2_p2 PARTITION OF plt2 FOR VALUES IN ('0001', '0005', '0002', '0009');
- CREATE TABLE plt2_p3 PARTITION OF plt2 FOR VALUES IN ('0006', '0007', '0008', '0011');
-INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 95 * 599, 3) i;
-+INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i;
- ANALYZE plt2;
- 
- --
-@@ -286,7 +286,7 @@ CREATE TABLE plt1_e (a int, b int, c text) PARTITION BY LIST(ltrim(c, 'A'));
- CREATE TABLE plt1_e_p1 PARTITION OF plt1_e FOR VALUES IN ('0000', '0003', '0004', '0010');
- CREATE TABLE plt1_e_p2 PARTITION OF plt1_e FOR VALUES IN ('0001', '0005', '0002', '0009');
- CREATE TABLE plt1_e_p3 PARTITION OF plt1_e FOR VALUES IN ('0006', '0007', '0008', '0011');
-INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 95 * 599, 2) i;
-+INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
- ANALYZE plt1_e;
- 
- -- test partition matching with N-way join
-@@ -371,7 +371,7 @@ CREATE TABLE prt1_l_p2_p2 PARTITION OF prt1_l_p2 FOR VALUES IN ('0002', '0003');
- CREATE TABLE prt1_l_p3 PARTITION OF prt1_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (b);
- CREATE TABLE prt1_l_p3_p1 PARTITION OF prt1_l_p3 FOR VALUES FROM (0) TO (13);
- CREATE TABLE prt1_l_p3_p2 PARTITION OF prt1_l_p3 FOR VALUES FROM (13) TO (25);
-INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 95 * 599, 2) i;
-+INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 2) i;
- ANALYZE prt1_l;
- 
- CREATE TABLE prt2_l (a int, b int, c varchar) PARTITION BY RANGE(b);
-@@ -382,7 +382,7 @@ CREATE TABLE prt2_l_p2_p2 PARTITION OF prt2_l_p2 FOR VALUES IN ('0002', '0003');
- CREATE TABLE prt2_l_p3 PARTITION OF prt2_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (a);
- CREATE TABLE prt2_l_p3_p1 PARTITION OF prt2_l_p3 FOR VALUES FROM (0) TO (13);
- CREATE TABLE prt2_l_p3_p2 PARTITION OF prt2_l_p3 FOR VALUES FROM (13) TO (25);
-INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 95 * 599, 3) i;
-+INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 3) i;
- ANALYZE prt2_l;
- 
- -- inner join, qual covering only top-level partitions
-@@ -453,27 +453,27 @@ WHERE EXISTS (
- CREATE TABLE prt1_n (a int, b int, c varchar) PARTITION BY RANGE(c);
- CREATE TABLE prt1_n_p1 PARTITION OF prt1_n FOR VALUES FROM ('0000') TO ('0250');
- CREATE TABLE prt1_n_p2 PARTITION OF prt1_n FOR VALUES FROM ('0250') TO ('0500');
-INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 95 * 499, 2) i;
-+INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 499, 2) i;
- ANALYZE prt1_n;
- 
- CREATE TABLE prt2_n (a int, b int, c text) PARTITION BY LIST(c);
- CREATE TABLE prt2_n_p1 PARTITION OF prt2_n FOR VALUES IN ('0000', '0003', '0004', '0010', '0006', '0007');
- CREATE TABLE prt2_n_p2 PARTITION OF prt2_n FOR VALUES IN ('0001', '0005', '0002', '0009', '0008', '0011');
-INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 95 * 599, 2) i;
-+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
- ANALYZE prt2_n;
- 
- CREATE TABLE prt3_n (a int, b int, c text) PARTITION BY LIST(c);
- CREATE TABLE prt3_n_p1 PARTITION OF prt3_n FOR VALUES IN ('0000', '0004', '0006', '0007');
- CREATE TABLE prt3_n_p2 PARTITION OF prt3_n FOR VALUES IN ('0001', '0002', '0008', '0010');
- CREATE TABLE prt3_n_p3 PARTITION OF prt3_n FOR VALUES IN ('0003', '0005', '0009', '0011');
-INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 95 * 599, 2) i;
-+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
- ANALYZE prt3_n;
- 
- CREATE TABLE prt4_n (a int, b int, c text) PARTITION BY RANGE(a);
- CREATE TABLE prt4_n_p1 PARTITION OF prt4_n FOR VALUES FROM (0) TO (300);
- CREATE TABLE prt4_n_p2 PARTITION OF prt4_n FOR VALUES FROM (300) TO (500);
- CREATE TABLE prt4_n_p3 PARTITION OF prt4_n FOR VALUES FROM (500) TO (600);
-INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 95 * 599, 2) i;
-+INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 599, 2) i;
- ANALYZE prt4_n;
- 
- -- partitionwise join can not be applied if the partition ranges differ
-@@ -533,7 +533,7 @@ create temp table prtx2_3 partition of prtx2 for values from (21) to (31);
- insert into prtx1 select 1 + i%30, i, i
-   from generate_series(1, 95 * 1000) i;
- insert into prtx2 select 1 + i%30, i, i
-  from generate_series(1, 95 * 500) i, generate_series(1, 95 * 10) j;
-+  from generate_series(1, 500) i, generate_series(1, 95 * 10) j;
- create index on prtx2 (b);
- create index on prtx2 (c);
- analyze prtx1;
-@@ -1202,7 +1202,7 @@ CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
- CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
- 
- -- insert data
-INSERT INTO fract_t (id) (SELECT generate_series(0, 95 * 1999));
-+INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
- ANALYZE fract_t;
- 
- -- verify plan; nested index only scans
-diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
-index 82ac39d5dc8..6a0c7a3666d 100644
--- a/src/test/regress/sql/partition_prune.sql
-+++ b/src/test/regress/sql/partition_prune.sql
-@@ -512,7 +512,7 @@ create table list_part2 partition of list_part for values in (2);
- create table list_part3 partition of list_part for values in (3);
- create table list_part4 partition of list_part for values in (4);
- 
-insert into list_part select generate_series(1, 95 * 4);
-+insert into list_part select generate_series(1, 4);
- 
- begin;
- 
-@@ -940,7 +940,7 @@ create table ma_test (a int, b int) partition by range (a);
- create table ma_test_p1 partition of ma_test for values from (0) to (10);
- create table ma_test_p2 partition of ma_test for values from (10) to (20);
- create table ma_test_p3 partition of ma_test for values from (20) to (30);
-insert into ma_test select x,x from generate_series(0, 95 * 29) t(x);
-+insert into ma_test select x,x from generate_series(0,29) t(x);
- create index on ma_test (b);
- 
- analyze ma_test;
-@@ -1263,7 +1263,7 @@ create table hp_prefix_test (a int, b int, c int, d int)
- 
- -- create 8 partitions
- select 'create table hp_prefix_test_p' || x::text || ' partition of hp_prefix_test for values with (modulus 8, remainder ' || x::text || ');'
-from generate_series(0, 95 * 7) x;
-+from generate_series(0, 7) x;
- \gexec
- 
- -- insert 16 rows, one row for each test to perform.
-@@ -1274,9 +1274,9 @@ select
-   case c when 0 then null else 3 end,
-   case d when 0 then null else 4 end
- from
-  generate_series(0, 95 * 1) a,
-  generate_series(0, 95 * 1) b,
-  generate_series(0, 95 * 1) c,
-+  generate_series(0, 1) a,
-+  generate_series(0, 1) b,
-+  generate_series(0, 1) c,
-   generate_series(0, 95 * 1) d;
- 
- -- Ensure partition pruning works correctly for each combination of IS NULL
-diff --git a/src/test/regress/sql/plpgsql.sql b/src/test/regress/sql/plpgsql.sql
-index d18cc331561..435d3d718e1 100644
--- a/src/test/regress/sql/plpgsql.sql
-+++ b/src/test/regress/sql/plpgsql.sql
-@@ -4581,12 +4581,12 @@ CREATE TRIGGER transition_table_level2_ri_child_upd_trigger
- 
- -- create initial test data
- INSERT INTO transition_table_level1 (level1_no)
-  SELECT generate_series(1, 95 * 200);
-+  SELECT generate_series(1,200);
- ANALYZE transition_table_level1;
- 
- INSERT INTO transition_table_level2 (level2_no, parent_no)
-   SELECT level2_no, level2_no / 50 + 1 AS parent_no
-    FROM generate_series(1, 95 * 9999) level2_no;
-+    FROM generate_series(1,9999) level2_no;
- ANALYZE transition_table_level2;
- 
- INSERT INTO transition_table_status (level, node_no, status)
-diff --git a/src/test/regress/sql/polygon.sql b/src/test/regress/sql/polygon.sql
-index d39a2b4e8f8..2d862985510 100644
--- a/src/test/regress/sql/polygon.sql
-+++ b/src/test/regress/sql/polygon.sql
-@@ -42,7 +42,7 @@ CREATE TABLE quad_poly_tbl (id int, p polygon);
- 
- INSERT INTO quad_poly_tbl
- 	SELECT (x - 1) * 100 + y, polygon(circle(point(x * 10, y * 10), 1 + (x + y) % 10))
-	FROM generate_series(1, 95 * 100) x,
-+	FROM generate_series(1, 100) x,
- 		 generate_series(1, 95 * 100) y;
- 
- INSERT INTO quad_poly_tbl
-diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql
-index 12c40039b18..e08b0aee00e 100644
--- a/src/test/regress/sql/psql.sql
-+++ b/src/test/regress/sql/psql.sql
-@@ -187,7 +187,7 @@ select 'drop table gexec_test', 'select ''2000-01-01''::date as party_over'
- prepare q as select array_to_string(array_agg(repeat('x',2*n)),E'\n') as "ab
- 
- c", array_to_string(array_agg(repeat('y',20-2*n)),E'\n') as "a
-bc" from generate_series(1, 95 * 10) as n(n) group by n>1 order by n>1;
-+bc" from generate_series(1,10) as n(n) group by n>1 order by n>1;
- 
- \pset linestyle ascii
- 
-@@ -304,7 +304,7 @@ execute q;
- deallocate q;
- 
- -- test single-line header and data
-prepare q as select repeat('x',2*n) as "0123456789abcdef", repeat('y',20-2*n) as "0123456789" from generate_series(1, 95 * 10) as n;
-+prepare q as select repeat('x',2*n) as "0123456789abcdef", repeat('y',20-2*n) as "0123456789" from generate_series(1,10) as n;
- 
- \pset linestyle ascii
- 
-@@ -1220,7 +1220,7 @@ create table child_10_20 partition of parent_tab
-   for values from (10) to (20);
- create table child_20_30 partition of parent_tab
-   for values from (20) to (30);
-insert into parent_tab values (generate_series(0, 95 * 29));
-+insert into parent_tab values (generate_series(0,29));
- create table child_30_40 partition of parent_tab
- for values from (30) to (40)
-   partition by range(id);
-diff --git a/src/test/regress/sql/rangetypes.sql b/src/test/regress/sql/rangetypes.sql
-index b51d6c405c2..a2d50d7bb43 100644
--- a/src/test/regress/sql/rangetypes.sql
-+++ b/src/test/regress/sql/rangetypes.sql
-@@ -314,13 +314,13 @@ select count(*) from test_range_gist where ir -|- int4multirange(int4range(100,2
- create table test_range_spgist(ir int4range);
- create index test_range_spgist_idx on test_range_spgist using spgist (ir);
- 
-insert into test_range_spgist select int4range(g, g+10) from generate_series(1, 95 * 2000) g;
-insert into test_range_spgist select 'empty'::int4range from generate_series(1, 95 * 500) g;
-insert into test_range_spgist select int4range(g, g+10000) from generate_series(1, 95 * 1000) g;
-insert into test_range_spgist select 'empty'::int4range from generate_series(1, 95 * 500) g;
-insert into test_range_spgist select int4range(NULL,g*10,'(]') from generate_series(1, 95 * 100) g;
-insert into test_range_spgist select int4range(g*10,NULL,'(]') from generate_series(1, 95 * 100) g;
-insert into test_range_spgist select int4range(g, g+10) from generate_series(1, 95 * 2000) g;
-+insert into test_range_spgist select int4range(g, g+10) from generate_series(1, POW(95, 0.5)::int * 2000) g;
-+insert into test_range_spgist select 'empty'::int4range from generate_series(1, POW(95, 0.5)::int * 500) g;
-+insert into test_range_spgist select int4range(g, g+10000) from generate_series(1, POW(95, 0.5)::int * 1000) g;
-+insert into test_range_spgist select 'empty'::int4range from generate_series(1, POW(95, 0.5)::int * 500) g;
-+insert into test_range_spgist select int4range(NULL,g*10,'(]') from generate_series(1, POW(95, 0.5)::int * 100) g;
-+insert into test_range_spgist select int4range(g*10,NULL,'(]') from generate_series(1, POW(95, 0.5)::int * 100) g;
-+insert into test_range_spgist select int4range(g, g+10) from generate_series(1, POW(95, 0.5)::int * 2000) g;
- 
- -- first, verify non-indexed results
- SET enable_seqscan    = t;
-diff --git a/src/test/regress/sql/spgist.sql b/src/test/regress/sql/spgist.sql
-index 0c4f24e1d49..ed9f7c45411 100644
--- a/src/test/regress/sql/spgist.sql
-+++ b/src/test/regress/sql/spgist.sql
-@@ -16,9 +16,9 @@ vacuum spgist_point_tbl;
- 
- -- Insert more data, to make the index a few levels deep.
- insert into spgist_point_tbl (id, p)
-select g,      point(g*10, g*10) from generate_series(1, 95 * 10000) g;
-+select g,      point(g*10, g*10) from generate_series(1, POW(95, 0.5) * 10000) g;
- insert into spgist_point_tbl (id, p)
-select g+100000, point(g*10+1, g*10+1) from generate_series(1, 95 * 10000) g;
-+select g+100000, point(g*10+1, g*10+1) from generate_series(1, POW(95, 0.5) * 10000) g;
- 
- -- To test vacuum, delete some entries from all over the index.
- delete from spgist_point_tbl where id % 2 = 1;
-@@ -37,8 +37,8 @@ vacuum spgist_point_tbl;
- create table spgist_box_tbl(id serial, b box);
- insert into spgist_box_tbl(b)
- select box(point(i,j),point(i+s,j+s))
-  from generate_series(1, 95 * 100,5) i,
-       generate_series(1, 95 * 100,5) j,
-+  from generate_series(1,100,5) i,
-+       generate_series(1,100,5) j,
-        generate_series(1, 95 * 10) s;
- create index spgist_box_idx on spgist_box_tbl using spgist (b);
- 
-@@ -86,6 +86,6 @@ create unlogged table spgist_unlogged_tbl(id serial, b box);
- create index spgist_unlogged_idx on spgist_unlogged_tbl using spgist (b);
- insert into spgist_unlogged_tbl(b)
- select box(point(i,j))
-  from generate_series(1, 95 * 100,5) i,
-+  from generate_series(1,100,5) i,
-        generate_series(1, 95 * 10,5) j;
- -- leave this table around, to help in testing dump/restore
-diff --git a/src/test/regress/sql/tuplesort.sql b/src/test/regress/sql/tuplesort.sql
-index 133491a0d70..0642902ad53 100644
--- a/src/test/regress/sql/tuplesort.sql
-+++ b/src/test/regress/sql/tuplesort.sql
-@@ -19,7 +19,7 @@ INSERT INTO abbrev_abort_uuids (abort_increasing, abort_decreasing, noabort_incr
-         ('00000000-0000-0000-0000-'||to_char(20000 - g.i, '000000000000FM'))::uuid abort_decreasing,
-         (to_char(g.i % 10009, '00000000FM')||'-0000-0000-0000-'||to_char(g.i, '000000000000FM'))::uuid noabort_increasing,
-         (to_char(((20000 - g.i) % 10009), '00000000FM')||'-0000-0000-0000-'||to_char(20000 - g.i, '000000000000FM'))::uuid noabort_decreasing
-    FROM generate_series(0, 95 * 20000, 1) g(i);
-+    FROM generate_series(0, 20000, 1) g(i);
- 
- -- and a few NULLs
- INSERT INTO abbrev_abort_uuids(id) VALUES(0);
-@@ -276,7 +276,7 @@ ROLLBACK;
- CREATE TEMP TABLE test_mark_restore(col1 int, col2 int, col12 int);
- -- need a few duplicates for mark/restore to matter
- INSERT INTO test_mark_restore(col1, col2, col12)
-   SELECT a.i, b.i, a.i * b.i FROM generate_series(1, 95 * 500) a(i), generate_series(1, 95 * 5) b(i);
-+   SELECT a.i, b.i, a.i * b.i FROM generate_series(1, 500) a(i), generate_series(1, 95 * 5) b(i);
- 
- BEGIN;
- 
-diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql
-index e4ad5c274fe..e1894d2d9cc 100644
--- a/src/test/regress/sql/updatable_views.sql
-+++ b/src/test/regress/sql/updatable_views.sql
-@@ -494,7 +494,7 @@ MERGE INTO rw_view2 t
- SELECT * FROM base_tbl ORDER BY a;
- 
- MERGE INTO rw_view2 t
-  USING (SELECT x, 'r'||x FROM generate_series(0, 95 * 2) x) AS s(a,b) ON t.a = s.a
-+  USING (SELECT x, 'r'||x FROM generate_series(0,2) x) AS s(a,b) ON t.a = s.a
-   WHEN MATCHED THEN UPDATE SET b = s.b
-   WHEN NOT MATCHED AND s.a > 0 THEN INSERT VALUES (s.a, s.b)
-   WHEN NOT MATCHED BY SOURCE THEN UPDATE SET b = 'Not matched by source'
-@@ -519,7 +519,7 @@ MERGE INTO rw_view2 t
-   WHEN MATCHED THEN UPDATE SET b = s.b
-   WHEN NOT MATCHED AND s.a > 0 THEN INSERT VALUES (s.a, s.b); -- should fail
- MERGE INTO rw_view2 t
-  USING (SELECT x, 'R'||x FROM generate_series(0, 95 * 3) x) AS s(a,b) ON t.a = s.a
-+  USING (SELECT x, 'R'||x FROM generate_series(0,3) x) AS s(a,b) ON t.a = s.a
-   WHEN MATCHED THEN UPDATE SET b = s.b
-   WHEN NOT MATCHED AND s.a > 0 THEN INSERT VALUES (s.a, s.b); -- ok
- 
-diff --git a/src/test/regress/sql/vacuum.sql b/src/test/regress/sql/vacuum.sql
-index 6a2f5815ab2..a63cf5cd12c 100644
--- a/src/test/regress/sql/vacuum.sql
-+++ b/src/test/regress/sql/vacuum.sql
-@@ -156,7 +156,7 @@ CREATE TABLE no_index_cleanup (i INT PRIMARY KEY, t TEXT);
- -- Use uncompressed data stored in toast.
- CREATE INDEX no_index_cleanup_idx ON no_index_cleanup(t);
- ALTER TABLE no_index_cleanup ALTER COLUMN t SET STORAGE EXTERNAL;
-INSERT INTO no_index_cleanup(i, t) VALUES (generate_series(1, 95 * 30),
-+INSERT INTO no_index_cleanup(i, t) VALUES (generate_series(1,30),
-     repeat('1234567890',269));
- -- index cleanup option is ignored if VACUUM FULL
- VACUUM (INDEX_CLEANUP TRUE, FULL TRUE) no_index_cleanup;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -213,8 +213,10 @@ impl Escaping for PgIdent {

        // Find the first suitable tag that is not present in the string.
        // Postgres' max role/DB name length is 63 bytes, so even in the
-        // worst case it won't take long.
-        while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
+        // worst case it won't take long. Outer tag is always `tag + "x"`,
+        // so if `tag` is not present in the string, `outer_tag` is not
+        // present in the string either.
+        while self.contains(&tag.to_string()) {
            tag += "x";
            outer_tag = tag.clone() + "x";
        }
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -27,6 +27,40 @@ fn get_rsyslog_pid() -> Option<String> {
    }
 }

+fn wait_for_rsyslog_pid() -> Result<String, anyhow::Error> {
+    const MAX_WAIT: Duration = Duration::from_secs(5);
+    const INITIAL_SLEEP: Duration = Duration::from_millis(2);
+
+    let mut sleep_duration = INITIAL_SLEEP;
+    let start = std::time::Instant::now();
+    let mut attempts = 1;
+
+    for attempt in 1.. {
+        attempts = attempt;
+        match get_rsyslog_pid() {
+            Some(pid) => return Ok(pid),
+            None => {
+                if start.elapsed() >= MAX_WAIT {
+                    break;
+                }
+                info!(
+                    "rsyslogd is not running, attempt {}. Sleeping for {} ms",
+                    attempt,
+                    sleep_duration.as_millis()
+                );
+                std::thread::sleep(sleep_duration);
+                sleep_duration *= 2;
+            }
+        }
+    }
+
+    Err(anyhow::anyhow!(
+        "rsyslogd is not running after waiting for {} seconds and {} attempts",
+        attempts,
+        start.elapsed().as_secs()
+    ))
+}
+
 // Restart rsyslogd to apply the new configuration.
 // This is necessary, because there is no other way to reload the rsyslog configuration.
 //
@@ -36,14 +70,14 @@ fn get_rsyslog_pid() -> Option<String> {
 // TODO: test it properly
 //
 fn restart_rsyslog() -> Result<()> {
-    let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
-    info!("rsyslogd is running with pid: {}, restart it", old_pid);
-
    // kill it to restart
    let _ = Command::new("pkill")
        .arg("rsyslogd")
        .output()
-        .context("Failed to stop rsyslogd")?;
+        .context("Failed to restart rsyslogd")?;
+
+    // ensure rsyslogd is running
+    wait_for_rsyslog_pid()?;

    Ok(())
 }
@@ -131,15 +165,11 @@ pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result
        return Ok(());
    }

-    // When new config is empty we can simply remove the configuration file.
+    // Nothing to configure
    if new_config.is_empty() {
-        info!("removing rsyslog config file: {}", POSTGRES_LOGS_CONF_PATH);
-        match std::fs::remove_file(POSTGRES_LOGS_CONF_PATH) {
-            Ok(_) => {}
-            Err(err) if err.kind() == ErrorKind::NotFound => {}
-            Err(err) => return Err(err.into()),
-        }
-        restart_rsyslog()?;
+        // When the configuration is removed, PostgreSQL will stop sending data
+        // to the files watched by rsyslog, so restarting rsyslog is more effort
+        // than just ignoring this change.
        return Ok(());
    }

--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -71,6 +71,14 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
            ("name$$$", ("$x$name$$$$x$", "xx")),
            ("name$$$$", ("$x$name$$$$$x$", "xx")),
            ("name$x$", ("$xx$name$x$$xx$", "xxx")),
+            ("x", ("$xx$x$xx$", "xxx")),
+            ("xx", ("$xxx$xx$xxx$", "xxxx")),
+            ("$x", ("$xx$$x$xx$", "xxx")),
+            ("x$", ("$xx$x$$xx$", "xxx")),
+            ("$x$", ("$xx$$x$$xx$", "xxx")),
+            ("xx$", ("$xxx$xx$$xxx$", "xxxx")),
+            ("$xx", ("$xxx$$xx$xxx$", "xxxx")),
+            ("$xx$", ("$xxx$$xx$$xxx$", "xxxx")),
        ];

        for (input, expected) in test_cases {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -616,19 +616,17 @@ impl Endpoint {

    /// Map safekeepers ids to the actual connection strings.
    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
-        let mut safekeeper_connstrings = Vec::new();
-        if self.mode == ComputeMode::Primary {
-            for sk_id in sk_ids {
-                let sk = self
-                    .env
+        sk_ids
+            .into_iter()
+            .map(|node_id| {
+                self.env
                    .safekeepers
                    .iter()
-                    .find(|node| node.id == sk_id)
-                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
-            }
-        }
-        Ok(safekeeper_connstrings)
+                    .find(|node| node.id == node_id)
+                    .map(|node| format!("127.0.0.1:{}", node.get_compute_port()))
+                    .ok_or_else(|| anyhow!("safekeeer {node_id} does not exist"))
+            })
+            .collect::<Result<Vec<String>>>()
    }

    /// Generate a JWT with the correct claims.
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -546,6 +546,16 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("Falied to parse 'sampling_ratio'")?,
+            relsize_snapshot_cache_capacity: settings
+                .remove("relsize snapshot cache capacity")
+                .map(|x| x.parse::<usize>())
+                .transpose()
+                .context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?,
+            basebackup_cache_enabled: settings
+                .remove("basebackup_cache_enabled")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'basebackup_cache_enabled' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        if var(REAL_S3_ENV).is_ok() {
            assert!(body.contains("remote_storage_s3_deleted_objects_total"));
        }
+
+        #[cfg(target_os = "linux")]
        assert!(body.contains("process_threads"));
    }

--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -183,6 +183,8 @@ pub struct ConfigToml {
    pub enable_tls_page_service_api: bool,
    pub dev_mode: bool,
    pub timeline_import_config: TimelineImportConfig,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub basebackup_cache_config: Option<BasebackupCacheConfig>,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -235,7 +237,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy {
    ScatteredLsn,
 }

-#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum GetVectoredConcurrentIo {
    /// The read path is fully sequential: layers are visited
@@ -308,6 +310,26 @@ pub struct TimelineImportConfig {
    pub import_job_checkpoint_threshold: NonZeroUsize,
 }

+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(default)]
+pub struct BasebackupCacheConfig {
+    #[serde(with = "humantime_serde")]
+    pub cleanup_period: Duration,
+    // FIXME: Support max_size_bytes.
+    // pub max_size_bytes: usize,
+    pub max_size_entries: i64,
+}
+
+impl Default for BasebackupCacheConfig {
+    fn default() -> Self {
+        Self {
+            cleanup_period: Duration::from_secs(60),
+            // max_size_bytes: 1024 * 1024 * 1024, // 1 GiB
+            max_size_entries: 1000,
+        }
+    }
+}
+
 pub mod statvfs {
    pub mod mock {
        #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -491,6 +513,14 @@ pub struct TenantConfigToml {
    /// Tenant level performance sampling ratio override. Controls the ratio of get page requests
    /// that will get perf sampling for the tenant.
    pub sampling_ratio: Option<Ratio>,
+
+    /// Capacity of relsize snapshot cache (used by replicas).
+    pub relsize_snapshot_cache_capacity: usize,
+
+    /// Enable preparing basebackup on XLOG_CHECKPOINT_SHUTDOWN and using it in basebackup requests.
+    // FIXME: Remove skip_serializing_if when the feature is stable.
+    #[serde(skip_serializing_if = "std::ops::Not::not")]
+    pub basebackup_cache_enabled: bool,
 }

 pub mod defaults {
@@ -664,6 +694,7 @@ impl Default for ConfigToml {
                import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
                import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
            },
+            basebackup_cache_config: None,
        }
    }
 }
@@ -730,6 +761,7 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
    pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
    pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
+    pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000;
 }

 impl Default for TenantConfigToml {
@@ -787,6 +819,8 @@ impl Default for TenantConfigToml {
            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
            sampling_ratio: None,
+            relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY,
+            basebackup_cache_enabled: false,
        }
    }
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -630,6 +630,10 @@ pub struct TenantConfigPatch {
    pub gc_compaction_ratio_percent: FieldPatch<u64>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub sampling_ratio: FieldPatch<Option<Ratio>>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub relsize_snapshot_cache_capacity: FieldPatch<usize>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub basebackup_cache_enabled: FieldPatch<bool>,
 }

 /// Like [`crate::config::TenantConfigToml`], but preserves the information
@@ -759,6 +763,12 @@ pub struct TenantConfig {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub sampling_ratio: Option<Option<Ratio>>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub relsize_snapshot_cache_capacity: Option<usize>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub basebackup_cache_enabled: Option<bool>,
 }

 impl TenantConfig {
@@ -804,6 +814,8 @@ impl TenantConfig {
            mut gc_compaction_initial_threshold_kb,
            mut gc_compaction_ratio_percent,
            mut sampling_ratio,
+            mut relsize_snapshot_cache_capacity,
+            mut basebackup_cache_enabled,
        } = self;

        patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -905,6 +917,12 @@ impl TenantConfig {
            .gc_compaction_ratio_percent
            .apply(&mut gc_compaction_ratio_percent);
        patch.sampling_ratio.apply(&mut sampling_ratio);
+        patch
+            .relsize_snapshot_cache_capacity
+            .apply(&mut relsize_snapshot_cache_capacity);
+        patch
+            .basebackup_cache_enabled
+            .apply(&mut basebackup_cache_enabled);

        Ok(Self {
            checkpoint_distance,
@@ -944,6 +962,8 @@ impl TenantConfig {
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
            sampling_ratio,
+            relsize_snapshot_cache_capacity,
+            basebackup_cache_enabled,
        })
    }

@@ -1052,6 +1072,12 @@ impl TenantConfig {
                .gc_compaction_ratio_percent
                .unwrap_or(global_conf.gc_compaction_ratio_percent),
            sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio),
+            relsize_snapshot_cache_capacity: self
+                .relsize_snapshot_cache_capacity
+                .unwrap_or(global_conf.relsize_snapshot_cache_capacity),
+            basebackup_cache_enabled: self
+                .basebackup_cache_enabled
+                .unwrap_or(global_conf.basebackup_cache_enabled),
        }
    }
 }
--- a/libs/proxy/postgres-protocol2/src/message/frontend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -25,6 +25,7 @@ where
    Ok(())
 }

+#[derive(Debug)]
 pub enum BindError {
    Conversion(Box<dyn Error + marker::Sync + Send>),
    Serialization(io::Error),
@@ -288,6 +289,12 @@ pub fn sync(buf: &mut BytesMut) {
    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
 }

+#[inline]
+pub fn flush(buf: &mut BytesMut) {
+    buf.put_u8(b'H');
+    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+}
+
 #[inline]
 pub fn terminate(buf: &mut BytesMut) {
    buf.put_u8(b'X');
--- a/libs/proxy/postgres-types2/src/lib.rs
+++ b/libs/proxy/postgres-types2/src/lib.rs
@@ -9,7 +9,6 @@ use std::error::Error;
 use std::fmt;
 use std::sync::Arc;

-use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 #[doc(inline)]
 pub use postgres_protocol2::Oid;
@@ -27,41 +26,6 @@ macro_rules! accepts {
    )
 }

-/// Generates an implementation of `ToSql::to_sql_checked`.
-///
-/// All `ToSql` implementations should use this macro.
-macro_rules! to_sql_checked {
-    () => {
-        fn to_sql_checked(
-            &self,
-            ty: &$crate::Type,
-            out: &mut $crate::private::BytesMut,
-        ) -> ::std::result::Result<
-            $crate::IsNull,
-            Box<dyn ::std::error::Error + ::std::marker::Sync + ::std::marker::Send>,
-        > {
-            $crate::__to_sql_checked(self, ty, out)
-        }
-    };
-}
-
-// WARNING: this function is not considered part of this crate's public API.
-// It is subject to change at any time.
-#[doc(hidden)]
-pub fn __to_sql_checked<T>(
-    v: &T,
-    ty: &Type,
-    out: &mut BytesMut,
-) -> Result<IsNull, Box<dyn Error + Sync + Send>>
-where
-    T: ToSql,
-{
-    if !T::accepts(ty) {
-        return Err(Box::new(WrongType::new::<T>(ty.clone())));
-    }
-    v.to_sql(ty, out)
-}
-
 // mod pg_lsn;
 #[doc(hidden)]
 pub mod private;
@@ -142,7 +106,7 @@ pub enum Kind {
    /// An array type along with the type of its elements.
    Array(Type),
    /// A range type along with the type of its elements.
-    Range(Type),
+    Range(Oid),
    /// A multirange type along with the type of its elements.
    Multirange(Type),
    /// A domain type along with its underlying type.
@@ -377,43 +341,6 @@ pub enum IsNull {
    No,
 }

-/// A trait for types that can be converted into Postgres values.
-pub trait ToSql: fmt::Debug {
-    /// Converts the value of `self` into the binary format of the specified
-    /// Postgres `Type`, appending it to `out`.
-    ///
-    /// The caller of this method is responsible for ensuring that this type
-    /// is compatible with the Postgres `Type`.
-    ///
-    /// The return value indicates if this value should be represented as
-    /// `NULL`. If this is the case, implementations **must not** write
-    /// anything to `out`.
-    fn to_sql(&self, ty: &Type, out: &mut BytesMut) -> Result<IsNull, Box<dyn Error + Sync + Send>>
-    where
-        Self: Sized;
-
-    /// Determines if a value of this type can be converted to the specified
-    /// Postgres `Type`.
-    fn accepts(ty: &Type) -> bool
-    where
-        Self: Sized;
-
-    /// An adaptor method used internally by Rust-Postgres.
-    ///
-    /// *All* implementations of this method should be generated by the
-    /// `to_sql_checked!()` macro.
-    fn to_sql_checked(
-        &self,
-        ty: &Type,
-        out: &mut BytesMut,
-    ) -> Result<IsNull, Box<dyn Error + Sync + Send>>;
-
-    /// Specify the encode format
-    fn encode_format(&self, _ty: &Type) -> Format {
-        Format::Binary
-    }
-}
-
 /// Supported Postgres message format types
 ///
 /// Using Text format in a message assumes a Postgres `SERVER_ENCODING` of `UTF8`
@@ -424,52 +351,3 @@ pub enum Format {
    /// Compact, typed binary format
    Binary,
 }
-
-impl ToSql for &str {
-    fn to_sql(&self, ty: &Type, w: &mut BytesMut) -> Result<IsNull, Box<dyn Error + Sync + Send>> {
-        match *ty {
-            ref ty if ty.name() == "ltree" => types::ltree_to_sql(self, w),
-            ref ty if ty.name() == "lquery" => types::lquery_to_sql(self, w),
-            ref ty if ty.name() == "ltxtquery" => types::ltxtquery_to_sql(self, w),
-            _ => types::text_to_sql(self, w),
-        }
-        Ok(IsNull::No)
-    }
-
-    fn accepts(ty: &Type) -> bool {
-        match *ty {
-            Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true,
-            ref ty
-                if (ty.name() == "citext"
-                    || ty.name() == "ltree"
-                    || ty.name() == "lquery"
-                    || ty.name() == "ltxtquery") =>
-            {
-                true
-            }
-            _ => false,
-        }
-    }
-
-    to_sql_checked!();
-}
-
-macro_rules! simple_to {
-    ($t:ty, $f:ident, $($expected:ident),+) => {
-        impl ToSql for $t {
-            fn to_sql(&self,
-                      _: &Type,
-                      w: &mut BytesMut)
-                      -> Result<IsNull, Box<dyn Error + Sync + Send>> {
-                types::$f(*self, w);
-                Ok(IsNull::No)
-            }
-
-            accepts!($($expected),+);
-
-            to_sql_checked!();
-        }
-    }
-}
-
-simple_to!(u32, oid_to_sql, OID);
--- a/libs/proxy/postgres-types2/src/type_gen.rs
+++ b/libs/proxy/postgres-types2/src/type_gen.rs
@@ -393,7 +393,7 @@ impl Inner {
        }
    }

-    pub fn oid(&self) -> Oid {
+    pub const fn const_oid(&self) -> Oid {
        match *self {
            Inner::Bool => 16,
            Inner::Bytea => 17,
@@ -580,7 +580,14 @@ impl Inner {
            Inner::TstzmultiRangeArray => 6153,
            Inner::DatemultiRangeArray => 6155,
            Inner::Int8multiRangeArray => 6157,
+            Inner::Other(_) => u32::MAX,
+        }
+    }
+
+    pub fn oid(&self) -> Oid {
+        match *self {
            Inner::Other(ref u) => u.oid,
+            _ => self.const_oid(),
        }
    }

@@ -727,17 +734,17 @@ impl Inner {
            Inner::JsonbArray => &Kind::Array(Type(Inner::Jsonb)),
            Inner::AnyRange => &Kind::Pseudo,
            Inner::EventTrigger => &Kind::Pseudo,
-            Inner::Int4Range => &Kind::Range(Type(Inner::Int4)),
+            Inner::Int4Range => &const { Kind::Range(Inner::Int4.const_oid()) },
            Inner::Int4RangeArray => &Kind::Array(Type(Inner::Int4Range)),
-            Inner::NumRange => &Kind::Range(Type(Inner::Numeric)),
+            Inner::NumRange => &const { Kind::Range(Inner::Numeric.const_oid()) },
            Inner::NumRangeArray => &Kind::Array(Type(Inner::NumRange)),
-            Inner::TsRange => &Kind::Range(Type(Inner::Timestamp)),
+            Inner::TsRange => &const { Kind::Range(Inner::Timestamp.const_oid()) },
            Inner::TsRangeArray => &Kind::Array(Type(Inner::TsRange)),
-            Inner::TstzRange => &Kind::Range(Type(Inner::Timestamptz)),
+            Inner::TstzRange => &const { Kind::Range(Inner::Timestamptz.const_oid()) },
            Inner::TstzRangeArray => &Kind::Array(Type(Inner::TstzRange)),
-            Inner::DateRange => &Kind::Range(Type(Inner::Date)),
+            Inner::DateRange => &const { Kind::Range(Inner::Date.const_oid()) },
            Inner::DateRangeArray => &Kind::Array(Type(Inner::DateRange)),
-            Inner::Int8Range => &Kind::Range(Type(Inner::Int8)),
+            Inner::Int8Range => &const { Kind::Range(Inner::Int8.const_oid()) },
            Inner::Int8RangeArray => &Kind::Array(Type(Inner::Int8Range)),
            Inner::Jsonpath => &Kind::Simple,
            Inner::JsonpathArray => &Kind::Array(Type(Inner::Jsonpath)),
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -1,14 +1,12 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::net::IpAddr;
-use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::time::Duration;

 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{TryStreamExt, future, ready};
-use parking_lot::Mutex;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use serde::{Deserialize, Serialize};
@@ -16,29 +14,52 @@ use tokio::sync::mpsc;

 use crate::codec::{BackendMessages, FrontendMessage};
 use crate::config::{Host, SslMode};
-use crate::connection::{Request, RequestMessages};
 use crate::query::RowStream;
 use crate::simple_query::SimpleQueryStream;
 use crate::types::{Oid, Type};
 use crate::{
-    CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction,
-    TransactionBuilder, query, simple_query,
+    CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Transaction, TransactionBuilder,
+    query, simple_query,
 };

 pub struct Responses {
+    /// new messages from conn
    receiver: mpsc::Receiver<BackendMessages>,
+    /// current batch of messages
    cur: BackendMessages,
+    /// number of total queries sent.
+    waiting: usize,
+    /// number of ReadyForQuery messages received.
+    received: usize,
 }

 impl Responses {
    pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll<Result<Message, Error>> {
        loop {
-            match self.cur.next().map_err(Error::parse)? {
-                Some(Message::ErrorResponse(body)) => return Poll::Ready(Err(Error::db(body))),
-                Some(message) => return Poll::Ready(Ok(message)),
-                None => {}
+            // get the next saved message
+            if let Some(message) = self.cur.next().map_err(Error::parse)? {
+                let received = self.received;
+
+                // increase the query head if this is the last message.
+                if let Message::ReadyForQuery(_) = message {
+                    self.received += 1;
+                }
+
+                // check if the client has skipped this query.
+                if received + 1 < self.waiting {
+                    // grab the next message.
+                    continue;
+                }
+
+                // convenience: turn the error messaage into a proper error.
+                let res = match message {
+                    Message::ErrorResponse(body) => Err(Error::db(body)),
+                    message => Ok(message),
+                };
+                return Poll::Ready(res);
            }

+            // get the next batch of messages.
            match ready!(self.receiver.poll_recv(cx)) {
                Some(messages) => self.cur = messages,
                None => return Poll::Ready(Err(Error::closed())),
@@ -55,44 +76,87 @@ impl Responses {
 /// (corresponding to the queries in the [crate::prepare] module).
 #[derive(Default)]
 pub(crate) struct CachedTypeInfo {
-    /// A statement for basic information for a type from its
-    /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
-    /// fallback).
-    pub(crate) typeinfo: Option<Statement>,
-
    /// Cache of types already looked up.
    pub(crate) types: HashMap<Oid, Type>,
 }

 pub struct InnerClient {
-    sender: mpsc::UnboundedSender<Request>,
+    sender: mpsc::UnboundedSender<FrontendMessage>,
+    responses: Responses,

    /// A buffer to use when writing out postgres commands.
-    buffer: Mutex<BytesMut>,
+    buffer: BytesMut,
 }

 impl InnerClient {
-    pub fn send(&self, messages: RequestMessages) -> Result<Responses, Error> {
-        let (sender, receiver) = mpsc::channel(1);
-        let request = Request { messages, sender };
-        self.sender.send(request).map_err(|_| Error::closed())?;
-
-        Ok(Responses {
-            receiver,
-            cur: BackendMessages::empty(),
-        })
+    pub fn start(&mut self) -> Result<PartialQuery, Error> {
+        self.responses.waiting += 1;
+        Ok(PartialQuery(Some(self)))
    }

-    /// Call the given function with a buffer to be used when writing out
-    /// postgres commands.
-    pub fn with_buf<F, R>(&self, f: F) -> R
+    // pub fn send_with_sync<F>(&mut self, f: F) -> Result<&mut Responses, Error>
+    // where
+    //     F: FnOnce(&mut BytesMut) -> Result<(), Error>,
+    // {
+    //     self.start()?.send_with_sync(f)
+    // }
+
+    pub fn send_simple_query(&mut self, query: &str) -> Result<&mut Responses, Error> {
+        self.responses.waiting += 1;
+
+        self.buffer.clear();
+        // simple queries do not need sync.
+        frontend::query(query, &mut self.buffer).map_err(Error::encode)?;
+        let buf = self.buffer.split().freeze();
+        self.send_message(FrontendMessage::Raw(buf))
+    }
+
+    fn send_message(&mut self, messages: FrontendMessage) -> Result<&mut Responses, Error> {
+        self.sender.send(messages).map_err(|_| Error::closed())?;
+        Ok(&mut self.responses)
+    }
+}
+
+pub struct PartialQuery<'a>(Option<&'a mut InnerClient>);
+
+impl Drop for PartialQuery<'_> {
+    fn drop(&mut self) {
+        if let Some(client) = self.0.take() {
+            client.buffer.clear();
+            frontend::sync(&mut client.buffer);
+            let buf = client.buffer.split().freeze();
+            let _ = client.send_message(FrontendMessage::Raw(buf));
+        }
+    }
+}
+
+impl<'a> PartialQuery<'a> {
+    pub fn send_with_flush<F>(&mut self, f: F) -> Result<&mut Responses, Error>
    where
-        F: FnOnce(&mut BytesMut) -> R,
+        F: FnOnce(&mut BytesMut) -> Result<(), Error>,
    {
-        let mut buffer = self.buffer.lock();
-        let r = f(&mut buffer);
-        buffer.clear();
-        r
+        let client = self.0.as_deref_mut().unwrap();
+
+        client.buffer.clear();
+        f(&mut client.buffer)?;
+        frontend::flush(&mut client.buffer);
+        let buf = client.buffer.split().freeze();
+        client.send_message(FrontendMessage::Raw(buf))
+    }
+
+    pub fn send_with_sync<F>(mut self, f: F) -> Result<&'a mut Responses, Error>
+    where
+        F: FnOnce(&mut BytesMut) -> Result<(), Error>,
+    {
+        let client = self.0.as_deref_mut().unwrap();
+
+        client.buffer.clear();
+        f(&mut client.buffer)?;
+        frontend::sync(&mut client.buffer);
+        let buf = client.buffer.split().freeze();
+        let _ = client.send_message(FrontendMessage::Raw(buf));
+
+        Ok(&mut self.0.take().unwrap().responses)
    }
 }

@@ -109,7 +173,7 @@ pub struct SocketConfig {
 /// The client is one half of what is returned when a connection is established. Users interact with the database
 /// through this client object.
 pub struct Client {
-    inner: Arc<InnerClient>,
+    inner: InnerClient,
    cached_typeinfo: CachedTypeInfo,

    socket_config: SocketConfig,
@@ -120,17 +184,24 @@ pub struct Client {

 impl Client {
    pub(crate) fn new(
-        sender: mpsc::UnboundedSender<Request>,
+        sender: mpsc::UnboundedSender<FrontendMessage>,
+        receiver: mpsc::Receiver<BackendMessages>,
        socket_config: SocketConfig,
        ssl_mode: SslMode,
        process_id: i32,
        secret_key: i32,
    ) -> Client {
        Client {
-            inner: Arc::new(InnerClient {
+            inner: InnerClient {
                sender,
+                responses: Responses {
+                    receiver,
+                    cur: BackendMessages::empty(),
+                    waiting: 0,
+                    received: 0,
+                },
                buffer: Default::default(),
-            }),
+            },
            cached_typeinfo: Default::default(),

            socket_config,
@@ -145,19 +216,29 @@ impl Client {
        self.process_id
    }

-    pub(crate) fn inner(&self) -> &Arc<InnerClient> {
-        &self.inner
+    pub(crate) fn inner_mut(&mut self) -> &mut InnerClient {
+        &mut self.inner
    }

    /// Pass text directly to the Postgres backend to allow it to sort out typing itself and
    /// to save a roundtrip
-    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    pub async fn query_raw_txt<S, I>(
+        &mut self,
+        statement: &str,
+        params: I,
+    ) -> Result<RowStream, Error>
    where
        S: AsRef<str>,
        I: IntoIterator<Item = Option<S>>,
        I::IntoIter: ExactSizeIterator,
    {
-        query::query_txt(&self.inner, statement, params).await
+        query::query_txt(
+            &mut self.inner,
+            &mut self.cached_typeinfo,
+            statement,
+            params,
+        )
+        .await
    }

    /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
@@ -173,12 +254,15 @@ impl Client {
    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
    /// them to this method!
-    pub async fn simple_query(&self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
+    pub async fn simple_query(&mut self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
        self.simple_query_raw(query).await?.try_collect().await
    }

-    pub(crate) async fn simple_query_raw(&self, query: &str) -> Result<SimpleQueryStream, Error> {
-        simple_query::simple_query(self.inner(), query).await
+    pub(crate) async fn simple_query_raw(
+        &mut self,
+        query: &str,
+    ) -> Result<SimpleQueryStream, Error> {
+        simple_query::simple_query(self.inner_mut(), query).await
    }

    /// Executes a sequence of SQL statements using the simple query protocol.
@@ -191,15 +275,11 @@ impl Client {
    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
    /// them to this method!
-    pub async fn batch_execute(&self, query: &str) -> Result<ReadyForQueryStatus, Error> {
-        simple_query::batch_execute(self.inner(), query).await
+    pub async fn batch_execute(&mut self, query: &str) -> Result<ReadyForQueryStatus, Error> {
+        simple_query::batch_execute(self.inner_mut(), query).await
    }

    pub async fn discard_all(&mut self) -> Result<ReadyForQueryStatus, Error> {
-        // clear the prepared statements that are about to be nuked from the postgres session
-
-        self.cached_typeinfo.typeinfo = None;
-
        self.batch_execute("discard all").await
    }

@@ -208,7 +288,7 @@ impl Client {
    /// The transaction will roll back by default - use the `commit` method to commit it.
    pub async fn transaction(&mut self) -> Result<Transaction<'_>, Error> {
        struct RollbackIfNotDone<'me> {
-            client: &'me Client,
+            client: &'me mut Client,
            done: bool,
        }

@@ -218,14 +298,7 @@ impl Client {
                    return;
                }

-                let buf = self.client.inner().with_buf(|buf| {
-                    frontend::query("ROLLBACK", buf).unwrap();
-                    buf.split().freeze()
-                });
-                let _ = self
-                    .client
-                    .inner()
-                    .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+                let _ = self.client.inner.send_simple_query("ROLLBACK");
            }
        }

@@ -239,7 +312,7 @@ impl Client {
                client: self,
                done: false,
            };
-            self.batch_execute("BEGIN").await?;
+            cleaner.client.batch_execute("BEGIN").await?;
            cleaner.done = true;
        }

@@ -265,11 +338,6 @@ impl Client {
        }
    }

-    /// Query for type information
-    pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result<Type, Error> {
-        crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await
-    }
-
    /// Determines if the connection to the server has already closed.
    ///
    /// In that case, all future queries will fail.
--- a/libs/proxy/tokio-postgres2/src/codec.rs
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -1,21 +1,16 @@
 use std::io;

-use bytes::{Buf, Bytes, BytesMut};
+use bytes::{Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use postgres_protocol2::message::backend;
-use postgres_protocol2::message::frontend::CopyData;
 use tokio_util::codec::{Decoder, Encoder};

 pub enum FrontendMessage {
    Raw(Bytes),
-    CopyData(CopyData<Box<dyn Buf + Send>>),
 }

 pub enum BackendMessage {
-    Normal {
-        messages: BackendMessages,
-        request_complete: bool,
-    },
+    Normal { messages: BackendMessages },
    Async(backend::Message),
 }

@@ -44,7 +39,6 @@ impl Encoder<FrontendMessage> for PostgresCodec {
    fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> {
        match item {
            FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf),
-            FrontendMessage::CopyData(data) => data.write(dst),
        }

        Ok(())
@@ -57,7 +51,6 @@ impl Decoder for PostgresCodec {

    fn decode(&mut self, src: &mut BytesMut) -> Result<Option<BackendMessage>, io::Error> {
        let mut idx = 0;
-        let mut request_complete = false;

        while let Some(header) = backend::Header::parse(&src[idx..])? {
            let len = header.len() as usize + 1;
@@ -82,7 +75,6 @@ impl Decoder for PostgresCodec {
            idx += len;

            if header.tag() == backend::READY_FOR_QUERY_TAG {
-                request_complete = true;
                break;
            }
        }
@@ -92,7 +84,6 @@ impl Decoder for PostgresCodec {
        } else {
            Ok(Some(BackendMessage::Normal {
                messages: BackendMessages(src.split_to(idx)),
-                request_complete,
            }))
        }
    }
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -59,9 +59,11 @@ where
        connect_timeout: config.connect_timeout,
    };

-    let (sender, receiver) = mpsc::unbounded_channel();
+    let (client_tx, conn_rx) = mpsc::unbounded_channel();
+    let (conn_tx, client_rx) = mpsc::channel(4);
    let client = Client::new(
-        sender,
+        client_tx,
+        client_rx,
        socket_config,
        config.ssl_mode,
        process_id,
@@ -74,7 +76,7 @@ where
        .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
        .collect();

-    let connection = Connection::new(stream, delayed, parameters, receiver);
+    let connection = Connection::new(stream, delayed, parameters, conn_tx, conn_rx);

    Ok((client, connection))
 }
--- a/libs/proxy/tokio-postgres2/src/connection.rs
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -4,7 +4,6 @@ use std::pin::Pin;
 use std::task::{Context, Poll};

 use bytes::BytesMut;
-use fallible_iterator::FallibleIterator;
 use futures_util::{Sink, Stream, ready};
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
@@ -19,30 +18,12 @@ use crate::error::DbError;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::{AsyncMessage, Error, Notification};

-pub enum RequestMessages {
-    Single(FrontendMessage),
-}
-
-pub struct Request {
-    pub messages: RequestMessages,
-    pub sender: mpsc::Sender<BackendMessages>,
-}
-
-pub struct Response {
-    sender: PollSender<BackendMessages>,
-}
-
 #[derive(PartialEq, Debug)]
 enum State {
    Active,
    Closing,
 }

-enum WriteReady {
-    Terminating,
-    WaitingOnRead,
-}
-
 /// A connection to a PostgreSQL database.
 ///
 /// This is one half of what is returned when a new connection is established. It performs the actual IO with the
@@ -56,9 +37,11 @@ pub struct Connection<S, T> {
    pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
    /// HACK: we need this in the Neon Proxy to forward params.
    pub parameters: HashMap<String, String>,
-    receiver: mpsc::UnboundedReceiver<Request>,
+
+    sender: PollSender<BackendMessages>,
+    receiver: mpsc::UnboundedReceiver<FrontendMessage>,
+
    pending_responses: VecDeque<BackendMessage>,
-    responses: VecDeque<Response>,
    state: State,
 }

@@ -71,14 +54,15 @@ where
        stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
        pending_responses: VecDeque<BackendMessage>,
        parameters: HashMap<String, String>,
-        receiver: mpsc::UnboundedReceiver<Request>,
+        sender: mpsc::Sender<BackendMessages>,
+        receiver: mpsc::UnboundedReceiver<FrontendMessage>,
    ) -> Connection<S, T> {
        Connection {
            stream,
            parameters,
+            sender: PollSender::new(sender),
            receiver,
            pending_responses,
-            responses: VecDeque::new(),
            state: State::Active,
        }
    }
@@ -110,7 +94,7 @@ where
                }
            };

-            let (mut messages, request_complete) = match message {
+            let messages = match message {
                BackendMessage::Async(Message::NoticeResponse(body)) => {
                    let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?;
                    return Poll::Ready(Ok(AsyncMessage::Notice(error)));
@@ -131,41 +115,19 @@ where
                    continue;
                }
                BackendMessage::Async(_) => unreachable!(),
-                BackendMessage::Normal {
-                    messages,
-                    request_complete,
-                } => (messages, request_complete),
+                BackendMessage::Normal { messages } => messages,
            };

-            let mut response = match self.responses.pop_front() {
-                Some(response) => response,
-                None => match messages.next().map_err(Error::parse)? {
-                    Some(Message::ErrorResponse(error)) => {
-                        return Poll::Ready(Err(Error::db(error)));
-                    }
-                    _ => return Poll::Ready(Err(Error::unexpected_message())),
-                },
-            };
-
-            match response.sender.poll_reserve(cx) {
+            match self.sender.poll_reserve(cx) {
                Poll::Ready(Ok(())) => {
-                    let _ = response.sender.send_item(messages);
-                    if !request_complete {
-                        self.responses.push_front(response);
-                    }
+                    let _ = self.sender.send_item(messages);
                }
                Poll::Ready(Err(_)) => {
-                    // we need to keep paging through the rest of the messages even if the receiver's hung up
-                    if !request_complete {
-                        self.responses.push_front(response);
-                    }
+                    return Poll::Ready(Err(Error::closed()));
                }
                Poll::Pending => {
-                    self.responses.push_front(response);
-                    self.pending_responses.push_back(BackendMessage::Normal {
-                        messages,
-                        request_complete,
-                    });
+                    self.pending_responses
+                        .push_back(BackendMessage::Normal { messages });
                    trace!("poll_read: waiting on sender");
                    return Poll::Pending;
                }
@@ -174,7 +136,7 @@ where
    }

    /// Fetch the next client request and enqueue the response sender.
-    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<RequestMessages>> {
+    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<FrontendMessage>> {
        if self.receiver.is_closed() {
            return Poll::Ready(None);
        }
@@ -182,10 +144,7 @@ where
        match self.receiver.poll_recv(cx) {
            Poll::Ready(Some(request)) => {
                trace!("polled new request");
-                self.responses.push_back(Response {
-                    sender: PollSender::new(request.sender),
-                });
-                Poll::Ready(Some(request.messages))
+                Poll::Ready(Some(request))
            }
            Poll::Ready(None) => Poll::Ready(None),
            Poll::Pending => Poll::Pending,
@@ -194,7 +153,7 @@ where

    /// Process client requests and write them to the postgres connection, flushing if necessary.
    /// client -> postgres
-    fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll<Result<WriteReady, Error>> {
+    fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
        loop {
            if Pin::new(&mut self.stream)
                .poll_ready(cx)
@@ -209,14 +168,14 @@ where

            match self.poll_request(cx) {
                // send the message to postgres
-                Poll::Ready(Some(RequestMessages::Single(request))) => {
+                Poll::Ready(Some(request)) => {
                    Pin::new(&mut self.stream)
                        .start_send(request)
                        .map_err(Error::io)?;
                }
                // No more messages from the client, and no more responses to wait for.
                // Send a terminate message to postgres
-                Poll::Ready(None) if self.responses.is_empty() => {
+                Poll::Ready(None) => {
                    trace!("poll_write: at eof, terminating");
                    let mut request = BytesMut::new();
                    frontend::terminate(&mut request);
@@ -228,16 +187,7 @@ where

                    trace!("poll_write: sent eof, closing");
                    trace!("poll_write: done");
-                    return Poll::Ready(Ok(WriteReady::Terminating));
-                }
-                // No more messages from the client, but there are still some responses to wait for.
-                Poll::Ready(None) => {
-                    trace!(
-                        "poll_write: at eof, pending responses {}",
-                        self.responses.len()
-                    );
-                    ready!(self.poll_flush(cx))?;
-                    return Poll::Ready(Ok(WriteReady::WaitingOnRead));
+                    return Poll::Ready(Ok(()));
                }
                // Still waiting for a message from the client.
                Poll::Pending => {
@@ -298,7 +248,7 @@ where
            // if the state is still active, try read from and write to postgres.
            let message = self.poll_read(cx)?;
            let closing = self.poll_write(cx)?;
-            if let Poll::Ready(WriteReady::Terminating) = closing {
+            if let Poll::Ready(()) = closing {
                self.state = State::Closing;
            }

--- a/libs/proxy/tokio-postgres2/src/error/mod.rs
+++ b/libs/proxy/tokio-postgres2/src/error/mod.rs
@@ -86,6 +86,27 @@ pub struct DbError {
 }

 impl DbError {
+    pub fn new_test_error(code: SqlState, message: String) -> Self {
+        DbError {
+            severity: "ERROR".to_string(),
+            parsed_severity: Some(Severity::Error),
+            code,
+            message,
+            detail: None,
+            hint: None,
+            position: None,
+            where_: None,
+            schema: None,
+            table: None,
+            column: None,
+            datatype: None,
+            constraint: None,
+            file: None,
+            line: None,
+            routine: None,
+        }
+    }
+
    pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
        let mut severity = None;
        let mut parsed_severity = None;
--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -1,9 +1,6 @@
 #![allow(async_fn_in_trait)]

-use postgres_protocol2::Oid;
-
 use crate::query::RowStream;
-use crate::types::Type;
 use crate::{Client, Error, Transaction};

 mod private {
@@ -15,20 +12,17 @@ mod private {
 /// This trait is "sealed", and cannot be implemented outside of this crate.
 pub trait GenericClient: private::Sealed {
    /// Like `Client::query_raw_txt`.
-    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
        I::IntoIter: ExactSizeIterator + Sync + Send;
-
-    /// Query for type information
-    async fn get_type(&mut self, oid: Oid) -> Result<Type, Error>;
 }

 impl private::Sealed for Client {}

 impl GenericClient for Client {
-    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -36,17 +30,12 @@ impl GenericClient for Client {
    {
        self.query_raw_txt(statement, params).await
    }
-
-    /// Query for type information
-    async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
-        self.get_type_inner(oid).await
-    }
 }

 impl private::Sealed for Transaction<'_> {}

 impl GenericClient for Transaction<'_> {
-    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -54,9 +43,4 @@ impl GenericClient for Transaction<'_> {
    {
        self.query_raw_txt(statement, params).await
    }
-
-    /// Query for type information
-    async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
-        self.client_mut().get_type(oid).await
-    }
 }
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -18,7 +18,6 @@ pub use crate::statement::{Column, Statement};
 pub use crate::tls::NoTls;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
-use crate::types::ToSql;

 /// After executing a query, the connection will be in one of these states
 #[derive(Clone, Copy, Debug, PartialEq)]
@@ -120,9 +119,3 @@ pub enum SimpleQueryMessage {
    /// The number of rows modified or selected is returned.
    CommandComplete(u64),
 }
-
-fn slice_iter<'a>(
-    s: &'a [&'a (dyn ToSql + Sync)],
-) -> impl ExactSizeIterator<Item = &'a (dyn ToSql + Sync)> + 'a {
-    s.iter().map(|s| *s as _)
-}
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -1,19 +1,14 @@
-use std::future::Future;
-use std::pin::Pin;
-use std::sync::Arc;
-
-use bytes::Bytes;
+use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
-use futures_util::{TryStreamExt, pin_mut};
-use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::IsNull;
+use postgres_protocol2::message::backend::{Message, RowDescriptionBody};
 use postgres_protocol2::message::frontend;
-use tracing::debug;
+use postgres_protocol2::types::oid_to_sql;
+use postgres_types2::Format;

-use crate::client::{CachedTypeInfo, InnerClient};
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
+use crate::client::{CachedTypeInfo, PartialQuery, Responses};
 use crate::types::{Kind, Oid, Type};
-use crate::{Column, Error, Statement, query, slice_iter};
+use crate::{Column, Error, Row, Statement};

 pub(crate) const TYPEINFO_QUERY: &str = "\
 SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
@@ -23,22 +18,51 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
 WHERE t.oid = $1
 ";

+/// we need to make sure we close this prepared statement.
+struct CloseStmt<'a, 'b> {
+    client: Option<&'a mut PartialQuery<'b>>,
+    name: &'static str,
+}
+
+impl<'a> CloseStmt<'a, '_> {
+    fn close(mut self) -> Result<&'a mut Responses, Error> {
+        let client = self.client.take().unwrap();
+        client.send_with_flush(|buf| {
+            frontend::close(b'S', self.name, buf).map_err(Error::encode)?;
+            Ok(())
+        })
+    }
+}
+
+impl Drop for CloseStmt<'_, '_> {
+    fn drop(&mut self) {
+        if let Some(client) = self.client.take() {
+            let _ = client.send_with_flush(|buf| {
+                frontend::close(b'S', self.name, buf).map_err(Error::encode)?;
+                Ok(())
+            });
+        }
+    }
+}
+
 async fn prepare_typecheck(
-    client: &Arc<InnerClient>,
+    client: &mut PartialQuery<'_>,
    name: &'static str,
    query: &str,
-    types: &[Type],
 ) -> Result<Statement, Error> {
-    let buf = encode(client, name, query, types)?;
-    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+    let responses = client.send_with_flush(|buf| {
+        frontend::parse(name, query, [], buf).map_err(Error::encode)?;
+        frontend::describe(b'S', name, buf).map_err(Error::encode)?;
+        Ok(())
+    })?;

    match responses.next().await? {
        Message::ParseComplete => {}
        _ => return Err(Error::unexpected_message()),
    }

-    let parameter_description = match responses.next().await? {
-        Message::ParameterDescription(body) => body,
+    match responses.next().await? {
+        Message::ParameterDescription(_) => {}
        _ => return Err(Error::unexpected_message()),
    };

@@ -48,13 +72,6 @@ async fn prepare_typecheck(
        _ => return Err(Error::unexpected_message()),
    };

-    let mut parameters = vec![];
-    let mut it = parameter_description.parameters();
-    while let Some(oid) = it.next().map_err(Error::parse)? {
-        let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?;
-        parameters.push(type_);
-    }
-
    let mut columns = vec![];
    if let Some(row_description) = row_description {
        let mut it = row_description.fields();
@@ -65,98 +82,168 @@ async fn prepare_typecheck(
        }
    }

-    Ok(Statement::new(client, name, parameters, columns))
+    Ok(Statement::new(name, columns))
 }

-fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
-    if types.is_empty() {
-        debug!("preparing query {}: {}", name, query);
-    } else {
-        debug!("preparing query {} with types {:?}: {}", name, types, query);
-    }
-
-    client.with_buf(|buf| {
-        frontend::parse(name, query, types.iter().map(Type::oid), buf).map_err(Error::encode)?;
-        frontend::describe(b'S', name, buf).map_err(Error::encode)?;
-        frontend::sync(buf);
-        Ok(buf.split().freeze())
-    })
-}
-
-pub async fn get_type(
-    client: &Arc<InnerClient>,
-    typecache: &mut CachedTypeInfo,
-    oid: Oid,
-) -> Result<Type, Error> {
+fn try_from_cache(typecache: &CachedTypeInfo, oid: Oid) -> Option<Type> {
    if let Some(type_) = Type::from_oid(oid) {
-        return Ok(type_);
+        return Some(type_);
    }

    if let Some(type_) = typecache.types.get(&oid) {
-        return Ok(type_.clone());
+        return Some(type_.clone());
    };

-    let stmt = typeinfo_statement(client, typecache).await?;
+    None
+}

-    let rows = query::query(client, stmt, slice_iter(&[&oid])).await?;
-    pin_mut!(rows);
+pub async fn parse_row_description(
+    client: &mut PartialQuery<'_>,
+    typecache: &mut CachedTypeInfo,
+    row_description: Option<RowDescriptionBody>,
+) -> Result<Vec<Column>, Error> {
+    let mut columns = vec![];

-    let row = match rows.try_next().await? {
-        Some(row) => row,
-        None => return Err(Error::unexpected_message()),
+    if let Some(row_description) = row_description {
+        let mut it = row_description.fields();
+        while let Some(field) = it.next().map_err(Error::parse)? {
+            let type_ = try_from_cache(typecache, field.type_oid()).unwrap_or(Type::UNKNOWN);
+            let column = Column::new(field.name().to_string(), type_, field);
+            columns.push(column);
+        }
+    }
+
+    let all_known = columns.iter().all(|c| c.type_ != Type::UNKNOWN);
+    if all_known {
+        // all known, return early.
+        return Ok(columns);
+    }
+
+    let typeinfo = "neon_proxy_typeinfo";
+
+    // make sure to close the typeinfo statement before exiting.
+    let mut guard = CloseStmt {
+        name: typeinfo,
+        client: None,
+    };
+    let client = guard.client.insert(client);
+
+    // get the typeinfo statement.
+    let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY).await?;
+
+    for column in &mut columns {
+        column.type_ = get_type(client, typecache, &stmt, column.type_oid()).await?;
+    }
+
+    // cancel the close guard.
+    let responses = guard.close()?;
+
+    match responses.next().await? {
+        Message::CloseComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    Ok(columns)
+}
+
+async fn get_type(
+    client: &mut PartialQuery<'_>,
+    typecache: &mut CachedTypeInfo,
+    stmt: &Statement,
+    mut oid: Oid,
+) -> Result<Type, Error> {
+    let mut stack = vec![];
+    let mut type_ = loop {
+        if let Some(type_) = try_from_cache(typecache, oid) {
+            break type_;
+        }
+
+        let row = exec(client, stmt, oid).await?;
+        if stack.len() > 8 {
+            return Err(Error::unexpected_message());
+        }
+
+        let name: String = row.try_get(0)?;
+        let type_: i8 = row.try_get(1)?;
+        let elem_oid: Oid = row.try_get(2)?;
+        let rngsubtype: Option<Oid> = row.try_get(3)?;
+        let basetype: Oid = row.try_get(4)?;
+        let schema: String = row.try_get(5)?;
+        let relid: Oid = row.try_get(6)?;
+
+        let kind = if type_ == b'e' as i8 {
+            Kind::Enum
+        } else if type_ == b'p' as i8 {
+            Kind::Pseudo
+        } else if basetype != 0 {
+            Kind::Domain(basetype)
+        } else if elem_oid != 0 {
+            stack.push((name, oid, schema));
+            oid = elem_oid;
+            continue;
+        } else if relid != 0 {
+            Kind::Composite(relid)
+        } else if let Some(rngsubtype) = rngsubtype {
+            Kind::Range(rngsubtype)
+        } else {
+            Kind::Simple
+        };
+
+        let type_ = Type::new(name, oid, kind, schema);
+        typecache.types.insert(oid, type_.clone());
+        break type_;
    };

-    let name: String = row.try_get(0)?;
-    let type_: i8 = row.try_get(1)?;
-    let elem_oid: Oid = row.try_get(2)?;
-    let rngsubtype: Option<Oid> = row.try_get(3)?;
-    let basetype: Oid = row.try_get(4)?;
-    let schema: String = row.try_get(5)?;
-    let relid: Oid = row.try_get(6)?;
-
-    let kind = if type_ == b'e' as i8 {
-        Kind::Enum
-    } else if type_ == b'p' as i8 {
-        Kind::Pseudo
-    } else if basetype != 0 {
-        Kind::Domain(basetype)
-    } else if elem_oid != 0 {
-        let type_ = get_type_rec(client, typecache, elem_oid).await?;
-        Kind::Array(type_)
-    } else if relid != 0 {
-        Kind::Composite(relid)
-    } else if let Some(rngsubtype) = rngsubtype {
-        let type_ = get_type_rec(client, typecache, rngsubtype).await?;
-        Kind::Range(type_)
-    } else {
-        Kind::Simple
-    };
-
-    let type_ = Type::new(name, oid, kind, schema);
-    typecache.types.insert(oid, type_.clone());
+    while let Some((name, oid, schema)) = stack.pop() {
+        type_ = Type::new(name, oid, Kind::Array(type_), schema);
+        typecache.types.insert(oid, type_.clone());
+    }

    Ok(type_)
 }

-fn get_type_rec<'a>(
-    client: &'a Arc<InnerClient>,
-    typecache: &'a mut CachedTypeInfo,
-    oid: Oid,
-) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
-    Box::pin(get_type(client, typecache, oid))
-}
+/// exec the typeinfo statement returning one row.
+async fn exec(
+    client: &mut PartialQuery<'_>,
+    statement: &Statement,
+    param: Oid,
+) -> Result<Row, Error> {
+    let responses = client.send_with_flush(|buf| {
+        encode_bind(statement, param, "", buf);
+        frontend::execute("", 0, buf).map_err(Error::encode)?;
+        Ok(())
+    })?;

-async fn typeinfo_statement(
-    client: &Arc<InnerClient>,
-    typecache: &mut CachedTypeInfo,
-) -> Result<Statement, Error> {
-    if let Some(stmt) = &typecache.typeinfo {
-        return Ok(stmt.clone());
+    match responses.next().await? {
+        Message::BindComplete => {}
+        _ => return Err(Error::unexpected_message()),
    }

-    let typeinfo = "neon_proxy_typeinfo";
-    let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
+    let row = match responses.next().await? {
+        Message::DataRow(body) => Row::new(statement.clone(), body, Format::Binary)?,
+        _ => return Err(Error::unexpected_message()),
+    };

-    typecache.typeinfo = Some(stmt.clone());
-    Ok(stmt)
+    match responses.next().await? {
+        Message::CommandComplete(_) => {}
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    Ok(row)
+}
+
+fn encode_bind(statement: &Statement, param: Oid, portal: &str, buf: &mut BytesMut) {
+    frontend::bind(
+        portal,
+        statement.name(),
+        [Format::Binary as i16],
+        [param],
+        |param, buf| {
+            oid_to_sql(param, buf);
+            Ok(IsNull::No)
+        },
+        [Format::Binary as i16],
+        buf,
+    )
+    .unwrap();
 }
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -1,76 +1,43 @@
-use std::fmt;
-use std::marker::PhantomPinned;
 use std::pin::Pin;
-use std::sync::Arc;
 use std::task::{Context, Poll};

-use bytes::{BufMut, Bytes, BytesMut};
-use fallible_iterator::FallibleIterator;
+use bytes::BufMut;
 use futures_util::{Stream, ready};
-use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
-use postgres_types2::{Format, ToSql, Type};
-use tracing::debug;
+use postgres_types2::Format;

-use crate::client::{InnerClient, Responses};
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
-use crate::types::IsNull;
-use crate::{Column, Error, ReadyForQueryStatus, Row, Statement};
+use crate::client::{CachedTypeInfo, InnerClient, Responses};
+use crate::{Error, ReadyForQueryStatus, Row, Statement};

-struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]);
-
-impl fmt::Debug for BorrowToSqlParamsDebug<'_> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_list().entries(self.0.iter()).finish()
-    }
-}
-
-pub async fn query<'a, I>(
-    client: &InnerClient,
-    statement: Statement,
-    params: I,
-) -> Result<RowStream, Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    let buf = if tracing::enabled!(tracing::Level::DEBUG) {
-        let params = params.into_iter().collect::<Vec<_>>();
-        debug!(
-            "executing statement {} with parameters: {:?}",
-            statement.name(),
-            BorrowToSqlParamsDebug(params.as_slice()),
-        );
-        encode(client, &statement, params)?
-    } else {
-        encode(client, &statement, params)?
-    };
-    let responses = start(client, buf).await?;
-    Ok(RowStream {
-        statement,
-        responses,
-        command_tag: None,
-        status: ReadyForQueryStatus::Unknown,
-        output_format: Format::Binary,
-        _p: PhantomPinned,
-    })
-}
-
-pub async fn query_txt<S, I>(
-    client: &Arc<InnerClient>,
+pub async fn query_txt<'a, S, I>(
+    client: &'a mut InnerClient,
+    typecache: &mut CachedTypeInfo,
    query: &str,
    params: I,
-) -> Result<RowStream, Error>
+) -> Result<RowStream<'a>, Error>
 where
    S: AsRef<str>,
    I: IntoIterator<Item = Option<S>>,
    I::IntoIter: ExactSizeIterator,
 {
    let params = params.into_iter();
+    let mut client = client.start()?;

-    let buf = client.with_buf(|buf| {
+    // Flow:
+    // 1. Parse the query
+    // 2. Inspect the row description for OIDs
+    // 3. If there's any OIDs we don't already know about, perform the typeinfo routine
+    // 4. Execute the query
+    // 5. Sync.
+    //
+    // The typeinfo routine:
+    // 1. Parse the typeinfo query
+    // 2. Execute the query on each OID
+    // 3. If the result does not match an OID we know, repeat 2.
+
+    // parse the query and get type info
+    let responses = client.send_with_flush(|buf| {
        frontend::parse(
            "",                 // unnamed prepared statement
            query,              // query to parse
@@ -79,7 +46,30 @@ where
        )
        .map_err(Error::encode)?;
        frontend::describe(b'S', "", buf).map_err(Error::encode)?;
-        // Bind, pass params as text, retrieve as binary
+        Ok(())
+    })?;
+
+    match responses.next().await? {
+        Message::ParseComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    match responses.next().await? {
+        Message::ParameterDescription(_) => {}
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let row_description = match responses.next().await? {
+        Message::RowDescription(body) => Some(body),
+        Message::NoData => None,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let columns =
+        crate::prepare::parse_row_description(&mut client, typecache, row_description).await?;
+
+    let responses = client.send_with_sync(|buf| {
+        // Bind, pass params as text, retrieve as text
        match frontend::bind(
            "",                 // empty string selects the unnamed portal
            "",                 // unnamed prepared statement
@@ -102,173 +92,55 @@ where

        // Execute
        frontend::execute("", 0, buf).map_err(Error::encode)?;
-        // Sync
-        frontend::sync(buf);

-        Ok(buf.split().freeze())
+        Ok(())
    })?;

-    // now read the responses
-    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
-
-    match responses.next().await? {
-        Message::ParseComplete => {}
-        _ => return Err(Error::unexpected_message()),
-    }
-
-    let parameter_description = match responses.next().await? {
-        Message::ParameterDescription(body) => body,
-        _ => return Err(Error::unexpected_message()),
-    };
-
-    let row_description = match responses.next().await? {
-        Message::RowDescription(body) => Some(body),
-        Message::NoData => None,
-        _ => return Err(Error::unexpected_message()),
-    };
-
    match responses.next().await? {
        Message::BindComplete => {}
        _ => return Err(Error::unexpected_message()),
    }

-    let mut parameters = vec![];
-    let mut it = parameter_description.parameters();
-    while let Some(oid) = it.next().map_err(Error::parse)? {
-        let type_ = Type::from_oid(oid).unwrap_or(Type::UNKNOWN);
-        parameters.push(type_);
-    }
-
-    let mut columns = vec![];
-    if let Some(row_description) = row_description {
-        let mut it = row_description.fields();
-        while let Some(field) = it.next().map_err(Error::parse)? {
-            let type_ = Type::from_oid(field.type_oid()).unwrap_or(Type::UNKNOWN);
-            let column = Column::new(field.name().to_string(), type_, field);
-            columns.push(column);
-        }
-    }
-
    Ok(RowStream {
-        statement: Statement::new_anonymous(parameters, columns),
        responses,
+        statement: Statement::new("", columns),
        command_tag: None,
        status: ReadyForQueryStatus::Unknown,
        output_format: Format::Text,
-        _p: PhantomPinned,
    })
 }

-async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
-    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
-
-    match responses.next().await? {
-        Message::BindComplete => {}
-        _ => return Err(Error::unexpected_message()),
-    }
-
-    Ok(responses)
+/// A stream of table rows.
+pub struct RowStream<'a> {
+    responses: &'a mut Responses,
+    output_format: Format,
+    pub statement: Statement,
+    pub command_tag: Option<String>,
+    pub status: ReadyForQueryStatus,
 }

-pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result<Bytes, Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    client.with_buf(|buf| {
-        encode_bind(statement, params, "", buf)?;
-        frontend::execute("", 0, buf).map_err(Error::encode)?;
-        frontend::sync(buf);
-        Ok(buf.split().freeze())
-    })
-}
-
-pub fn encode_bind<'a, I>(
-    statement: &Statement,
-    params: I,
-    portal: &str,
-    buf: &mut BytesMut,
-) -> Result<(), Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    let param_types = statement.params();
-    let params = params.into_iter();
-
-    assert!(
-        param_types.len() == params.len(),
-        "expected {} parameters but got {}",
-        param_types.len(),
-        params.len()
-    );
-
-    let (param_formats, params): (Vec<_>, Vec<_>) = params
-        .zip(param_types.iter())
-        .map(|(p, ty)| (p.encode_format(ty) as i16, p))
-        .unzip();
-
-    let params = params.into_iter();
-
-    let mut error_idx = 0;
-    let r = frontend::bind(
-        portal,
-        statement.name(),
-        param_formats,
-        params.zip(param_types).enumerate(),
-        |(idx, (param, ty)), buf| match param.to_sql_checked(ty, buf) {
-            Ok(IsNull::No) => Ok(postgres_protocol2::IsNull::No),
-            Ok(IsNull::Yes) => Ok(postgres_protocol2::IsNull::Yes),
-            Err(e) => {
-                error_idx = idx;
-                Err(e)
-            }
-        },
-        Some(1),
-        buf,
-    );
-    match r {
-        Ok(()) => Ok(()),
-        Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, error_idx)),
-        Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)),
-    }
-}
-
-pin_project! {
-    /// A stream of table rows.
-    pub struct RowStream {
-        statement: Statement,
-        responses: Responses,
-        command_tag: Option<String>,
-        output_format: Format,
-        status: ReadyForQueryStatus,
-        #[pin]
-        _p: PhantomPinned,
-    }
-}
-
-impl Stream for RowStream {
+impl Stream for RowStream<'_> {
    type Item = Result<Row, Error>;

    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        let this = self.project();
+        let this = self.get_mut();
        loop {
            match ready!(this.responses.poll_next(cx)?) {
                Message::DataRow(body) => {
                    return Poll::Ready(Some(Ok(Row::new(
                        this.statement.clone(),
                        body,
-                        *this.output_format,
+                        this.output_format,
                    )?)));
                }
                Message::EmptyQueryResponse | Message::PortalSuspended => {}
                Message::CommandComplete(body) => {
                    if let Ok(tag) = body.tag() {
-                        *this.command_tag = Some(tag.to_string());
+                        this.command_tag = Some(tag.to_string());
                    }
                }
                Message::ReadyForQuery(status) => {
-                    *this.status = status.into();
+                    this.status = status.into();
                    return Poll::Ready(None);
                }
                _ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
@@ -276,24 +148,3 @@ impl Stream for RowStream {
        }
    }
 }
-
-impl RowStream {
-    /// Returns information about the columns of data in the row.
-    pub fn columns(&self) -> &[Column] {
-        self.statement.columns()
-    }
-
-    /// Returns the command tag of this query.
-    ///
-    /// This is only available after the stream has been exhausted.
-    pub fn command_tag(&self) -> Option<String> {
-        self.command_tag.clone()
-    }
-
-    /// Returns if the connection is ready for querying, with the status of the connection.
-    ///
-    /// This might be available only after the stream has been exhausted.
-    pub fn ready_status(&self) -> ReadyForQueryStatus {
-        self.status
-    }
-}
--- a/libs/proxy/tokio-postgres2/src/simple_query.rs
+++ b/libs/proxy/tokio-postgres2/src/simple_query.rs
@@ -1,19 +1,14 @@
-use std::marker::PhantomPinned;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};

-use bytes::Bytes;
 use fallible_iterator::FallibleIterator;
 use futures_util::{Stream, ready};
 use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
-use postgres_protocol2::message::frontend;
 use tracing::debug;

 use crate::client::{InnerClient, Responses};
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
 use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow};

 /// Information about a column of a single query row.
@@ -33,28 +28,28 @@ impl SimpleColumn {
    }
 }

-pub async fn simple_query(client: &InnerClient, query: &str) -> Result<SimpleQueryStream, Error> {
+pub async fn simple_query<'a>(
+    client: &'a mut InnerClient,
+    query: &str,
+) -> Result<SimpleQueryStream<'a>, Error> {
    debug!("executing simple query: {}", query);

-    let buf = encode(client, query)?;
-    let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+    let responses = client.send_simple_query(query)?;

    Ok(SimpleQueryStream {
        responses,
        columns: None,
        status: ReadyForQueryStatus::Unknown,
-        _p: PhantomPinned,
    })
 }

 pub async fn batch_execute(
-    client: &InnerClient,
+    client: &mut InnerClient,
    query: &str,
 ) -> Result<ReadyForQueryStatus, Error> {
    debug!("executing statement batch: {}", query);

-    let buf = encode(client, query)?;
-    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+    let responses = client.send_simple_query(query)?;

    loop {
        match responses.next().await? {
@@ -68,25 +63,16 @@ pub async fn batch_execute(
    }
 }

-pub(crate) fn encode(client: &InnerClient, query: &str) -> Result<Bytes, Error> {
-    client.with_buf(|buf| {
-        frontend::query(query, buf).map_err(Error::encode)?;
-        Ok(buf.split().freeze())
-    })
-}
-
 pin_project! {
    /// A stream of simple query results.
-    pub struct SimpleQueryStream {
-        responses: Responses,
+    pub struct SimpleQueryStream<'a> {
+        responses: &'a mut Responses,
        columns: Option<Arc<[SimpleColumn]>>,
        status: ReadyForQueryStatus,
-        #[pin]
-        _p: PhantomPinned,
    }
 }

-impl SimpleQueryStream {
+impl SimpleQueryStream<'_> {
    /// Returns if the connection is ready for querying, with the status of the connection.
    ///
    /// This might be available only after the stream has been exhausted.
@@ -95,7 +81,7 @@ impl SimpleQueryStream {
    }
 }

-impl Stream for SimpleQueryStream {
+impl Stream for SimpleQueryStream<'_> {
    type Item = Result<SimpleQueryMessage, Error>;

    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
--- a/libs/proxy/tokio-postgres2/src/statement.rs
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -1,35 +1,15 @@
 use std::fmt;
-use std::sync::{Arc, Weak};
+use std::sync::Arc;

+use crate::types::Type;
 use postgres_protocol2::Oid;
 use postgres_protocol2::message::backend::Field;
-use postgres_protocol2::message::frontend;
-
-use crate::client::InnerClient;
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
-use crate::types::Type;

 struct StatementInner {
-    client: Weak<InnerClient>,
    name: &'static str,
-    params: Vec<Type>,
    columns: Vec<Column>,
 }

-impl Drop for StatementInner {
-    fn drop(&mut self) {
-        if let Some(client) = self.client.upgrade() {
-            let buf = client.with_buf(|buf| {
-                frontend::close(b'S', self.name, buf).unwrap();
-                frontend::sync(buf);
-                buf.split().freeze()
-            });
-            let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)));
-        }
-    }
-}
-
 /// A prepared statement.
 ///
 /// Prepared statements can only be used with the connection that created them.
@@ -37,38 +17,14 @@ impl Drop for StatementInner {
 pub struct Statement(Arc<StatementInner>);

 impl Statement {
-    pub(crate) fn new(
-        inner: &Arc<InnerClient>,
-        name: &'static str,
-        params: Vec<Type>,
-        columns: Vec<Column>,
-    ) -> Statement {
-        Statement(Arc::new(StatementInner {
-            client: Arc::downgrade(inner),
-            name,
-            params,
-            columns,
-        }))
-    }
-
-    pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
-        Statement(Arc::new(StatementInner {
-            client: Weak::new(),
-            name: "<anonymous>",
-            params,
-            columns,
-        }))
+    pub(crate) fn new(name: &'static str, columns: Vec<Column>) -> Statement {
+        Statement(Arc::new(StatementInner { name, columns }))
    }

    pub(crate) fn name(&self) -> &str {
        self.0.name
    }

-    /// Returns the expected types of the statement's parameters.
-    pub fn params(&self) -> &[Type] {
-        &self.0.params
-    }
-
    /// Returns information about the columns returned when the statement is queried.
    pub fn columns(&self) -> &[Column] {
        &self.0.columns
@@ -78,7 +34,7 @@ impl Statement {
 /// Information about a column of a query.
 pub struct Column {
    name: String,
-    type_: Type,
+    pub(crate) type_: Type,

    // raw fields from RowDescription
    table_oid: Oid,
--- a/libs/proxy/tokio-postgres2/src/transaction.rs
+++ b/libs/proxy/tokio-postgres2/src/transaction.rs
@@ -1,7 +1,3 @@
-use postgres_protocol2::message::frontend;
-
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
 use crate::query::RowStream;
 use crate::{CancelToken, Client, Error, ReadyForQueryStatus};

@@ -20,14 +16,7 @@ impl Drop for Transaction<'_> {
            return;
        }

-        let buf = self.client.inner().with_buf(|buf| {
-            frontend::query("ROLLBACK", buf).unwrap();
-            buf.split().freeze()
-        });
-        let _ = self
-            .client
-            .inner()
-            .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+        let _ = self.client.inner_mut().send_simple_query("ROLLBACK");
    }
 }

@@ -54,7 +43,11 @@ impl<'a> Transaction<'a> {
    }

    /// Like `Client::query_raw_txt`.
-    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    pub async fn query_raw_txt<S, I>(
+        &mut self,
+        statement: &str,
+        params: I,
+    ) -> Result<RowStream, Error>
    where
        S: AsRef<str>,
        I: IntoIterator<Item = Option<S>>,
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -439,6 +439,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
        shard_ps_feedback: [empty_feedback; 128],
        num_shards: 0,
+        replica_promote: false,
        min_ps_feedback: empty_feedback,
    }
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -30,6 +30,7 @@ crc32c.workspace = true
 either.workspace = true
 fail.workspace = true
 futures.workspace = true
+hashlink.workspace = true
 hex.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "pageserver_page_api"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+prost.workspace = true
+tonic.workspace = true
+workspace_hack.workspace = true
+
+[build-dependencies]
+tonic-build.workspace = true
--- a/pageserver/page_api/build.rs
+++ b/pageserver/page_api/build.rs
@@ -0,0 +1,13 @@
+use std::env;
+use std::path::PathBuf;
+
+/// Generates Rust code from .proto Protobuf schemas, along with a binary file
+/// descriptor set for Protobuf schema reflection.
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let out_dir = PathBuf::from(env::var("OUT_DIR")?);
+    tonic_build::configure()
+        .bytes(["."])
+        .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin"))
+        .compile_protos(&["proto/page_service.proto"], &["proto"])
+        .map_err(|err| err.into())
+}
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -0,0 +1,233 @@
+// Page service, presented by pageservers for computes.
+//
+// This is the compute read path. It primarily serves page versions at given
+// LSNs, but also base backups, SLRU segments, and relation metadata.
+//
+// EXPERIMENTAL: this is still under development and subject to change.
+//
+// Request metadata headers:
+// - authorization: JWT token ("Bearer <token>"), if auth is enabled
+// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980")
+// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
+// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
+//
+// The service can be accessed via e.g. grpcurl:
+//
+//    ```
+//    grpcurl \
+//      -plaintext \
+//      -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
+//      -H "neon-shard-id: 0b10" \
+//      -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
+//      -H "authorization: Bearer $JWT" \
+//      -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
+//      localhost:51051 page_api.PageService/CheckRelExists
+//    ```
+//
+// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
+// However, this will require reconnecting when changing modes.
+//
+// TODO: write implementation guidance on
+// - Health checks
+// - Tracing, OpenTelemetry
+// - Compression
+
+syntax = "proto3";
+package page_api;
+
+service PageService {
+  // Returns whether a relation exists.
+  rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
+
+  // Fetches a base backup.
+  rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
+
+  // Returns the total size of a database, as # of bytes.
+  rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse);
+
+  // Fetches pages.
+  //
+  // This is implemented as a bidirectional streaming RPC for performance. Unary
+  // requests incur costs for e.g. HTTP/2 stream setup, header parsing,
+  // authentication, and so on -- with streaming, we only pay these costs during
+  // the initial stream setup. This ~doubles throughput in benchmarks. Other
+  // RPCs use regular unary requests, since they are not as frequent and
+  // performance-critical, and this simplifies implementation.
+  //
+  // NB: a status response (e.g. errors) will terminate the stream. The stream
+  // may be shared by e.g. multiple Postgres backends, so we should avoid this.
+  // Most errors are therefore sent as GetPageResponse.status instead.
+  rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);
+
+  // Returns the size of a relation, as # of blocks.
+  rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse);
+
+  // Fetches an SLRU segment.
+  rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
+}
+
+// The LSN a request should read at.
+message ReadLsn {
+  // The request's read LSN. Required.
+  uint64 request_lsn = 1;
+  // If given, the caller guarantees that the page has not been modified since
+  // this LSN. Must be smaller than or equal to request_lsn. This allows the
+  // Pageserver to serve an old page without waiting for the request LSN to
+  // arrive. Valid for all request types.
+  //
+  // It is undefined behaviour to make a request such that the page was, in
+  // fact, modified between request_lsn and not_modified_since_lsn. The
+  // Pageserver might detect it and return an error, or it might return the old
+  // page version or the new page version. Setting not_modified_since_lsn equal
+  // to request_lsn is always safe, but can lead to unnecessary waiting.
+  uint64 not_modified_since_lsn = 2;
+}
+
+// A relation identifier.
+message RelTag {
+    uint32 spc_oid = 1;
+    uint32 db_oid = 2;
+    uint32 rel_number = 3;
+    uint32 fork_number = 4;
+}
+
+// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
+// other shards will error.
+message CheckRelExistsRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message CheckRelExistsResponse {
+  bool exists = 1;
+}
+
+// Requests a base backup at a given LSN.
+message GetBaseBackupRequest {
+  // The LSN to fetch a base backup at.
+  ReadLsn read_lsn = 1;
+  // If true, logical replication slots will not be created.
+  bool replica = 2;
+}
+
+// Base backup response chunk, returned as an ordered stream.
+message GetBaseBackupResponseChunk {
+  // A basebackup data chunk. The size is undefined, but bounded by the 4 MB
+  // gRPC message size limit.
+  bytes chunk = 1;
+}
+
+// Requests the size of a database, as # of bytes. Only valid on shard 0, other
+// shards will error.
+message GetDbSizeRequest {
+  ReadLsn read_lsn = 1;
+  uint32 db_oid = 2;
+}
+
+message GetDbSizeResponse {
+  uint64 num_bytes = 1;
+}
+
+// Requests one or more pages.
+message GetPageRequest {
+  // A request ID. Will be included in the response. Should be unique for
+  // in-flight requests on the stream.
+  uint64 request_id = 1;
+  // The request class.
+  GetPageClass request_class = 2;
+  // The LSN to read at.
+  ReadLsn read_lsn = 3;
+  // The relation to read from.
+  RelTag rel = 4;
+  // Page numbers to read. Must belong to the remote shard.
+  //
+  // Multiple pages will be executed as a single batch by the Pageserver,
+  // amortizing layer access costs and parallelizing them. This may increase the
+  // latency of any individual request, but improves the overall latency and
+  // throughput of the batch as a whole.
+  //
+  // TODO: this causes an allocation in the common single-block case. The sender
+  // can use a SmallVec to stack-allocate it, but Prost will always deserialize
+  // into a heap-allocated Vec. Consider optimizing this.
+  //
+  // TODO: we might be able to avoid a sort or something if we mandate that these
+  // are always in order. But we can't currenly rely on this on the server, because
+  // of compatibility with the libpq protocol handler.
+  repeated uint32 block_number = 5;
+}
+
+// A GetPageRequest class. Primarily intended for observability, but may also be
+// used for prioritization in the future.
+enum GetPageClass {
+  // Unknown class. For forwards compatibility: used when the client sends a
+  // class that the server doesn't know about.
+  GET_PAGE_CLASS_UNKNOWN = 0;
+  // A normal request. This is the default.
+  GET_PAGE_CLASS_NORMAL = 1;
+  // A prefetch request. NB: can only be classified on pg < 18.
+  GET_PAGE_CLASS_PREFETCH = 2;
+  // A background request (e.g. vacuum).
+  GET_PAGE_CLASS_BACKGROUND = 3;
+}
+
+// A GetPage response.
+//
+// A batch response will contain all of the requested pages. We could eagerly
+// emit individual pages as soon as they are ready, but on a readv() Postgres
+// holds buffer pool locks on all pages in the batch and we'll only return once
+// the entire batch is ready, so no one can make use of the individual pages.
+message GetPageResponse {
+  // The original request's ID.
+  uint64 request_id = 1;
+  // The response status code.
+  GetPageStatus status = 2;
+  // A string describing the status, if any.
+  string reason = 3;
+  // The 8KB page images, in the same order as the request. Empty if status != OK.
+  repeated bytes page_image = 4;
+}
+
+// A GetPageResponse status code. Since we use a bidirectional stream, we don't
+// want to send errors as gRPC statuses, since this would terminate the stream.
+enum GetPageStatus {
+  // Unknown status. For forwards compatibility: used when the server sends a
+  // status code that the client doesn't know about.
+  GET_PAGE_STATUS_UNKNOWN = 0;
+  // The request was successful.
+  GET_PAGE_STATUS_OK = 1;
+  // The page did not exist. The tenant/timeline/shard has already been
+  // validated during stream setup.
+  GET_PAGE_STATUS_NOT_FOUND = 2;
+  // The request was invalid.
+  GET_PAGE_STATUS_INVALID = 3;
+  // The tenant is rate limited. Slow down and retry later.
+  GET_PAGE_STATUS_SLOW_DOWN = 4;
+  // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
+  // layer download. This could free up the server task to process other
+  // requests while the layer download is in progress.
+}
+
+// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
+// shard 0, other shards will error.
+message GetRelSizeRequest {
+  ReadLsn read_lsn = 1;
+  RelTag rel = 2;
+}
+
+message GetRelSizeResponse {
+  uint32 num_blocks = 1;
+}
+
+// Requests an SLRU segment. Only valid on shard 0, other shards will error.
+message GetSlruSegmentRequest {
+  ReadLsn read_lsn = 1;
+  uint32 kind = 2;
+  uint32 segno = 3;
+}
+
+// Returns an SLRU segment.
+//
+// These are up 32 pages (256 KB), so we can send them as a single response.
+message GetSlruSegmentResponse {
+  bytes segment = 1;
+}
--- a/pageserver/page_api/src/lib.rs
+++ b/pageserver/page_api/src/lib.rs
@@ -0,0 +1,19 @@
+//! This crate provides the Pageserver's page API. It contains:
+//!
+//! * proto/page_service.proto: the Protobuf schema for the page API.
+//! * proto: auto-generated Protobuf types for gRPC.
+//!
+//! This crate is used by both the client and the server. Try to keep it slim.
+
+// Code generated by protobuf.
+pub mod proto {
+    tonic::include_proto!("page_api");
+
+    /// File descriptor set for Protobuf schema reflection. This allows using
+    /// e.g. grpcurl with the API.
+    pub const FILE_DESCRIPTOR_SET: &[u8] =
+        tonic::include_file_descriptor_set!("page_api_descriptor");
+
+    pub use page_service_client::PageServiceClient;
+    pub use page_service_server::{PageService, PageServiceServer};
+}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -144,7 +144,7 @@ where
        replica,
        ctx,
        io_concurrency: IoConcurrency::spawn_from_conf(
-            timeline.conf,
+            timeline.conf.get_vectored_concurrent_io,
            timeline
                .gate
                .enter()
@@ -343,7 +343,7 @@ where
            // Gather non-relational files from object storage pages.
            let slru_partitions = self
                .timeline
-                .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
+                .get_slru_keyspace(Version::at(self.lsn), self.ctx)
                .await?
                .partition(
                    self.timeline.get_shard_identity(),
@@ -378,7 +378,7 @@ where
            // Otherwise only include init forks of unlogged relations.
            let rels = self
                .timeline
-                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
@@ -517,7 +517,7 @@ where
    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
+            .get_rel_size(src, Version::at(self.lsn), self.ctx)
            .await?;

        // If the relation is empty, create an empty file
@@ -577,7 +577,7 @@ where
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
-                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                .get_relmap_file(spcnode, dbnode, Version::at(self.lsn), self.ctx)
                .await?;

            if img.len()
@@ -631,7 +631,7 @@ where
            if !has_relmap_file
                && self
                    .timeline
-                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
+                    .list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
                    .await?
                    .is_empty()
            {
--- a/pageserver/src/basebackup_cache.rs
+++ b/pageserver/src/basebackup_cache.rs
@@ -0,0 +1,518 @@
+use std::{collections::HashMap, sync::Arc};
+
+use async_compression::tokio::write::GzipEncoder;
+use camino::{Utf8Path, Utf8PathBuf};
+use metrics::core::{AtomicU64, GenericCounter};
+use pageserver_api::{config::BasebackupCacheConfig, models::TenantState};
+use tokio::{
+    io::{AsyncWriteExt, BufWriter},
+    sync::mpsc::{UnboundedReceiver, UnboundedSender},
+};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    id::{TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+    shard::TenantShardId,
+};
+
+use crate::{
+    basebackup::send_basebackup_tarball,
+    context::{DownloadBehavior, RequestContext},
+    metrics::{BASEBACKUP_CACHE_ENTRIES, BASEBACKUP_CACHE_PREPARE, BASEBACKUP_CACHE_READ},
+    task_mgr::TaskKind,
+    tenant::{
+        Timeline,
+        mgr::{TenantManager, TenantSlot},
+    },
+};
+
+pub struct BasebackupPrepareRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    pub lsn: Lsn,
+}
+
+pub type BasebackupPrepareSender = UnboundedSender<BasebackupPrepareRequest>;
+pub type BasebackupPrepareReceiver = UnboundedReceiver<BasebackupPrepareRequest>;
+
+type BasebackupRemoveEntrySender = UnboundedSender<Utf8PathBuf>;
+type BasebackupRemoveEntryReceiver = UnboundedReceiver<Utf8PathBuf>;
+
+/// BasebackupCache stores cached basebackup archives for timelines on local disk.
+///
+/// The main purpose of this cache is to speed up the startup process of compute nodes
+/// after scaling to zero.
+/// Thus, the basebackup is stored only for the latest LSN of the timeline and with
+/// fixed set of parameters (gzip=true, full_backup=false, replica=false, prev_lsn=none).
+///
+/// The cache receives prepare requests through the `BasebackupPrepareSender` channel,
+/// generates a basebackup from the timeline in the background, and stores it on disk.
+///
+/// Basebackup requests are pretty rare. We expect ~thousands of entries in the cache
+/// and ~1 RPS for get requests.
+pub struct BasebackupCache {
+    data_dir: Utf8PathBuf,
+    config: BasebackupCacheConfig,
+    tenant_manager: Arc<TenantManager>,
+    remove_entry_sender: BasebackupRemoveEntrySender,
+
+    entries: std::sync::Mutex<HashMap<TenantTimelineId, Lsn>>,
+
+    cancel: CancellationToken,
+
+    read_hit_count: GenericCounter<AtomicU64>,
+    read_miss_count: GenericCounter<AtomicU64>,
+    read_err_count: GenericCounter<AtomicU64>,
+
+    prepare_ok_count: GenericCounter<AtomicU64>,
+    prepare_skip_count: GenericCounter<AtomicU64>,
+    prepare_err_count: GenericCounter<AtomicU64>,
+}
+
+impl BasebackupCache {
+    /// Creates a BasebackupCache and spawns the background task.
+    /// The initialization of the cache is performed in the background and does not
+    /// block the caller. The cache will return `None` for any get requests until
+    /// initialization is complete.
+    pub fn spawn(
+        runtime_handle: &tokio::runtime::Handle,
+        data_dir: Utf8PathBuf,
+        config: Option<BasebackupCacheConfig>,
+        prepare_receiver: BasebackupPrepareReceiver,
+        tenant_manager: Arc<TenantManager>,
+        cancel: CancellationToken,
+    ) -> Arc<Self> {
+        let (remove_entry_sender, remove_entry_receiver) = tokio::sync::mpsc::unbounded_channel();
+
+        let enabled = config.is_some();
+
+        let cache = Arc::new(BasebackupCache {
+            data_dir,
+            config: config.unwrap_or_default(),
+            tenant_manager,
+            remove_entry_sender,
+
+            entries: std::sync::Mutex::new(HashMap::new()),
+
+            cancel,
+
+            read_hit_count: BASEBACKUP_CACHE_READ.with_label_values(&["hit"]),
+            read_miss_count: BASEBACKUP_CACHE_READ.with_label_values(&["miss"]),
+            read_err_count: BASEBACKUP_CACHE_READ.with_label_values(&["error"]),
+
+            prepare_ok_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["ok"]),
+            prepare_skip_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["skip"]),
+            prepare_err_count: BASEBACKUP_CACHE_PREPARE.with_label_values(&["error"]),
+        });
+
+        if enabled {
+            runtime_handle.spawn(
+                cache
+                    .clone()
+                    .background(prepare_receiver, remove_entry_receiver),
+            );
+        }
+
+        cache
+    }
+
+    /// Gets a basebackup entry from the cache.
+    /// If the entry is found, opens a file with the basebackup archive and returns it.
+    /// The open file descriptor will prevent the file system from deleting the file
+    /// even if the entry is removed from the cache in the background.
+    pub async fn get(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Option<tokio::fs::File> {
+        // Fast path. Check if the entry exists using the in-memory state.
+        let tti = TenantTimelineId::new(tenant_id, timeline_id);
+        if self.entries.lock().unwrap().get(&tti) != Some(&lsn) {
+            self.read_miss_count.inc();
+            return None;
+        }
+
+        let path = self.entry_path(tenant_id, timeline_id, lsn);
+
+        match tokio::fs::File::open(path).await {
+            Ok(file) => {
+                self.read_hit_count.inc();
+                Some(file)
+            }
+            Err(e) => {
+                if e.kind() == std::io::ErrorKind::NotFound {
+                    // We may end up here if the basebackup was concurrently removed by the cleanup task.
+                    self.read_miss_count.inc();
+                } else {
+                    self.read_err_count.inc();
+                    tracing::warn!("Unexpected error opening basebackup cache file: {:?}", e);
+                }
+                None
+            }
+        }
+    }
+
+    // Private methods.
+
+    fn entry_filename(tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> String {
+        // The default format for LSN is 0/ABCDEF.
+        // The backslash is not filename friendly, so serialize it as plain hex.
+        let lsn = lsn.0;
+        format!("basebackup_{tenant_id}_{timeline_id}_{lsn:016X}.tar.gz")
+    }
+
+    fn entry_path(&self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn) -> Utf8PathBuf {
+        self.data_dir
+            .join(Self::entry_filename(tenant_id, timeline_id, lsn))
+    }
+
+    fn entry_tmp_path(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Utf8PathBuf {
+        self.data_dir
+            .join("tmp")
+            .join(Self::entry_filename(tenant_id, timeline_id, lsn))
+    }
+
+    fn parse_entry_filename(filename: &str) -> Option<(TenantId, TimelineId, Lsn)> {
+        let parts: Vec<&str> = filename
+            .strip_prefix("basebackup_")?
+            .strip_suffix(".tar.gz")?
+            .split('_')
+            .collect();
+        if parts.len() != 3 {
+            return None;
+        }
+        let tenant_id = parts[0].parse::<TenantId>().ok()?;
+        let timeline_id = parts[1].parse::<TimelineId>().ok()?;
+        let lsn = Lsn(u64::from_str_radix(parts[2], 16).ok()?);
+
+        Some((tenant_id, timeline_id, lsn))
+    }
+
+    async fn cleanup(&self) -> anyhow::Result<()> {
+        // Cleanup tmp directory.
+        let tmp_dir = self.data_dir.join("tmp");
+        let mut tmp_dir = tokio::fs::read_dir(&tmp_dir).await?;
+        while let Some(dir_entry) = tmp_dir.next_entry().await? {
+            if let Err(e) = tokio::fs::remove_file(dir_entry.path()).await {
+                tracing::warn!("Failed to remove basebackup cache tmp file: {:#}", e);
+            }
+        }
+
+        // Remove outdated entries.
+        let entries_old = self.entries.lock().unwrap().clone();
+        let mut entries_new = HashMap::new();
+        for (tenant_shard_id, tenant_slot) in self.tenant_manager.list() {
+            if !tenant_shard_id.is_shard_zero() {
+                continue;
+            }
+            let TenantSlot::Attached(tenant) = tenant_slot else {
+                continue;
+            };
+            let tenant_id = tenant_shard_id.tenant_id;
+
+            for timeline in tenant.list_timelines() {
+                let tti = TenantTimelineId::new(tenant_id, timeline.timeline_id);
+                if let Some(&entry_lsn) = entries_old.get(&tti) {
+                    if timeline.get_last_record_lsn() <= entry_lsn {
+                        entries_new.insert(tti, entry_lsn);
+                    }
+                }
+            }
+        }
+
+        for (&tti, &lsn) in entries_old.iter() {
+            if !entries_new.contains_key(&tti) {
+                self.remove_entry_sender
+                    .send(self.entry_path(tti.tenant_id, tti.timeline_id, lsn))
+                    .unwrap();
+            }
+        }
+
+        BASEBACKUP_CACHE_ENTRIES.set(entries_new.len() as i64);
+        *self.entries.lock().unwrap() = entries_new;
+
+        Ok(())
+    }
+
+    async fn on_startup(&self) -> anyhow::Result<()> {
+        // Create data_dir and tmp directory if they do not exist.
+        tokio::fs::create_dir_all(&self.data_dir.join("tmp"))
+            .await
+            .map_err(|e| {
+                anyhow::anyhow!(
+                    "Failed to create basebackup cache data_dir {:?}: {:?}",
+                    self.data_dir,
+                    e
+                )
+            })?;
+
+        // Read existing entries from the data_dir and add them to in-memory state.
+        let mut entries = HashMap::new();
+        let mut dir = tokio::fs::read_dir(&self.data_dir).await?;
+        while let Some(dir_entry) = dir.next_entry().await? {
+            let filename = dir_entry.file_name();
+
+            if filename == "tmp" {
+                // Skip the tmp directory.
+                continue;
+            }
+
+            let parsed = Self::parse_entry_filename(filename.to_string_lossy().as_ref());
+            let Some((tenant_id, timeline_id, lsn)) = parsed else {
+                tracing::warn!("Invalid basebackup cache file name: {:?}", filename);
+                continue;
+            };
+
+            let tti = TenantTimelineId::new(tenant_id, timeline_id);
+
+            use std::collections::hash_map::Entry::*;
+
+            match entries.entry(tti) {
+                Occupied(mut entry) => {
+                    let entry_lsn = *entry.get();
+                    // Leave only the latest entry, remove the old one.
+                    if lsn < entry_lsn {
+                        self.remove_entry_sender.send(self.entry_path(
+                            tenant_id,
+                            timeline_id,
+                            lsn,
+                        ))?;
+                    } else if lsn > entry_lsn {
+                        self.remove_entry_sender.send(self.entry_path(
+                            tenant_id,
+                            timeline_id,
+                            entry_lsn,
+                        ))?;
+                        entry.insert(lsn);
+                    } else {
+                        // Two different filenames parsed to the same timline_id and LSN.
+                        // Should never happen.
+                        return Err(anyhow::anyhow!(
+                            "Duplicate basebackup cache entry with the same LSN: {:?}",
+                            filename
+                        ));
+                    }
+                }
+                Vacant(entry) => {
+                    entry.insert(lsn);
+                }
+            }
+        }
+
+        BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
+        *self.entries.lock().unwrap() = entries;
+
+        Ok(())
+    }
+
+    async fn background(
+        self: Arc<Self>,
+        mut prepare_receiver: BasebackupPrepareReceiver,
+        mut remove_entry_receiver: BasebackupRemoveEntryReceiver,
+    ) {
+        // Panic in the background is a safe fallback.
+        // It will drop receivers and the cache will be effectively disabled.
+        self.on_startup()
+            .await
+            .expect("Failed to initialize basebackup cache");
+
+        let mut cleanup_ticker = tokio::time::interval(self.config.cleanup_period);
+        cleanup_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+
+        loop {
+            tokio::select! {
+                Some(req) = prepare_receiver.recv() => {
+                    if let Err(err) = self.prepare_basebackup(
+                        req.tenant_shard_id,
+                        req.timeline_id,
+                        req.lsn,
+                    ).await {
+                        tracing::info!("Failed to prepare basebackup: {:#}", err);
+                        self.prepare_err_count.inc();
+                        continue;
+                    }
+                }
+                Some(req) = remove_entry_receiver.recv() => {
+                    if let Err(e) = tokio::fs::remove_file(req).await {
+                        tracing::warn!("Failed to remove basebackup cache file: {:#}", e);
+                    }
+                }
+                _ = cleanup_ticker.tick() => {
+                    self.cleanup().await.unwrap_or_else(|e| {
+                        tracing::warn!("Failed to clean up basebackup cache: {:#}", e);
+                    });
+                }
+                _ = self.cancel.cancelled() => {
+                    tracing::info!("BasebackupCache background task cancelled");
+                    break;
+                }
+            }
+        }
+    }
+
+    /// Prepare a basebackup for the given timeline.
+    ///
+    /// If the basebackup already exists with a higher LSN or the timeline already
+    /// has a higher last_record_lsn, skip the preparation.
+    ///
+    /// The basebackup is prepared in a temporary directory and then moved to the final
+    /// location to make the operation atomic.
+    async fn prepare_basebackup(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        req_lsn: Lsn,
+    ) -> anyhow::Result<()> {
+        tracing::info!(
+            tenant_id = %tenant_shard_id.tenant_id,
+            %timeline_id,
+            %req_lsn,
+            "Preparing basebackup for timeline",
+        );
+
+        let tti = TenantTimelineId::new(tenant_shard_id.tenant_id, timeline_id);
+
+        {
+            let entries = self.entries.lock().unwrap();
+            if let Some(&entry_lsn) = entries.get(&tti) {
+                if entry_lsn >= req_lsn {
+                    tracing::info!(
+                        %timeline_id,
+                        %req_lsn,
+                        %entry_lsn,
+                        "Basebackup entry already exists for timeline with higher LSN, skipping basebackup",
+                    );
+                    self.prepare_skip_count.inc();
+                    return Ok(());
+                }
+            }
+
+            if entries.len() as i64 >= self.config.max_size_entries {
+                tracing::info!(
+                    %timeline_id,
+                    %req_lsn,
+                    "Basebackup cache is full, skipping basebackup",
+                );
+                self.prepare_skip_count.inc();
+                return Ok(());
+            }
+        }
+
+        let tenant = self
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        let tenant_state = tenant.current_state();
+        if tenant_state != TenantState::Active {
+            anyhow::bail!(
+                "Tenant {} is not active, current state: {:?}",
+                tenant_shard_id.tenant_id,
+                tenant_state
+            )
+        }
+
+        let timeline = tenant.get_timeline(timeline_id, true)?;
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn > req_lsn {
+            tracing::info!(
+                %timeline_id,
+                %req_lsn,
+                %last_record_lsn,
+                "Timeline has a higher LSN than the requested one, skipping basebackup",
+            );
+            self.prepare_skip_count.inc();
+            return Ok(());
+        }
+
+        let entry_tmp_path = self.entry_tmp_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
+
+        let res = self
+            .prepare_basebackup_tmp(&entry_tmp_path, &timeline, req_lsn)
+            .await;
+
+        if let Err(err) = res {
+            tracing::info!("Failed to prepare basebackup tmp file: {:#}", err);
+            // Try to clean up tmp file. If we fail, the background clean up task will take care of it.
+            match tokio::fs::remove_file(&entry_tmp_path).await {
+                Ok(_) => {}
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+                Err(e) => {
+                    tracing::info!("Failed to remove basebackup tmp file: {:?}", e);
+                }
+            }
+            return Err(err);
+        }
+
+        // Move the tmp file to the final location atomically.
+        let entry_path = self.entry_path(tenant_shard_id.tenant_id, timeline_id, req_lsn);
+        tokio::fs::rename(&entry_tmp_path, &entry_path).await?;
+
+        let mut entries = self.entries.lock().unwrap();
+        if let Some(old_lsn) = entries.insert(tti, req_lsn) {
+            // Remove the old entry if it exists.
+            self.remove_entry_sender
+                .send(self.entry_path(tenant_shard_id.tenant_id, timeline_id, old_lsn))
+                .unwrap();
+        }
+        BASEBACKUP_CACHE_ENTRIES.set(entries.len() as i64);
+
+        self.prepare_ok_count.inc();
+        Ok(())
+    }
+
+    /// Prepares a basebackup in a temporary file.
+    async fn prepare_basebackup_tmp(
+        &self,
+        emptry_tmp_path: &Utf8Path,
+        timeline: &Arc<Timeline>,
+        req_lsn: Lsn,
+    ) -> anyhow::Result<()> {
+        let ctx = RequestContext::new(TaskKind::BasebackupCache, DownloadBehavior::Download);
+        let ctx = ctx.with_scope_timeline(timeline);
+
+        let file = tokio::fs::File::create(emptry_tmp_path).await?;
+        let mut writer = BufWriter::new(file);
+
+        let mut encoder = GzipEncoder::with_quality(
+            &mut writer,
+            // Level::Best because compression is not on the hot path of basebackup requests.
+            // The decompression is almost not affected by the compression level.
+            async_compression::Level::Best,
+        );
+
+        // We may receive a request before the WAL record is applied to the timeline.
+        // Wait for the requested LSN to be applied.
+        timeline
+            .wait_lsn(
+                req_lsn,
+                crate::tenant::timeline::WaitLsnWaiter::BaseBackupCache,
+                crate::tenant::timeline::WaitLsnTimeout::Default,
+                &ctx,
+            )
+            .await?;
+
+        send_basebackup_tarball(
+            &mut encoder,
+            timeline,
+            Some(req_lsn),
+            None,
+            false,
+            false,
+            &ctx,
+        )
+        .await?;
+
+        encoder.shutdown().await?;
+        writer.flush().await?;
+        writer.into_inner().sync_all().await?;
+
+        Ok(())
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -16,6 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
 use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
 use metrics::set_build_info_metric;
 use nix::sys::socket::{setsockopt, sockopt};
+use pageserver::basebackup_cache::BasebackupCache;
 use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
@@ -541,6 +542,8 @@ fn start_pageserver(
        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());

    // Scan the local 'tenants/' directory and start loading the tenants
+    let (basebackup_prepare_sender, basebackup_prepare_receiver) =
+        tokio::sync::mpsc::unbounded_channel();
    let deletion_queue_client = deletion_queue.new_client();
    let background_purges = mgr::BackgroundPurges::default();
    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -551,12 +554,22 @@ fn start_pageserver(
            remote_storage: remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
+            basebackup_prepare_sender,
        },
        order,
        shutdown_pageserver.clone(),
    ))?;
    let tenant_manager = Arc::new(tenant_manager);

+    let basebackup_cache = BasebackupCache::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        conf.basebackup_cache_dir(),
+        conf.basebackup_cache_config.clone(),
+        basebackup_prepare_receiver,
+        Arc::clone(&tenant_manager),
+        shutdown_pageserver.child_token(),
+    );
+
    BACKGROUND_RUNTIME.spawn({
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
@@ -763,6 +776,7 @@ fn start_pageserver(
        } else {
            None
        },
+        basebackup_cache,
    );

    // All started up! Now just sit and wait for shutdown signal.
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -232,6 +232,8 @@ pub struct PageServerConf {
    pub dev_mode: bool,

    pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
+
+    pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
 }

 /// Token for authentication to safekeepers
@@ -261,6 +263,10 @@ impl PageServerConf {
        self.workdir.join("metadata.json")
    }

+    pub fn basebackup_cache_dir(&self) -> Utf8PathBuf {
+        self.workdir.join("basebackup_cache")
+    }
+
    pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
        // Encode a version in the filename, so that if we ever switch away from JSON we can
        // increment this.
@@ -407,6 +413,7 @@ impl PageServerConf {
            enable_tls_page_service_api,
            dev_mode,
            timeline_import_config,
+            basebackup_cache_config,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -461,6 +468,7 @@ impl PageServerConf {
            enable_tls_page_service_api,
            dev_mode,
            timeline_import_config,
+            basebackup_cache_config,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
@@ -544,6 +552,23 @@ impl PageServerConf {
                    ratio.numerator, ratio.denominator
                )
            );
+
+            let url = Url::parse(&tracing_config.export_config.endpoint)
+                .map_err(anyhow::Error::msg)
+                .with_context(|| {
+                    format!(
+                        "tracing endpoint URL is invalid : {}",
+                        tracing_config.export_config.endpoint
+                    )
+                })?;
+
+            ensure!(
+                url.scheme() == "http" || url.scheme() == "https",
+                format!(
+                    "tracing endpoint URL must start with http:// or https://: {}",
+                    tracing_config.export_config.endpoint
+                )
+            );
        }

        IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
@@ -660,4 +685,25 @@ mod tests {
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
            .expect("parse_and_validate");
    }
+
+    #[test]
+    fn test_config_tracing_endpoint_is_invalid() {
+        let input = r#"
+            control_plane_api = "http://localhost:6666"
+
+            [tracing]
+
+            sampling_ratio = { numerator = 1, denominator = 0 }
+
+            [tracing.export_config]
+            endpoint = "localhost:4317"
+            protocol = "http-binary"
+            timeout = "1ms"
+        "#;
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
+            .expect("config has valid fields");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
+            .expect_err("parse_and_validate should fail for endpoint without scheme");
+    }
 }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -18,12 +18,25 @@ use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 // management.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
 pub(super) enum Name {
-    /// Timeline last_record_lsn, absolute
+    /// Timeline last_record_lsn, absolute.
    #[serde(rename = "written_size")]
    WrittenSize,
    /// Timeline last_record_lsn, incremental
    #[serde(rename = "written_data_bytes_delta")]
    WrittenSizeDelta,
+    /// Written bytes only on this timeline (not including ancestors):
+    /// written_size - ancestor_lsn
+    ///
+    /// On the root branch, this is equivalent to `written_size`.
+    #[serde(rename = "written_size_since_parent")]
+    WrittenSizeSinceParent,
+    /// PITR history size only on this timeline (not including ancestors):
+    /// last_record_lsn - max(pitr_cutoff, ancestor_lsn).
+    ///
+    /// On the root branch, this is its entire PITR history size. Not emitted if GC hasn't computed
+    /// the PITR cutoff yet. 0 if PITR is disabled.
+    #[serde(rename = "pitr_history_size_since_parent")]
+    PitrHistorySizeSinceParent,
    /// Timeline logical size
    #[serde(rename = "timeline_logical_size")]
    LogicalSize,
@@ -157,6 +170,32 @@ impl MetricsKey {
        .incremental_values()
    }

+    /// `written_size` - `ancestor_lsn`.
+    const fn written_size_since_parent(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::WrittenSizeSinceParent,
+        }
+        .absolute_values()
+    }
+
+    /// `written_size` - max(`pitr_cutoff`, `ancestor_lsn`).
+    const fn pitr_history_size_since_parent(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        MetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: Name::PitrHistorySizeSinceParent,
+        }
+        .absolute_values()
+    }
+
    /// Exact [`Timeline::get_current_logical_size`].
    ///
    /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
@@ -334,7 +373,13 @@ impl TenantSnapshot {
 struct TimelineSnapshot {
    loaded_at: (Lsn, SystemTime),
    last_record_lsn: Lsn,
+    ancestor_lsn: Lsn,
    current_exact_logical_size: Option<u64>,
+    /// Whether PITR is enabled (pitr_interval > 0).
+    pitr_enabled: bool,
+    /// The PITR cutoff LSN. None if not yet initialized. If PITR is disabled, this is approximately
+    /// Some(last_record_lsn), but may lag behind it since it's computed periodically.
+    pitr_cutoff: Option<Lsn>,
 }

 impl TimelineSnapshot {
@@ -354,6 +399,9 @@ impl TimelineSnapshot {
        } else {
            let loaded_at = t.loaded_at;
            let last_record_lsn = t.get_last_record_lsn();
+            let ancestor_lsn = t.get_ancestor_lsn();
+            let pitr_enabled = !t.get_pitr_interval().is_zero();
+            let pitr_cutoff = t.gc_info.read().unwrap().cutoffs.time;

            let current_exact_logical_size = {
                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
@@ -373,7 +421,10 @@ impl TimelineSnapshot {
            Ok(Some(TimelineSnapshot {
                loaded_at,
                last_record_lsn,
+                ancestor_lsn,
                current_exact_logical_size,
+                pitr_enabled,
+                pitr_cutoff,
            }))
        }
    }
@@ -424,6 +475,8 @@ impl TimelineSnapshot {

        let up_to = now;

+        let written_size_last = written_size_now.value.max(prev.1); // don't regress
+
        if let Some(delta) = written_size_now.value.checked_sub(prev.1) {
            let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
            // written_size_delta
@@ -441,6 +494,27 @@ impl TimelineSnapshot {
            });
        }

+        // Compute the branch-local written size.
+        let written_size_since_parent_key =
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id);
+        metrics.push(
+            written_size_since_parent_key
+                .at(now, written_size_last.saturating_sub(self.ancestor_lsn.0)),
+        );
+
+        // Compute the branch-local PITR history size. Not emitted if GC hasn't yet computed the
+        // PITR cutoff. 0 if PITR is disabled.
+        let pitr_history_size_since_parent_key =
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id);
+        if !self.pitr_enabled {
+            metrics.push(pitr_history_size_since_parent_key.at(now, 0));
+        } else if let Some(pitr_cutoff) = self.pitr_cutoff {
+            metrics.push(pitr_history_size_since_parent_key.at(
+                now,
+                written_size_last.saturating_sub(pitr_cutoff.max(self.ancestor_lsn).0),
+            ));
+        }
+
        {
            let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
            let current_or_previous = self
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -12,12 +12,17 @@ fn startup_collected_timeline_metrics_before_advancing() {
    let cache = HashMap::new();

    let initdb_lsn = Lsn(0x10000);
+    let pitr_cutoff = Lsn(0x11000);
    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+    let logical_size = 0x42000;

    let snap = TimelineSnapshot {
        loaded_at: (disk_consistent_lsn, SystemTime::now()),
        last_record_lsn: disk_consistent_lsn,
-        current_exact_logical_size: Some(0x42000),
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: Some(logical_size),
+        pitr_enabled: true,
+        pitr_cutoff: Some(pitr_cutoff),
    };

    let now = DateTime::<Utc>::from(SystemTime::now());
@@ -33,7 +38,11 @@ fn startup_collected_timeline_metrics_before_advancing() {
                0
            ),
            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
        ]
    );
 }
@@ -49,7 +58,9 @@ fn startup_collected_timeline_metrics_second_round() {
    let before = DateTime::<Utc>::from(before);

    let initdb_lsn = Lsn(0x10000);
+    let pitr_cutoff = Lsn(0x11000);
    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+    let logical_size = 0x42000;

    let mut metrics = Vec::new();
    let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id)
@@ -59,7 +70,10 @@ fn startup_collected_timeline_metrics_second_round() {
    let snap = TimelineSnapshot {
        loaded_at: (disk_consistent_lsn, init),
        last_record_lsn: disk_consistent_lsn,
-        current_exact_logical_size: Some(0x42000),
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: Some(logical_size),
+        pitr_enabled: true,
+        pitr_cutoff: Some(pitr_cutoff),
    };

    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
@@ -69,7 +83,11 @@ fn startup_collected_timeline_metrics_second_round() {
        &[
            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
        ]
    );
 }
@@ -86,7 +104,9 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
    let before = DateTime::<Utc>::from(before);

    let initdb_lsn = Lsn(0x10000);
+    let pitr_cutoff = Lsn(0x11000);
    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+    let logical_size = 0x42000;

    let mut metrics = Vec::new();
    let cache = HashMap::from([
@@ -103,7 +123,10 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
    let snap = TimelineSnapshot {
        loaded_at: (disk_consistent_lsn, init),
        last_record_lsn: disk_consistent_lsn,
-        current_exact_logical_size: Some(0x42000),
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: Some(logical_size),
+        pitr_enabled: true,
+        pitr_cutoff: Some(pitr_cutoff),
    };

    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
@@ -113,16 +136,18 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
        &[
            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
-            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0 - pitr_cutoff.0),
+            MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, logical_size)
        ]
    );
 }

+/// Tests that written sizes do not regress across restarts.
 #[test]
 fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
-    // it can happen that we lose the inmemorylayer but have previously sent metrics and we
-    // should never go backwards
-
    let tenant_id = TenantId::generate();
    let timeline_id = TimelineId::generate();

@@ -140,7 +165,10 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
    let snap = TimelineSnapshot {
        loaded_at: (Lsn(50), at_restart),
        last_record_lsn: Lsn(50),
+        ancestor_lsn: Lsn(0),
        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: Some(Lsn(20)),
    };

    let mut cache = HashMap::from([
@@ -169,6 +197,8 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
                0
            ),
            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 100),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 80),
        ]
    );

@@ -183,6 +213,157 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
        &[
            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 100),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 80),
+        ]
+    );
+}
+
+/// Tests that written sizes do not regress across restarts, even on child branches.
+#[test]
+fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [later, now, at_restart] = time_backwards();
+
+    // FIXME: tests would be so much easier if we did not need to juggle back and forth
+    // SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
+    let now = DateTime::<Utc>::from(now);
+    let later = DateTime::<Utc>::from(later);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+    let way_before = DateTime::<Utc>::from(way_before);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        ancestor_lsn: Lsn(40),
+        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: Some(Lsn(20)),
+    };
+
+    let mut cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id)
+            .at(before_restart, 100)
+            .to_kv_pair(),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until(
+                way_before,
+                before_restart,
+                // not taken into account, but the timestamps are important
+                999_999_999,
+            )
+            .to_kv_pair(),
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                before_restart,
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 60),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60),
+        ]
+    );
+
+    // now if we cache these metrics, and re-run while "still in recovery"
+    cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
+
+    // "still in recovery", because our snapshot did not change
+    snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 60),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60),
+        ]
+    );
+}
+
+/// Tests that written sizes do not regress across restarts, even on child branches and
+/// with a PITR cutoff after the branch point.
+#[test]
+fn post_restart_written_sizes_with_rolled_back_last_record_lsn_and_ancestor_lsn_and_pitr_cutoff() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let [later, now, at_restart] = time_backwards();
+
+    // FIXME: tests would be so much easier if we did not need to juggle back and forth
+    // SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
+    let now = DateTime::<Utc>::from(now);
+    let later = DateTime::<Utc>::from(later);
+    let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
+    let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
+    let before_restart = DateTime::<Utc>::from(before_restart);
+    let way_before = DateTime::<Utc>::from(way_before);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (Lsn(50), at_restart),
+        last_record_lsn: Lsn(50),
+        ancestor_lsn: Lsn(30),
+        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: Some(Lsn(40)),
+    };
+
+    let mut cache = HashMap::from([
+        MetricsKey::written_size(tenant_id, timeline_id)
+            .at(before_restart, 100)
+            .to_kv_pair(),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until(
+                way_before,
+                before_restart,
+                // not taken into account, but the timestamps are important
+                999_999_999,
+            )
+            .to_kv_pair(),
+    ]);
+
+    let mut metrics = Vec::new();
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                before_restart,
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 70),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 60),
+        ]
+    );
+
+    // now if we cache these metrics, and re-run while "still in recovery"
+    cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
+
+    // "still in recovery", because our snapshot did not change
+    snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
+            MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(later, 70),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(later, 60),
        ]
    );
 }
@@ -201,7 +382,10 @@ fn post_restart_current_exact_logical_size_uses_cached() {
    let snap = TimelineSnapshot {
        loaded_at: (Lsn(50), at_restart),
        last_record_lsn: Lsn(50),
+        ancestor_lsn: Lsn(0),
        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: None,
    };

    let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id)
@@ -286,16 +470,101 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
    times
 }

+/// Tests that disabled PITR history does not yield any history size, even when the PITR cutoff
+/// indicates otherwise.
+#[test]
+fn pitr_disabled_yields_no_history_size() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::new();
+
+    let initdb_lsn = Lsn(0x10000);
+    let pitr_cutoff = Lsn(0x11000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, SystemTime::now()),
+        last_record_lsn: disk_consistent_lsn,
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: None,
+        pitr_enabled: false,
+        pitr_cutoff: Some(pitr_cutoff),
+    };
+
+    let now = DateTime::<Utc>::from(SystemTime::now());
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                snap.loaded_at.1.into(),
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+            MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0),
+        ]
+    );
+}
+
+/// Tests that uninitialized PITR cutoff does not emit any history size metric at all.
+#[test]
+fn pitr_uninitialized_does_not_emit_history_size() {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+
+    let mut metrics = Vec::new();
+    let cache = HashMap::new();
+
+    let initdb_lsn = Lsn(0x10000);
+    let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
+
+    let snap = TimelineSnapshot {
+        loaded_at: (disk_consistent_lsn, SystemTime::now()),
+        last_record_lsn: disk_consistent_lsn,
+        ancestor_lsn: Lsn(0),
+        current_exact_logical_size: None,
+        pitr_enabled: true,
+        pitr_cutoff: None,
+    };
+
+    let now = DateTime::<Utc>::from(SystemTime::now());
+
+    snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
+
+    assert_eq!(
+        metrics,
+        &[
+            MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
+                snap.loaded_at.1.into(),
+                now,
+                0
+            ),
+            MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
+            MetricsKey::written_size_since_parent(tenant_id, timeline_id)
+                .at(now, disk_consistent_lsn.0),
+        ]
+    );
+}
+
 pub(crate) const fn metric_examples_old(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    now: DateTime<Utc>,
    before: DateTime<Utc>,
-) -> [RawMetric; 5] {
+) -> [RawMetric; 7] {
    [
        MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::written_size_delta(tenant_id, timeline_id)
            .from_until_old_format(before, now, 0),
+        MetricsKey::written_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0),
+        MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
        MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
        MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
@@ -307,10 +576,12 @@ pub(crate) const fn metric_examples(
    timeline_id: TimelineId,
    now: DateTime<Utc>,
    before: DateTime<Utc>,
-) -> [NewRawMetric; 5] {
+) -> [NewRawMetric; 7] {
    [
        MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
+        MetricsKey::written_size_since_parent(tenant_id, timeline_id).at(now, 0),
+        MetricsKey::pitr_history_size_since_parent(tenant_id, timeline_id).at(now, 0),
        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
        MetricsKey::remote_storage_size(tenant_id).at(now, 0),
        MetricsKey::synthetic_size(tenant_id).at(now, 1),
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -513,6 +513,14 @@ mod tests {
                line!(),
                r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
+            (
+                line!(),
+                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"pitr_history_size_since_parent","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
+            ),
            (
                line!(),
                r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
@@ -560,7 +568,7 @@ mod tests {
        assert_eq!(upgraded_samples, new_samples);
    }

-    fn metric_samples_old() -> [RawMetric; 5] {
+    fn metric_samples_old() -> [RawMetric; 7] {
        let tenant_id = TenantId::from_array([0; 16]);
        let timeline_id = TimelineId::from_array([0xff; 16]);

@@ -572,7 +580,7 @@ mod tests {
        super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
    }

-    fn metric_samples() -> [NewRawMetric; 5] {
+    fn metric_samples() -> [NewRawMetric; 7] {
        let tenant_id = TenantId::from_array([0; 16]);
        let timeline_id = TimelineId::from_array([0xff; 16]);

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -449,7 +449,7 @@ async fn build_timeline_info_common(
    // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
    // actually trimmed data to), which can pass each other when PITR is changed.
    let min_readable_lsn = std::cmp::max(
-        timeline.get_gc_cutoff_lsn(),
+        timeline.get_gc_cutoff_lsn().unwrap_or_default(),
        *timeline.get_applied_gc_cutoff_lsn(),
    );

@@ -3199,7 +3199,7 @@ async fn list_aux_files(
            .await?;

    let io_concurrency = IoConcurrency::spawn_from_conf(
-        state.conf,
+        state.conf.get_vectored_concurrent_io,
        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
    );

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -3,6 +3,7 @@

 mod auth;
 pub mod basebackup;
+pub mod basebackup_cache;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -843,23 +843,50 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
    .expect("failed to define a metric")
 });

-pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
+pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
    register_uint_gauge!(
-        "pageserver_relsize_cache_entries",
-        "Number of entries in the relation size cache",
+        "pageserver_relsize_latest_cache_entries",
+        "Number of entries in the latest relation size cache",
    )
    .expect("failed to define a metric")
 });

-pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
-        .expect("failed to define a metric")
+pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_latest_cache_hits",
+        "Latest relation size cache hits",
+    )
+    .expect("failed to define a metric")
 });

-pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
-        "pageserver_relsize_cache_misses",
-        "Relation size cache misses",
+        "pageserver_relsize_latest_cache_misses",
+        "Relation size latest cache misses",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_relsize_snapshot_cache_entries",
+        "Number of entries in the pitr relation size cache",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_snapshot_cache_hits",
+        "Pitr relation size cache hits",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_relsize_snapshot_cache_misses",
+        "Relation size snapshot cache misses",
    )
    .expect("failed to define a metric")
 });
@@ -1039,6 +1066,15 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
    .expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
 });

+pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_tenant_offloaded_timelines",
+        "Number of offloaded timelines of a tenant",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("Failed to register pageserver_tenant_offloaded_timelines metric")
+});
+
 pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_eviction_iteration_duration_seconds_global",
@@ -3524,11 +3560,14 @@ impl TimelineMetrics {
 }

 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+    let tid = tenant_shard_id.tenant_id.to_string();
+    let shard_id = tenant_shard_id.shard_slug().to_string();
+
    // Only shard zero deals in synthetic sizes
    if tenant_shard_id.is_shard_zero() {
-        let tid = tenant_shard_id.tenant_id.to_string();
        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    }
+    let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]);

    tenant_throttling::remove_tenant_metrics(tenant_shard_id);

@@ -4320,6 +4359,42 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
        .set(u64::try_from(num_threads.get()).unwrap());
 }

+pub(crate) static BASEBACKUP_CACHE_READ: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_basebackup_cache_read_total",
+        "Number of read accesses to the basebackup cache grouped by hit/miss/error",
+        &["result"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static BASEBACKUP_CACHE_PREPARE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_basebackup_cache_prepare_total",
+        "Number of prepare requests processed by the basebackup cache grouped by ok/skip/error",
+        &["result"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static BASEBACKUP_CACHE_ENTRIES: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "pageserver_basebackup_cache_entries_total",
+        "Number of entries in the basebackup cache"
+    )
+    .expect("failed to define a metric")
+});
+
+// FIXME: Support basebackup cache size metrics.
+#[allow(dead_code)]
+pub(crate) static BASEBACKUP_CACHE_SIZE: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "pageserver_basebackup_cache_size_bytes",
+        "Total size of all basebackup cache entries on disk in bytes"
+    )
+    .expect("failed to define a metric")
+});
+
 static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_config_ignored_items",
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -9,7 +9,6 @@ use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};
 use std::{io, str};

-use crate::PERF_TRACE_TARGET;
 use anyhow::{Context, bail};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
@@ -18,7 +17,7 @@ use itertools::Itertools;
 use jsonwebtoken::TokenData;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
-    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
@@ -52,8 +51,10 @@ use utils::simple_rcu::RcuReadGuard;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;

+use crate::PERF_TRACE_TARGET;
 use crate::auth::check_permission;
 use crate::basebackup::BasebackupError;
+use crate::basebackup_cache::BasebackupCache;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -62,7 +63,7 @@ use crate::metrics::{
    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
    SmgrOpTimer, TimelineMetrics,
 };
-use crate::pgdatadir_mapping::Version;
+use crate::pgdatadir_mapping::{LsnRange, Version};
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -107,6 +108,7 @@ pub fn spawn(
    perf_trace_dispatch: Option<Dispatch>,
    tcp_listener: tokio::net::TcpListener,
    tls_config: Option<Arc<rustls::ServerConfig>>,
+    basebackup_cache: Arc<BasebackupCache>,
 ) -> Listener {
    let cancel = CancellationToken::new();
    let libpq_ctx = RequestContext::todo_child(
@@ -128,6 +130,7 @@ pub fn spawn(
            conf.pg_auth_type,
            tls_config,
            conf.page_service_pipelining.clone(),
+            basebackup_cache,
            libpq_ctx,
            cancel.clone(),
        )
@@ -186,6 +189,7 @@ pub async fn libpq_listener_main(
    auth_type: AuthType,
    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
+    basebackup_cache: Arc<BasebackupCache>,
    listener_ctx: RequestContext,
    listener_cancel: CancellationToken,
 ) -> Connections {
@@ -229,6 +233,7 @@ pub async fn libpq_listener_main(
                    auth_type,
                    tls_config.clone(),
                    pipelining_config.clone(),
+                    Arc::clone(&basebackup_cache),
                    connection_ctx,
                    connections_cancel.child_token(),
                    gate_guard,
@@ -271,6 +276,7 @@ async fn page_service_conn_main(
    auth_type: AuthType,
    tls_config: Option<Arc<rustls::ServerConfig>>,
    pipelining_config: PageServicePipeliningConfig,
+    basebackup_cache: Arc<BasebackupCache>,
    connection_ctx: RequestContext,
    cancel: CancellationToken,
    gate_guard: GateGuard,
@@ -331,11 +337,12 @@ async fn page_service_conn_main(
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
    let mut conn_handler = PageServerHandler::new(
-        conf,
        tenant_manager,
        auth,
        pipelining_config,
+        conf.get_vectored_concurrent_io,
        perf_span_fields,
+        basebackup_cache,
        connection_ctx,
        cancel.clone(),
        gate_guard,
@@ -371,7 +378,6 @@ async fn page_service_conn_main(
 }

 struct PageServerHandler {
-    conf: &'static PageServerConf,
    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

@@ -389,6 +395,9 @@ struct PageServerHandler {
    timeline_handles: Option<TimelineHandles>,

    pipelining_config: PageServicePipeliningConfig,
+    get_vectored_concurrent_io: GetVectoredConcurrentIo,
+
+    basebackup_cache: Arc<BasebackupCache>,

    gate_guard: GateGuard,
 }
@@ -642,7 +651,7 @@ impl std::fmt::Display for BatchedPageStreamError {
 struct BatchedGetPageRequest {
    req: PagestreamGetPageRequest,
    timer: SmgrOpTimer,
-    effective_request_lsn: Lsn,
+    lsn_range: LsnRange,
    ctx: RequestContext,
 }

@@ -764,12 +773,12 @@ impl BatchedFeMessage {
                match batching_strategy {
                    PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
                        if let Some(last_in_batch) = accum_pages.last() {
-                            if last_in_batch.effective_request_lsn
-                                != this_pages[0].effective_request_lsn
+                            if last_in_batch.lsn_range.effective_lsn
+                                != this_pages[0].lsn_range.effective_lsn
                            {
                                trace!(
-                                    accum_lsn = %last_in_batch.effective_request_lsn,
-                                    this_lsn = %this_pages[0].effective_request_lsn,
+                                    accum_lsn = %last_in_batch.lsn_range.effective_lsn,
+                                    this_lsn = %this_pages[0].lsn_range.effective_lsn,
                                    "stopping batching because LSN changed"
                                );

@@ -784,15 +793,15 @@ impl BatchedFeMessage {
                        let same_page_different_lsn = accum_pages.iter().any(|batched| {
                            batched.req.rel == this_pages[0].req.rel
                                && batched.req.blkno == this_pages[0].req.blkno
-                                && batched.effective_request_lsn
-                                    != this_pages[0].effective_request_lsn
+                                && batched.lsn_range.effective_lsn
+                                    != this_pages[0].lsn_range.effective_lsn
                        });

                        if same_page_different_lsn {
                            trace!(
                                rel=%this_pages[0].req.rel,
                                blkno=%this_pages[0].req.blkno,
-                                lsn=%this_pages[0].effective_request_lsn,
+                                lsn=%this_pages[0].lsn_range.effective_lsn,
                                "stopping batching because same page was requested at different LSNs"
                            );

@@ -844,17 +853,17 @@ impl BatchedFeMessage {
 impl PageServerHandler {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
-        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
        pipelining_config: PageServicePipeliningConfig,
+        get_vectored_concurrent_io: GetVectoredConcurrentIo,
        perf_span_fields: ConnectionPerfSpanFields,
+        basebackup_cache: Arc<BasebackupCache>,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
        gate_guard: GateGuard,
    ) -> Self {
        PageServerHandler {
-            conf,
            auth,
            claims: None,
            connection_ctx,
@@ -862,6 +871,8 @@ impl PageServerHandler {
            timeline_handles: Some(TimelineHandles::new(tenant_manager)),
            cancel,
            pipelining_config,
+            get_vectored_concurrent_io,
+            basebackup_cache,
            gate_guard,
        }
    }
@@ -1158,7 +1169,7 @@ impl PageServerHandler {
                .await?;

                // We're holding the Handle
-                let effective_request_lsn = match Self::effective_request_lsn(
+                let effective_lsn = match Self::effective_request_lsn(
                    &shard,
                    shard.get_last_record_lsn(),
                    req.hdr.request_lsn,
@@ -1177,7 +1188,10 @@ impl PageServerHandler {
                    pages: smallvec::smallvec![BatchedGetPageRequest {
                        req,
                        timer,
-                        effective_request_lsn,
+                        lsn_range: LsnRange {
+                            effective_lsn,
+                            request_lsn: req.hdr.request_lsn
+                        },
                        ctx,
                    }],
                    // The executor grabs the batch when it becomes idle.
@@ -1623,7 +1637,7 @@ impl PageServerHandler {
        }

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.get_vectored_concurrent_io,
            match self.gate_guard.try_clone() {
                Ok(guard) => guard,
                Err(_) => {
@@ -2127,7 +2141,14 @@ impl PageServerHandler {
        .await?;

        let exists = timeline
-            .get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_exists(
+                req.rel,
+                Version::LsnRange(LsnRange {
+                    effective_lsn: lsn,
+                    request_lsn: req.hdr.request_lsn,
+                }),
+                ctx,
+            )
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -2154,7 +2175,14 @@ impl PageServerHandler {
        .await?;

        let n_blocks = timeline
-            .get_rel_size(req.rel, Version::Lsn(lsn), ctx)
+            .get_rel_size(
+                req.rel,
+                Version::LsnRange(LsnRange {
+                    effective_lsn: lsn,
+                    request_lsn: req.hdr.request_lsn,
+                }),
+                ctx,
+            )
            .await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -2181,7 +2209,15 @@ impl PageServerHandler {
        .await?;

        let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
+            .get_db_size(
+                DEFAULTTABLESPACE_OID,
+                req.dbnode,
+                Version::LsnRange(LsnRange {
+                    effective_lsn: lsn,
+                    request_lsn: req.hdr.request_lsn,
+                }),
+                ctx,
+            )
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -2214,7 +2250,7 @@ impl PageServerHandler {
                // Ignore error (trace buffer may be full or tracer may have disconnected).
                _ = page_trace.try_send(PageTraceEvent {
                    key,
-                    effective_lsn: batch.effective_request_lsn,
+                    effective_lsn: batch.lsn_range.effective_lsn,
                    time,
                });
            }
@@ -2229,7 +2265,7 @@ impl PageServerHandler {
                    perf_instrument = true;
                }

-                req.effective_request_lsn
+                req.lsn_range.effective_lsn
            })
            .max()
            .expect("batch is never empty");
@@ -2283,7 +2319,7 @@ impl PageServerHandler {
                    (
                        &p.req.rel,
                        &p.req.blkno,
-                        p.effective_request_lsn,
+                        p.lsn_range,
                        p.ctx.attached_child(),
                    )
                }),
@@ -2468,6 +2504,8 @@ impl PageServerHandler {
            .map_err(QueryError::Disconnected)?;
        self.flush_cancellable(pgb, &self.cancel).await?;

+        let mut from_cache = false;
+
        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
        if full_backup {
@@ -2485,7 +2523,33 @@ impl PageServerHandler {
            .map_err(map_basebackup_error)?;
        } else {
            let mut writer = BufWriter::new(pgb.copyout_writer());
-            if gzip {
+
+            let cached = {
+                // Basebackup is cached only for this combination of parameters.
+                if timeline.is_basebackup_cache_enabled()
+                    && gzip
+                    && lsn.is_some()
+                    && prev_lsn.is_none()
+                {
+                    self.basebackup_cache
+                        .get(tenant_id, timeline_id, lsn.unwrap())
+                        .await
+                } else {
+                    None
+                }
+            };
+
+            if let Some(mut cached) = cached {
+                from_cache = true;
+                tokio::io::copy(&mut cached, &mut writer)
+                    .await
+                    .map_err(|e| {
+                        map_basebackup_error(BasebackupError::Client(
+                            e,
+                            "handle_basebackup_request,cached,copy",
+                        ))
+                    })?;
+            } else if gzip {
                let mut encoder = GzipEncoder::with_quality(
                    &mut writer,
                    // NOTE using fast compression because it's on the critical path
@@ -2544,6 +2608,7 @@ impl PageServerHandler {
        info!(
            lsn_await_millis = lsn_awaited_after.as_millis(),
            basebackup_millis = basebackup_after.as_millis(),
+            %from_cache,
            "basebackup complete"
        );

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -43,7 +43,9 @@ use crate::aux_file;
 use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::{
-    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
+    RELSIZE_CACHE_MISSES_OLD, RELSIZE_LATEST_CACHE_ENTRIES, RELSIZE_LATEST_CACHE_HITS,
+    RELSIZE_LATEST_CACHE_MISSES, RELSIZE_SNAPSHOT_CACHE_ENTRIES, RELSIZE_SNAPSHOT_CACHE_HITS,
+    RELSIZE_SNAPSHOT_CACHE_MISSES,
 };
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
@@ -90,6 +92,28 @@ pub enum LsnForTimestamp {
    NoData(Lsn),
 }

+/// Each request to page server contains LSN range: `not_modified_since..request_lsn`.
+/// See comments libs/pageserver_api/src/models.rs.
+/// Based on this range and `last_record_lsn` PS calculates `effective_lsn`.
+/// But to distinguish requests from primary and replicas we need also to pass `request_lsn`.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct LsnRange {
+    pub effective_lsn: Lsn,
+    pub request_lsn: Lsn,
+}
+
+impl LsnRange {
+    pub fn at(lsn: Lsn) -> LsnRange {
+        LsnRange {
+            effective_lsn: lsn,
+            request_lsn: lsn,
+        }
+    }
+    pub fn is_latest(&self) -> bool {
+        self.request_lsn == Lsn::MAX
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum CalculateLogicalSizeError {
    #[error("cancelled")]
@@ -202,13 +226,13 @@ impl Timeline {
        io_concurrency: IoConcurrency,
    ) -> Result<Bytes, PageReconstructError> {
        match version {
-            Version::Lsn(effective_lsn) => {
+            Version::LsnRange(lsns) => {
                let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                let res = self
                    .get_rel_page_at_lsn_batched(
-                        pages.iter().map(|(tag, blknum)| {
-                            (tag, blknum, effective_lsn, ctx.attached_child())
-                        }),
+                        pages
+                            .iter()
+                            .map(|(tag, blknum)| (tag, blknum, lsns, ctx.attached_child())),
                        io_concurrency.clone(),
                        ctx,
                    )
@@ -246,7 +270,7 @@ impl Timeline {
    /// The ordering of the returned vec corresponds to the ordering of `pages`.
    pub(crate) async fn get_rel_page_at_lsn_batched(
        &self,
-        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, LsnRange, RequestContext)>,
        io_concurrency: IoConcurrency,
        ctx: &RequestContext,
    ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -265,7 +289,7 @@ impl Timeline {
        let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
            HashMap::with_capacity(pages.len());

-        for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
+        for (response_slot_idx, (tag, blknum, lsns, ctx)) in pages.enumerate() {
            if tag.relnode == 0 {
                result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                    RelationError::InvalidRelnode.into(),
@@ -274,7 +298,7 @@ impl Timeline {
                slots_filled += 1;
                continue;
            }
-
+            let lsn = lsns.effective_lsn;
            let nblocks = {
                let ctx = RequestContextBuilder::from(&ctx)
                    .perf_span(|crnt_perf_span| {
@@ -289,7 +313,7 @@ impl Timeline {
                    .attached_child();

                match self
-                    .get_rel_size(*tag, Version::Lsn(lsn), &ctx)
+                    .get_rel_size(*tag, Version::LsnRange(lsns), &ctx)
                    .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                    .await
                {
@@ -470,7 +494,7 @@ impl Timeline {
            ));
        }

-        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, version) {
            return Ok(nblocks);
        }

@@ -488,7 +512,7 @@ impl Timeline {
        let mut buf = version.get(self, key, ctx).await?;
        let nblocks = buf.get_u32_le();

-        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+        self.update_cached_rel_size(tag, version, nblocks);

        Ok(nblocks)
    }
@@ -510,7 +534,7 @@ impl Timeline {
        }

        // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version) {
            return Ok(true);
        }
        // then check if the database was already initialized.
@@ -586,7 +610,7 @@ impl Timeline {
        // scan directory listing (new), merge with the old results
        let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.conf.get_vectored_concurrent_io,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -632,7 +656,7 @@ impl Timeline {
    ) -> Result<Bytes, PageReconstructError> {
        assert!(self.tenant_shard_id.is_shard_zero());
        let n_blocks = self
-            .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
+            .get_slru_segment_size(kind, segno, Version::at(lsn), ctx)
            .await?;

        let keyspace = KeySpace::single(
@@ -645,7 +669,7 @@ impl Timeline {
        );

        let io_concurrency = IoConcurrency::spawn_from_conf(
-            self.conf,
+            self.conf.get_vectored_concurrent_io,
            self.gate
                .enter()
                .map_err(|_| PageReconstructError::Cancelled)?,
@@ -867,11 +891,11 @@ impl Timeline {
        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
    ) -> Result<T, PageReconstructError> {
        for segno in self
-            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
+            .list_slru_segments(SlruKind::Clog, Version::at(probe_lsn), ctx)
            .await?
        {
            let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, Version::at(probe_lsn), ctx)
                .await?;

            let keyspace = KeySpace::single(
@@ -885,7 +909,7 @@ impl Timeline {
            );

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf,
+                self.conf.get_vectored_concurrent_io,
                self.gate
                    .enter()
                    .map_err(|_| PageReconstructError::Cancelled)?,
@@ -1137,7 +1161,7 @@ impl Timeline {
        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
            for rel in self
-                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
+                .list_rels(*spcnode, *dbnode, Version::at(lsn), ctx)
                .await?
            {
                if self.cancel.is_cancelled() {
@@ -1212,7 +1236,7 @@ impl Timeline {
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
+                .list_rels(spcnode, dbnode, Version::at(lsn), ctx)
                .await?
                .into_iter()
                .collect();
@@ -1329,59 +1353,75 @@ impl Timeline {
        Ok((dense_keyspace, sparse_keyspace))
    }

-    /// Get cached size of relation if it not updated after specified LSN
-    pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
-        let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
-            if lsn >= *cached_lsn {
-                RELSIZE_CACHE_HITS.inc();
-                return Some(*nblocks);
+    /// Get cached size of relation. There are two caches: one for primary updates, it captures the latest state of
+    /// of the timeline and snapshot cache, which key includes LSN and so can be used by replicas to get relation size
+    /// at the particular LSN (snapshot).
+    pub fn get_cached_rel_size(&self, tag: &RelTag, version: Version<'_>) -> Option<BlockNumber> {
+        let lsn = version.get_lsn();
+        {
+            let rel_size_cache = self.rel_size_latest_cache.read().unwrap();
+            if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
+                if lsn >= *cached_lsn {
+                    RELSIZE_LATEST_CACHE_HITS.inc();
+                    return Some(*nblocks);
+                }
+                RELSIZE_CACHE_MISSES_OLD.inc();
            }
-            RELSIZE_CACHE_MISSES_OLD.inc();
        }
-        RELSIZE_CACHE_MISSES.inc();
+        {
+            let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
+            if let Some(nblock) = rel_size_cache.get(&(lsn, *tag)) {
+                RELSIZE_SNAPSHOT_CACHE_HITS.inc();
+                return Some(*nblock);
+            }
+        }
+        if version.is_latest() {
+            RELSIZE_LATEST_CACHE_MISSES.inc();
+        } else {
+            RELSIZE_SNAPSHOT_CACHE_MISSES.inc();
+        }
        None
    }

    /// Update cached relation size if there is no more recent update
-    pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
-        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-
-        if lsn < rel_size_cache.complete_as_of {
-            // Do not cache old values. It's safe to cache the size on read, as long as
-            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
-            // never evict values from the cache, so if the relation size changed after
-            // 'lsn', the new value is already in the cache.
-            return;
-        }
-
-        match rel_size_cache.map.entry(tag) {
-            hash_map::Entry::Occupied(mut entry) => {
-                let cached_lsn = entry.get_mut();
-                if lsn >= cached_lsn.0 {
-                    *cached_lsn = (lsn, nblocks);
+    pub fn update_cached_rel_size(&self, tag: RelTag, version: Version<'_>, nblocks: BlockNumber) {
+        let lsn = version.get_lsn();
+        if version.is_latest() {
+            let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
+            match rel_size_cache.entry(tag) {
+                hash_map::Entry::Occupied(mut entry) => {
+                    let cached_lsn = entry.get_mut();
+                    if lsn >= cached_lsn.0 {
+                        *cached_lsn = (lsn, nblocks);
+                    }
+                }
+                hash_map::Entry::Vacant(entry) => {
+                    entry.insert((lsn, nblocks));
+                    RELSIZE_LATEST_CACHE_ENTRIES.inc();
                }
            }
-            hash_map::Entry::Vacant(entry) => {
-                entry.insert((lsn, nblocks));
-                RELSIZE_CACHE_ENTRIES.inc();
+        } else {
+            let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
+            if rel_size_cache.capacity() != 0 {
+                rel_size_cache.insert((lsn, tag), nblocks);
+                RELSIZE_SNAPSHOT_CACHE_ENTRIES.set(rel_size_cache.len() as u64);
            }
        }
    }

    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
-        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
-            RELSIZE_CACHE_ENTRIES.inc();
+        let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
+        if rel_size_cache.insert(tag, (lsn, nblocks)).is_none() {
+            RELSIZE_LATEST_CACHE_ENTRIES.inc();
        }
    }

    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
-        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        if rel_size_cache.map.remove(tag).is_some() {
-            RELSIZE_CACHE_ENTRIES.dec();
+        let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
+        if rel_size_cache.remove(tag).is_some() {
+            RELSIZE_LATEST_CACHE_ENTRIES.dec();
        }
    }
 }
@@ -1585,7 +1625,10 @@ impl DatadirModification<'_> {
        //       check the cache too. This is because eagerly checking the cache results in
        //       less work overall and 10% better performance. It's more work on cache miss
        //       but cache miss is rare.
-        if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
+        if let Some(nblocks) = self
+            .tline
+            .get_cached_rel_size(&rel, Version::Modified(self))
+        {
            Ok(nblocks)
        } else if !self
            .tline
@@ -2667,7 +2710,7 @@ pub struct DatadirModificationStats {
 /// timeline to not miss the latest updates.
 #[derive(Clone, Copy)]
 pub enum Version<'a> {
-    Lsn(Lsn),
+    LsnRange(LsnRange),
    Modified(&'a DatadirModification<'a>),
 }

@@ -2679,7 +2722,7 @@ impl Version<'_> {
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        match self {
-            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
+            Version::LsnRange(lsns) => timeline.get(key, lsns.effective_lsn, ctx).await,
            Version::Modified(modification) => modification.get(key, ctx).await,
        }
    }
@@ -2701,12 +2744,26 @@ impl Version<'_> {
        }
    }

-    fn get_lsn(&self) -> Lsn {
+    pub fn is_latest(&self) -> bool {
        match self {
-            Version::Lsn(lsn) => *lsn,
+            Version::LsnRange(lsns) => lsns.is_latest(),
+            Version::Modified(_) => true,
+        }
+    }
+
+    pub fn get_lsn(&self) -> Lsn {
+        match self {
+            Version::LsnRange(lsns) => lsns.effective_lsn,
            Version::Modified(modification) => modification.lsn,
        }
    }
+
+    pub fn at(lsn: Lsn) -> Self {
+        Version::LsnRange(LsnRange {
+            effective_lsn: lsn,
+            request_lsn: lsn,
+        })
+    }
 }

 //--- Metadata structs stored in key-value pairs in the repository.
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -380,6 +380,10 @@ pub enum TaskKind {
    DetachAncestor,

    ImportPgdata,
+
+    /// Background task of [`crate::basebackup_cache::BasebackupCache`].
+    /// Prepares basebackups and clears outdated entries.
+    BasebackupCache,
 }

 #[derive(Default)]
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -78,6 +78,7 @@ use self::timeline::uninit::{TimelineCreateGuard, TimelineExclusionError, Uninit
 use self::timeline::{
    EvictionTaskTenantState, GcCutoffs, TimelineDeleteProgress, TimelineResources, WaitLsnError,
 };
+use crate::basebackup_cache::BasebackupPrepareSender;
 use crate::config::PageServerConf;
 use crate::context;
 use crate::context::RequestContextBuilder;
@@ -86,8 +87,8 @@ use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::{
    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
-    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
-    TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
+    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
+    TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
 };
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
@@ -157,6 +158,7 @@ pub struct TenantSharedResources {
    pub remote_storage: GenericRemoteStorage,
    pub deletion_queue_client: DeletionQueueClient,
    pub l0_flush_global_state: L0FlushGlobalState,
+    pub basebackup_prepare_sender: BasebackupPrepareSender,
 }

 /// A [`TenantShard`] is really an _attached_ tenant.  The configuration
@@ -317,12 +319,15 @@ pub struct TenantShard {
    gc_cs: tokio::sync::Mutex<()>,
    walredo_mgr: Option<Arc<WalRedoManager>>,

-    // provides access to timeline data sitting in the remote storage
+    /// Provides access to timeline data sitting in the remote storage.
    pub(crate) remote_storage: GenericRemoteStorage,

-    // Access to global deletion queue for when this tenant wants to schedule a deletion
+    /// Access to global deletion queue for when this tenant wants to schedule a deletion.
    deletion_queue_client: DeletionQueueClient,

+    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
+    basebackup_prepare_sender: BasebackupPrepareSender,
+
    /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
    cached_synthetic_tenant_size: Arc<AtomicU64>,
@@ -1286,6 +1291,7 @@ impl TenantShard {
            remote_storage,
            deletion_queue_client,
            l0_flush_global_state,
+            basebackup_prepare_sender,
        } = resources;

        let attach_mode = attached_conf.location.attach_mode;
@@ -1301,6 +1307,7 @@ impl TenantShard {
            remote_storage.clone(),
            deletion_queue_client,
            l0_flush_global_state,
+            basebackup_prepare_sender,
        ));

        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -3348,6 +3355,13 @@ impl TenantShard {
                activated_timelines += 1;
            }

+            let tid = self.tenant_shard_id.tenant_id.to_string();
+            let shard_id = self.tenant_shard_id.shard_slug().to_string();
+            let offloaded_timeline_count = timelines_offloaded_accessor.len();
+            TENANT_OFFLOADED_TIMELINES
+                .with_label_values(&[&tid, &shard_id])
+                .set(offloaded_timeline_count as u64);
+
            self.state.send_modify(move |current_state| {
                assert!(
                    matches!(current_state, TenantState::Activating(_)),
@@ -4232,6 +4246,7 @@ impl TenantShard {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
+        basebackup_prepare_sender: BasebackupPrepareSender,
    ) -> TenantShard {
        assert!(!attached_conf.location.generation.is_none());

@@ -4335,6 +4350,7 @@ impl TenantShard {
            ongoing_timeline_detach: std::sync::Mutex::default(),
            gc_block: Default::default(),
            l0_flush_global_state,
+            basebackup_prepare_sender,
        }
    }

@@ -4587,7 +4603,7 @@ impl TenantShard {

            target.cutoffs = GcCutoffs {
                space: space_cutoff,
-                time: Lsn::INVALID,
+                time: None,
            };
        }
    }
@@ -4671,7 +4687,7 @@ impl TenantShard {
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
                        target.within_ancestor_pitr =
-                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
+                            Some(timeline.get_ancestor_lsn()) >= ancestor_gc_cutoffs.time;
                    }
                }

@@ -4684,13 +4700,15 @@ impl TenantShard {
                    } else {
                        0
                    });
-                timeline.metrics.pitr_history_size.set(
-                    timeline
-                        .get_last_record_lsn()
-                        .checked_sub(target.cutoffs.time)
-                        .unwrap_or(Lsn(0))
-                        .0,
-                );
+                if let Some(time_cutoff) = target.cutoffs.time {
+                    timeline.metrics.pitr_history_size.set(
+                        timeline
+                            .get_last_record_lsn()
+                            .checked_sub(time_cutoff)
+                            .unwrap_or_default()
+                            .0,
+                    );
+                }

                // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
                // - this timeline was created while we were finding cutoffs
@@ -4699,8 +4717,8 @@ impl TenantShard {
                    let original_cutoffs = target.cutoffs.clone();
                    // GC cutoffs should never go back
                    target.cutoffs = GcCutoffs {
-                        space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
-                        time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
+                        space: cutoffs.space.max(original_cutoffs.space),
+                        time: cutoffs.time.max(original_cutoffs.time),
                    }
                }
            }
@@ -5252,6 +5270,7 @@ impl TenantShard {
            pagestream_throttle_metrics: self.pagestream_throttle_metrics.clone(),
            l0_compaction_trigger: self.l0_compaction_trigger.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
+            basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
        }
    }

@@ -5560,6 +5579,14 @@ impl TenantShard {
            }
        }

+        // Update metrics
+        let tid = self.tenant_shard_id.to_string();
+        let shard_id = self.tenant_shard_id.shard_slug().to_string();
+        let set_key = &[tid.as_str(), shard_id.as_str()][..];
+        TENANT_OFFLOADED_TIMELINES
+            .with_label_values(set_key)
+            .set(manifest.offloaded_timelines.len() as u64);
+
        // Upload the manifest. Remote storage does no retries internally, so retry here.
        match backoff::retry(
            || async {
@@ -5826,6 +5853,8 @@ pub(crate) mod harness {
        ) -> anyhow::Result<Arc<TenantShard>> {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

+            let (basebackup_requst_sender, _) = tokio::sync::mpsc::unbounded_channel();
+
            let tenant = Arc::new(TenantShard::new(
                TenantState::Attaching,
                self.conf,
@@ -5843,6 +5872,7 @@ pub(crate) mod harness {
                self.deletion_queue.new_client(),
                // TODO: ideally we should run all unit tests with both configs
                L0FlushGlobalState::new(L0FlushConfig::default()),
+                basebackup_requst_sender,
            ));

            let preload = tenant
@@ -8596,8 +8626,10 @@ mod tests {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Result<Option<Bytes>, GetVectoredError> {
-        let io_concurrency =
-            IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            tline.conf.get_vectored_concurrent_io,
+            tline.gate.enter().unwrap(),
+        );
        let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
        let mut res = tline
@@ -8935,7 +8967,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x30);
+            guard.cutoffs.time = Some(Lsn(0x30));
            guard.cutoffs.space = Lsn(0x30);
        }

@@ -9043,7 +9075,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
+            guard.cutoffs.time = Some(Lsn(0x40));
            guard.cutoffs.space = Lsn(0x40);
        }
        tline
@@ -9461,7 +9493,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -9545,7 +9577,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x40);
+            guard.cutoffs.time = Some(Lsn(0x40));
            guard.cutoffs.space = Lsn(0x40);
        }
        tline
@@ -10016,7 +10048,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -10079,7 +10111,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
+                gc_info.cutoffs.time.unwrap_or_default()
            };
            for idx in 0..10 {
                assert_eq!(
@@ -10157,7 +10189,7 @@ mod tests {
                .await;
            // Update GC info
            let mut guard = tline.gc_info.write().unwrap();
-            guard.cutoffs.time = Lsn(0x38);
+            guard.cutoffs.time = Some(Lsn(0x38));
            guard.cutoffs.space = Lsn(0x38);
        }
        tline
@@ -10265,7 +10297,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -10328,7 +10360,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
+                gc_info.cutoffs.time.unwrap_or_default()
            };
            for idx in 0..10 {
                assert_eq!(
@@ -10514,7 +10546,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x10),
+                    time: Some(Lsn(0x10)),
                    space: Lsn(0x10),
                },
                leases: Default::default(),
@@ -10534,7 +10566,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x50),
+                    time: Some(Lsn(0x50)),
                    space: Lsn(0x50),
                },
                leases: Default::default(),
@@ -11255,7 +11287,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -11644,7 +11676,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -11707,7 +11739,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
+                gc_info.cutoffs.time.unwrap_or_default()
            };
            for idx in 0..10 {
                assert_eq!(
@@ -11896,7 +11928,7 @@ mod tests {
                    (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
                ],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
@@ -11959,7 +11991,7 @@ mod tests {
        let verify_result = || async {
            let gc_horizon = {
                let gc_info = tline.gc_info.read().unwrap();
-                gc_info.cutoffs.time
+                gc_info.cutoffs.time.unwrap_or_default()
            };
            for idx in 0..10 {
                assert_eq!(
@@ -12222,7 +12254,7 @@ mod tests {
            *guard = GcInfo {
                retain_lsns: vec![],
                cutoffs: GcCutoffs {
-                    time: Lsn(0x30),
+                    time: Some(Lsn(0x30)),
                    space: Lsn(0x30),
                },
                leases: Default::default(),
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -235,7 +235,7 @@ pub(super) async fn gather_inputs(
        // than our internal space cutoff.  This means that if someone drops a database and waits for their
        // PITR interval, they will see synthetic size decrease, even if we are still storing data inside
        // the space cutoff.
-        let mut next_pitr_cutoff = gc_info.cutoffs.time;
+        let mut next_pitr_cutoff = gc_info.cutoffs.time.unwrap_or_default(); // TODO: handle None

        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -31,6 +31,7 @@ pub use inmemory_layer::InMemoryLayer;
 pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
 pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
+use pageserver_api::config::GetVectoredConcurrentIo;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
@@ -43,7 +44,6 @@ use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
 use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
-use crate::config::PageServerConf;
 use crate::context::{
    AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
@@ -318,11 +318,10 @@ impl IoConcurrency {
    }

    pub(crate) fn spawn_from_conf(
-        conf: &'static PageServerConf,
+        conf: GetVectoredConcurrentIo,
        gate_guard: GateGuard,
    ) -> IoConcurrency {
-        use pageserver_api::config::GetVectoredConcurrentIo;
-        let selected = match conf.get_vectored_concurrent_io {
+        let selected = match conf {
            GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
            GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
        };
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -63,7 +63,28 @@ pub struct InMemoryLayer {

    opened_at: Instant,

-    /// The above fields never change, except for `end_lsn`, which is only set once.
+    /// All versions of all pages in the layer are kept here. Indexed
+    /// by block number and LSN. The [`IndexEntry`] is an offset into the
+    /// ephemeral file where the page version is stored.
+    ///
+    /// We use a separate lock for the index to reduce the critical section
+    /// during which reads cannot be planned.
+    ///
+    /// If you need access to both the index and the underlying file at the same time,
+    /// respect the following locking order to avoid deadlocks:
+    /// 1. [`InMemoryLayer::inner`]
+    /// 2. [`InMemoryLayer::index`]
+    ///
+    /// Note that the file backing [`InMemoryLayer::inner`] is append-only,
+    /// so it is not necessary to hold simultaneous locks on index.
+    /// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
+    /// In particular:
+    /// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
+    /// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
+    index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
+
+    /// The above fields never change, except for `end_lsn`, which is only set once,
+    /// and `index` (see rationale there).
    /// All other changing parts are in `inner`, and protected by a mutex.
    inner: RwLock<InMemoryLayerInner>,

@@ -81,11 +102,6 @@ impl std::fmt::Debug for InMemoryLayer {
 }

 pub struct InMemoryLayerInner {
-    /// All versions of all pages in the layer are kept here. Indexed
-    /// by block number and LSN. The [`IndexEntry`] is an offset into the
-    /// ephemeral file where the page version is stored.
-    index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
-
    /// The values are stored in a serialized format in this file.
    /// Each serialized Value is preceded by a 'u32' length field.
    /// PerSeg::page_versions map stores offsets into this file.
@@ -105,7 +121,7 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
    trailing_ones
 };

-/// See [`InMemoryLayerInner::index`].
+/// See [`InMemoryLayer::index`].
 ///
 /// For memory efficiency, the data is packed into a u64.
 ///
@@ -425,7 +441,7 @@ impl InMemoryLayer {
            .page_content_kind(PageContentKind::InMemoryLayer)
            .attached_child();

-        let inner = self.inner.read().await;
+        let index = self.index.read().await;

        struct ValueRead {
            entry_lsn: Lsn,
@@ -435,10 +451,7 @@ impl InMemoryLayer {
        let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();

        for range in keyspace.ranges.iter() {
-            for (key, vec_map) in inner
-                .index
-                .range(range.start.to_compact()..range.end.to_compact())
-            {
+            for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) {
                let key = Key::from_compact(*key);
                let slice = vec_map.slice_range(lsn_range.clone());

@@ -466,7 +479,7 @@ impl InMemoryLayer {
                }
            }
        }
-        drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
+        drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
        let read_from = Arc::clone(self);
        let read_ctx = ctx.attached_child();
        reconstruct_state
@@ -573,8 +586,8 @@ impl InMemoryLayer {
            start_lsn,
            end_lsn: OnceLock::new(),
            opened_at: Instant::now(),
+            index: RwLock::new(BTreeMap::new()),
            inner: RwLock::new(InMemoryLayerInner {
-                index: BTreeMap::new(),
                file,
                resource_units: GlobalResourceUnits::new(),
            }),
@@ -592,31 +605,39 @@ impl InMemoryLayer {
        serialized_batch: SerializedValueBatch,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let mut inner = self.inner.write().await;
-        self.assert_writable();
+        let (base_offset, metadata) = {
+            let mut inner = self.inner.write().await;
+            self.assert_writable();

-        let base_offset = inner.file.len();
+            let base_offset = inner.file.len();

-        let SerializedValueBatch {
-            raw,
-            metadata,
-            max_lsn: _,
-            len: _,
-        } = serialized_batch;
+            let SerializedValueBatch {
+                raw,
+                metadata,
+                max_lsn: _,
+                len: _,
+            } = serialized_batch;

-        // Write the batch to the file
-        inner.file.write_raw(&raw, ctx).await?;
-        let new_size = inner.file.len();
+            // Write the batch to the file
+            inner.file.write_raw(&raw, ctx).await?;
+            let new_size = inner.file.len();

-        let expected_new_len = base_offset
-            .checked_add(raw.len().into_u64())
-            // write_raw would error if we were to overflow u64.
-            // also IndexEntry and higher levels in
-            //the code don't allow the file to grow that large
-            .unwrap();
-        assert_eq!(new_size, expected_new_len);
+            let expected_new_len = base_offset
+                .checked_add(raw.len().into_u64())
+                // write_raw would error if we were to overflow u64.
+                // also IndexEntry and higher levels in
+                //the code don't allow the file to grow that large
+                .unwrap();
+            assert_eq!(new_size, expected_new_len);
+
+            inner.resource_units.maybe_publish_size(new_size);
+
+            (base_offset, metadata)
+        };

        // Update the index with the new entries
+        let mut index = self.index.write().await;
+
        for meta in metadata {
            let SerializedValueMeta {
                key,
@@ -639,7 +660,7 @@ impl InMemoryLayer {
                will_init,
            })?;

-            let vec_map = inner.index.entry(key).or_default();
+            let vec_map = index.entry(key).or_default();
            let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
            if old.is_some() {
                // This should not break anything, but is unexpected: ingestion code aims to filter out
@@ -658,8 +679,6 @@ impl InMemoryLayer {
            );
        }

-        inner.resource_units.maybe_publish_size(new_size);
-
        Ok(())
    }

@@ -680,6 +699,18 @@ impl InMemoryLayer {

    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
+    ///
+    /// A note on locking:
+    /// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing
+    /// writes while freezing the layer. This is enforced at a higher level via
+    /// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths:
+    /// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the
+    ///    Timeline::write_lock for its lifetime. The rolling is handled in
+    ///    [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function
+    ///    so can't be called from different threads.
+    /// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`].
+    ///    This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer),
+    ///    hence there can be no concurrent writes
    pub async fn freeze(&self, end_lsn: Lsn) {
        assert!(
            self.start_lsn < end_lsn,
@@ -700,8 +731,8 @@ impl InMemoryLayer {

        #[cfg(debug_assertions)]
        {
-            let inner = self.inner.write().await;
-            for vec_map in inner.index.values() {
+            let index = self.index.read().await;
+            for vec_map in index.values() {
                for (lsn, _) in vec_map.as_slice() {
                    assert!(*lsn < end_lsn);
                }
@@ -724,14 +755,11 @@ impl InMemoryLayer {
    ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
        // Grab the lock in read-mode. We hold it over the I/O, but because this
        // layer is not writeable anymore, no one should be trying to acquire the
-        // write lock on it, so we shouldn't block anyone. There's one exception
-        // though: another thread might have grabbed a reference to this layer
-        // in `get_layer_for_write' just before the checkpointer called
-        // `freeze`, and then `write_to_disk` on it. When the thread gets the
-        // lock, it will see that it's not writeable anymore and retry, but it
-        // would have to wait until we release it. That race condition is very
-        // rare though, so we just accept the potential latency hit for now.
+        // write lock on it, so we shouldn't block anyone. See the comment on
+        // [`InMemoryLayer::freeze`] to understand how locking between the append path
+        // and layer flushing works.
        let inner = self.inner.read().await;
+        let index = self.index.read().await;

        use l0_flush::Inner;
        let _concurrency_permit = match l0_flush_global_state {
@@ -743,13 +771,9 @@ impl InMemoryLayer {
        let key_count = if let Some(key_range) = key_range {
            let key_range = key_range.start.to_compact()..key_range.end.to_compact();

-            inner
-                .index
-                .iter()
-                .filter(|(k, _)| key_range.contains(k))
-                .count()
+            index.iter().filter(|(k, _)| key_range.contains(k)).count()
        } else {
-            inner.index.len()
+            index.len()
        };
        if key_count == 0 {
            return Ok(None);
@@ -772,7 +796,7 @@ impl InMemoryLayer {
                let file_contents = inner.file.load_to_io_buf(ctx).await?;
                let file_contents = file_contents.freeze();

-                for (key, vec_map) in inner.index.iter() {
+                for (key, vec_map) in index.iter() {
                    // Write all page versions
                    for (lsn, entry) in vec_map
                        .as_slice()
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ pub mod span;
 pub mod uninit;
 mod walreceiver;

+use hashlink::LruCache;
 use std::array;
 use std::cmp::{max, min};
 use std::collections::btree_map::Entry;
@@ -23,8 +24,6 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

-use crate::PERF_TRACE_TARGET;
-use crate::walredo::RedoAttemptType;
 use anyhow::{Context, Result, anyhow, bail, ensure};
 use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
@@ -93,10 +92,12 @@ use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
 use super::tasks::log_compaction_error;
 use super::upload_queue::NotInitialized;
 use super::{
-    AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
+    AttachedTenantConf, BasebackupPrepareSender, GcError, HeatMapTimeline, MaybeOffloaded,
    debug_assert_current_span_has_tenant_and_timeline_id,
 };
+use crate::PERF_TRACE_TARGET;
 use crate::aux_file::AuxFileSizeEstimator;
+use crate::basebackup_cache::BasebackupPrepareRequest;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -130,6 +131,7 @@ use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 use crate::walingest::WalLagCooldown;
+use crate::walredo::RedoAttemptType;
 use crate::{ZERO_PAGE, task_mgr, walredo};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -195,16 +197,7 @@ pub struct TimelineResources {
    pub pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
    pub l0_compaction_trigger: Arc<Notify>,
    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
-}
-
-/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
-/// ingestion considerably, because WAL ingestion needs to check on most records if the record
-/// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
-/// of the timeline (disk_consistent_lsn).  It's used on reads of relation sizes to check if the
-/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
-pub(crate) struct RelSizeCache {
-    pub(crate) complete_as_of: Lsn,
-    pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
+    pub basebackup_prepare_sender: BasebackupPrepareSender,
 }

 pub struct Timeline {
@@ -365,7 +358,8 @@ pub struct Timeline {
    pub walreceiver: Mutex<Option<WalReceiver>>,

    /// Relation size cache
-    pub(crate) rel_size_cache: RwLock<RelSizeCache>,
+    pub(crate) rel_size_latest_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
+    pub(crate) rel_size_snapshot_cache: Mutex<LruCache<(Lsn, RelTag), BlockNumber>>,

    download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,

@@ -447,6 +441,9 @@ pub struct Timeline {
    pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,

    wait_lsn_log_slow: tokio::sync::Semaphore,
+
+    /// A channel to send async requests to prepare a basebackup for the basebackup cache.
+    basebackup_prepare_sender: BasebackupPrepareSender,
 }

 pub(crate) enum PreviousHeatmap {
@@ -537,29 +534,24 @@ impl GcInfo {
 /// The `GcInfo` component describing which Lsns need to be retained.  Functionally, this
 /// is a single number (the oldest LSN which we must retain), but it internally distinguishes
 /// between time-based and space-based retention for observability and consumption metrics purposes.
-#[derive(Debug, Clone)]
+#[derive(Clone, Debug, Default)]
 pub(crate) struct GcCutoffs {
    /// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much
    /// history we must keep to retain a specified number of bytes of WAL.
    pub(crate) space: Lsn,

-    /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates how much
-    /// history we must keep to enable reading back at least the PITR interval duration.
-    pub(crate) time: Lsn,
-}
-
-impl Default for GcCutoffs {
-    fn default() -> Self {
-        Self {
-            space: Lsn::INVALID,
-            time: Lsn::INVALID,
-        }
-    }
+    /// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates
+    /// how much history we must keep to enable reading back at least the PITR interval duration.
+    ///
+    /// None indicates that the PITR cutoff has not been computed. A PITR interval of 0 will yield
+    /// Some(last_record_lsn).
+    pub(crate) time: Option<Lsn>,
 }

 impl GcCutoffs {
    fn select_min(&self) -> Lsn {
-        std::cmp::min(self.space, self.time)
+        // NB: if we haven't computed the PITR cutoff yet, we can't GC anything.
+        self.space.min(self.time.unwrap_or_default())
    }
 }

@@ -1041,6 +1033,7 @@ pub(crate) enum WaitLsnWaiter<'a> {
    Tenant,
    PageService,
    HttpEndpoint,
+    BaseBackupCache,
 }

 /// Argument to [`Timeline::shutdown`].
@@ -1096,11 +1089,14 @@ impl Timeline {
    /// Get the bytes written since the PITR cutoff on this branch, and
    /// whether this branch's ancestor_lsn is within its parent's PITR.
    pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
+        // TODO: for backwards compatibility, we return the full history back to 0 when the PITR
+        // cutoff has not yet been initialized. This should return None instead, but this is exposed
+        // in external HTTP APIs and callers may not handle a null value.
        let gc_info = self.gc_info.read().unwrap();
        let history = self
            .get_last_record_lsn()
-            .checked_sub(gc_info.cutoffs.time)
-            .unwrap_or(Lsn(0))
+            .checked_sub(gc_info.cutoffs.time.unwrap_or_default())
+            .unwrap_or_default()
            .0;
        (history, gc_info.within_ancestor_pitr)
    }
@@ -1110,9 +1106,10 @@ impl Timeline {
        self.applied_gc_cutoff_lsn.read()
    }

-    /// Read timeline's planned GC cutoff: this is the logical end of history that users
-    /// are allowed to read (based on configured PITR), even if physically we have more history.
-    pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
+    /// Read timeline's planned GC cutoff: this is the logical end of history that users are allowed
+    /// to read (based on configured PITR), even if physically we have more history. Returns None
+    /// if the PITR cutoff has not yet been initialized.
+    pub(crate) fn get_gc_cutoff_lsn(&self) -> Option<Lsn> {
        self.gc_info.read().unwrap().cutoffs.time
    }

@@ -1563,7 +1560,8 @@ impl Timeline {
                        }
                        WaitLsnWaiter::Tenant
                        | WaitLsnWaiter::PageService
-                        | WaitLsnWaiter::HttpEndpoint => unreachable!(
+                        | WaitLsnWaiter::HttpEndpoint
+                        | WaitLsnWaiter::BaseBackupCache => unreachable!(
                            "tenant or page_service context are not expected to have task kind {:?}",
                            ctx.task_kind()
                        ),
@@ -2468,6 +2466,41 @@ impl Timeline {
            false
        }
    }
+
+    pub(crate) fn is_basebackup_cache_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .basebackup_cache_enabled
+            .unwrap_or(self.conf.default_tenant_conf.basebackup_cache_enabled)
+    }
+
+    /// Prepare basebackup for the given LSN and store it in the basebackup cache.
+    /// The method is asynchronous and returns immediately.
+    /// The actual basebackup preparation is performed in the background
+    /// by the basebackup cache on a best-effort basis.
+    pub(crate) fn prepare_basebackup(&self, lsn: Lsn) {
+        if !self.is_basebackup_cache_enabled() {
+            return;
+        }
+        if !self.tenant_shard_id.is_shard_zero() {
+            // In theory we should never get here, but just in case check it.
+            // Preparing basebackup doesn't make sense for shards other than shard zero.
+            return;
+        }
+
+        let res = self
+            .basebackup_prepare_sender
+            .send(BasebackupPrepareRequest {
+                tenant_shard_id: self.tenant_shard_id,
+                timeline_id: self.timeline_id,
+                lsn,
+            });
+        if let Err(e) = res {
+            // May happen during shutdown, it's not critical.
+            info!("Failed to send shutdown checkpoint: {e:#}");
+        }
+    }
 }

 /// Number of times we will compute partition within a checkpoint distance.
@@ -2545,6 +2578,13 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

+    pub(crate) fn get_pitr_interval(&self) -> Duration {
+        let tenant_conf = &self.tenant_conf.load().tenant_conf;
+        tenant_conf
+            .pitr_interval
+            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
+    }
+
    fn get_compaction_period(&self) -> Duration {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -2820,6 +2860,13 @@ impl Timeline {

            self.remote_client.update_config(&new_conf.location);

+            let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
+            if let Some(new_capacity) = new_conf.tenant_conf.relsize_snapshot_cache_capacity {
+                if new_capacity != rel_size_cache.capacity() {
+                    rel_size_cache.set_capacity(new_capacity);
+                }
+            }
+
            self.metrics
                .evictions_with_low_residence_duration
                .write()
@@ -2878,6 +2925,14 @@ impl Timeline {
            ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded);
        }

+        let relsize_snapshot_cache_capacity = {
+            let loaded_tenant_conf = tenant_conf.load();
+            loaded_tenant_conf
+                .tenant_conf
+                .relsize_snapshot_cache_capacity
+                .unwrap_or(conf.default_tenant_conf.relsize_snapshot_cache_capacity)
+        };
+
        Arc::new_cyclic(|myself| {
            let metrics = Arc::new(TimelineMetrics::new(
                &tenant_shard_id,
@@ -2969,10 +3024,8 @@ impl Timeline {
                last_image_layer_creation_check_instant: Mutex::new(None),

                last_received_wal: Mutex::new(None),
-                rel_size_cache: RwLock::new(RelSizeCache {
-                    complete_as_of: disk_consistent_lsn,
-                    map: HashMap::new(),
-                }),
+                rel_size_latest_cache: RwLock::new(HashMap::new()),
+                rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),

                download_all_remote_layers_task_info: RwLock::new(None),

@@ -3017,6 +3070,8 @@ impl Timeline {
                rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),

                wait_lsn_log_slow: tokio::sync::Semaphore::new(1),
+
+                basebackup_prepare_sender: resources.basebackup_prepare_sender,
            };

            result.repartition_threshold =
@@ -3530,7 +3585,7 @@ impl Timeline {
                };

                let io_concurrency = IoConcurrency::spawn_from_conf(
-                    self_ref.conf,
+                    self_ref.conf.get_vectored_concurrent_io,
                    self_ref
                        .gate
                        .enter()
@@ -5559,7 +5614,7 @@ impl Timeline {
            });

            let io_concurrency = IoConcurrency::spawn_from_conf(
-                self.conf,
+                self.conf.get_vectored_concurrent_io,
                self.gate
                    .enter()
                    .map_err(|_| CreateImageLayersError::Cancelled)?,
@@ -6230,14 +6285,12 @@ impl Timeline {

        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");

-        if cfg!(test) {
+        if cfg!(test) && pitr == Duration::ZERO {
            // Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
-            if pitr == Duration::ZERO {
-                return Ok(GcCutoffs {
-                    time: self.get_last_record_lsn(),
-                    space: space_cutoff,
-                });
-            }
+            return Ok(GcCutoffs {
+                time: Some(self.get_last_record_lsn()),
+                space: space_cutoff,
+            });
        }

        // Calculate a time-based limit on how much to retain:
@@ -6251,14 +6304,14 @@ impl Timeline {
                // PITR is not set. Retain the size-based limit, or the default time retention,
                // whichever requires less data.
                GcCutoffs {
-                    time: self.get_last_record_lsn(),
+                    time: Some(self.get_last_record_lsn()),
                    space: std::cmp::max(time_cutoff, space_cutoff),
                }
            }
            (Duration::ZERO, None) => {
                // PITR is not set, and time lookup failed
                GcCutoffs {
-                    time: self.get_last_record_lsn(),
+                    time: Some(self.get_last_record_lsn()),
                    space: space_cutoff,
                }
            }
@@ -6266,7 +6319,7 @@ impl Timeline {
                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
                // cannot advance beyond what was already GC'd, and respect space-based retention
                GcCutoffs {
-                    time: *self.get_applied_gc_cutoff_lsn(),
+                    time: Some(*self.get_applied_gc_cutoff_lsn()),
                    space: space_cutoff,
                }
            }
@@ -6274,7 +6327,7 @@ impl Timeline {
                // PITR interval is set and we looked up timestamp successfully.  Ignore
                // size based retention and make time cutoff authoritative
                GcCutoffs {
-                    time: time_cutoff,
+                    time: Some(time_cutoff),
                    space: time_cutoff,
                }
            }
@@ -6327,7 +6380,7 @@ impl Timeline {
            )
        };

-        let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
+        let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default());
        let standby_horizon = self.standby_horizon.load();
        // Hold GC for the standby, but as a safety guard do it only within some
        // reasonable lag.
@@ -6376,7 +6429,7 @@ impl Timeline {
    async fn gc_timeline(
        &self,
        space_cutoff: Lsn,
-        time_cutoff: Lsn,
+        time_cutoff: Option<Lsn>, // None if uninitialized
        retain_lsns: Vec<Lsn>,
        max_lsn_with_valid_lease: Option<Lsn>,
        new_gc_cutoff: Lsn,
@@ -6395,6 +6448,12 @@ impl Timeline {
            return Ok(result);
        }

+        let Some(time_cutoff) = time_cutoff else {
+            // The GC cutoff should have been computed by now, but let's be defensive.
+            info!("Nothing to GC: time_cutoff not yet computed");
+            return Ok(result);
+        };
+
        // We need to ensure that no one tries to read page versions or create
        // branches at a point before latest_gc_cutoff_lsn. See branch_timeline()
        // for details. This will block until the old value is no longer in use.
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1526,7 +1526,7 @@ impl Timeline {
        info!(
            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
                checked {layers_checked}/{layers_total} layers \
-                (latest_gc_cutoff={} pitr_cutoff={})",
+                (latest_gc_cutoff={} pitr_cutoff={:?})",
            layers_to_rewrite.len(),
            drop_layers.len(),
            *latest_gc_cutoff,
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -188,7 +188,7 @@ pub(crate) async fn generate_tombstone_image_layer(
        "removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
    );
    let io_concurrency = IoConcurrency::spawn_from_conf(
-        detached.conf,
+        detached.conf.get_vectored_concurrent_io,
        detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
    );
    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1316,6 +1316,10 @@ impl WalIngest {
            }
        });

+        if info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN {
+            modification.tline.prepare_basebackup(lsn);
+        }
+
        Ok(())
    }

@@ -1684,31 +1688,31 @@ mod tests {
        // The relation was created at LSN 2, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
                .await?,
            false
        );
        assert!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
                .await
                .is_err()
        );
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            1
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
                .await?,
            3
        );
@@ -1719,7 +1723,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x20)),
+                    Version::at(Lsn(0x20)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1733,7 +1737,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x30)),
+                    Version::at(Lsn(0x30)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1747,7 +1751,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x40)),
+                    Version::at(Lsn(0x40)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1760,7 +1764,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::Lsn(Lsn(0x40)),
+                    Version::at(Lsn(0x40)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1774,7 +1778,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x50)),
+                    Version::at(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1787,7 +1791,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::Lsn(Lsn(0x50)),
+                    Version::at(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1800,7 +1804,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    2,
-                    Version::Lsn(Lsn(0x50)),
+                    Version::at(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1820,7 +1824,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
                .await?,
            2
        );
@@ -1829,7 +1833,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x60)),
+                    Version::at(Lsn(0x60)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1842,7 +1846,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::Lsn(Lsn(0x60)),
+                    Version::at(Lsn(0x60)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1854,7 +1858,7 @@ mod tests {
        // should still see the truncated block with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
                .await?,
            3
        );
@@ -1863,7 +1867,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    2,
-                    Version::Lsn(Lsn(0x50)),
+                    Version::at(Lsn(0x50)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1880,7 +1884,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x68)), &ctx)
                .await?,
            0
        );
@@ -1893,7 +1897,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x70)), &ctx)
                .await?,
            2
        );
@@ -1902,7 +1906,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    0,
-                    Version::Lsn(Lsn(0x70)),
+                    Version::at(Lsn(0x70)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1915,7 +1919,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1,
-                    Version::Lsn(Lsn(0x70)),
+                    Version::at(Lsn(0x70)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1932,7 +1936,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
                .await?,
            1501
        );
@@ -1942,7 +1946,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blk,
-                        Version::Lsn(Lsn(0x80)),
+                        Version::at(Lsn(0x80)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -1956,7 +1960,7 @@ mod tests {
                .get_rel_page_at_lsn(
                    TESTREL_A,
                    1500,
-                    Version::Lsn(Lsn(0x80)),
+                    Version::at(Lsn(0x80)),
                    &ctx,
                    io_concurrency.clone()
                )
@@ -1990,13 +1994,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            1
        );
@@ -2011,7 +2015,7 @@ mod tests {
        // Check that rel is not visible anymore
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x30)), &ctx)
                .await?,
            false
        );
@@ -2029,13 +2033,13 @@ mod tests {
        // Check that rel exists and size is correct
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
                .await?,
            1
        );
@@ -2077,26 +2081,26 @@ mod tests {
        // The relation was created at LSN 20, not visible at LSN 1 yet.
        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
                .await?,
            false
        );
        assert!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
                .await
                .is_err()
        );

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
                .await?,
            relsize
        );
@@ -2110,7 +2114,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::Lsn(lsn),
+                        Version::at(lsn),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2131,7 +2135,7 @@ mod tests {
        // Check reported size and contents after truncation
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
                .await?,
            1
        );
@@ -2144,7 +2148,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::Lsn(Lsn(0x60)),
+                        Version::at(Lsn(0x60)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2157,7 +2161,7 @@ mod tests {
        // should still see all blocks with older LSN
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
                .await?,
            relsize
        );
@@ -2169,7 +2173,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::Lsn(Lsn(0x50)),
+                        Version::at(Lsn(0x50)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2193,13 +2197,13 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_exists(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
                .await?,
            true
        );
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
                .await?,
            relsize
        );
@@ -2212,7 +2216,7 @@ mod tests {
                    .get_rel_page_at_lsn(
                        TESTREL_A,
                        blkno,
-                        Version::Lsn(Lsn(0x80)),
+                        Version::at(Lsn(0x80)),
                        &ctx,
                        io_concurrency.clone()
                    )
@@ -2250,7 +2254,7 @@ mod tests {

        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
                .await?,
            RELSEG_SIZE + 1
        );
@@ -2264,7 +2268,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
                .await?,
            RELSEG_SIZE
        );
@@ -2279,7 +2283,7 @@ mod tests {
        m.commit(&ctx).await?;
        assert_eq!(
            tline
-                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
                .await?,
            RELSEG_SIZE - 1
        );
@@ -2297,7 +2301,7 @@ mod tests {
            m.commit(&ctx).await?;
            assert_eq!(
                tline
-                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
+                    .get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
                    .await?,
                size as BlockNumber
            );
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -717,7 +717,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_receive);
-	Assert(readpage_reentrant_guard);
+	Assert(readpage_reentrant_guard || AmPrewarmWorker);

 	if (slot->status != PRFS_REQUESTED ||
 		slot->response != NULL ||
@@ -800,7 +800,7 @@ communicator_prefetch_receive(BufferTag tag)
 	PrfHashEntry *entry;
 	PrefetchRequest hashkey;

-	Assert(readpage_reentrant_guard);
+	Assert(readpage_reentrant_guard || AmPrewarmWorker); /* do not pump prefetch state in prewarm worker */
 	hashkey.buftag = tag;
 	entry = prfh_lookup(MyPState->prf_hash, &hashkey);
 	if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index))
@@ -2450,6 +2450,7 @@ void
 communicator_reconfigure_timeout_if_needed(void)
 {
 	bool	needs_set = MyPState->ring_receive != MyPState->ring_unused &&
+						!AmPrewarmWorker && /* do not pump prefetch state in prewarm worker */
 						readahead_getpage_pull_timeout_ms > 0;

 	if (needs_set != timeout_set)
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -201,6 +201,8 @@ static shmem_request_hook_type prev_shmem_request_hook;
 bool lfc_store_prefetch_result;
 bool lfc_prewarm_update_ws_estimation;

+bool AmPrewarmWorker;
+
 #define LFC_ENABLED() (lfc_ctl->limit != 0)

 /*
@@ -845,6 +847,8 @@ lfc_prewarm_main(Datum main_arg)
 	PrewarmWorkerState* ws;
 	uint32 worker_id = DatumGetInt32(main_arg);

+	AmPrewarmWorker = true;
+
 	pqsignal(SIGTERM, die);
 	BackgroundWorkerUnblockSignals();

--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -23,6 +23,8 @@ extern int	wal_acceptor_connection_timeout;
 extern int	readahead_getpage_pull_timeout_ms;
 extern bool	disable_wal_prev_lsn_checks;

+extern bool AmPrewarmWorker;
+
 #if PG_MAJORVERSION_NUM >= 17
 extern uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
 extern uint32		WAIT_EVENT_NEON_LFC_READ;
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -69,6 +69,7 @@ struct NeonWALReader
 	WALSegmentContext segcxt;
 	WALOpenSegment seg;
 	int			wre_errno;
+	TimeLineID	local_active_tlid;
 	/* Explains failure to read, static for simplicity. */
 	char		err_msg[NEON_WALREADER_ERR_MSG_LEN];

@@ -106,7 +107,8 @@ struct NeonWALReader

 /* palloc and initialize NeonWALReader */
 NeonWALReader *
-NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix)
+NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn,
+					  char *log_prefix, TimeLineID tlid)
 {
 	NeonWALReader *reader;

@@ -118,6 +120,7 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_
 		MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader));

 	reader->available_lsn = available_lsn;
+	reader->local_active_tlid = tlid;
 	reader->seg.ws_file = -1;
 	reader->seg.ws_segno = 0;
 	reader->seg.ws_tli = 0;
@@ -577,6 +580,17 @@ NeonWALReaderIsRemConnEstablished(NeonWALReader *state)
 	return state->rem_state == RS_ESTABLISHED;
 }

+/*
+ * Whether remote connection is established. Once this is done, until successful
+ * local read or error socket is stable and user can update socket events
+ * instead of readding it each time.
+ */
+TimeLineID
+NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state)
+{
+	return state->local_active_tlid;
+}
+
 /*
 * Returns events user should wait on connection socket or 0 if remote
 * connection is not active.
--- a/pgxn/neon/neon_walreader.h
+++ b/pgxn/neon/neon_walreader.h
@@ -19,9 +19,12 @@ typedef enum
 	NEON_WALREAD_ERROR,
 } NeonWALReadResult;

-extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix);
+extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size,
+											XLogRecPtr available_lsn,
+											char *log_prefix, TimeLineID tlid);
 extern void NeonWALReaderFree(NeonWALReader *state);
 extern void NeonWALReaderResetRemote(NeonWALReader *state);
+extern TimeLineID NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state);
 extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
 extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
 extern uint32 NeonWALReaderEvents(NeonWALReader *state);
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -98,6 +98,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp = palloc0(sizeof(WalProposer));
 	wp->config = config;
 	wp->api = api;
+	wp->localTimeLineID = config->pgTimeline;
 	wp->state = WPS_COLLECTING_TERMS;
 	wp->mconf.generation = INVALID_GENERATION;
 	wp->mconf.members.len = 0;
@@ -1379,7 +1380,7 @@ ProcessPropStartPos(WalProposer *wp)
 	 * we must bail out, as clog and other non rel data is inconsistent.
 	 */
 	walprop_shared = wp->api.get_shmem_state(wp);
-	if (!wp->config->syncSafekeepers)
+	if (!wp->config->syncSafekeepers && !walprop_shared->replica_promote)
 	{
 		/*
 		 * Basebackup LSN always points to the beginning of the record (not
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -391,6 +391,7 @@ typedef struct WalproposerShmemState
 	/* last feedback from each shard */
 	PageserverFeedback shard_ps_feedback[MAX_SHARDS];
 	int			num_shards;
+	bool		replica_promote;

 	/* aggregated feedback with min LSNs across shards */
 	PageserverFeedback min_ps_feedback;
@@ -805,6 +806,8 @@ typedef struct WalProposer

 	/* WAL has been generated up to this point */
 	XLogRecPtr	availableLsn;
+	/* Current local TimeLineId in use */
+	TimeLineID	localTimeLineID;

 	/* cached GetAcknowledgedByQuorumWALPosition result */
 	XLogRecPtr	commitLsn;
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -35,6 +35,7 @@
 #include "storage/proc.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
@@ -157,12 +158,23 @@ WalProposerMain(Datum main_arg)
 {
 	WalProposer *wp;

+	if (*wal_acceptors_list == '\0')
+	{
+		wpg_log(WARNING, "Safekeepers list is empty");
+		return;
+	}
+
 	init_walprop_config(false);
 	walprop_pg_init_bgworker();
 	am_walproposer = true;
 	walprop_pg_load_libpqwalreceiver();

 	wp = WalProposerCreate(&walprop_config, walprop_pg);
+#if PG_MAJORVERSION_NUM < 15
+	wp->localTimeLineID = ThisTimeLineID;
+#else
+	wp->localTimeLineID = GetWALInsertionTimeLine();
+#endif
 	wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp);

 	walprop_pg_init_walsender();
@@ -294,16 +306,15 @@ safekeepers_cmp(char *old, char *new)
 	return true;
 }

-/*
- * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
- * the list changed.
- */
 static void
 assign_neon_safekeepers(const char *newval, void *extra)
 {
 	char	   *newval_copy;
 	char	   *oldval;

+	if (newval && *newval != '\0' && UsedShmemSegAddr && walprop_shared && RecoveryInProgress())
+		walprop_shared->replica_promote = true;
+
 	if (!am_walproposer)
 		return;

@@ -500,10 +511,6 @@ walprop_register_bgworker(void)
 {
 	BackgroundWorker bgw;

-	/* If no wal acceptors are specified, don't start the background worker. */
-	if (*wal_acceptors_list == '\0')
-		return;
-
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
 	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
@@ -1496,7 +1503,10 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)

 	snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
 	Assert(!sk->xlogreader);
-	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix);
+	/* note that WalProposer shouldn't access safekeepers when active */
+	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size,
+										   sk->wp->propTermStartLsn, log_prefix,
+										   sk->wp->localTimeLineID);
 	if (sk->xlogreader == NULL)
 		wpg_log(FATAL, "failed to allocate xlog reader");
 }
@@ -1510,7 +1520,7 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
 					  buf,
 					  startptr,
 					  count,
-					  walprop_pg_get_timeline_id());
+					  sk->wp->localTimeLineID);

 	if (res == NEON_WALREAD_SUCCESS)
 	{
--- a/pgxn/neon/walsender_hooks.c
+++ b/pgxn/neon/walsender_hooks.c
@@ -68,8 +68,7 @@ NeonWALReadWaitForWAL(XLogRecPtr loc)
 }

 static int
-NeonWALPageRead(
-				XLogReaderState *xlogreader,
+NeonWALPageRead(XLogReaderState *xlogreader,
 				XLogRecPtr targetPagePtr,
 				int reqLen,
 				XLogRecPtr targetRecPtr,
@@ -106,12 +105,11 @@ NeonWALPageRead(

 	for (;;)
 	{
-		NeonWALReadResult res = NeonWALRead(
-											wal_reader,
+		NeonWALReadResult res = NeonWALRead(wal_reader,
 											readBuf,
 											targetPagePtr,
 											count,
-											walprop_pg_get_timeline_id());
+											NeonWALReaderLocalActiveTimeLineID(wal_reader));

 		if (res == NEON_WALREAD_SUCCESS)
 		{
@@ -202,7 +200,8 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
 		{
 			elog(ERROR, "unable to start walsender when basebackupLsn is 0");
 		}
-		wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ");
+		wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn,
+										   "[walsender] ", 1);
 	}
 	xlr->page_read = NeonWALPageRead;
 	xlr->segment_open = NeonWALReadSegmentOpen;
--- a/poetry.lock
+++ b/poetry.lock
@@ -3170,19 +3170,24 @@ pbr = "*"

 [[package]]
 name = "setuptools"
-version = "70.0.0"
+version = "78.1.1"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
-    {file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
+    {file = "setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561"},
+    {file = "setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d"},
 ]

 [package.extras]
-docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
+core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
+type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]

 [[package]]
 name = "six"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -127,3 +127,4 @@ rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
 tokio-postgres.workspace = true
+tracing-test = "0.2"
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -80,10 +80,22 @@ impl std::fmt::Display for Backend<'_, ()> {
                    .field(&endpoint.url())
                    .finish(),
                #[cfg(any(test, feature = "testing"))]
-                ControlPlaneClient::PostgresMock(endpoint) => fmt
-                    .debug_tuple("ControlPlane::PostgresMock")
-                    .field(&endpoint.url())
-                    .finish(),
+                ControlPlaneClient::PostgresMock(endpoint) => {
+                    let url = endpoint.url();
+                    match url::Url::parse(url) {
+                        Ok(mut url) => {
+                            let _ = url.set_password(Some("_redacted_"));
+                            let url = url.as_str();
+                            fmt.debug_tuple("ControlPlane::PostgresMock")
+                                .field(&url)
+                                .finish()
+                        }
+                        Err(_) => fmt
+                            .debug_tuple("ControlPlane::PostgresMock")
+                            .field(&url)
+                            .finish(),
+                    }
+                }
                #[cfg(test)]
                ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
            },
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -1,9 +1,13 @@
+#[cfg(any(test, feature = "testing"))]
+use std::env;
 use std::net::SocketAddr;
 use std::path::PathBuf;
 use std::pin::pin;
 use std::sync::Arc;
 use std::time::Duration;

+#[cfg(any(test, feature = "testing"))]
+use anyhow::Context;
 use anyhow::{bail, ensure};
 use arc_swap::ArcSwapOption;
 use futures::future::Either;
@@ -35,6 +39,8 @@ use crate::scram::threadpool::ThreadPool;
 use crate::serverless::GlobalConnPoolOptions;
 use crate::serverless::cancel_set::CancelSet;
 use crate::tls::client_config::compute_client_config_with_root_certs;
+#[cfg(any(test, feature = "testing"))]
+use crate::url::ApiUrl;
 use crate::{auth, control_plane, http, serverless, usage_metrics};

 project_git_version!(GIT_VERSION);
@@ -161,8 +167,11 @@ struct ProxyCliArgs {
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
    redis_rps_limit: Vec<RateBucketInfo>,
    /// Cancellation channel size (max queue size for redis kv client)
-    #[clap(long, default_value = "1024")]
+    #[clap(long, default_value_t = 1024)]
    cancellation_ch_size: usize,
+    /// Cancellation ops batch size for redis
+    #[clap(long, default_value_t = 8)]
+    cancellation_batch_size: usize,
    /// cache for `allowed_ips` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    allowed_ips_cache: String,
@@ -542,7 +551,12 @@ pub async fn run() -> anyhow::Result<()> {
            if let Some(mut redis_kv_client) = redis_kv_client {
                maintenance_tasks.spawn(async move {
                    redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
+                    handle_cancel_messages(
+                        &mut redis_kv_client,
+                        rx_cancel,
+                        args.cancellation_batch_size,
+                    )
+                    .await?;

                    drop(redis_kv_client);

@@ -769,7 +783,13 @@ fn build_auth_backend(

        #[cfg(any(test, feature = "testing"))]
        AuthBackendType::Postgres => {
-            let url = args.auth_endpoint.parse()?;
+            let mut url: ApiUrl = args.auth_endpoint.parse()?;
+            if url.password().is_none() {
+                let password = env::var("PGPASSWORD")
+                    .with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?;
+                url.set_password(Some(&password))
+                    .expect("Failed to set password");
+            }
            let api = control_plane::client::mock::MockControlPlane::new(
                url,
                !args.is_private_access_proxy,
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -30,8 +30,6 @@ use crate::tls::postgres_rustls::MakeRustlsConnect;
 type IpSubnetKey = IpNet;

 const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
-const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
-const BATCH_SIZE: usize = 8;

 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -231,12 +229,13 @@ impl CancelReplyOp {
 pub async fn handle_cancel_messages(
    client: &mut RedisKVClient,
    mut rx: mpsc::Receiver<CancelKeyOp>,
+    batch_size: usize,
 ) -> anyhow::Result<()> {
-    let mut batch = Vec::with_capacity(BATCH_SIZE);
-    let mut pipeline = Pipeline::with_capacity(BATCH_SIZE);
+    let mut batch = Vec::with_capacity(batch_size);
+    let mut pipeline = Pipeline::with_capacity(batch_size);

    loop {
-        if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
+        if rx.recv_many(&mut batch, batch_size).await == 0 {
            warn!("shutting down cancellation queue");
            break Ok(());
        }
@@ -367,8 +366,7 @@ impl CancellationHandler {
            return Err(CancelError::InternalError);
        };

-        tx.send_timeout(op, REDIS_SEND_TIMEOUT)
-            .await
+        tx.try_send(op)
            .map_err(|e| {
                tracing::warn!("failed to send GetCancelData for {key}: {e}");
            })
@@ -570,7 +568,7 @@ impl Session {
    }

    // Send the store key op to the cancellation handler and set TTL for the key
-    pub(crate) async fn write_cancel_key(
+    pub(crate) fn write_cancel_key(
        &self,
        cancel_closure: CancelClosure,
    ) -> Result<(), CancelError> {
@@ -596,14 +594,14 @@ impl Session {
            expire: CANCEL_KEY_TTL,
        };

-        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+        let _ = tx.try_send(op).map_err(|e| {
            let key = self.key;
            tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
        });
        Ok(())
    }

-    pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
+    pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
        let Some(tx) = &self.cancellation_handler.tx else {
            tracing::warn!("cancellation handler is not available");
            return Err(CancelError::InternalError);
@@ -619,7 +617,7 @@ impl Session {
                .guard(RedisMsgKind::HDel),
        };

-        let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
+        let _ = tx.try_send(op).map_err(|e| {
            let key = self.key;
            tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
        });
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -244,9 +244,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
    let session = cancellation_handler_clone.get_key();

-    session
-        .write_cancel_key(node.cancel_closure.clone())
-        .await?;
+    session.write_cancel_key(node.cancel_closure.clone())?;

    prepare_client_connection(&node, *session.key(), &mut stream).await?;

--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,13 +1,11 @@
-use std::cell::{Cell, RefCell};
+use std::cell::RefCell;
 use std::collections::HashMap;
-use std::hash::BuildHasher;
+use std::sync::Arc;
 use std::sync::atomic::{AtomicU32, Ordering};
-use std::{array, env, fmt, io};
+use std::{env, io};

 use chrono::{DateTime, Utc};
-use indexmap::IndexSet;
 use opentelemetry::trace::TraceContextExt;
-use scopeguard::defer;
 use serde::ser::{SerializeMap, Serializer};
 use tracing::subscriber::Interest;
 use tracing::{Event, Metadata, Span, Subscriber, callsite, span};
@@ -19,7 +17,6 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields};
 use tracing_subscriber::layer::{Context, Layer};
 use tracing_subscriber::prelude::*;
 use tracing_subscriber::registry::{LookupSpan, SpanRef};
-use try_lock::TryLock;

 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
@@ -55,7 +52,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
            StderrWriter {
                stderr: std::io::stderr(),
            },
-            ["request_id", "session_id", "conn_id"],
+            &["request_id", "session_id", "conn_id"],
        ))
    } else {
        None
@@ -183,50 +180,65 @@ impl Clock for RealClock {
 /// Name of the field used by tracing crate to store the event message.
 const MESSAGE_FIELD: &str = "message";

+/// Tracing used to enforce that spans/events have no more than 32 fields.
+/// It seems this is no longer the case, but it's still documented in some places.
+/// Generally, we shouldn't expect more than 32 fields anyway, so we can try and
+/// rely on it for some (minor) performance gains.
+const MAX_TRACING_FIELDS: usize = 32;
+
 thread_local! {
-    /// Protects against deadlocks and double panics during log writing.
-    /// The current panic handler will use tracing to log panic information.
-    static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
    /// Thread-local instance with per-thread buffer for log writing.
-    static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
+    static EVENT_FORMATTER: RefCell<EventFormatter> = const { RefCell::new(EventFormatter::new()) };
    /// Cached OS thread ID.
    static THREAD_ID: u64 = gettid::gettid();
 }

+/// Map for values fixed at callsite registration.
+// We use papaya here because registration rarely happens post-startup.
+// papaya is good for read-heavy workloads.
+//
+// We use rustc_hash here because callsite::Identifier will always be an integer with low-bit entropy,
+// since it's always a pointer to static mutable data. rustc_hash was designed for low-bit entropy.
+type CallsiteMap<T> =
+    papaya::HashMap<callsite::Identifier, T, std::hash::BuildHasherDefault<rustc_hash::FxHasher>>;
+
 /// Implements tracing layer to handle events specific to logging.
-struct JsonLoggingLayer<C: Clock, W: MakeWriter, const F: usize> {
+struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
    clock: C,
-    skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
-    callsite_ids: papaya::HashMap<callsite::Identifier, CallsiteId>,
    writer: W,
-    // We use a const generic and arrays to bypass one heap allocation.
-    extract_fields: IndexSet<&'static str>,
-    _marker: std::marker::PhantomData<[&'static str; F]>,
+
+    /// tracks which fields of each **event** are duplicates
+    skipped_field_indices: CallsiteMap<SkippedFieldIndices>,
+
+    span_info: CallsiteMap<CallsiteSpanInfo>,
+
+    /// Fields we want to keep track of in a separate json object.
+    extract_fields: &'static [&'static str],
 }

-impl<C: Clock, W: MakeWriter, const F: usize> JsonLoggingLayer<C, W, F> {
-    fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self {
+impl<C: Clock, W: MakeWriter> JsonLoggingLayer<C, W> {
+    fn new(clock: C, writer: W, extract_fields: &'static [&'static str]) -> Self {
        JsonLoggingLayer {
            clock,
-            skipped_field_indices: papaya::HashMap::default(),
-            callsite_ids: papaya::HashMap::default(),
+            skipped_field_indices: CallsiteMap::default(),
+            span_info: CallsiteMap::default(),
            writer,
-            extract_fields: IndexSet::from_iter(extract_fields),
-            _marker: std::marker::PhantomData,
+            extract_fields,
        }
    }

    #[inline]
-    fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId {
-        *self
-            .callsite_ids
+    fn span_info(&self, metadata: &'static Metadata<'static>) -> CallsiteSpanInfo {
+        self.span_info
            .pin()
-            .get_or_insert_with(cs, CallsiteId::next)
+            .get_or_insert_with(metadata.callsite(), || {
+                CallsiteSpanInfo::new(metadata, self.extract_fields)
+            })
+            .clone()
    }
 }

-impl<S, C: Clock + 'static, W: MakeWriter + 'static, const F: usize> Layer<S>
-    for JsonLoggingLayer<C, W, F>
+impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
 where
    S: Subscriber + for<'a> LookupSpan<'a>,
 {
@@ -237,35 +249,25 @@ where
        //       early, before OTel machinery, and add as event extension.
        let now = self.clock.now();

-        let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
-            if entered.get() {
-                let mut formatter = EventFormatter::new();
-                formatter.format::<S, F>(
-                    now,
-                    event,
-                    &ctx,
-                    &self.skipped_field_indices,
-                    &self.callsite_ids,
-                    &self.extract_fields,
-                )?;
-                self.writer.make_writer().write_all(formatter.buffer())
-            } else {
-                entered.set(true);
-                defer!(entered.set(false););
+        let res: io::Result<()> = EVENT_FORMATTER.with(|f| {
+            let mut borrow = f.try_borrow_mut();
+            let formatter = match borrow.as_deref_mut() {
+                Ok(formatter) => formatter,
+                // If the thread local formatter is borrowed,
+                // then we likely hit an edge case were we panicked during formatting.
+                // We allow the logging to proceed with an uncached formatter.
+                Err(_) => &mut EventFormatter::new(),
+            };

-                EVENT_FORMATTER.with_borrow_mut(move |formatter| {
-                    formatter.reset();
-                    formatter.format::<S, F>(
-                        now,
-                        event,
-                        &ctx,
-                        &self.skipped_field_indices,
-                        &self.callsite_ids,
-                        &self.extract_fields,
-                    )?;
-                    self.writer.make_writer().write_all(formatter.buffer())
-                })
-            }
+            formatter.reset();
+            formatter.format(
+                now,
+                event,
+                &ctx,
+                &self.skipped_field_indices,
+                self.extract_fields,
+            )?;
+            self.writer.make_writer().write_all(formatter.buffer())
        });

        // In case logging fails we generate a simpler JSON object.
@@ -287,50 +289,48 @@ where
    /// Registers a SpanFields instance as span extension.
    fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
        let span = ctx.span(id).expect("span must exist");
-        let fields = SpanFields::default();
-        fields.record_fields(attrs);

-        // This could deadlock when there's a panic somewhere in the tracing
-        // event handling and a read or write guard is still held. This includes
-        // the OTel subscriber.
-        let mut exts = span.extensions_mut();
+        let mut fields = SpanFields::new(self.span_info(span.metadata()));
+        attrs.record(&mut fields);

-        exts.insert(fields);
+        // This is a new span: the extensions should not be locked
+        // unless some layer spawned a thread to process this span.
+        // I don't think any layers do that.
+        span.extensions_mut().insert(fields);
    }

    fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
        let span = ctx.span(id).expect("span must exist");
-        let ext = span.extensions();
-        if let Some(data) = ext.get::<SpanFields>() {
-            data.record_fields(values);
+
+        // assumption: `on_record` is rarely called.
+        // assumption: a span being updated by one thread,
+        //             and formatted by another thread is even rarer.
+        let mut ext = span.extensions_mut();
+        if let Some(fields) = ext.get_mut::<SpanFields>() {
+            values.record(fields);
        }
    }

-    /// Called (lazily) whenever a new log call is executed. We quickly check
-    /// for duplicate field names and record duplicates as skippable. Last one
-    /// wins.
+    /// Called (lazily) roughly once per event/span instance. We quickly check
+    /// for duplicate field names and record duplicates as skippable. Last field wins.
    fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
+        debug_assert!(
+            metadata.fields().len() <= MAX_TRACING_FIELDS,
+            "callsite {metadata:?} has too many fields."
+        );
+
        if !metadata.is_event() {
-            self.callsite_id(metadata.callsite());
+            // register the span info.
+            self.span_info(metadata);
            // Must not be never because we wouldn't get trace and span data.
            return Interest::always();
        }

        let mut field_indices = SkippedFieldIndices::default();
-        let mut seen_fields = HashMap::<&'static str, usize>::new();
+        let mut seen_fields = HashMap::new();
        for field in metadata.fields() {
-            use std::collections::hash_map::Entry;
-            match seen_fields.entry(field.name()) {
-                Entry::Vacant(entry) => {
-                    // field not seen yet
-                    entry.insert(field.index());
-                }
-                Entry::Occupied(mut entry) => {
-                    // replace currently stored index
-                    let old_index = entry.insert(field.index());
-                    // ... and append it to list of skippable indices
-                    field_indices.push(old_index);
-                }
+            if let Some(old_index) = seen_fields.insert(field.name(), field.index()) {
+                field_indices.set(old_index);
            }
        }

@@ -344,110 +344,113 @@ where
    }
 }

-#[derive(Copy, Clone, Debug, Default)]
-#[repr(transparent)]
-struct CallsiteId(u32);
+/// Any span info that is fixed to a particular callsite. Not variable between span instances.
+#[derive(Clone)]
+struct CallsiteSpanInfo {
+    /// index of each field to extract. usize::MAX if not found.
+    extract: Arc<[usize]>,

-impl CallsiteId {
-    #[inline]
-    fn next() -> Self {
-        // Start at 1 to reserve 0 for default.
-        static COUNTER: AtomicU32 = AtomicU32::new(1);
-        CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed))
-    }
+    /// tracks the fixed "callsite ID" for each span.
+    /// note: this is not stable between runs.
+    normalized_name: Arc<str>,
 }

-impl fmt::Display for CallsiteId {
-    #[inline]
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        self.0.fmt(f)
+impl CallsiteSpanInfo {
+    fn new(metadata: &'static Metadata<'static>, extract_fields: &[&'static str]) -> Self {
+        // Start at 1 to reserve 0 for default.
+        static COUNTER: AtomicU32 = AtomicU32::new(1);
+
+        let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect();
+
+        // get all the indices of span fields we want to focus
+        let extract = extract_fields
+            .iter()
+            // use rposition, since we want last match wins.
+            .map(|f1| names.iter().rposition(|f2| f1 == f2).unwrap_or(usize::MAX))
+            .collect();
+
+        // normalized_name is unique for each callsite, but it is not
+        // unified across separate proxy instances.
+        // todo: can we do better here?
+        let cid = COUNTER.fetch_add(1, Ordering::Relaxed);
+        let normalized_name = format!("{}#{cid}", metadata.name()).into();
+
+        Self {
+            extract,
+            normalized_name,
+        }
    }
 }

 /// Stores span field values recorded during the spans lifetime.
-#[derive(Default)]
 struct SpanFields {
-    // TODO: Switch to custom enum with lasso::Spur for Strings?
-    fields: papaya::HashMap<&'static str, serde_json::Value>,
+    values: [serde_json::Value; MAX_TRACING_FIELDS],
+
+    /// cached span info so we can avoid extra hashmap lookups in the hot path.
+    span_info: CallsiteSpanInfo,
 }

 impl SpanFields {
-    #[inline]
-    fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
-        fields.record(&mut SpanFieldsRecorder {
-            fields: self.fields.pin(),
-        });
+    fn new(span_info: CallsiteSpanInfo) -> Self {
+        Self {
+            span_info,
+            values: [const { serde_json::Value::Null }; MAX_TRACING_FIELDS],
+        }
    }
 }

-/// Implements a tracing field visitor to convert and store values.
-struct SpanFieldsRecorder<'m, S, G> {
-    fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
-}
-
-impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
+impl tracing::field::Visit for SpanFields {
    #[inline]
    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
-        self.fields
-            .insert(field.name(), serde_json::Value::from(value));
+        self.values[field.index()] = serde_json::Value::from(value);
    }

    #[inline]
    fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
-        self.fields
-            .insert(field.name(), serde_json::Value::from(value));
+        self.values[field.index()] = serde_json::Value::from(value);
    }

    #[inline]
    fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
-        self.fields
-            .insert(field.name(), serde_json::Value::from(value));
+        self.values[field.index()] = serde_json::Value::from(value);
    }

    #[inline]
    fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
        if let Ok(value) = i64::try_from(value) {
-            self.fields
-                .insert(field.name(), serde_json::Value::from(value));
+            self.values[field.index()] = serde_json::Value::from(value);
        } else {
-            self.fields
-                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+            self.values[field.index()] = serde_json::Value::from(format!("{value}"));
        }
    }

    #[inline]
    fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
        if let Ok(value) = u64::try_from(value) {
-            self.fields
-                .insert(field.name(), serde_json::Value::from(value));
+            self.values[field.index()] = serde_json::Value::from(value);
        } else {
-            self.fields
-                .insert(field.name(), serde_json::Value::from(format!("{value}")));
+            self.values[field.index()] = serde_json::Value::from(format!("{value}"));
        }
    }

    #[inline]
    fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
-        self.fields
-            .insert(field.name(), serde_json::Value::from(value));
+        self.values[field.index()] = serde_json::Value::from(value);
    }

    #[inline]
    fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
-        self.fields
-            .insert(field.name(), serde_json::Value::from(value));
+        self.values[field.index()] = serde_json::Value::from(value);
    }

    #[inline]
    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
-        self.fields
-            .insert(field.name(), serde_json::Value::from(value));
+        self.values[field.index()] = serde_json::Value::from(value);
    }

    #[inline]
    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
-        self.fields
-            .insert(field.name(), serde_json::Value::from(format!("{value:?}")));
+        self.values[field.index()] = serde_json::Value::from(format!("{value:?}"));
    }

    #[inline]
@@ -456,38 +459,33 @@ impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecor
        field: &tracing::field::Field,
        value: &(dyn std::error::Error + 'static),
    ) {
-        self.fields
-            .insert(field.name(), serde_json::Value::from(format!("{value}")));
+        self.values[field.index()] = serde_json::Value::from(format!("{value}"));
    }
 }

 /// List of field indices skipped during logging. Can list duplicate fields or
 /// metafields not meant to be logged.
-#[derive(Clone, Default)]
+#[derive(Copy, Clone, Default)]
 struct SkippedFieldIndices {
-    bits: u64,
+    // 32-bits is large enough for `MAX_TRACING_FIELDS`
+    bits: u32,
 }

 impl SkippedFieldIndices {
    #[inline]
-    fn is_empty(&self) -> bool {
+    fn is_empty(self) -> bool {
        self.bits == 0
    }

    #[inline]
-    fn push(&mut self, index: usize) {
-        self.bits |= 1u64
-            .checked_shl(index as u32)
-            .expect("field index too large");
+    fn set(&mut self, index: usize) {
+        debug_assert!(index <= 32, "index out of bounds of 32-bit set");
+        self.bits |= 1 << index;
    }

    #[inline]
-    fn contains(&self, index: usize) -> bool {
-        self.bits
-            & 1u64
-                .checked_shl(index as u32)
-                .expect("field index too large")
-            != 0
+    fn contains(self, index: usize) -> bool {
+        self.bits & (1 << index) != 0
    }
 }

@@ -499,7 +497,7 @@ struct EventFormatter {

 impl EventFormatter {
    #[inline]
-    fn new() -> Self {
+    const fn new() -> Self {
        EventFormatter {
            logline_buffer: Vec::new(),
        }
@@ -515,14 +513,13 @@ impl EventFormatter {
        self.logline_buffer.clear();
    }

-    fn format<S, const F: usize>(
+    fn format<S>(
        &mut self,
        now: DateTime<Utc>,
        event: &Event<'_>,
        ctx: &Context<'_, S>,
-        skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
-        callsite_ids: &papaya::HashMap<callsite::Identifier, CallsiteId>,
-        extract_fields: &IndexSet<&'static str>,
+        skipped_field_indices: &CallsiteMap<SkippedFieldIndices>,
+        extract_fields: &'static [&'static str],
    ) -> io::Result<()>
    where
        S: Subscriber + for<'a> LookupSpan<'a>,
@@ -533,8 +530,11 @@ impl EventFormatter {
        let normalized_meta = event.normalized_metadata();
        let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());

-        let skipped_field_indices = skipped_field_indices.pin();
-        let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
+        let skipped_field_indices = skipped_field_indices
+            .pin()
+            .get(&meta.callsite())
+            .copied()
+            .unwrap_or_default();

        let mut serialize = || {
            let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
@@ -565,9 +565,11 @@ impl EventFormatter {
            }

            let spans = SerializableSpans {
-                ctx,
-                callsite_ids,
-                extract: ExtractedSpanFields::<'_, F>::new(extract_fields),
+                // collect all spans from parent to root.
+                spans: ctx
+                    .event_span(event)
+                    .map_or(vec![], |parent| parent.scope().collect()),
+                extracted: ExtractedSpanFields::new(extract_fields),
            };
            serializer.serialize_entry("spans", &spans)?;

@@ -620,9 +622,9 @@ impl EventFormatter {
                }
            }

-            if spans.extract.has_values() {
+            if spans.extracted.has_values() {
                // TODO: add fields from event, too?
-                serializer.serialize_entry("extract", &spans.extract)?;
+                serializer.serialize_entry("extract", &spans.extracted)?;
            }

            serializer.end()
@@ -635,15 +637,15 @@ impl EventFormatter {
 }

 /// Extracts the message field that's mixed will other fields.
-struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
+struct MessageFieldExtractor<S: serde::ser::SerializeMap> {
    serializer: S,
-    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    skipped_field_indices: SkippedFieldIndices,
    state: Option<Result<(), S::Error>>,
 }

-impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
+impl<S: serde::ser::SerializeMap> MessageFieldExtractor<S> {
    #[inline]
-    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+    fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
        Self {
            serializer,
            skipped_field_indices,
@@ -665,13 +667,11 @@ impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
    fn accept_field(&self, field: &tracing::field::Field) -> bool {
        self.state.is_none()
            && field.name() == MESSAGE_FIELD
-            && !self
-                .skipped_field_indices
-                .is_some_and(|i| i.contains(field.index()))
+            && !self.skipped_field_indices.contains(field.index())
    }
 }

-impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<S> {
    #[inline]
    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
        if self.accept_field(field) {
@@ -751,14 +751,14 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtracto
 /// can be skipped.
 // This is entirely optional and only cosmetic, though maybe helps a
 // bit during log parsing in dashboards when there's no field with empty object.
-struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
+struct FieldsPresent(pub bool, SkippedFieldIndices);

 // Even though some methods have an overhead (error, bytes) it is assumed the
 // compiler won't include this since we ignore the value entirely.
-impl tracing::field::Visit for FieldsPresent<'_> {
+impl tracing::field::Visit for FieldsPresent {
    #[inline]
    fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
-        if !self.1.is_some_and(|i| i.contains(field.index()))
+        if !self.1.contains(field.index())
            && field.name() != MESSAGE_FIELD
            && !field.name().starts_with("log.")
        {
@@ -768,10 +768,7 @@ impl tracing::field::Visit for FieldsPresent<'_> {
 }

 /// Serializes the fields directly supplied with a log event.
-struct SerializableEventFields<'a, 'event>(
-    &'a tracing::Event<'event>,
-    Option<&'a SkippedFieldIndices>,
-);
+struct SerializableEventFields<'a, 'event>(&'a tracing::Event<'event>, SkippedFieldIndices);

 impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
@@ -788,15 +785,15 @@ impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
 }

 /// A tracing field visitor that skips the message field.
-struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
+struct MessageFieldSkipper<S: serde::ser::SerializeMap> {
    serializer: S,
-    skipped_field_indices: Option<&'a SkippedFieldIndices>,
+    skipped_field_indices: SkippedFieldIndices,
    state: Result<(), S::Error>,
 }

-impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
+impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
    #[inline]
-    fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
+    fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
        Self {
            serializer,
            skipped_field_indices,
@@ -809,9 +806,7 @@ impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
        self.state.is_ok()
            && field.name() != MESSAGE_FIELD
            && !field.name().starts_with("log.")
-            && !self
-                .skipped_field_indices
-                .is_some_and(|i| i.contains(field.index()))
+            && !self.skipped_field_indices.contains(field.index())
    }

    #[inline]
@@ -821,7 +816,7 @@ impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
    }
 }

-impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
+impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<S> {
    #[inline]
    fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
        if self.accept_field(field) {
@@ -905,18 +900,17 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<
 /// with the span names as keys. To prevent collision we append a numberic value
 /// to the name. Also, collects any span fields we're interested in. Last one
 /// wins.
-struct SerializableSpans<'a, 'ctx, Span, const F: usize>
+struct SerializableSpans<'ctx, S>
 where
-    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
+    S: for<'lookup> LookupSpan<'lookup>,
 {
-    ctx: &'a Context<'ctx, Span>,
-    callsite_ids: &'a papaya::HashMap<callsite::Identifier, CallsiteId>,
-    extract: ExtractedSpanFields<'a, F>,
+    spans: Vec<SpanRef<'ctx, S>>,
+    extracted: ExtractedSpanFields,
 }

-impl<Span, const F: usize> serde::ser::Serialize for SerializableSpans<'_, '_, Span, F>
+impl<S> serde::ser::Serialize for SerializableSpans<'_, S>
 where
-    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
+    S: for<'lookup> LookupSpan<'lookup>,
 {
    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
    where
@@ -924,25 +918,22 @@ where
    {
        let mut serializer = serializer.serialize_map(None)?;

-        if let Some(leaf_span) = self.ctx.lookup_current() {
-            for span in leaf_span.scope().from_root() {
-                // Append a numeric callsite ID to the span name to keep the name unique
-                // in the JSON object.
-                let cid = self
-                    .callsite_ids
-                    .pin()
-                    .get(&span.metadata().callsite())
-                    .copied()
-                    .unwrap_or_default();
+        for span in self.spans.iter().rev() {
+            let ext = span.extensions();

-                // Loki turns the # into an underscore during field name concatenation.
-                serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?;
+            // all spans should have this extension.
+            let Some(fields) = ext.get() else { continue };

-                serializer.serialize_value(&SerializableSpanFields {
-                    span: &span,
-                    extract: &self.extract,
-                })?;
-            }
+            self.extracted.layer_span(fields);
+
+            let SpanFields { values, span_info } = fields;
+            serializer.serialize_entry(
+                &*span_info.normalized_name,
+                &SerializableSpanFields {
+                    fields: span.metadata().fields(),
+                    values,
+                },
+            )?;
        }

        serializer.end()
@@ -950,80 +941,77 @@ where
 }

 /// Serializes the span fields as object.
-struct SerializableSpanFields<'a, 'span, Span, const F: usize>
-where
-    Span: for<'lookup> LookupSpan<'lookup>,
-{
-    span: &'a SpanRef<'span, Span>,
-    extract: &'a ExtractedSpanFields<'a, F>,
+struct SerializableSpanFields<'span> {
+    fields: &'span tracing::field::FieldSet,
+    values: &'span [serde_json::Value; MAX_TRACING_FIELDS],
 }

-impl<Span, const F: usize> serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F>
-where
-    Span: for<'lookup> LookupSpan<'lookup>,
-{
+impl serde::ser::Serialize for SerializableSpanFields<'_> {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::ser::Serializer,
    {
        let mut serializer = serializer.serialize_map(None)?;

-        let ext = self.span.extensions();
-        if let Some(data) = ext.get::<SpanFields>() {
-            for (name, value) in &data.fields.pin() {
-                serializer.serialize_entry(name, value)?;
-                // TODO: replace clone with reference, if possible.
-                self.extract.set(name, value.clone());
+        for (field, value) in std::iter::zip(self.fields, self.values) {
+            if value.is_null() {
+                continue;
            }
+            serializer.serialize_entry(field.name(), value)?;
        }

        serializer.end()
    }
 }

-struct ExtractedSpanFields<'a, const F: usize> {
-    names: &'a IndexSet<&'static str>,
-    // TODO: replace TryLock with something local thread and interior mutability.
-    //       serde API doesn't let us use `mut`.
-    values: TryLock<([Option<serde_json::Value>; F], bool)>,
+struct ExtractedSpanFields {
+    names: &'static [&'static str],
+    values: RefCell<Vec<serde_json::Value>>,
 }

-impl<'a, const F: usize> ExtractedSpanFields<'a, F> {
-    fn new(names: &'a IndexSet<&'static str>) -> Self {
+impl ExtractedSpanFields {
+    fn new(names: &'static [&'static str]) -> Self {
        ExtractedSpanFields {
            names,
-            values: TryLock::new((array::from_fn(|_| Option::default()), false)),
+            values: RefCell::new(vec![serde_json::Value::Null; names.len()]),
        }
    }

-    #[inline]
-    fn set(&self, name: &'static str, value: serde_json::Value) {
-        if let Some((index, _)) = self.names.get_full(name) {
-            let mut fields = self.values.try_lock().expect("thread-local use");
-            fields.0[index] = Some(value);
-            fields.1 = true;
+    fn layer_span(&self, fields: &SpanFields) {
+        let mut v = self.values.borrow_mut();
+        let SpanFields { values, span_info } = fields;
+
+        // extract the fields
+        for (i, &j) in span_info.extract.iter().enumerate() {
+            let Some(value) = values.get(j) else { continue };
+
+            if !value.is_null() {
+                // TODO: replace clone with reference, if possible.
+                v[i] = value.clone();
+            }
        }
    }

    #[inline]
    fn has_values(&self) -> bool {
-        self.values.try_lock().expect("thread-local use").1
+        self.values.borrow().iter().any(|v| !v.is_null())
    }
 }

-impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {
+impl serde::ser::Serialize for ExtractedSpanFields {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::ser::Serializer,
    {
        let mut serializer = serializer.serialize_map(None)?;

-        let values = self.values.try_lock().expect("thread-local use");
-        for (i, value) in values.0.iter().enumerate() {
-            if let Some(value) = value {
-                let key = self.names[i];
-                serializer.serialize_entry(key, value)?;
+        let values = self.values.borrow();
+        for (key, value) in std::iter::zip(self.names, &*values) {
+            if value.is_null() {
+                continue;
            }
+
+            serializer.serialize_entry(key, value)?;
        }

        serializer.end()
@@ -1032,7 +1020,6 @@ impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {

 #[cfg(test)]
 mod tests {
-    use std::marker::PhantomData;
    use std::sync::{Arc, Mutex, MutexGuard};

    use assert_json_diff::assert_json_eq;
@@ -1081,10 +1068,9 @@ mod tests {
        let log_layer = JsonLoggingLayer {
            clock: clock.clone(),
            skipped_field_indices: papaya::HashMap::default(),
-            callsite_ids: papaya::HashMap::default(),
+            span_info: papaya::HashMap::default(),
            writer: buffer.clone(),
-            extract_fields: IndexSet::from_iter(["x"]),
-            _marker: PhantomData::<[&'static str; 1]>,
+            extract_fields: &["x"],
        };

        let registry = tracing_subscriber::Registry::default().with(log_layer);
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -383,9 +383,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
    let session = cancellation_handler_clone.get_key();

-    session
-        .write_cancel_key(node.cancel_closure.clone())
-        .await?;
+    session.write_cancel_key(node.cancel_closure.clone())?;

    prepare_client_connection(&node, *session.key(), &mut stream).await?;

--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -94,7 +94,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
            tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
        }

-        drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error
+        drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error

        res
    }
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -48,7 +48,7 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
        use postgres_client::error::SqlState;
        // Here are errors that happens after the user successfully authenticated to the database.
        // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
-        !matches!(
+        let non_retriable_pg_errors = matches!(
            self.code(),
            &SqlState::TOO_MANY_CONNECTIONS
                | &SqlState::OUT_OF_MEMORY
@@ -56,8 +56,20 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
                | &SqlState::T_R_SERIALIZATION_FAILURE
                | &SqlState::INVALID_CATALOG_NAME
                | &SqlState::INVALID_SCHEMA_NAME
-                | &SqlState::INVALID_PARAMETER_VALUE
-        )
+                | &SqlState::INVALID_PARAMETER_VALUE,
+        );
+        if non_retriable_pg_errors {
+            return false;
+        }
+        // PGBouncer errors that should not trigger a wake_compute retry.
+        if self.code() == &SqlState::PROTOCOL_VIOLATION {
+            // Source for the error message:
+            // https://github.com/pgbouncer/pgbouncer/blob/f15997fe3effe3a94ba8bcc1ea562e6117d1a131/src/client.c#L1070
+            return !self
+                .message()
+                .contains("no more connections allowed (max_client_conn)");
+        }
+        true
    }
 }

@@ -110,3 +122,55 @@ pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Durati
        .base_delay
        .mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::ShouldRetryWakeCompute;
+    use postgres_client::error::{DbError, SqlState};
+
+    #[test]
+    fn should_retry_wake_compute_for_db_error() {
+        // These SQLStates should NOT trigger a wake_compute retry.
+        let non_retry_states = [
+            SqlState::TOO_MANY_CONNECTIONS,
+            SqlState::OUT_OF_MEMORY,
+            SqlState::SYNTAX_ERROR,
+            SqlState::T_R_SERIALIZATION_FAILURE,
+            SqlState::INVALID_CATALOG_NAME,
+            SqlState::INVALID_SCHEMA_NAME,
+            SqlState::INVALID_PARAMETER_VALUE,
+        ];
+        for state in non_retry_states {
+            let err = DbError::new_test_error(state.clone(), "oops".to_string());
+            assert!(
+                !err.should_retry_wake_compute(),
+                "State {state:?} unexpectedly retried"
+            );
+        }
+
+        // Errors coming from pgbouncer should not trigger a wake_compute retry
+        let non_retry_pgbouncer_errors = ["no more connections allowed (max_client_conn)"];
+        for error in non_retry_pgbouncer_errors {
+            let err = DbError::new_test_error(SqlState::PROTOCOL_VIOLATION, error.to_string());
+            assert!(
+                !err.should_retry_wake_compute(),
+                "PGBouncer error {error:?} unexpectedly retried"
+            );
+        }
+
+        // These SQLStates should trigger a wake_compute retry.
+        let retry_states = [
+            SqlState::CONNECTION_FAILURE,
+            SqlState::CONNECTION_EXCEPTION,
+            SqlState::CONNECTION_DOES_NOT_EXIST,
+            SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        ];
+        for state in retry_states {
+            let err = DbError::new_test_error(state.clone(), "oops".to_string());
+            assert!(
+                err.should_retry_wake_compute(),
+                "State {state:?} unexpectedly skipped retry"
+            );
+        }
+    }
+}
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -15,6 +15,7 @@ use rstest::rstest;
 use rustls::crypto::ring;
 use rustls::pki_types;
 use tokio::io::DuplexStream;
+use tracing_test::traced_test;

 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -381,8 +382,14 @@ enum ConnectAction {
    WakeFail,
    WakeRetry,
    Connect,
+    // connect_once -> Err, could_retry = true, should_retry_wake_compute = true
    Retry,
+    // connect_once -> Err, could_retry = true, should_retry_wake_compute = false
+    RetryNoWake,
+    // connect_once -> Err, could_retry = false, should_retry_wake_compute = true
    Fail,
+    // connect_once -> Err, could_retry = false, should_retry_wake_compute = false
+    FailNoWake,
 }

 #[derive(Clone)]
@@ -424,6 +431,7 @@ struct TestConnection;
 #[derive(Debug)]
 struct TestConnectError {
    retryable: bool,
+    wakeable: bool,
    kind: crate::error::ErrorKind,
 }

@@ -448,7 +456,7 @@ impl CouldRetry for TestConnectError {
 }
 impl ShouldRetryWakeCompute for TestConnectError {
    fn should_retry_wake_compute(&self) -> bool {
-        true
+        self.wakeable
    }
 }

@@ -471,10 +479,22 @@ impl ConnectMechanism for TestConnectMechanism {
            ConnectAction::Connect => Ok(TestConnection),
            ConnectAction::Retry => Err(TestConnectError {
                retryable: true,
+                wakeable: true,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::RetryNoWake => Err(TestConnectError {
+                retryable: true,
+                wakeable: false,
                kind: ErrorKind::Compute,
            }),
            ConnectAction::Fail => Err(TestConnectError {
                retryable: false,
+                wakeable: true,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::FailNoWake => Err(TestConnectError {
+                retryable: false,
+                wakeable: false,
                kind: ErrorKind::Compute,
            }),
            x => panic!("expecting action {x:?}, connect is called instead"),
@@ -709,3 +729,92 @@ async fn wake_non_retry() {
        .unwrap_err();
    mechanism.verify();
 }
+
+#[tokio::test]
+#[traced_test]
+async fn fail_but_wake_invalidates_cache() {
+    let ctx = RequestContext::test();
+    let mech = TestConnectMechanism::new(vec![
+        ConnectAction::Wake,
+        ConnectAction::Fail,
+        ConnectAction::Wake,
+        ConnectAction::Connect,
+    ]);
+    let user = helper_create_connect_info(&mech);
+    let cfg = config();
+
+    connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
+        .await
+        .unwrap();
+
+    assert!(logs_contain(
+        "invalidating stalled compute node info cache entry"
+    ));
+}
+
+#[tokio::test]
+#[traced_test]
+async fn fail_no_wake_skips_cache_invalidation() {
+    let ctx = RequestContext::test();
+    let mech = TestConnectMechanism::new(vec![
+        ConnectAction::Wake,
+        ConnectAction::FailNoWake,
+        ConnectAction::Connect,
+    ]);
+    let user = helper_create_connect_info(&mech);
+    let cfg = config();
+
+    connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
+        .await
+        .unwrap();
+
+    assert!(!logs_contain(
+        "invalidating stalled compute node info cache entry"
+    ));
+}
+
+#[tokio::test]
+#[traced_test]
+async fn retry_but_wake_invalidates_cache() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+
+    let ctx = RequestContext::test();
+    // Wake → Retry (retryable + wakeable) → Wake → Connect
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    let cfg = config();
+
+    connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
+        .await
+        .unwrap();
+    mechanism.verify();
+
+    // Because Retry has wakeable=true, we should see invalidate_cache
+    assert!(logs_contain(
+        "invalidating stalled compute node info cache entry"
+    ));
+}
+
+#[tokio::test]
+#[traced_test]
+async fn retry_no_wake_skips_invalidation() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+
+    let ctx = RequestContext::test();
+    // Wake → RetryNoWake (retryable + NOT wakeable)
+    let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]);
+    let user_info = helper_create_connect_info(&mechanism);
+    let cfg = config();
+
+    connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
+        .await
+        .unwrap_err();
+    mechanism.verify();
+
+    // Because RetryNoWake has wakeable=false, we must NOT see invalidate_cache
+    assert!(!logs_contain(
+        "invalidating stalled compute node info cache entry"
+    ));
+}
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -13,22 +13,19 @@ pub(crate) struct Pbkdf2 {
 // inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
 impl Pbkdf2 {
    pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
-        let hmac =
+        // key the HMAC and derive the first block in-place
+        let mut hmac =
            Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
-
-        let prev = hmac
-            .clone()
-            .chain_update(salt)
-            .chain_update(1u32.to_be_bytes())
-            .finalize()
-            .into_bytes();
+        hmac.update(salt);
+        hmac.update(&1u32.to_be_bytes());
+        let init_block = hmac.finalize_reset().into_bytes();

        Self {
            hmac,
-            // one consumed for the hash above
+            // one iteration spent above
            iterations: iterations - 1,
-            hi: prev,
-            prev,
+            hi: init_block,
+            prev: init_block,
        }
    }

@@ -44,14 +41,17 @@ impl Pbkdf2 {
            iterations,
        } = self;

-        // only do 4096 iterations per turn before sharing the thread for fairness
+        // only do up to 4096 iterations per turn for fairness
        let n = (*iterations).clamp(0, 4096);
        for _ in 0..n {
-            *prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
+            hmac.update(prev);
+            let block = hmac.finalize_reset().into_bytes();

-            for (hi, prev) in hi.iter_mut().zip(*prev) {
-                *hi ^= prev;
+            for (hi_byte, &b) in hi.iter_mut().zip(block.iter()) {
+                *hi_byte ^= b;
            }
+
+            *prev = block;
        }

        *iterations -= n;
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -14,7 +14,9 @@ use hyper::http::{HeaderName, HeaderValue};
 use hyper::{HeaderMap, Request, Response, StatusCode, header};
 use indexmap::IndexMap;
 use postgres_client::error::{DbError, ErrorPosition, SqlState};
-use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
+use postgres_client::{
+    GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction,
+};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
@@ -1092,22 +1094,41 @@ async fn query_to_json<T: GenericClient>(
    let query_start = Instant::now();

    let query_params = data.params;
-    let mut row_stream = std::pin::pin!(
-        client
-            .query_raw_txt(&data.query, query_params)
-            .await
-            .map_err(SqlOverHttpError::Postgres)?
-    );
+    let mut row_stream = client
+        .query_raw_txt(&data.query, query_params)
+        .await
+        .map_err(SqlOverHttpError::Postgres)?;
    let query_acknowledged = Instant::now();

+    let columns_len = row_stream.statement.columns().len();
+    let mut fields = Vec::with_capacity(columns_len);
+    let mut types = Vec::with_capacity(columns_len);
+
+    for c in row_stream.statement.columns() {
+        fields.push(json!({
+            "name": c.name().to_owned(),
+            "dataTypeID": c.type_().oid(),
+            "tableID": c.table_oid(),
+            "columnID": c.column_id(),
+            "dataTypeSize": c.type_size(),
+            "dataTypeModifier": c.type_modifier(),
+            "format": "text",
+        }));
+
+        types.push(c.type_().clone());
+    }
+
+    let raw_output = parsed_headers.raw_output;
+    let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
+
    // Manually drain the stream into a vector to leave row_stream hanging
    // around to get a command tag. Also check that the response is not too
    // big.
-    let mut rows: Vec<postgres_client::Row> = Vec::new();
+    let mut rows = Vec::new();
    while let Some(row) = row_stream.next().await {
        let row = row.map_err(SqlOverHttpError::Postgres)?;
        *current_size += row.body_len();
-        rows.push(row);
+
        // we don't have a streaming response support yet so this is to prevent OOM
        // from a malicious query (eg a cross join)
        if *current_size > config.max_response_size_bytes {
@@ -1115,13 +1136,26 @@ async fn query_to_json<T: GenericClient>(
                config.max_response_size_bytes,
            ));
        }
+
+        let row = pg_text_row_to_json(&row, &types, raw_output, array_mode)?;
+        rows.push(row);
+
+        // assumption: parsing pg text and converting to json takes CPU time.
+        // let's assume it is slightly expensive, so we should consume some cooperative budget.
+        // Especially considering that `RowStream::next` might be pulling from a batch
+        // of rows and never hit the tokio mpsc for a long time (although unlikely).
+        tokio::task::consume_budget().await;
    }

    let query_resp_end = Instant::now();
-    let ready = row_stream.ready_status();
+    let RowStream {
+        command_tag,
+        status: ready,
+        ..
+    } = row_stream;

    // grab the command tag and number of rows affected
-    let command_tag = row_stream.command_tag().unwrap_or_default();
+    let command_tag = command_tag.unwrap_or_default();
    let mut command_tag_split = command_tag.split(' ');
    let command_tag_name = command_tag_split.next().unwrap_or_default();
    let command_tag_count = if command_tag_name == "INSERT" {
@@ -1142,38 +1176,6 @@ async fn query_to_json<T: GenericClient>(
        "finished executing query"
    );

-    let columns_len = row_stream.columns().len();
-    let mut fields = Vec::with_capacity(columns_len);
-    let mut columns = Vec::with_capacity(columns_len);
-
-    for c in row_stream.columns() {
-        fields.push(json!({
-            "name": c.name().to_owned(),
-            "dataTypeID": c.type_().oid(),
-            "tableID": c.table_oid(),
-            "columnID": c.column_id(),
-            "dataTypeSize": c.type_size(),
-            "dataTypeModifier": c.type_modifier(),
-            "format": "text",
-        }));
-
-        match client.get_type(c.type_oid()).await {
-            Ok(t) => columns.push(t),
-            Err(err) => {
-                tracing::warn!(?err, "unable to query type information");
-                return Err(SqlOverHttpError::InternalPostgres(err));
-            }
-        }
-    }
-
-    let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
-
-    // convert rows to JSON
-    let rows = rows
-        .iter()
-        .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
-        .collect::<Result<Vec<_>, _>>()?;
-
    // Resulting JSON format is based on the format of node-postgres result.
    let results = json!({
        "command": command_tag_name.to_string(),
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -43,6 +43,12 @@ impl std::ops::Deref for ApiUrl {
    }
 }

+impl std::ops::DerefMut for ApiUrl {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
 impl std::fmt::Display for ApiUrl {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        self.0.fmt(f)
--- a/scripts/benchmark_durations.py
+++ b/scripts/benchmark_durations.py
@@ -32,12 +32,6 @@ BENCHMARKS_DURATION_QUERY = """
 # the total duration varies from 8 to 40 minutes.
 # We use some pre-collected durations as a fallback to have a better distribution.
 FALLBACK_DURATION = {
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742,
-    "test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135,
    "test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036,
    "test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104,
    "test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073,
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -17,12 +17,14 @@ use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;

+use bytes::Bytes;
 use camino::Utf8PathBuf;
 use clap::{Parser, command};
 use futures::future::OptionFuture;
 use futures_core::Stream;
 use futures_util::StreamExt;
-use http_body_util::Full;
+use http_body_util::combinators::BoxBody;
+use http_body_util::{Empty, Full};
 use http_utils::tls_certs::ReloadingCertificateResolver;
 use hyper::body::Incoming;
 use hyper::header::CONTENT_TYPE;
@@ -46,7 +48,6 @@ use tokio::net::TcpListener;
 use tokio::sync::broadcast;
 use tokio::sync::broadcast::error::RecvError;
 use tokio::time;
-use tonic::body::{self, BoxBody, empty_body};
 use tonic::codegen::Service;
 use tonic::{Code, Request, Response, Status};
 use tracing::*;
@@ -634,7 +635,7 @@ impl BrokerService for Broker {
 // We serve only metrics and healthcheck through http1.
 async fn http1_handler(
    req: hyper::Request<Incoming>,
-) -> Result<hyper::Response<BoxBody>, Infallible> {
+) -> Result<hyper::Response<BoxBody<Bytes, Infallible>>, Infallible> {
    let resp = match (req.method(), req.uri().path()) {
        (&Method::GET, "/metrics") => {
            let mut buffer = vec![];
@@ -645,16 +646,16 @@ async fn http1_handler(
            hyper::Response::builder()
                .status(StatusCode::OK)
                .header(CONTENT_TYPE, encoder.format_type())
-                .body(body::boxed(Full::new(bytes::Bytes::from(buffer))))
+                .body(BoxBody::new(Full::new(Bytes::from(buffer))))
                .unwrap()
        }
        (&Method::GET, "/status") => hyper::Response::builder()
            .status(StatusCode::OK)
-            .body(empty_body())
+            .body(BoxBody::new(Empty::new()))
            .unwrap(),
        _ => hyper::Response::builder()
            .status(StatusCode::NOT_FOUND)
-            .body(empty_body())
+            .body(BoxBody::new(Empty::new()))
            .unwrap(),
    };
    Ok(resp)
--- a/test_runner/cloud_regress/test_cloud_regress.py
+++ b/test_runner/cloud_regress/test_cloud_regress.py
@@ -15,7 +15,7 @@ if TYPE_CHECKING:
    from fixtures.pg_version import PgVersion


-@pytest.mark.timeout(4*3600)
+@pytest.mark.timeout(7200)
@pytest.mark.remote_cluster
 def test_cloud_regress(
    remote_pg: RemotePostgres,
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -184,6 +184,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    "pageserver_evictions_with_low_residence_duration_total",
    "pageserver_aux_file_estimated_size",
    "pageserver_valid_lsn_lease_count",
+    "pageserver_tenant_offloaded_timelines",
    counter("pageserver_tenant_throttling_count_accounted_start"),
    counter("pageserver_tenant_throttling_count_accounted_finish"),
    counter("pageserver_tenant_throttling_wait_usecs_sum"),
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -682,7 +682,7 @@ class NeonEnvBuilder:
                log.info(
                    f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
                )
-                shutil.copytree(tenants_from_dir, tenants_to_dir)
+                subprocess.run(["cp", "-a", tenants_from_dir, tenants_to_dir], check=True)
            else:
                log.info(
                    f"Creating overlayfs mount of pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}"
@@ -698,8 +698,9 @@ class NeonEnvBuilder:
        shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True)
        if self.test_overlay_dir is None:
            log.info("Copying local_fs_remote_storage directory from snapshot")
-            shutil.copytree(
-                repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage"
+            subprocess.run(
+                ["cp", "-a", f"{repo_dir / 'local_fs_remote_storage'}", f"{self.repo_dir}"],
+                check=True,
            )
        else:
            log.info("Creating overlayfs mount of local_fs_remote_storage directory from snapshot")
@@ -4657,7 +4658,7 @@ class EndpointFactory:
        origin: Endpoint,
        endpoint_id: str | None = None,
        config_lines: list[str] | None = None,
-    ):
+    ) -> Endpoint:
        branch_name = origin.branch_name
        assert origin in self.endpoints
        assert branch_name is not None
@@ -4676,7 +4677,7 @@ class EndpointFactory:
        origin: Endpoint,
        endpoint_id: str | None = None,
        config_lines: list[str] | None = None,
-    ):
+    ) -> Endpoint:
        branch_name = origin.branch_name
        assert origin in self.endpoints
        assert branch_name is not None
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -14,7 +14,7 @@ from fixtures.neon_fixtures import (
    PgBin,
    wait_for_last_flush_lsn,
 )
-from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci
+from fixtures.utils import get_scale_for_db, humantime_to_ms

 from performance.pageserver.util import setup_pageserver_with_tenants

@@ -36,9 +36,6 @@ if TYPE_CHECKING:
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
@pytest.mark.parametrize("n_tenants", [500])
@pytest.mark.timeout(10000)
-@skip_on_ci(
-    "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
-)
 def test_pageserver_characterize_throughput_with_n_tenants(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
@@ -63,9 +60,6 @@ def test_pageserver_characterize_throughput_with_n_tenants(
@pytest.mark.parametrize("n_clients", [1, 64])
@pytest.mark.parametrize("n_tenants", [1])
@pytest.mark.timeout(2400)
-@skip_on_ci(
-    "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
-)
 def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -187,6 +187,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
            "args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
        },
        "rel_size_v2_enabled": True,
+        "relsize_snapshot_cache_capacity": 10000,
        "gc_compaction_enabled": True,
        "gc_compaction_verification": False,
        "gc_compaction_initial_threshold_kb": 1024000,
--- a/test_runner/regress/test_basebackup.py
+++ b/test_runner/regress/test_basebackup.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from fixtures.utils import wait_until
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
+    """
+    Simple test for basebackup cache.
+    1. Check that we always hit the cache after compute restart.
+    2. Check that we eventually delete old basebackup files, but not the latest one.
+    3. Check that we delete basebackup file for timeline with active compute.
+    """
+
+    neon_env_builder.pageserver_config_override = """
+        tenant_config = { basebackup_cache_enabled = true }
+        basebackup_cache_config = { cleanup_period = '1s' }
+    """
+
+    env = neon_env_builder.init_start()
+    ep = env.endpoints.create("main")
+    ps = env.pageserver
+    ps_http = ps.http_client()
+
+    # 1. Check that we always hit the cache after compute restart.
+    for i in range(3):
+        ep.start()
+        ep.stop()
+
+        def check_metrics(i=i):
+            metrics = ps_http.get_metrics()
+            # Never miss.
+            # The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests.
+            # All other requests should be a hit
+            assert (
+                metrics.query_one(
+                    "pageserver_basebackup_cache_read_total", {"result": "miss"}
+                ).value
+                == 0
+            )
+            # All but the first requests are hits.
+            assert (
+                metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value
+                == i
+            )
+            # Every compute shut down should trigger a prepare reuest.
+            assert (
+                metrics.query_one(
+                    "pageserver_basebackup_cache_prepare_total", {"result": "ok"}
+                ).value
+                == i + 1
+            )
+
+        wait_until(check_metrics)
+
+    # 2. Check that we eventually delete old basebackup files, but not the latest one.
+    def check_bb_file_count():
+        bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir())
+        # tmp dir + 1 basebackup file.
+        assert len(bb_files) == 2
+
+    wait_until(check_bb_file_count)
+
+    # 3. Check that we delete basebackup file for timeline with active compute.
+    ep.start()
+    ep.safe_psql("create table t1 as select generate_series(1, 10) as n")
+
+    def check_bb_dir_empty():
+        bb_files = list(ps.workdir.joinpath("basebackup_cache").iterdir())
+        # only tmp dir.
+        assert len(bb_files) == 1
+
+    wait_until(check_bb_dir_empty)
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -19,6 +19,16 @@ TEST_ROLE_NAMES = [
    {"name": "role$"},
    {"name": "role$$"},
    {"name": "role$x$"},
+    {"name": "x"},
+    {"name": "xx"},
+    {"name": "$x"},
+    {"name": "x$"},
+    {"name": "$x$"},
+    {"name": "xx$"},
+    {"name": "$xx"},
+    {"name": "$xx$"},
+    # 63 bytes is the limit for role/DB names in Postgres
+    {"name": "x" * 63},
 ]

 TEST_DB_NAMES = [
@@ -74,6 +84,43 @@ TEST_DB_NAMES = [
        "name": "db name$x$",
        "owner": "role$x$",
    },
+    {
+        "name": "x",
+        "owner": "x",
+    },
+    {
+        "name": "xx",
+        "owner": "xx",
+    },
+    {
+        "name": "$x",
+        "owner": "$x",
+    },
+    {
+        "name": "x$",
+        "owner": "x$",
+    },
+    {
+        "name": "$x$",
+        "owner": "$x$",
+    },
+    {
+        "name": "xx$",
+        "owner": "xx$",
+    },
+    {
+        "name": "$xx",
+        "owner": "$xx",
+    },
+    {
+        "name": "$xx$",
+        "owner": "$xx$",
+    },
+    # 63 bytes is the limit for role/DB names in Postgres
+    {
+        "name": "x" * 63,
+        "owner": "x" * 63,
+    },
 ]


@@ -146,6 +193,10 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
    """
    Test that compute_ctl can create and work with databases and roles
    with special characters (whitespaces, %, tabs, etc.) in the name.
+    Also use `drop_subscriptions_before_start: true`. We do not actually
+    have any subscriptions in this test, so it should be no-op, but it
+    i) simulates the case when we create a second dev branch together with
+    a new project creation, and ii) just generally stresses more code paths.
    """
    env = neon_simple_env

@@ -159,6 +210,7 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
        **{
            "spec": {
                "skip_pg_catalog_updates": False,
+                "drop_subscriptions_before_start": True,
                "cluster": {
                    "roles": TEST_ROLE_NAMES,
                    "databases": TEST_DB_NAMES,
@@ -202,6 +254,7 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
        **{
            "spec": {
                "skip_pg_catalog_updates": False,
+                "drop_subscriptions_before_start": True,
                "cluster": {
                    "roles": [],
                    "databases": [],
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -217,11 +217,11 @@ if SQL_EXPORTER is None:
            self, logs_dir: Path, config_file: Path, collector_file: Path, port: int
        ) -> None:
            # NOTE: Keep the version the same as in
-            # compute/Dockerfile.compute-node and Dockerfile.build-tools.
+            # compute/compute-node.Dockerfile and build-tools.Dockerfile.
            #
            # The "host" network mode allows sql_exporter to talk to the
            # endpoint which is running on the host.
-            super().__init__("docker.io/burningalchemist/sql_exporter:0.17.0", network_mode="host")
+            super().__init__("docker.io/burningalchemist/sql_exporter:0.17.3", network_mode="host")

            self.__logs_dir = logs_dir
            self.__port = port
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -74,8 +74,9 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                for query in queries:
                    with s_con.cursor() as secondary_cursor:
                        secondary_cursor.execute(query)
-                        response = secondary_cursor.fetchone()
-                        assert response is not None
+                        res = secondary_cursor.fetchone()
+                        assert res is not None
+                        response = res
                        assert response == responses[query]

            # Check for corrupted WAL messages which might otherwise go unnoticed if
@@ -164,7 +165,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):

            s_cur.execute("SELECT COUNT(*) FROM test")
            res = s_cur.fetchone()
-            assert res[0] == 10000
+            assert res == (10000,)

            # Clear the cache in the standby, so that when we
            # re-execute the query, it will make GetPage
@@ -195,7 +196,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
            s_cur.execute("SELECT COUNT(*) FROM test")
            log_replica_lag(primary, secondary)
            res = s_cur.fetchone()
-            assert res[0] == 10000
+            assert res == (10000,)


 def run_pgbench(connstr: str, pg_bin: PgBin):
--- a/test_runner/regress/test_pageserver_metric_collection.py
+++ b/test_runner/regress/test_pageserver_metric_collection.py
@@ -508,6 +508,9 @@ PER_METRIC_VERIFIERS = {
    "remote_storage_size": CannotVerifyAnything,
    "written_size": WrittenDataVerifier,
    "written_data_bytes_delta": WrittenDataDeltaVerifier,
+    "written_size_since_parent": WrittenDataVerifier,  # same as written_size on root
+    "pitr_cutoff": CannotVerifyAnything,
+    "pitr_history_size_since_parent": WrittenDataVerifier,  # same as written_size on root w/o GC
    "timeline_logical_size": CannotVerifyAnything,
    "synthetic_storage_size": SyntheticSizeVerifier,
 }
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -0,0 +1,129 @@
+"""
+File with secondary->primary promotion testing.
+
+This far, only contains a test that we don't break and that the data is persisted.
+"""
+
+import psycopg2
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
+from fixtures.pg_version import PgVersion
+from pytest import raises
+
+
+def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
+    """
+    Test that a replica safely promotes, and can commit data updates which
+    show up when the primary boots up after the promoted secondary endpoint
+    shut down.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env: NeonEnv = neon_simple_env
+    primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    with primary.connect() as primary_conn:
+        primary_cur = primary_conn.cursor()
+        primary_cur.execute(
+            "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
+        )
+        primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
+        primary_cur.execute(
+            """
+            SELECT pg_current_wal_insert_lsn(),
+                   pg_current_wal_lsn(),
+                   pg_current_wal_flush_lsn()
+            """
+        )
+        log.info(f"Primary: Current LSN after workload is {primary_cur.fetchone()}")
+
+    wait_replica_caughtup(primary, secondary)
+
+    with secondary.connect() as secondary_conn:
+        secondary_cur = secondary_conn.cursor()
+        secondary_cur.execute("select count(*) from t")
+
+        assert secondary_cur.fetchone() == (100,)
+
+        with raises(psycopg2.Error):
+            secondary_cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200)")
+            secondary_conn.commit()
+
+        secondary_conn.rollback()
+        secondary_cur.execute("select count(*) from t")
+        assert secondary_cur.fetchone() == (100,)
+
+    primary.stop_and_destroy(mode="immediate")
+
+    # Reconnect to the secondary to make sure we get a read-write connection
+    with secondary.connect() as promo_conn:
+        promo_cur = promo_conn.cursor()
+
+        promo_cur.execute("SELECT * FROM pg_promote()")
+        assert promo_cur.fetchone() == (True,)
+        promo_cur.execute(
+            """
+            SELECT pg_current_wal_insert_lsn(),
+                   pg_current_wal_lsn(),
+                   pg_current_wal_flush_lsn()
+            """
+        )
+        log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}")
+
+    # Reconnect to the secondary to make sure we get a read-write connection
+    with secondary.connect() as new_primary_conn:
+        new_primary_cur = new_primary_conn.cursor()
+        new_primary_cur.execute("select count(*) from t")
+        assert new_primary_cur.fetchone() == (100,)
+
+        new_primary_cur.execute(
+            "INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload"
+        )
+        assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
+
+        new_primary_cur = new_primary_conn.cursor()
+        new_primary_cur.execute("select payload from t")
+        assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
+
+        new_primary_cur.execute("select count(*) from t")
+        assert new_primary_cur.fetchone() == (200,)
+        new_primary_cur.execute(
+            """
+            SELECT pg_current_wal_insert_lsn(),
+                   pg_current_wal_lsn(),
+                   pg_current_wal_flush_lsn()
+            """
+        )
+        log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}")
+
+    with secondary.connect() as second_viewpoint_conn:
+        new_primary_cur = second_viewpoint_conn.cursor()
+        new_primary_cur.execute("select payload from t")
+        assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
+
+    wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
+
+    secondary.stop_and_destroy()
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+
+    with primary.connect() as new_primary:
+        new_primary_cur = new_primary.cursor()
+        new_primary_cur.execute(
+            """
+            SELECT pg_current_wal_insert_lsn(),
+                   pg_current_wal_lsn(),
+                   pg_current_wal_flush_lsn()
+            """
+        )
+        log.info(f"New primary: Boot LSN is {new_primary_cur.fetchone()}")
+
+        new_primary_cur.execute("select count(*) from t")
+        assert new_primary_cur.fetchone() == (200,)
+        new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
+        new_primary_cur.execute("select count(*) from t")
+        assert new_primary_cur.fetchone() == (300,)
+
+    primary.stop(mode="immediate")
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -27,8 +27,9 @@ from contextlib import closing

 import psycopg2
 import pytest
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
+from fixtures.neon_fixtures import NeonEnv, PgBin, wait_for_last_flush_lsn, wait_replica_caughtup
 from fixtures.pg_version import PgVersion
 from fixtures.utils import query_scalar, skip_on_postgres, wait_until

@@ -695,3 +696,110 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv):
    with secondary.cursor() as secondary_cur:
        secondary_cur.execute("select count(*) from t")
        assert secondary_cur.fetchone() == (n_restarts,)
+
+
+def test_ephemeral_endpoints_vacuum(neon_simple_env: NeonEnv, pg_bin: PgBin):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    sql = """
+CREATE TABLE CHAR_TBL(f1 char(4));
+CREATE TABLE FLOAT8_TBL(f1 float8);
+CREATE TABLE INT2_TBL(f1 int2);
+CREATE TABLE INT4_TBL(f1 int4);
+CREATE TABLE INT8_TBL(q1 int8, q2 int8);
+CREATE TABLE POINT_TBL(f1 point);
+CREATE TABLE TEXT_TBL (f1 text);
+CREATE TABLE VARCHAR_TBL(f1 varchar(4));
+CREATE TABLE onek (unique1		int4);
+CREATE TABLE onek2 AS SELECT * FROM onek;
+CREATE TABLE tenk1 (unique1		int4);
+CREATE TABLE tenk2 AS SELECT * FROM tenk1;
+CREATE TABLE person (name text, age int4,location point);
+CREATE TABLE emp (salary int4, manager name) INHERITS (person);
+CREATE TABLE student (gpa float8) INHERITS (person);
+CREATE TABLE stud_emp (	percent 	int4) INHERITS (emp, student);
+CREATE TABLE road (name		text,thepath 	path);
+CREATE TABLE ihighway () INHERITS (road);
+CREATE TABLE shighway(surface		text) INHERITS (road);
+CREATE TABLE BOOLTBL3 (d text, b bool, o int);
+CREATE TABLE booltbl4(isfalse bool, istrue bool, isnul bool);
+DROP TABLE BOOLTBL3;
+DROP TABLE BOOLTBL4;
+CREATE TABLE ceil_floor_round (a numeric);
+DROP TABLE ceil_floor_round;
+CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
+DROP TABLE width_bucket_test;
+CREATE TABLE num_input_test (n1 numeric);
+CREATE TABLE num_variance (a numeric);
+INSERT INTO num_variance VALUES (0);
+CREATE TABLE snapshot_test (nr	integer, snap	txid_snapshot);
+CREATE TABLE guid1(guid_field UUID, text_field TEXT DEFAULT(now()));
+CREATE TABLE guid2(guid_field UUID, text_field TEXT DEFAULT(now()));
+CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field);
+CREATE INDEX guid1_hash  ON guid1 USING HASH  (guid_field);
+TRUNCATE guid1;
+DROP TABLE guid1;
+DROP TABLE guid2 CASCADE;
+CREATE TABLE numrange_test (nr NUMRANGE);
+CREATE INDEX numrange_test_btree on numrange_test(nr);
+CREATE TABLE numrange_test2(nr numrange);
+CREATE INDEX numrange_test2_hash_idx on numrange_test2 using hash (nr);
+INSERT INTO numrange_test2 VALUES('[, 5)');
+CREATE TABLE textrange_test (tr text);
+CREATE INDEX textrange_test_btree on textrange_test(tr);
+CREATE TABLE test_range_gist(ir int4range);
+CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
+DROP INDEX test_range_gist_idx;
+CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
+CREATE TABLE test_range_spgist(ir int4range);
+CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
+DROP INDEX test_range_spgist_idx;
+CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
+CREATE TABLE test_range_elem(i int4);
+CREATE INDEX test_range_elem_idx on test_range_elem (i);
+CREATE INDEX ON test_range_elem using spgist(int4range(i,i+10));
+DROP TABLE test_range_elem;
+CREATE TABLE test_range_excl(room int4range, speaker int4range, during tsrange, exclude using gist (room with =, during with &&), exclude using gist (speaker with =, during with &&));
+CREATE TABLE f_test(f text, i int);
+CREATE TABLE i8r_array (f1 int, f2 text);
+CREATE TYPE arrayrange as range (subtype=int4[]);
+CREATE TYPE two_ints as (a int, b int);
+DROP TYPE two_ints cascade;
+CREATE TABLE text_support_test (t text);
+CREATE TABLE TEMP_FLOAT (f1 FLOAT8);
+CREATE TABLE TEMP_INT4 (f1 INT4);
+CREATE TABLE TEMP_INT2 (f1 INT2);
+CREATE TABLE TEMP_GROUP (f1 INT4, f2 INT4, f3 FLOAT8);
+CREATE TABLE POLYGON_TBL(f1 polygon);
+CREATE TABLE quad_poly_tbl (id int, p polygon);
+INSERT INTO quad_poly_tbl SELECT (x - 1) * 100 + y, polygon(circle(point(x * 10, y * 10), 1 + (x + y) % 10)) FROM generate_series(1, 200) x, generate_series(1, 100) y;
+CREATE TABLE quad_poly_tbl_ord_seq2 AS SELECT 1 FROM quad_poly_tbl;
+CREATE TABLE quad_poly_tbl_ord_idx2 AS SELECT 1 FROM quad_poly_tbl;
+"""
+
+    with endpoint.cursor() as cur:
+        lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+        env.endpoints.create_start(branch_name="main", lsn=lsn)
+        log.info(f"lsn: {lsn}")
+
+        for line in sql.split("\n"):
+            if len(line.strip()) == 0 or line.startswith("--"):
+                continue
+            cur.execute(line)
+
+        lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+        env.endpoints.create_start(branch_name="main", lsn=lsn)
+        log.info(f"lsn: {lsn}")
+
+        cur.execute("VACUUM FULL pg_class;")
+
+    for ep in env.endpoints.endpoints:
+        log.info(f"{ep.endpoint_id} / {ep.pg_port}")
+        pg_dump_command = ["pg_dumpall", "-f", f"/tmp/dump-{ep.endpoint_id}.sql"]
+        env_vars = {
+            "PGPORT": str(ep.pg_port),
+            "PGUSER": endpoint.default_options["user"],
+            "PGHOST": endpoint.default_options["host"],
+        }
+        pg_bin.run_capture(pg_dump_command, env=env_vars)
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -193,6 +193,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
        "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
    )

+    offloaded_count = ps_http.get_metric_value(
+        "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"}
+    )
+    assert offloaded_count == 0
+
    ps_http.timeline_archival_config(
        tenant_id,
        leaf_timeline_id,
@@ -244,6 +249,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
    wait_until(leaf_offloaded)
    wait_until(parent_offloaded)

+    offloaded_count = ps_http.get_metric_value(
+        "pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"}
+    )
+    assert offloaded_count == 2
+
    # Offloaded child timelines should still prevent deletion
    with pytest.raises(
        PageserverApiException,
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/Show More
+++ b/Show More