Add skeleton of parallel xxHash implementation

Merge branch 'quantumish/lfc-resizable-map' into quantumish/lfc-soa-map
Remove prev entry tracking, refactor HashMapInit into proper builder
2026-02-04 11:10:37 +00:00 · 2025-07-03 13:12:41 -07:00 · 2025-06-24 14:36:43 -07:00 · 2025-06-24 13:34:22 -07:00 · 2025-06-23 16:15:43 -07:00 · 2025-06-23 15:38:49 -07:00
219 changed files with 11761 additions and 5073 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -314,7 +314,8 @@ jobs:
          test_selection: performance
          run_in_parallel: false
          save_perf_report: ${{ github.ref_name == 'main' }}
-          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
+          # test_pageserver_max_throughput_getpage_at_latest_lsn is run in separate workflow periodic_pagebench.yml because it needs snapshots
+          extra_params: --splits 5 --group ${{ matrix.pytest_split_group }} --ignore=test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
          benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
          pg_version: v16
          aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -1,4 +1,4 @@
-name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
+name: Periodic pagebench performance test on unit-perf hetzner runner

 on:
  schedule:
@@ -8,7 +8,7 @@ on:
    #        │   │ ┌───────────── day of the month (1 - 31)
    #        │   │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #        │   │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron: '0 */3 * * *' # Runs every 3 hours
+    - cron: '0 */4 * * *' # Runs every 4 hours
  workflow_dispatch: # Allows manual triggering of the workflow
    inputs:
      commit_hash:
@@ -16,6 +16,11 @@ on:
        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
        required: false
        default: ''
+      recreate_snapshots:
+        type: boolean
+        description: 'Recreate snapshots - !!!WARNING!!! We should only recreate snapshots if the previous ones are no longer compatible. Otherwise benchmarking results are not comparable across runs.'
+        required: false
+        default: false

 defaults:
  run:
@@ -29,13 +34,13 @@ permissions:
  contents: read

 jobs:
-  trigger_bench_on_ec2_machine_in_eu_central_1:
+  run_periodic_pagebench_test:
    permissions:
      id-token: write # aws-actions/configure-aws-credentials
      statuses: write
      contents: write
      pull-requests: write
-    runs-on: [ self-hosted, small ]
+    runs-on: [ self-hosted, unit-perf ]
    container:
      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
      credentials:
@@ -44,10 +49,13 @@ jobs:
      options: --init
    timeout-minutes: 360  # Set the timeout to 6 hours
    env:
-      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
      RUN_ID: ${{ github.run_id }}
-      AWS_DEFAULT_REGION : "eu-central-1"
-      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
+      DEFAULT_PG_VERSION: 16
+      BUILD_TYPE: release
+      RUST_BACKTRACE: 1
+      # NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS: 1 - doesn't work without root in container
+      S3_BUCKET: neon-github-public-dev
+      PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
    steps:
    # we don't need the neon source code because we run everything remotely
    # however we still need the local github actions to run the allure step below
@@ -56,99 +64,194 @@ jobs:
      with:
        egress-policy: audit

-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - name: Set up the environment which depends on $RUNNER_TEMP on nvme drive
+      id: set-env
+      shell: bash -euxo pipefail {0}
+      run: |
+        {
+          echo "NEON_DIR=${RUNNER_TEMP}/neon"
+          echo "NEON_BIN=${RUNNER_TEMP}/neon/bin"
+          echo "POSTGRES_DISTRIB_DIR=${RUNNER_TEMP}/neon/pg_install"
+          echo "LD_LIBRARY_PATH=${RUNNER_TEMP}/neon/pg_install/v${DEFAULT_PG_VERSION}/lib"
+          echo "BACKUP_DIR=${RUNNER_TEMP}/instance_store/saved_snapshots"
+          echo "TEST_OUTPUT=${RUNNER_TEMP}/neon/test_output"
+          echo "PERF_REPORT_DIR=${RUNNER_TEMP}/neon/test_output/perf-report-local"
+          echo "ALLURE_DIR=${RUNNER_TEMP}/neon/test_output/allure-results"
+          echo "ALLURE_RESULTS_DIR=${RUNNER_TEMP}/neon/test_output/allure-results/results"
+        } >> "$GITHUB_ENV"

-    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
-      run: curl https://ifconfig.me
+        echo "allure_results_dir=${RUNNER_TEMP}/neon/test_output/allure-results/results" >> "$GITHUB_OUTPUT"

-    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
-      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+    - uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
      with:
        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
-        role-duration-seconds: 3600
-
-    - name: Start EC2 instance and wait for the instance to boot up
-      run: |
-        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
-        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
-        sleep 60 # sleep some time to allow cloudinit and our API server to start up
-
-    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
-      run: |
-        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
-        echo "Public IP of the EC2 instance: $public_ip"
-        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
-
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # max 5 hours (needed in case commit hash is still being built)
    - name: Determine commit hash
+      id: commit_hash
+      shell: bash -euxo pipefail {0}
      env:
        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
      run: |
-        if [ -z "$INPUT_COMMIT_HASH" ]; then
-          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+        if [[ -z "${INPUT_COMMIT_HASH}" ]]; then
+          COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
          echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
        else
-          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+          COMMIT_HASH="${INPUT_COMMIT_HASH}"
+          echo "COMMIT_HASH=$COMMIT_HASH" >> $GITHUB_ENV
+          echo "commit_hash=$COMMIT_HASH" >> "$GITHUB_OUTPUT"
          echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
        fi
+    - name: Checkout the neon repository at given commit hash
+      uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        ref: ${{ steps.commit_hash.outputs.commit_hash }}

-    - name: Start Bench with run_id
+    # does not reuse ./.github/actions/download because we need to download the artifact for the given commit hash
+    # example artifact
+    # s3://neon-github-public-dev/artifacts/48b870bc078bd2c450eb7b468e743b9c118549bf/15036827400/1/neon-Linux-X64-release-artifact.tar.zst /instance_store/artifacts/neon-Linux-release-artifact.tar.zst
+    - name: Determine artifact S3_KEY for given commit hash and download and extract artifact
+      id: artifact_prefix
+      shell: bash -euxo pipefail {0}
+      env:
+        ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
+        COMMIT_HASH: ${{ env.COMMIT_HASH }}
+        COMMIT_HASH_TYPE: ${{ env.COMMIT_HASH_TYPE }}
      run: |
-        curl -k -X 'POST' \
-        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
-        -H 'accept: application/json' \
-        -H 'Content-Type: application/json' \
-        -H "Authorization: Bearer $API_KEY" \
-        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
+        attempt=0
+        max_attempts=24 # 5 minutes * 24 = 2 hours

-    - name: Poll Test Status
-      id: poll_step
-      run: |
-        status=""
-        while [[ "$status" != "failure" && "$status" != "success" ]]; do
-          response=$(curl -k -X 'GET' \
-          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
-          -H 'accept: application/json' \
-          -H "Authorization: Bearer $API_KEY")
-          echo "Response: $response"
-          set +x
-          status=$(echo $response | jq -r '.status')
-          echo "Test status: $status"
-          if [[ "$status" == "failure" ]]; then
-            echo "Test failed"
-            exit 1 # Fail the job step if status is failure
-          elif [[ "$status" == "success" || "$status" == "null" ]]; then
+        while [[ $attempt -lt $max_attempts ]]; do
+          # the following command will fail until the artifacts are available ...
+          S3_KEY=$(aws s3api list-objects-v2 --bucket "$S3_BUCKET" --prefix "artifacts/$COMMIT_HASH/" \
+            | jq -r '.Contents[]?.Key' \
+            | grep "neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst" \
+            | sort --version-sort \
+            | tail -1) || true # ... thus ignore errors from the command
+          if [[ -n "${S3_KEY}" ]]; then
+            echo "Artifact found: $S3_KEY"
+            echo "S3_KEY=$S3_KEY" >> $GITHUB_ENV
            break
-          elif [[ "$status" == "too_many_runs" ]]; then
-            echo "Too many runs already running"
-            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
-            exit 1
          fi
-
-          sleep 60 # Poll every 60 seconds
+          
+          # Increment attempt counter and sleep for 5 minutes
+          attempt=$((attempt + 1))
+          echo "Attempt $attempt of $max_attempts to find artifacts in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH failed. Retrying in 5 minutes..."
+          sleep 300 # Sleep for 5 minutes
        done

-    - name: Retrieve Test Logs
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      run: |
-        curl -k -X 'GET' \
-        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
-        -H 'accept: application/gzip' \
-        -H "Authorization: Bearer $API_KEY" \
-        --output "test_log_${GITHUB_RUN_ID}.gz"
+        if [[ -z "${S3_KEY}" ]]; then
+          echo "Error: artifact not found in S3 bucket s3://$S3_BUCKET/artifacts/$COMMIT_HASH" after 2 hours
+        else
+          mkdir -p $(dirname $ARCHIVE)
+          time aws s3 cp --only-show-errors s3://$S3_BUCKET/${S3_KEY} ${ARCHIVE}
+          mkdir -p ${NEON_DIR}
+          time tar -xf ${ARCHIVE} -C ${NEON_DIR}
+          rm -f ${ARCHIVE}
+        fi

-    - name: Unzip Test Log and Print it into this job's log
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+    - name: Download snapshots from S3
+      if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.recreate_snapshots == 'false' || github.event.inputs.recreate_snapshots == '' }}
+      id: download_snapshots
+      shell: bash -euxo pipefail {0}
      run: |
-        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
-        cat "test_log_${GITHUB_RUN_ID}"
+        # Download the snapshots from S3
+        mkdir -p ${TEST_OUTPUT}
+        mkdir -p $BACKUP_DIR
+        cd $BACKUP_DIR
+        mkdir parts
+        cd parts
+        PART=$(aws s3api list-objects-v2 --bucket $S3_BUCKET --prefix performance/pagebench/ \
+          | jq -r '.Contents[]?.Key' \
+          | grep -E 'shared-snapshots-[0-9]{4}-[0-9]{2}-[0-9]{2}' \
+          | sort \
+          | tail -1)
+        echo "Latest PART: $PART"
+        if [[ -z "$PART" ]]; then
+          echo "ERROR: No matching S3 key found" >&2
+          exit 1
+        fi
+        S3_KEY=$(dirname $PART)
+        time aws s3 cp --only-show-errors --recursive s3://${S3_BUCKET}/$S3_KEY/ .
+        cd $TEST_OUTPUT
+        time cat $BACKUP_DIR/parts/* | zstdcat | tar --extract --preserve-permissions
+        rm -rf ${BACKUP_DIR}
+
+    - name: Cache poetry deps
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      shell: bash -euxo pipefail {0}
+      run: ./scripts/pysync
+
+    # we need high number of open files for pagebench
+    - name: show ulimits
+      shell: bash -euxo pipefail {0}
+      run: |
+        ulimit -a
+
+    - name: Run pagebench testcase
+      shell: bash -euxo pipefail {0}
+      env:
+        CI: false  # need to override this env variable set by github to enforce using snapshots
+      run: |
+        export PLATFORM=hetzner-unit-perf-${COMMIT_HASH_TYPE}
+        # report the commit hash of the neon repository in the revision of the test results
+        export GITHUB_SHA=${COMMIT_HASH}
+        rm -rf ${PERF_REPORT_DIR}
+        rm -rf ${ALLURE_RESULTS_DIR}
+        mkdir -p ${PERF_REPORT_DIR}
+        mkdir -p ${ALLURE_RESULTS_DIR}
+        PARAMS="--alluredir=${ALLURE_RESULTS_DIR} --tb=short --verbose -rA"
+        EXTRA_PARAMS="--out-dir ${PERF_REPORT_DIR} --durations-path $TEST_OUTPUT/benchmark_durations.json"
+        # run only two selected tests
+        # environment set by parent:
+        # RUST_BACKTRACE=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release
+        ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_throughput_with_n_tenants ${EXTRA_PARAMS}
+        ./scripts/pytest ${PARAMS} test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant ${EXTRA_PARAMS}
+
+    - name: upload the performance metrics to the Neon performance database which is used by grafana dashboards to display the results
+      shell: bash -euxo pipefail {0}
+      run: |
+        export REPORT_FROM="$PERF_REPORT_DIR"
+        export GITHUB_SHA=${COMMIT_HASH}
+        time ./scripts/generate_and_push_perf_report.sh
+
+    - name: Upload test results
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-store
+      with:
+        report-dir:  ${{ steps.set-env.outputs.allure_results_dir }}
+        unique-key: ${{ env.BUILD_TYPE }}-${{ env.DEFAULT_PG_VERSION }}-${{ runner.arch }}
+        aws-oidc-role-arn:  ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

    - name: Create Allure report
+      id: create-allure-report
      if: ${{ !cancelled() }}
      uses: ./.github/actions/allure-report-generate
      with:
        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}

+    - name: Upload snapshots
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.recreate_snapshots != 'false' && github.event.inputs.recreate_snapshots != '' }}
+      id: upload_snapshots
+      shell: bash -euxo pipefail {0}
+      run: |
+        mkdir -p $BACKUP_DIR
+        cd $TEST_OUTPUT
+        tar --create --preserve-permissions --file - shared-snapshots | zstd -o $BACKUP_DIR/shared_snapshots.tar.zst
+        cd $BACKUP_DIR
+        mkdir parts
+        split -b 1G shared_snapshots.tar.zst ./parts/shared_snapshots.tar.zst.part.
+        SNAPSHOT_DATE=$(date +%F)  # YYYY-MM-DD
+        cd parts
+        time aws s3 cp --recursive . s3://${S3_BUCKET}/performance/pagebench/shared-snapshots-${SNAPSHOT_DATE}/
+
    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@fcfb566f8b0aab22203f066d80ca1d7e4b5d05b3 # v1.27.1
@@ -157,26 +260,22 @@ jobs:
        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
+        
    - name: Cleanup Test Resources
      if: always()
+      shell: bash -euxo pipefail {0}
+      env:
+        ARCHIVE: ${{ runner.temp }}/downloads/neon-${{ runner.os }}-${{ runner.arch }}-release-artifact.tar.zst
      run: |
-        curl -k -X 'POST' \
-        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
-        -H 'accept: application/json' \
-        -H "Authorization: Bearer $API_KEY" \
-        -d ''
+        # Cleanup the test resources
+        if [[ -d "${BACKUP_DIR}" ]]; then
+          rm -rf ${BACKUP_DIR}
+        fi
+        if [[ -d "${TEST_OUTPUT}" ]]; then
+          rm -rf ${TEST_OUTPUT}
+        fi
+        if [[ -d "${NEON_DIR}" ]]; then
+          rm -rf ${NEON_DIR}
+        fi
+        rm -rf $(dirname $ARCHIVE)

-    - name: Assume AWS OIDC role that allows to manage (start/stop/describe... EC machine)
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
-      with:
-        aws-region: eu-central-1
-        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN }}
-        role-duration-seconds: 3600
-
-    - name: Stop EC2 instance and wait for the instance to be stopped
-      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
-      run: |
-        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
-        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1086,6 +1086,25 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"

+[[package]]
+name = "cbindgen"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
+dependencies = [
+ "clap",
+ "heck 0.4.1",
+ "indexmap 2.9.0",
+ "log",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "syn 2.0.100",
+ "tempfile",
+ "toml",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.16"
@@ -1212,7 +1231,7 @@ version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -1270,13 +1289,21 @@ dependencies = [
 "unicode-width",
 ]

+[[package]]
+name = "communicator"
+version = "0.1.0"
+dependencies = [
+ "cbindgen",
+ "neon-shmem",
+]
+
 [[package]]
 name = "compute_api"
 version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "jsonwebtoken",
 "regex",
 "remote_storage",
@@ -1308,7 +1335,7 @@ dependencies = [
 "flate2",
 "futures",
 "http 1.1.0",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "itertools 0.10.5",
 "jsonwebtoken",
 "metrics",
@@ -1936,7 +1963,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
 dependencies = [
 "darling",
 "either",
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -2500,6 +2527,18 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "getrandom"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi 0.14.2+wasi-0.2.4",
+]
+
 [[package]]
 name = "gettid"
 version = "0.1.3"
@@ -2597,7 +2636,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 0.2.9",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "slab",
 "tokio",
 "tokio-util",
@@ -2616,7 +2655,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http 1.1.0",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "slab",
 "tokio",
 "tokio-util",
@@ -2712,6 +2751,12 @@ dependencies = [
 "http 1.1.0",
 ]

+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2863,14 +2908,14 @@ dependencies = [
 "pprof",
 "regex",
 "routerify",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-pemfile 2.1.1",
 "serde",
 "serde_json",
 "serde_path_to_error",
 "thiserror 1.0.69",
 "tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
 "tokio-util",
 "tracing",
@@ -3200,12 +3245,12 @@ dependencies = [

 [[package]]
 name = "indexmap"
-version = "2.0.1"
+version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
+checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
 dependencies = [
 "equivalent",
- "hashbrown 0.14.5",
+ "hashbrown 0.15.2",
 "serde",
 ]

@@ -3228,7 +3273,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88"
 dependencies = [
 "ahash",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "is-terminal",
 "itoa",
 "log",
@@ -3251,7 +3296,7 @@ dependencies = [
 "crossbeam-utils",
 "dashmap 6.1.0",
 "env_logger",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "itoa",
 "log",
 "num-format",
@@ -3648,7 +3693,7 @@ version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "syn 2.0.100",
@@ -3710,7 +3755,7 @@ dependencies = [
 "procfs",
 "prometheus",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "twox-hash",
 ]

@@ -3798,7 +3843,11 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 name = "neon-shmem"
 version = "0.1.0"
 dependencies = [
+ "criterion",
 "nix 0.30.1",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "rustc-hash 1.1.0",
 "tempfile",
 "thiserror 1.0.69",
 "workspace_hack",
@@ -4112,7 +4161,7 @@ dependencies = [
 "opentelemetry-http",
 "opentelemetry-proto",
 "opentelemetry_sdk",
- "prost 0.13.3",
+ "prost 0.13.5",
 "reqwest",
 "thiserror 1.0.69",
 ]
@@ -4125,8 +4174,8 @@ checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
 dependencies = [
 "opentelemetry",
 "opentelemetry_sdk",
- "prost 0.13.3",
- "tonic",
+ "prost 0.13.5",
+ "tonic 0.12.3",
 ]

 [[package]]
@@ -4236,6 +4285,8 @@ name = "pagebench"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "async-trait",
+ "bytes",
 "camino",
 "clap",
 "futures",
@@ -4244,12 +4295,15 @@ dependencies = [
 "humantime-serde",
 "pageserver_api",
 "pageserver_client",
+ "pageserver_page_api",
 "rand 0.8.5",
 "reqwest",
 "serde",
 "serde_json",
 "tokio",
+ "tokio-stream",
 "tokio-util",
+ "tonic 0.13.1",
 "tracing",
 "utils",
 "workspace_hack",
@@ -4305,6 +4359,7 @@ dependencies = [
 "hashlink",
 "hex",
 "hex-literal",
+ "http 1.1.0",
 "http-utils",
 "humantime",
 "humantime-serde",
@@ -4321,6 +4376,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "pageserver_compaction",
+ "pageserver_page_api",
 "pem",
 "pin-project-lite",
 "postgres-protocol",
@@ -4329,6 +4385,7 @@ dependencies = [
 "postgres_connection",
 "postgres_ffi",
 "postgres_initdb",
+ "posthog_client_lite",
 "pprof",
 "pq_proto",
 "procfs",
@@ -4339,7 +4396,7 @@ dependencies = [
 "reqwest",
 "rpds",
 "rstest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "scopeguard",
 "send-future",
 "serde",
@@ -4358,11 +4415,14 @@ dependencies = [
 "tokio-epoll-uring",
 "tokio-io-timeout",
 "tokio-postgres",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
 "toml_edit",
+ "tonic 0.13.1",
+ "tonic-reflection",
+ "tower 0.5.2",
 "tracing",
 "tracing-utils",
 "twox-hash",
@@ -4455,9 +4515,14 @@ dependencies = [
 name = "pageserver_page_api"
 version = "0.1.0"
 dependencies = [
- "prost 0.13.3",
- "tonic",
+ "bytes",
+ "pageserver_api",
+ "postgres_ffi",
+ "prost 0.13.5",
+ "thiserror 1.0.69",
+ "tonic 0.13.1",
 "tonic-build",
+ "utils",
 "workspace_hack",
 ]

@@ -4837,14 +4902,14 @@ dependencies = [
 "bytes",
 "once_cell",
 "pq_proto",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror 1.0.69",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-util",
 "tracing",
 ]
@@ -4898,11 +4963,16 @@ name = "posthog_client_lite"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "arc-swap",
 "reqwest",
 "serde",
 "serde_json",
 "sha2",
 "thiserror 1.0.69",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "tracing-utils",
 "workspace_hack",
 ]

@@ -4951,7 +5021,7 @@ dependencies = [
 "inferno 0.12.0",
 "num",
 "paste",
- "prost 0.13.3",
+ "prost 0.13.5",
 ]

 [[package]]
@@ -5056,12 +5126,12 @@ dependencies = [

 [[package]]
 name = "prost"
-version = "0.13.3"
+version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f"
+checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
 dependencies = [
 "bytes",
- "prost-derive 0.13.3",
+ "prost-derive 0.13.5",
 ]

 [[package]]
@@ -5071,7 +5141,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
@@ -5092,14 +5162,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
 "bytes",
- "heck",
+ "heck 0.5.0",
 "itertools 0.12.1",
 "log",
 "multimap",
 "once_cell",
 "petgraph",
 "prettyplease",
- "prost 0.13.3",
+ "prost 0.13.5",
 "prost-types 0.13.3",
 "regex",
 "syn 2.0.100",
@@ -5121,9 +5191,9 @@ dependencies = [

 [[package]]
 name = "prost-derive"
-version = "0.13.3"
+version = "0.13.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
+checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d"
 dependencies = [
 "anyhow",
 "itertools 0.12.1",
@@ -5147,7 +5217,7 @@ version = "0.13.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670"
 dependencies = [
- "prost 0.13.3",
+ "prost 0.13.5",
 ]

 [[package]]
@@ -5195,7 +5265,7 @@ dependencies = [
 "hyper 0.14.30",
 "hyper 1.4.1",
 "hyper-util",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "ipnet",
 "itertools 0.10.5",
 "itoa",
@@ -5217,7 +5287,7 @@ dependencies = [
 "postgres_backend",
 "pq_proto",
 "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
 "rcgen",
 "redis",
 "regex",
@@ -5229,7 +5299,7 @@ dependencies = [
 "rsa",
 "rstest",
 "rustc-hash 1.1.0",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-native-certs 0.8.0",
 "rustls-pemfile 2.1.1",
 "scopeguard",
@@ -5248,7 +5318,7 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-postgres2",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-tungstenite 0.21.0",
 "tokio-util",
 "tracing",
@@ -5321,6 +5391,12 @@ dependencies = [
 "proc-macro2",
 ]

+[[package]]
+name = "r-efi"
+version = "5.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5345,6 +5421,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -5365,6 +5451,16 @@ dependencies = [
 "rand_core 0.6.4",
 ]

+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -5383,6 +5479,15 @@ dependencies = [
 "getrandom 0.2.11",
 ]

+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.3",
+]
+
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
@@ -5393,6 +5498,16 @@ dependencies = [
 "rand 0.8.5",
 ]

+[[package]]
+name = "rand_distr"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
+dependencies = [
+ "num-traits",
+ "rand 0.9.1",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -5472,13 +5587,13 @@ dependencies = [
 "num-bigint",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-native-certs 0.8.0",
 "ryu",
 "sha1_smol",
 "socket2",
 "tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-util",
 "url",
 ]
@@ -5926,15 +6041,15 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.23.18"
+version = "0.23.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c9cc1d47e243d655ace55ed38201c19ae02c148ae56412ab8750e8f0166ab7f"
+checksum = "730944ca083c1c233a75c09f199e973ca499344a2b7ba9e755c457e86fb4a321"
 dependencies = [
 "log",
 "once_cell",
 "ring",
 "rustls-pki-types",
- "rustls-webpki 0.102.8",
+ "rustls-webpki 0.103.3",
 "subtle",
 "zeroize",
 ]
@@ -6023,6 +6138,17 @@ dependencies = [
 "untrusted",
 ]

+[[package]]
+name = "rustls-webpki"
+version = "0.103.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
+dependencies = [
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+
 [[package]]
 name = "rustversion"
 version = "1.0.12"
@@ -6074,7 +6200,7 @@ dependencies = [
 "regex",
 "remote_storage",
 "reqwest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "safekeeper_api",
 "safekeeper_client",
 "scopeguard",
@@ -6091,7 +6217,7 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
 "tokio-tar",
 "tokio-util",
@@ -6263,7 +6389,7 @@ checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335"
 dependencies = [
 "httpdate",
 "reqwest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -6692,11 +6818,11 @@ dependencies = [
 "metrics",
 "once_cell",
 "parking_lot 0.12.1",
- "prost 0.13.3",
- "rustls 0.23.18",
+ "prost 0.13.5",
+ "rustls 0.23.27",
 "tokio",
- "tokio-rustls 0.26.0",
- "tonic",
+ "tokio-rustls 0.26.2",
+ "tonic 0.13.1",
 "tonic-build",
 "tracing",
 "utils",
@@ -6738,7 +6864,7 @@ dependencies = [
 "regex",
 "reqwest",
 "routerify",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-native-certs 0.8.0",
 "safekeeper_api",
 "safekeeper_client",
@@ -6753,7 +6879,7 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-util",
 "tracing",
 "utils",
@@ -6791,7 +6917,7 @@ dependencies = [
 "postgres_ffi",
 "remote_storage",
 "reqwest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-native-certs 0.8.0",
 "serde",
 "serde_json",
@@ -6868,7 +6994,7 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck",
+ "heck 0.5.0",
 "proc-macro2",
 "quote",
 "rustversion",
@@ -7325,10 +7451,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
 "ring",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "tokio",
 "tokio-postgres",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "x509-certificate",
 ]

@@ -7372,12 +7498,11 @@ dependencies = [

 [[package]]
 name = "tokio-rustls"
-version = "0.26.0"
+version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
+checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
 dependencies = [
- "rustls 0.23.18",
- "rustls-pki-types",
+ "rustls 0.23.27",
 "tokio",
 ]

@@ -7475,7 +7600,7 @@ version = "0.22.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38"
 dependencies = [
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "serde",
 "serde_spanned",
 "toml_datetime",
@@ -7494,18 +7619,41 @@ dependencies = [
 "http 1.1.0",
 "http-body 1.0.0",
 "http-body-util",
+ "percent-encoding",
+ "pin-project",
+ "prost 0.13.5",
+ "tokio-stream",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9"
+dependencies = [
+ "async-trait",
+ "axum",
+ "base64 0.22.1",
+ "bytes",
+ "h2 0.4.4",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
 "hyper 1.4.1",
 "hyper-timeout",
 "hyper-util",
 "percent-encoding",
 "pin-project",
- "prost 0.13.3",
+ "prost 0.13.5",
 "rustls-native-certs 0.8.0",
- "rustls-pemfile 2.1.1",
+ "socket2",
 "tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
- "tower 0.4.13",
+ "tower 0.5.2",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -7513,9 +7661,9 @@ dependencies = [

 [[package]]
 name = "tonic-build"
-version = "0.12.3"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11"
+checksum = "eac6f67be712d12f0b41328db3137e0d0757645d8904b4cb7d51cd9c2279e847"
 dependencies = [
 "prettyplease",
 "proc-macro2",
@@ -7525,6 +7673,19 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "tonic-reflection"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9687bd5bfeafebdded2356950f278bba8226f0b32109537c4253406e09aafe1"
+dependencies = [
+ "prost 0.13.5",
+ "prost-types 0.13.3",
+ "tokio",
+ "tokio-stream",
+ "tonic 0.13.1",
+]
+
 [[package]]
 name = "tower"
 version = "0.4.13"
@@ -7533,16 +7694,11 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
 dependencies = [
 "futures-core",
 "futures-util",
- "indexmap 1.9.3",
 "pin-project",
 "pin-project-lite",
- "rand 0.8.5",
- "slab",
 "tokio",
- "tokio-util",
 "tower-layer",
 "tower-service",
- "tracing",
 ]

 [[package]]
@@ -7553,9 +7709,12 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
 dependencies = [
 "futures-core",
 "futures-util",
+ "indexmap 2.9.0",
 "pin-project-lite",
+ "slab",
 "sync_wrapper 1.0.1",
 "tokio",
+ "tokio-util",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -7883,7 +8042,7 @@ dependencies = [
 "base64 0.22.1",
 "log",
 "once_cell",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-pki-types",
 "url",
 "webpki-roots",
@@ -8078,7 +8237,7 @@ dependencies = [
 "pageserver_api",
 "postgres_ffi",
 "pprof",
- "prost 0.13.3",
+ "prost 0.13.5",
 "remote_storage",
 "serde",
 "serde_json",
@@ -8134,6 +8293,15 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"

+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
 [[package]]
 name = "wasite"
 version = "0.1.0"
@@ -8491,6 +8659,15 @@ dependencies = [
 "windows-sys 0.48.0",
 ]

+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
@@ -8498,6 +8675,8 @@ dependencies = [
 "ahash",
 "anstream",
 "anyhow",
+ "axum",
+ "axum-core",
 "base64 0.13.1",
 "base64 0.21.7",
 "base64ct",
@@ -8520,10 +8699,8 @@ dependencies = [
 "fail",
 "form_urlencoded",
 "futures-channel",
- "futures-core",
 "futures-executor",
 "futures-io",
- "futures-task",
 "futures-util",
 "generic-array",
 "getrandom 0.2.11",
@@ -8534,8 +8711,7 @@ dependencies = [
 "hyper 0.14.30",
 "hyper 1.4.1",
 "hyper-util",
- "indexmap 1.9.3",
- "indexmap 2.0.1",
+ "indexmap 2.9.0",
 "itertools 0.12.1",
 "lazy_static",
 "libc",
@@ -8554,19 +8730,18 @@ dependencies = [
 "once_cell",
 "p256 0.13.2",
 "parquet",
- "percent-encoding",
 "prettyplease",
 "proc-macro2",
- "prost 0.13.3",
+ "prost 0.13.5",
 "quote",
 "rand 0.8.5",
 "regex",
 "regex-automata 0.4.3",
 "regex-syntax 0.8.2",
 "reqwest",
- "rustls 0.23.18",
+ "rustls 0.23.27",
 "rustls-pki-types",
- "rustls-webpki 0.102.8",
+ "rustls-webpki 0.103.3",
 "scopeguard",
 "sec1 0.7.3",
 "serde",
@@ -8584,12 +8759,11 @@ dependencies = [
 "time",
 "time-macros",
 "tokio",
- "tokio-rustls 0.26.0",
+ "tokio-rustls 0.26.2",
 "tokio-stream",
 "tokio-util",
 "toml_edit",
- "tonic",
- "tower 0.4.13",
+ "tower 0.5.2",
 "tracing",
 "tracing-core",
 "tracing-log",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -44,6 +44,7 @@ members = [
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
    "endpoint_storage",
+    "pgxn/neon/communicator",
 ]

 [workspace.package]
@@ -149,7 +150,7 @@ pin-project-lite = "0.2"
 pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
 procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
-prost = "0.13"
+prost = "0.13.5"
 rand = "0.8"
 redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
@@ -199,7 +200,8 @@ tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.8"
 toml_edit = "0.22"
-tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
+tonic = { version = "0.13.1", default-features = false, features = ["channel", "codegen", "prost", "router", "server", "tls-ring", "tls-native-roots"] }
+tonic-reflection = { version = "0.13.1", features = ["server"] }
 tower = { version = "0.5.2", default-features = false }
 tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }

@@ -246,9 +248,11 @@ azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rus
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
+desim = { version = "0.1", path = "./libs/desim" }
 endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
 http-utils = { version = "0.1", path = "./libs/http-utils/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
+neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
@@ -258,29 +262,30 @@ postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
 postgres_initdb = { path = "./libs/postgres_initdb" }
+posthog_client_lite = { version = "0.1", path = "./libs/posthog_client_lite" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
 remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 safekeeper_client = { path = "./safekeeper/client" }
-desim = { version = "0.1", path = "./libs/desim" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 storage_controller_client = { path = "./storage_controller/client" }
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
 tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
-walproposer = { version = "0.1", path = "./libs/walproposer/" }
 wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
+walproposer = { version = "0.1", path = "./libs/walproposer/" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }

 ## Build dependencies
+cbindgen = "0.28.0"
 criterion = "0.5.1"
 rcgen = "0.13"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
-tonic-build = "0.12"
+tonic-build = "0.13.1"

 [patch.crates-io]

--- a/7
+++ b/7
@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
 	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
 	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
+	NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif
@@ -180,11 +182,16 @@ postgres-check-%: postgres-%

 .PHONY: neon-pg-ext-%
 neon-pg-ext-%: postgres-%
+	+@echo "Compiling communicator $*"
+	$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
+
 	+@echo "Compiling neon $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
+		LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
+
 	+@echo "Compiling neon_walredo $*"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -155,7 +155,7 @@ RUN set -e \

 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.17.0
+ENV SQL_EXPORTER_VERSION=0.17.3
 RUN curl -fsSL \
    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
    --output sql_exporter.tar.gz \
@@ -310,13 +310,13 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    . "$HOME/.cargo/env" && \
    cargo --version && rustup --version && \
    rustup component add llvm-tools rustfmt clippy && \
-    cargo install rustfilt            --version ${RUSTFILT_VERSION} && \
-    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} && \
-    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
-    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
-    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
-    cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
-    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
+    cargo install rustfilt            --version ${RUSTFILT_VERSION} --locked && \
+    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} --locked && \
+    cargo install cargo-deny          --version ${CARGO_DENY_VERSION} --locked && \
+    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} --locked && \
+    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} --locked && \
+    cargo install cargo-chef          --version ${CARGO_CHEF_VERSION} --locked && \
+    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} --locked \
                                      --features postgres-bundled --no-default-features && \
    rm -rf /home/nonroot/.cargo/registry && \
    rm -rf /home/nonroot/.cargo/git
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -297,6 +297,7 @@ RUN ./autogen.sh && \
    ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make staged-install && \
    cd extensions/postgis && \
    make clean && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -582,6 +583,38 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control

+#########################################################################################
+#
+# Layer "online_advisor-build"
+# compile online_advisor extension
+#
+#########################################################################################
+FROM build-deps AS online_advisor-src
+ARG PG_VERSION
+
+# online_advisor supports all Postgres version starting from PG14, but prior to PG17 has to be included in preload_shared_libraries
+# last release 1.0 - May 15, 2025
+WORKDIR /ext-src
+RUN case "${PG_VERSION:?}" in \
+    "v17") \
+        ;; \
+    *) \
+        echo "skipping the version of online_advistor for $PG_VERSION" && exit 0 \
+        ;; \
+    esac && \
+	wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
+    echo "37dcadf8f7cc8d6cc1f8831276ee245b44f1b0274f09e511e47a67738ba9ed0f online_advisor.tar.gz" | sha256sum --check && \
+    mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS online_advisor-build
+COPY --from=online_advisor-src /ext-src/ /ext-src/
+WORKDIR /ext-src/
+RUN if [ -d online_advisor-src ]; then \
+	    cd online_advisor-src && \
+        make -j install && \
+        echo 'trusted = true' >> /usr/local/pgsql/share/extension/online_advisor.control; \
+    fi
+
 #########################################################################################
 #
 # Layer "pg_hashids-build"
@@ -1148,14 +1181,14 @@ RUN cd exts/rag && \
 RUN cd exts/rag_bge_small_en_v15 && \
    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
-        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/bge_small_en_v15.onnx \
        cargo pgrx install --release --features remote_onnx && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control

 RUN cd exts/rag_jina_reranker_v1_tiny_en && \
    sed -i 's/pgrx = "0.14.1"/pgrx = { version = "0.14.1", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    ORT_LIB_LOCATION=/ext-src/onnxruntime-src/build/Linux \
-        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/pgrag-data/jina_reranker_v1_tiny_en.onnx \
        cargo pgrx install --release --features remote_onnx && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control

@@ -1648,6 +1681,7 @@ COPY --from=pg_jsonschema-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_graphql-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_tiktoken-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=hypopg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=online_advisor-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_hashids-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rum-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgtap-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1751,17 +1785,17 @@ ARG TARGETARCH
 RUN if [ "$TARGETARCH" = "amd64" ]; then\
        postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
        pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
-        sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
+        sql_exporter_sha256='9a41127a493e8bfebfe692bf78c7ed2872a58a3f961ee534d1b0da9ae584aaab';\
    else\
        postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
        pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
-        sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
+        sql_exporter_sha256='530e6afc77c043497ed965532c4c9dfa873bc2a4f0b3047fad367715c0081d6a';\
    fi\
    && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
     | tar xzf - --strip-components=1 -C.\
    && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
     | tar xzf - --strip-components=1 -C.\
-    && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.0/sql_exporter-0.17.0.linux-${TARGETARCH}.tar.gz\
+    && curl -sL https://github.com/burningalchemist/sql_exporter/releases/download/0.17.3/sql_exporter-0.17.3.linux-${TARGETARCH}.tar.gz\
     | tar xzf - --strip-components=1 -C.\
    && echo "${postgres_exporter_sha256} postgres_exporter" | sha256sum -c -\
    && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
@@ -1809,12 +1843,27 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute

 FROM pg-build AS extension-tests
 ARG PG_VERSION
+# This is required for the PostGIS test
+RUN apt-get update && case $DEBIAN_VERSION in \
+      bullseye) \
+        apt-get install -y libproj19 libgdal28 time; \
+      ;; \
+      bookworm) \
+        apt-get install -y libgdal32 libproj25 time; \
+      ;; \
+      *) \
+        echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
+      ;; \
+    esac
+
 COPY docker-compose/ext-src/ /ext-src/

 COPY --from=pg-build /postgres /postgres
-#COPY --from=postgis-src /ext-src/ /ext-src/
+COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=postgis-build /ext-src/postgis-src /ext-src/postgis-src
+COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-src /ext-src/ /ext-src/
-#COPY --from=h3-pg-src /ext-src/ /ext-src/
+COPY --from=h3-pg-src /ext-src/h3-pg-src /ext-src/h3-pg-src
 COPY --from=postgresql-unit-src /ext-src/ /ext-src/
 COPY --from=pgvector-src /ext-src/ /ext-src/
 COPY --from=pgjwt-src /ext-src/ /ext-src/
@@ -1823,6 +1872,7 @@ COPY --from=pgjwt-src /ext-src/ /ext-src/
 COPY --from=pg_graphql-src /ext-src/ /ext-src/
 #COPY --from=pg_tiktoken-src /ext-src/ /ext-src/
 COPY --from=hypopg-src /ext-src/ /ext-src/
+COPY --from=online_advisor-src /ext-src/ /ext-src/
 COPY --from=pg_hashids-src /ext-src/ /ext-src/
 COPY --from=rum-src /ext-src/ /ext-src/
 COPY --from=pgtap-src /ext-src/ /ext-src/
@@ -1852,6 +1902,7 @@ COPY compute/patches/pg_repack.patch /ext-src
 RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch

 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
+RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
 RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
   && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
 ENV PATH=/usr/local/pgsql/bin:$PATH
--- a/compute/manifest.yaml
+++ b/compute/manifest.yaml
@@ -0,0 +1,121 @@
+pg_settings:
+  # Common settings for primaries and replicas of all versions.
+  common:
+    # Check for client disconnection every 1 minute. By default, Postgres will detect the
+    # loss of the connection only at the next interaction with the socket, when it waits
+    # for, receives or sends data, so it will likely waste resources till the end of the
+    # query execution. There should be no drawbacks in setting this for everyone, so enable
+    # it by default. If anyone will complain, we can allow editing it.
+    # https://www.postgresql.org/docs/16/runtime-config-connection.html#GUC-CLIENT-CONNECTION-CHECK-INTERVAL
+    client_connection_check_interval: "60000" # 1 minute
+    # ---- IO ---- 
+    effective_io_concurrency: "20"
+    maintenance_io_concurrency: "100"
+    fsync: "off"
+    hot_standby: "off"
+    # We allow users to change this if needed, but by default we
+    # just don't want to see long-lasting idle transactions, as they
+    # prevent activity monitor from suspending projects.
+    idle_in_transaction_session_timeout: "300000" # 5 minutes
+    listen_addresses: "*"
+    # --- LOGGING ---- helps investigations
+    log_connections: "on"
+    log_disconnections: "on"
+    # 1GB, unit is KB
+    log_temp_files: "1048576"
+    # Disable dumping customer data to logs, both to increase data privacy
+    # and to reduce the amount the logs.
+    log_error_verbosity: "terse"
+    log_min_error_statement: "panic"
+    max_connections: "100"
+    # --- WAL ---
+    # - flush lag is the max amount of WAL that has been generated but not yet stored
+    # to disk in the page server. A smaller value means less delay after a pageserver
+    # restart, but if you set it too small you might again need to slow down writes if the
+    # pageserver cannot flush incoming WAL to disk fast enough. This must be larger
+    # than the pageserver's checkpoint interval, currently 1 GB! Otherwise you get a
+    # a deadlock where the compute node refuses to generate more WAL before the
+    # old WAL has been uploaded to S3, but the pageserver is waiting for more WAL
+    # to be generated before it is uploaded to S3.
+    max_replication_flush_lag: "10GB"
+    max_replication_slots: "10"
+    # Backpressure configuration:
+    # - write lag is the max amount of WAL that has been generated by Postgres but not yet
+    # processed by the page server. Making this smaller reduces the worst case latency
+    # of a GetPage request, if you request a page that was recently modified. On the other
+    # hand, if this is too small, the compute node might need to wait on a write if there is a
+    # hiccup in the network or page server so that the page server has temporarily fallen
+    # behind.
+    #
+    # Previously it was set to 500 MB, but it caused compute being unresponsive under load
+    # https://github.com/neondatabase/neon/issues/2028
+    max_replication_write_lag: "500MB"
+    max_wal_senders: "10"
+    # A Postgres checkpoint is cheap in storage, as doesn't involve any significant amount
+    # of real I/O. Only the SLRU buffers and some other small files are flushed to disk.
+    # However, as long as we have full_page_writes=on, page updates after a checkpoint
+    # include full-page images which bloats the WAL. So may want to bump max_wal_size to
+    # reduce the WAL bloating, but at the same it will increase pg_wal directory size on
+    # compute and can lead to out of disk error on k8s nodes.
+    max_wal_size: "1024"
+    wal_keep_size: "0"
+    wal_level: "replica"
+    # Reduce amount of WAL generated by default.
+    wal_log_hints: "off"
+    # - without wal_sender_timeout set we don't get feedback messages,
+    # required for backpressure.
+    wal_sender_timeout: "10000"
+    # We have some experimental extensions, which we don't want users to install unconsciously.
+    # To install them, users would need to set the `neon.allow_unstable_extensions` setting.
+    # There are two of them currently:
+    # - `pgrag` - https://github.com/neondatabase-labs/pgrag - extension is actually called just `rag`,
+    #                                                          and two dependencies:
+    #                                                          - `rag_bge_small_en_v15`
+    #                                                          - `rag_jina_reranker_v1_tiny_en`
+    # - `pg_mooncake` - https://github.com/Mooncake-Labs/pg_mooncake/  
+    neon.unstable_extensions: "rag,rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en,pg_mooncake,anon"
+    neon.protocol_version: "3"
+    password_encryption: "scram-sha-256"
+    # This is important to prevent Postgres from trying to perform
+    # a local WAL redo after backend crash. It should exit and let
+    # the systemd or k8s to do a fresh startup with compute_ctl.
+    restart_after_crash: "off"
+    # By default 3. We have the following persistent connections in the VM:
+    # * compute_activity_monitor (from compute_ctl)
+    # * postgres-exporter (metrics collector; it has 2 connections)
+    # * sql_exporter (metrics collector; we have 2 instances [1 for us & users; 1 for autoscaling])
+    # * vm-monitor (to query & change file cache size)
+    # i.e. total of 6. Let's reserve 7, so there's still at least one left over.
+    superuser_reserved_connections: "7"
+    synchronous_standby_names: "walproposer"
+
+  replica:
+    hot_standby: "on"
+
+  per_version:
+    17:
+      common:
+        # PostgreSQL 17 has a new IO system called "read stream", which can combine IOs up to some
+        # size. It still has some issues with readahead, though, so we default to disabled/
+        # "no combining of IOs" to make sure we get the maximum prefetch depth.
+        # See also: https://github.com/neondatabase/neon/pull/9860
+        io_combine_limit: "1"
+      replica:
+        # prefetching of blocks referenced in WAL doesn't make sense for us
+        # Neon hot standby ignores pages that are not in the shared_buffers
+        recovery_prefetch: "off"
+    16:
+      common:
+      replica:
+        # prefetching of blocks referenced in WAL doesn't make sense for us
+        # Neon hot standby ignores pages that are not in the shared_buffers
+        recovery_prefetch: "off"
+    15:
+      common:
+      replica:
+        # prefetching of blocks referenced in WAL doesn't make sense for us
+        # Neon hot standby ignores pages that are not in the shared_buffers
+        recovery_prefetch: "off"
+    14:
+      common:
+      replica:
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,7 +40,7 @@ use std::sync::mpsc;
 use std::thread;
 use std::time::Duration;

-use anyhow::{Context, Result};
+use anyhow::{Context, Result, bail};
 use clap::Parser;
 use compute_api::responses::ComputeConfig;
 use compute_tools::compute::{
@@ -57,31 +57,15 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;

-// Compatibility hack: if the control plane specified any remote-ext-config
-// use the default value for extension storage proxy gateway.
-// Remove this once the control plane is updated to pass the gateway URL
-fn parse_remote_ext_base_url(arg: &str) -> Result<String> {
-    const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str =
-        "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local";
-
-    Ok(if arg.starts_with("http") {
-        arg
-    } else {
-        FALLBACK_PG_EXT_GATEWAY_BASE_URL
-    }
-    .to_owned())
-}
-
-#[derive(Parser)]
+#[derive(Debug, Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
    pub pgbin: String,

    /// The base URL for the remote extension storage proxy gateway.
-    /// Should be in the form of `http(s)://<gateway-hostname>[:<port>]`.
-    #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")]
-    pub remote_ext_base_url: Option<String>,
+    #[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)]
+    pub remote_ext_base_url: Option<Url>,

    /// The port to bind the external listening HTTP server to. Clients running
    /// outside the compute will talk to the compute through this port. Keep
@@ -136,6 +120,29 @@ struct Cli {
        requires = "compute-id"
    )]
    pub control_plane_uri: Option<String>,
+
+    /// Interval in seconds for collecting installed extensions statistics
+    #[arg(long, default_value = "3600")]
+    pub installed_extensions_collection_interval: u64,
+}
+
+impl Cli {
+    /// Parse a URL from an argument. By default, this isn't necessary, but we
+    /// want to do some sanity checking.
+    fn parse_remote_ext_base_url(value: &str) -> Result<Url> {
+        // Remove extra trailing slashes, and add one. We use Url::join() later
+        // when downloading remote extensions. If the base URL is something like
+        // http://example.com/pg-ext-s3-gateway, and join() is called with
+        // something like "xyz", the resulting URL is http://example.com/xyz.
+        let value = value.trim_end_matches('/').to_owned() + "/";
+        let url = Url::parse(&value)?;
+
+        if url.query_pairs().count() != 0 {
+            bail!("parameters detected in remote extensions base URL")
+        }
+
+        Ok(url)
+    }
 }

 fn main() -> Result<()> {
@@ -179,6 +186,7 @@ fn main() -> Result<()> {
            cgroup: cli.cgroup,
            #[cfg(target_os = "linux")]
            vm_monitor_addr: cli.vm_monitor_addr,
+            installed_extensions_collection_interval: cli.installed_extensions_collection_interval,
        },
        config,
    )?;
@@ -263,7 +271,8 @@ fn handle_exit_signal(sig: i32) {

 #[cfg(test)]
 mod test {
-    use clap::CommandFactory;
+    use clap::{CommandFactory, Parser};
+    use url::Url;

    use super::Cli;

@@ -273,16 +282,41 @@ mod test {
    }

    #[test]
-    fn parse_pg_ext_gateway_base_url() {
-        let arg = "http://pg-ext-s3-gateway2";
-        let result = super::parse_remote_ext_base_url(arg).unwrap();
-        assert_eq!(result, arg);
-
-        let arg = "pg-ext-s3-gateway";
-        let result = super::parse_remote_ext_base_url(arg).unwrap();
+    fn verify_remote_ext_base_url() {
+        let cli = Cli::parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--remote-ext-base-url",
+            "https://example.com/subpath",
+        ]);
        assert_eq!(
-            result,
-            "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"
+            cli.remote_ext_base_url.unwrap(),
+            Url::parse("https://example.com/subpath/").unwrap()
        );
+
+        let cli = Cli::parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--remote-ext-base-url",
+            "https://example.com//",
+        ]);
+        assert_eq!(
+            cli.remote_ext_base_url.unwrap(),
+            Url::parse("https://example.com").unwrap()
+        );
+
+        Cli::try_parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--remote-ext-base-url",
+            "https://example.com?hello=world",
+        ])
+        .expect_err("URL parameters are not allowed");
    }
 }
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -339,6 +339,8 @@ async fn run_dump_restore(
    destination_connstring: String,
 ) -> Result<(), anyhow::Error> {
    let dumpdir = workdir.join("dumpdir");
+    let num_jobs = num_cpus::get().to_string();
+    info!("using {num_jobs} jobs for dump/restore");

    let common_args = [
        // schema mapping (prob suffices to specify them on one side)
@@ -354,7 +356,7 @@ async fn run_dump_restore(
        "directory".to_string(),
        // concurrency
        "--jobs".to_string(),
-        num_cpus::get().to_string(),
+        num_jobs,
        // progress updates
        "--verbose".to_string(),
    ];
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
    ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
-    LfcPrewarmState,
+    LfcPrewarmState, TlsConfig,
 };
 use compute_api::spec::{
    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
@@ -31,6 +31,7 @@ use std::time::{Duration, Instant};
 use std::{env, fs};
 use tokio::spawn;
 use tracing::{Instrument, debug, error, info, instrument, warn};
+use url::Url;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
@@ -96,7 +97,10 @@ pub struct ComputeNodeParams {
    pub internal_http_port: u16,

    /// the address of extension storage proxy gateway
-    pub remote_ext_base_url: Option<String>,
+    pub remote_ext_base_url: Option<Url>,
+
+    /// Interval for installed extensions collection
+    pub installed_extensions_collection_interval: u64,
 }

 /// Compute node info shared across several `compute_ctl` threads.
@@ -392,7 +396,7 @@ impl ComputeNode {
        // because QEMU will already have its memory allocated from the host, and
        // the necessary binaries will already be cached.
        if cli_spec.is_none() {
-            this.prewarm_postgres()?;
+            this.prewarm_postgres_vm_memory()?;
        }

        // Set the up metric with Empty status before starting the HTTP server.
@@ -599,6 +603,8 @@ impl ComputeNode {
            });
        }

+        let tls_config = self.tls_config(&pspec.spec);
+
        // If there are any remote extensions in shared_preload_libraries, start downloading them
        if pspec.spec.remote_extensions.is_some() {
            let (this, spec) = (self.clone(), pspec.spec.clone());
@@ -655,7 +661,7 @@ impl ComputeNode {
            info!("tuning pgbouncer");

            let pgbouncer_settings = pgbouncer_settings.clone();
-            let tls_config = self.compute_ctl_config.tls.clone();
+            let tls_config = tls_config.clone();

            // Spawn a background task to do the tuning,
            // so that we don't block the main thread that starts Postgres.
@@ -674,7 +680,10 @@ impl ComputeNode {

            // Spawn a background task to do the configuration,
            // so that we don't block the main thread that starts Postgres.
-            let local_proxy = local_proxy.clone();
+
+            let mut local_proxy = local_proxy.clone();
+            local_proxy.tls = tls_config.clone();
+
            let _handle = tokio::spawn(async move {
                if let Err(err) = local_proxy::configure(&local_proxy) {
                    error!("error while configuring local_proxy: {err:?}");
@@ -695,25 +704,18 @@ impl ComputeNode {
                let log_directory_path = Path::new(&self.params.pgdata).join("log");
                let log_directory_path = log_directory_path.to_string_lossy().to_string();

-                // Add project_id,endpoint_id tag to identify the logs.
+                // Add project_id,endpoint_id to identify the logs.
                //
                // These ids are passed from cplane,
-                // for backwards compatibility (old computes that don't have them),
-                // we set them to None.
-                // TODO: Clean up this code when all computes have them.
-                let tag: Option<String> = match (
-                    pspec.spec.project_id.as_deref(),
-                    pspec.spec.endpoint_id.as_deref(),
-                ) {
-                    (Some(project_id), Some(endpoint_id)) => {
-                        Some(format!("{project_id}/{endpoint_id}"))
-                    }
-                    (Some(project_id), None) => Some(format!("{project_id}/None")),
-                    (None, Some(endpoint_id)) => Some(format!("None,{endpoint_id}")),
-                    (None, None) => None,
-                };
+                let endpoint_id = pspec.spec.endpoint_id.as_deref().unwrap_or("");
+                let project_id = pspec.spec.project_id.as_deref().unwrap_or("");

-                configure_audit_rsyslog(log_directory_path.clone(), tag, &remote_endpoint)?;
+                configure_audit_rsyslog(
+                    log_directory_path.clone(),
+                    endpoint_id,
+                    project_id,
+                    &remote_endpoint,
+                )?;

                // Launch a background task to clean up the audit logs
                launch_pgaudit_gc(log_directory_path);
@@ -749,17 +751,7 @@ impl ComputeNode {

            let conf = self.get_tokio_conn_conf(None);
            tokio::task::spawn(async {
-                let res = get_installed_extensions(conf).await;
-                match res {
-                    Ok(extensions) => {
-                        info!(
-                            "[NEON_EXT_STAT] {}",
-                            serde_json::to_string(&extensions)
-                                .expect("failed to serialize extensions list")
-                        );
-                    }
-                    Err(err) => error!("could not get installed extensions: {err:?}"),
-                }
+                let _ = installed_extensions(conf).await;
            });
        }

@@ -789,7 +781,10 @@ impl ComputeNode {
        // Log metrics so that we can search for slow operations in logs
        info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished");

-        if pspec.spec.prewarm_lfc_on_startup {
+        // Spawn the extension stats background task
+        self.spawn_extension_stats_task();
+
+        if pspec.spec.autoprewarm {
            self.prewarm_lfc();
        }
        Ok(())
@@ -1215,13 +1210,15 @@ impl ComputeNode {
        let spec = &pspec.spec;
        let pgdata_path = Path::new(&self.params.pgdata);

+        let tls_config = self.tls_config(&pspec.spec);
+
        // Remove/create an empty pgdata directory and put configuration there.
        self.create_pgdata()?;
        config::write_postgres_conf(
            pgdata_path,
            &pspec.spec,
            self.params.internal_http_port,
-            &self.compute_ctl_config.tls,
+            tls_config,
        )?;

        // Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1317,8 +1314,8 @@ impl ComputeNode {
    }

    /// Start and stop a postgres process to warm up the VM for startup.
-    pub fn prewarm_postgres(&self) -> Result<()> {
-        info!("prewarming");
+    pub fn prewarm_postgres_vm_memory(&self) -> Result<()> {
+        info!("prewarming VM memory");

        // Create pgdata
        let pgdata = &format!("{}.warmup", self.params.pgdata);
@@ -1360,7 +1357,7 @@ impl ComputeNode {
        kill(pm_pid, Signal::SIGQUIT)?;
        info!("sent SIGQUIT signal");
        pg.wait()?;
-        info!("done prewarming");
+        info!("done prewarming vm memory");

        // clean up
        let _ok = fs::remove_dir_all(pgdata);
@@ -1546,14 +1543,22 @@ impl ComputeNode {
                .clone(),
        );

+        let mut tls_config = None::<TlsConfig>;
+        if spec.features.contains(&ComputeFeature::TlsExperimental) {
+            tls_config = self.compute_ctl_config.tls.clone();
+        }
+
        let max_concurrent_connections = self.max_service_connections(compute_state, &spec);

        // Merge-apply spec & changes to PostgreSQL state.
        self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?;

        if let Some(local_proxy) = &spec.clone().local_proxy_config {
+            let mut local_proxy = local_proxy.clone();
+            local_proxy.tls = tls_config.clone();
+
            info!("configuring local_proxy");
-            local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
+            local_proxy::configure(&local_proxy).context("apply_config local_proxy")?;
        }

        // Run migrations separately to not hold up cold starts
@@ -1605,11 +1610,13 @@ impl ComputeNode {
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

+        let tls_config = self.tls_config(&spec);
+
        if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
            info!("tuning pgbouncer");

            let pgbouncer_settings = pgbouncer_settings.clone();
-            let tls_config = self.compute_ctl_config.tls.clone();
+            let tls_config = tls_config.clone();

            // Spawn a background task to do the tuning,
            // so that we don't block the main thread that starts Postgres.
@@ -1627,7 +1634,7 @@ impl ComputeNode {
            // Spawn a background task to do the configuration,
            // so that we don't block the main thread that starts Postgres.
            let mut local_proxy = local_proxy.clone();
-            local_proxy.tls = self.compute_ctl_config.tls.clone();
+            local_proxy.tls = tls_config.clone();
            tokio::spawn(async move {
                if let Err(err) = local_proxy::configure(&local_proxy) {
                    error!("error while configuring local_proxy: {err:?}");
@@ -1645,7 +1652,7 @@ impl ComputeNode {
            pgdata_path,
            &spec,
            self.params.internal_http_port,
-            &self.compute_ctl_config.tls,
+            tls_config,
        )?;

        if !spec.skip_pg_catalog_updates {
@@ -1765,6 +1772,14 @@ impl ComputeNode {
        }
    }

+    pub fn tls_config(&self, spec: &ComputeSpec) -> &Option<TlsConfig> {
+        if spec.features.contains(&ComputeFeature::TlsExperimental) {
+            &self.compute_ctl_config.tls
+        } else {
+            &None::<TlsConfig>
+        }
+    }
+
    /// Update the `last_active` in the shared state, but ensure that it's a more recent one.
    pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
        let mut state = self.state.lock().unwrap();
@@ -2199,6 +2214,41 @@ LIMIT 100",
            info!("Pageserver config changed");
        }
    }
+
+    pub fn spawn_extension_stats_task(&self) {
+        let conf = self.tokio_conn_conf.clone();
+        let installed_extensions_collection_interval =
+            self.params.installed_extensions_collection_interval;
+        tokio::spawn(async move {
+            // An initial sleep is added to ensure that two collections don't happen at the same time.
+            // The first collection happens during compute startup.
+            tokio::time::sleep(tokio::time::Duration::from_secs(
+                installed_extensions_collection_interval,
+            ))
+            .await;
+            let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(
+                installed_extensions_collection_interval,
+            ));
+            loop {
+                interval.tick().await;
+                let _ = installed_extensions(conf.clone()).await;
+            }
+        });
+    }
+}
+
+pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
+    let res = get_installed_extensions(conf).await;
+    match res {
+        Ok(extensions) => {
+            info!(
+                "[NEON_EXT_STAT] {}",
+                serde_json::to_string(&extensions).expect("failed to serialize extensions list")
+            );
+        }
+        Err(err) => error!("could not get installed extensions: {err:?}"),
+    }
+    Ok(())
 }

 pub fn forward_termination_signal() {
--- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf
+++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf
@@ -2,10 +2,24 @@
 module(load="imfile")

 # Input configuration for log files in the specified directory
-# Replace {log_directory} with the directory containing the log files
-input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
+# The messages can be multiline. The start of the message is a timestamp
+# in "%Y-%m-%d %H:%M:%S.%3N GMT" (so timezone hardcoded).
+# Replace log_directory with the directory containing the log files
+input(type="imfile" File="{log_directory}/*.log"
+  Tag="pgaudit_log" Severity="info" Facility="local5"
+  startmsg.regex="^[[:digit:]]{{4}}-[[:digit:]]{{2}}-[[:digit:]]{{2}} [[:digit:]]{{2}}:[[:digit:]]{{2}}:[[:digit:]]{{2}}.[[:digit:]]{{3}} GMT,")
+
 # the directory to store rsyslog state files
 global(workDirectory="/var/log/rsyslog")

-# Forward logs to remote syslog server
-*.* @@{remote_endpoint}
+# Construct json, endpoint_id and project_id as additional metadata
+set $.json_log!endpoint_id = "{endpoint_id}";
+set $.json_log!project_id = "{project_id}";
+set $.json_log!msg = $msg;
+
+# Template suitable for rfc5424 syslog format
+template(name="PgAuditLog" type="string"
+    string="<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% - - - - %$.json_log%")
+
+# Forward to remote syslog receiver (@@<hostname>:<port>;format
+local5.info @@{remote_endpoint};PgAuditLog
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -83,6 +83,7 @@ use reqwest::StatusCode;
 use tar::Archive;
 use tracing::info;
 use tracing::log::warn;
+use url::Url;
 use zstd::stream::read::Decoder;

 use crate::metrics::{REMOTE_EXT_REQUESTS_TOTAL, UNKNOWN_HTTP_STATUS};
@@ -158,7 +159,7 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion {
 pub async fn download_extension(
    ext_name: &str,
    ext_path: &RemotePath,
-    remote_ext_base_url: &str,
+    remote_ext_base_url: &Url,
    pgbin: &str,
 ) -> Result<u64> {
    info!("Download extension {:?} from {:?}", ext_name, ext_path);
@@ -270,10 +271,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
 }

 // Do request to extension storage proxy, e.g.,
-// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
+// curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst
 // using HTTP GET and return the response body as bytes.
-async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
-    let uri = format!("{}/{}", remote_ext_base_url, ext_path);
+async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result<Bytes> {
+    let uri = remote_ext_base_url.join(ext_path).with_context(|| {
+        format!(
+            "failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}"
+        )
+    })?;
    let filename = Path::new(ext_path)
        .file_name()
        .unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
@@ -283,7 +288,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re

    info!("Downloading extension file '{}' from uri {}", filename, uri);

-    match do_extension_server_request(&uri).await {
+    match do_extension_server_request(uri).await {
        Ok(resp) => {
            info!("Successfully downloaded remote extension data {}", ext_path);
            REMOTE_EXT_REQUESTS_TOTAL
@@ -302,7 +307,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re

 // Do a single remote extensions server request.
 // Return result or (error message + stringified status code) in case of any failures.
-async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
+async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)> {
    let resp = reqwest::get(uri).await.map_err(|e| {
        (
            format!(
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -48,11 +48,9 @@ impl JsonResponse {

    /// Create an error response related to the compute being in an invalid state
    pub(self) fn invalid_status(status: ComputeStatus) -> Response {
-        Self::create_response(
+        Self::error(
            StatusCode::PRECONDITION_FAILED,
-            &GenericAPIError {
-                error: format!("invalid compute status: {status}"),
-            },
+            format!("invalid compute status: {status}"),
        )
    }
 }
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
    State(compute): State<Arc<ComputeNode>>,
    request: Json<ConfigurationRequest>,
 ) -> Response {
-    let pspec = match ParsedSpec::try_from(request.spec.clone()) {
+    let pspec = match ParsedSpec::try_from(request.0.spec) {
        Ok(p) => p,
        Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
    };
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -13,6 +13,12 @@ use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};

 const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

+/// Struct to store runtime state of the compute monitor thread.
+/// In theory, this could be a part of `Compute`, but i)
+/// this state is expected to be accessed only by single thread,
+/// so we don't need to care about locking; ii) `Compute` is
+/// already quite big. Thus, it seems to be a good idea to keep
+/// all the activity/health monitoring parts here.
 struct ComputeMonitor {
    compute: Arc<ComputeNode>,

@@ -70,12 +76,36 @@ impl ComputeMonitor {
        )
    }

+    /// Check if compute is in some terminal or soon-to-be-terminal
+    /// state, then return `true`, signalling the caller that it
+    /// should exit gracefully. Otherwise, return `false`.
+    fn check_interrupts(&mut self) -> bool {
+        let compute_status = self.compute.get_status();
+        if matches!(
+            compute_status,
+            ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed
+        ) {
+            info!(
+                "compute is in {} status, stopping compute monitor",
+                compute_status
+            );
+            return true;
+        }
+
+        false
+    }
+
    /// Spin in a loop and figure out the last activity time in the Postgres.
-    /// Then update it in the shared state. This function never errors out.
+    /// Then update it in the shared state. This function currently never
+    /// errors out explicitly, but there is a graceful termination path.
+    /// Every time we receive an error trying to check Postgres, we use
+    /// [`ComputeMonitor::check_interrupts()`] because it could be that
+    /// compute is being terminated already, then we can exit gracefully
+    /// to not produce errors' noise in the log.
    /// NB: the only expected panic is at `Mutex` unwrap(), all other errors
    /// should be handled gracefully.
    #[instrument(skip_all)]
-    pub fn run(&mut self) {
+    pub fn run(&mut self) -> anyhow::Result<()> {
        // Suppose that `connstr` doesn't change
        let connstr = self.compute.params.connstr.clone();
        let conf = self
@@ -93,6 +123,10 @@ impl ComputeMonitor {
        info!("starting compute monitor for {}", connstr);

        loop {
+            if self.check_interrupts() {
+                break;
+            }
+
            match &mut client {
                Ok(cli) => {
                    if cli.is_closed() {
@@ -100,6 +134,10 @@ impl ComputeMonitor {
                            downtime_info = self.downtime_info(),
                            "connection to Postgres is closed, trying to reconnect"
                        );
+                        if self.check_interrupts() {
+                            break;
+                        }
+
                        self.report_down();

                        // Connection is closed, reconnect and try again.
@@ -111,15 +149,19 @@ impl ComputeMonitor {
                                self.compute.update_last_active(self.last_active);
                            }
                            Err(e) => {
+                                error!(
+                                    downtime_info = self.downtime_info(),
+                                    "could not check Postgres: {}", e
+                                );
+                                if self.check_interrupts() {
+                                    break;
+                                }
+
                                // Although we have many places where we can return errors in `check()`,
                                // normally it shouldn't happen. I.e., we will likely return error if
                                // connection got broken, query timed out, Postgres returned invalid data, etc.
                                // In all such cases it's suspicious, so let's report this as downtime.
                                self.report_down();
-                                error!(
-                                    downtime_info = self.downtime_info(),
-                                    "could not check Postgres: {}", e
-                                );

                                // Reconnect to Postgres just in case. During tests, I noticed
                                // that queries in `check()` can fail with `connection closed`,
@@ -136,6 +178,10 @@ impl ComputeMonitor {
                        downtime_info = self.downtime_info(),
                        "could not connect to Postgres: {}, retrying", e
                    );
+                    if self.check_interrupts() {
+                        break;
+                    }
+
                    self.report_down();

                    // Establish a new connection and try again.
@@ -147,6 +193,9 @@ impl ComputeMonitor {
            self.last_checked = Utc::now();
            thread::sleep(MONITOR_CHECK_INTERVAL);
        }
+
+        // Graceful termination path
+        Ok(())
    }

    #[instrument(skip_all)]
@@ -429,7 +478,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
        .spawn(move || {
            let span = span!(Level::INFO, "compute_monitor");
            let _enter = span.enter();
-            monitor.run();
+            match monitor.run() {
+                Ok(_) => info!("compute monitor thread terminated gracefully"),
+                Err(err) => error!("compute monitor thread terminated abnormally {:?}", err),
+            }
        })
        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -28,20 +28,37 @@ fn get_rsyslog_pid() -> Option<String> {
 }

 fn wait_for_rsyslog_pid() -> Result<String, anyhow::Error> {
-    for attempt in 1..=50 {
+    const MAX_WAIT: Duration = Duration::from_secs(5);
+    const INITIAL_SLEEP: Duration = Duration::from_millis(2);
+
+    let mut sleep_duration = INITIAL_SLEEP;
+    let start = std::time::Instant::now();
+    let mut attempts = 1;
+
+    for attempt in 1.. {
+        attempts = attempt;
        match get_rsyslog_pid() {
            Some(pid) => return Ok(pid),
            None => {
+                if start.elapsed() >= MAX_WAIT {
+                    break;
+                }
                info!(
-                    "rsyslogd is not running, attempt {}/50. Waiting...",
-                    attempt
+                    "rsyslogd is not running, attempt {}. Sleeping for {} ms",
+                    attempt,
+                    sleep_duration.as_millis()
                );
-                std::thread::sleep(std::time::Duration::from_millis(2));
+                std::thread::sleep(sleep_duration);
+                sleep_duration *= 2;
            }
        }
    }

-    Err(anyhow::anyhow!("rsyslogd did not start after 50 attempts"))
+    Err(anyhow::anyhow!(
+        "rsyslogd is not running after waiting for {} seconds and {} attempts",
+        attempts,
+        start.elapsed().as_secs()
+    ))
 }

 // Restart rsyslogd to apply the new configuration.
@@ -67,13 +84,15 @@ fn restart_rsyslog() -> Result<()> {

 pub fn configure_audit_rsyslog(
    log_directory: String,
-    tag: Option<String>,
+    endpoint_id: &str,
+    project_id: &str,
    remote_endpoint: &str,
 ) -> Result<()> {
    let config_content: String = format!(
        include_str!("config_template/compute_audit_rsyslog_template.conf"),
        log_directory = log_directory,
-        tag = tag.unwrap_or("".to_string()),
+        endpoint_id = endpoint_id,
+        project_id = project_id,
        remote_endpoint = remote_endpoint
    );

--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -30,7 +30,7 @@ mod pg_helpers_tests {
            r#"fsync = off
 wal_level = logical
 hot_standby = on
-prewarm_lfc_on_startup = off
+autoprewarm = off
 neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
 wal_log_hints = on
 log_connections = on
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -2,8 +2,10 @@
 [pageserver]
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
+listen_grpc_addr = '127.0.0.1:51051'
 pg_auth_type = 'Trust'
 http_auth_type = 'Trust'
+grpc_auth_type = 'Trust'

 [[safekeepers]]
 id = 1
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -4,8 +4,10 @@
 id=1
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
+listen_grpc_addr = '127.0.0.1:51051'
 pg_auth_type = 'Trust'
 http_auth_type = 'Trust'
+grpc_auth_type = 'Trust'

 [[safekeepers]]
 id = 1
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -32,6 +32,7 @@ use control_plane::storage_controller::{
 };
 use nix::fcntl::{Flock, FlockArg};
 use pageserver_api::config::{
+    DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT,
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
@@ -1007,13 +1008,16 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                    let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
                    let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
                    let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
+                    let grpc_port = DEFAULT_PAGESERVER_GRPC_PORT + i;
                    NeonLocalInitPageserverConf {
                        id: pageserver_id,
                        listen_pg_addr: format!("127.0.0.1:{pg_port}"),
                        listen_http_addr: format!("127.0.0.1:{http_port}"),
                        listen_https_addr: None,
+                        listen_grpc_addr: Some(format!("127.0.0.1:{grpc_port}")),
                        pg_auth_type: AuthType::Trust,
                        http_auth_type: AuthType::Trust,
+                        grpc_auth_type: AuthType::Trust,
                        other: Default::default(),
                        // Typical developer machines use disks with slow fsync, and we don't care
                        // about data integrity: disable disk syncs.
@@ -1275,6 +1279,7 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
                mode: pageserver_api::models::TimelineCreateRequestMode::Branch {
                    ancestor_timeline_id,
                    ancestor_start_lsn: start_lsn,
+                    read_only: false,
                    pg_version: None,
                },
            };
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -747,7 +747,7 @@ impl Endpoint {
                logs_export_host: None::<String>,
                endpoint_storage_addr: Some(endpoint_storage_addr),
                endpoint_storage_token: Some(endpoint_storage_token),
-                prewarm_lfc_on_startup: false,
+                autoprewarm: false,
            };

            // this strange code is needed to support respec() in tests
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -278,8 +278,10 @@ pub struct PageServerConf {
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
    pub listen_https_addr: Option<String>,
+    pub listen_grpc_addr: Option<String>,
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
+    pub grpc_auth_type: AuthType,
    pub no_sync: bool,
 }

@@ -290,8 +292,10 @@ impl Default for PageServerConf {
            listen_pg_addr: String::new(),
            listen_http_addr: String::new(),
            listen_https_addr: None,
+            listen_grpc_addr: None,
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
+            grpc_auth_type: AuthType::Trust,
            no_sync: false,
        }
    }
@@ -306,8 +310,10 @@ pub struct NeonLocalInitPageserverConf {
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
    pub listen_https_addr: Option<String>,
+    pub listen_grpc_addr: Option<String>,
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
+    pub grpc_auth_type: AuthType,
    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
    pub no_sync: bool,
    #[serde(flatten)]
@@ -321,8 +327,10 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
            listen_pg_addr,
            listen_http_addr,
            listen_https_addr,
+            listen_grpc_addr,
            pg_auth_type,
            http_auth_type,
+            grpc_auth_type,
            no_sync,
            other: _,
        } = conf;
@@ -331,7 +339,9 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
            listen_pg_addr: listen_pg_addr.clone(),
            listen_http_addr: listen_http_addr.clone(),
            listen_https_addr: listen_https_addr.clone(),
+            listen_grpc_addr: listen_grpc_addr.clone(),
            pg_auth_type: *pg_auth_type,
+            grpc_auth_type: *grpc_auth_type,
            http_auth_type: *http_auth_type,
            no_sync: *no_sync,
        }
@@ -707,8 +717,10 @@ impl LocalEnv {
                    listen_pg_addr: String,
                    listen_http_addr: String,
                    listen_https_addr: Option<String>,
+                    listen_grpc_addr: Option<String>,
                    pg_auth_type: AuthType,
                    http_auth_type: AuthType,
+                    grpc_auth_type: AuthType,
                    #[serde(default)]
                    no_sync: bool,
                }
@@ -732,8 +744,10 @@ impl LocalEnv {
                    listen_pg_addr,
                    listen_http_addr,
                    listen_https_addr,
+                    listen_grpc_addr,
                    pg_auth_type,
                    http_auth_type,
+                    grpc_auth_type,
                    no_sync,
                } = config_toml;
                let IdentityTomlSubset {
@@ -750,8 +764,10 @@ impl LocalEnv {
                    listen_pg_addr,
                    listen_http_addr,
                    listen_https_addr,
+                    listen_grpc_addr,
                    pg_auth_type,
                    http_auth_type,
+                    grpc_auth_type,
                    no_sync,
                };
                pageservers.push(conf);
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -129,7 +129,9 @@ impl PageServerNode {
            ));
        }

-        if conf.http_auth_type != AuthType::Trust || conf.pg_auth_type != AuthType::Trust {
+        if [conf.http_auth_type, conf.pg_auth_type, conf.grpc_auth_type]
+            .contains(&AuthType::NeonJWT)
+        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
@@ -511,11 +513,6 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'timeline_offloading' as bool")?,
-            wal_receiver_protocol_override: settings
-                .remove("wal_receiver_protocol_override")
-                .map(serde_json::from_str)
-                .transpose()
-                .context("parse `wal_receiver_protocol_override` from json")?,
            rel_size_v2_enabled: settings
                .remove("rel_size_v2_enabled")
                .map(|x| x.parse::<bool>())
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
                       jq   \
                       netcat-openbsd
 #This is required for the pg_hintplan test
-RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw
+RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src/ && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src

 USER postgres
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -1,18 +1,18 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -eux

 # Generate a random tenant or timeline ID
 #
 # Takes a variable name as argument. The result is stored in that variable.
 generate_id() {
-    local -n resvar=$1
-    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
+    local -n resvar=${1}
+    printf -v resvar '%08x%08x%08x%08x' ${SRANDOM} ${SRANDOM} ${SRANDOM} ${SRANDOM}
 }

 PG_VERSION=${PG_VERSION:-14}

-CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
-CONFIG_FILE=/tmp/config.json
+readonly CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
+readonly CONFIG_FILE=/tmp/config.json

 # Test that the first library path that the dynamic loader looks in is the path
 # that we use for custom compiled software
@@ -20,17 +20,17 @@ first_path="$(ldconfig --verbose 2>/dev/null \
    | grep --invert-match ^$'\t' \
    | cut --delimiter=: --fields=1 \
    | head --lines=1)"
-test "$first_path" == '/usr/local/lib' || true # Remove the || true in a follow-up PR. Needed for backwards compat.
+test "${first_path}" = '/usr/local/lib'

 echo "Waiting pageserver become ready."
 while ! nc -z pageserver 6400; do
-     sleep 1;
+     sleep 1
 done
 echo "Page server is ready."

-cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
+cp "${CONFIG_FILE_ORG}" "${CONFIG_FILE}"

- if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
+ if [[ -n "${TENANT_ID:-}" && -n "${TIMELINE_ID:-}" ]]; then
   tenant_id=${TENANT_ID}
   timeline_id=${TIMELINE_ID}
 else
@@ -41,7 +41,7 @@ else
       "http://pageserver:9898/v1/tenant"
  )
  tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id)
-  if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then
+  if [[ -z "${tenant_id}" || "${tenant_id}" = null ]]; then
    echo "Create a tenant"
    generate_id tenant_id
    PARAMS=(
@@ -51,7 +51,7 @@ else
        "http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
    )
    result=$(curl "${PARAMS[@]}")
-    echo $result | jq .
+    printf '%s\n' "${result}" | jq .
  fi

  echo "Check if a timeline present"
@@ -61,7 +61,7 @@ else
       "http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
  )
  timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
-  if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then
+  if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
    generate_id timeline_id
    PARAMS=(
        -sbf
@@ -71,7 +71,7 @@ else
        "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
    )
    result=$(curl "${PARAMS[@]}")
-    echo $result | jq .
+    printf '%s\n' "${result}" | jq .
  fi
 fi

@@ -82,10 +82,10 @@ else
 fi
 echo "Adding pgx_ulid"
 shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
-sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
+sed -i "s|${shared_libraries}|${shared_libraries},${ulid_extension}|" ${CONFIG_FILE}
 echo "Overwrite tenant id and timeline id in spec file"
-sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
-sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
+sed -i "s|TENANT_ID|${tenant_id}|" ${CONFIG_FILE}
+sed -i "s|TIMELINE_ID|${timeline_id}|" ${CONFIG_FILE}

 cat ${CONFIG_FILE}

@@ -93,5 +93,5 @@ echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
     -b /usr/local/bin/postgres                              \
-     --compute-id "compute-$RANDOM"                          \
-     --config "$CONFIG_FILE"
+     --compute-id "compute-${RANDOM}"                          \
+     --config "${CONFIG_FILE}"
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -186,13 +186,14 @@ services:

  neon-test-extensions:
    profiles: ["test-extensions"]
-    image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
+    image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-${PG_VERSION:-16}}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
    environment:
-      - PGPASSWORD=cloud_admin
+      - PGUSER=${PGUSER:-cloud_admin}
+      - PGPASSWORD=${PGPASSWORD:-cloud_admin}
    entrypoint:
      - "/bin/bash"
      - "-c"
    command:
-      - sleep 1800
+      - sleep 3600
    depends_on:
      - compute
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -54,6 +54,15 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
        echo Adding dummy config
        docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
+        # Prepare for the PostGIS test
+        docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
+        TMPDIR=$(mktemp -d)
+        docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}"
+        docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}"
+        docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
+        docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test
+        docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress
+        rm -rf "${TMPDIR}"
        # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
        TMPDIR=$(mktemp -d)
        docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
@@ -68,7 +77,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
        docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
        # We are running tests now
        rm -f testout.txt testout_contrib.txt
-        docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
+        docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
        neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
        docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
        neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
--- a/docker-compose/ext-src/h3-pg-src/neon-test.sh
+++ b/docker-compose/ext-src/h3-pg-src/neon-test.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -ex
+cd "$(dirname "${0}")"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+dropdb --if-exists contrib_regression
+createdb contrib_regression
+cd h3_postgis/test
+psql -d contrib_regression -c "CREATE EXTENSION postgis" -c "CREATE EXTENSION postgis_raster" -c "CREATE EXTENSION h3" -c "CREATE EXTENSION h3_postgis"
+TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g')
+${PG_REGRESS} --use-existing --dbname contrib_regression ${TESTS}
+cd ../../h3/test
+TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g')
+dropdb --if-exists contrib_regression
+createdb contrib_regression
+psql -d contrib_regression -c "CREATE EXTENSION h3"
+${PG_REGRESS} --use-existing --dbname contrib_regression ${TESTS}
--- a/docker-compose/ext-src/h3-pg-src/test-upgrade.sh
+++ b/docker-compose/ext-src/h3-pg-src/test-upgrade.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+cd h3/test
+TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g')
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'  --dbname=contrib_regression  ${TESTS}
--- a/docker-compose/ext-src/online_advisor-src/neon-test.sh
+++ b/docker-compose/ext-src/online_advisor-src/neon-test.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname "${0}")"
+if [ -f Makefile ]; then
+  make installcheck
+fi
--- a/docker-compose/ext-src/online_advisor-src/regular-test.sh
+++ b/docker-compose/ext-src/online_advisor-src/regular-test.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+[ -f Makefile ] || exit 0
+dropdb --if-exist contrib_regression
+createdb contrib_regression
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+TESTS=$(echo sql/* | sed 's|sql/||g; s|\.sql||g')
+${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression ${TESTS}
--- a/docker-compose/ext-src/postgis-src/README-Neon.md
+++ b/docker-compose/ext-src/postgis-src/README-Neon.md
@@ -0,0 +1,70 @@
+# PostGIS Testing in Neon
+
+This directory contains configuration files and patches for running PostGIS tests in the Neon database environment.
+
+## Overview
+
+PostGIS is a spatial database extension for PostgreSQL that adds support for geographic objects. Testing PostGIS compatibility ensures that Neon's modifications to PostgreSQL don't break compatibility with this critical extension.
+
+## PostGIS Versions
+
+- PostgreSQL v17: PostGIS 3.5.0
+- PostgreSQL v14/v15/v16: PostGIS 3.3.3
+
+## Test Configuration
+
+The test setup includes:
+
+- `postgis-no-upgrade-test.patch`: Disables upgrade tests by removing the upgrade test section from regress/runtest.mk
+- `postgis-regular-v16.patch`: Version-specific patch for PostgreSQL v16
+- `postgis-regular-v17.patch`: Version-specific patch for PostgreSQL v17
+- `regular-test.sh`: Script to run PostGIS tests as a regular user
+- `neon-test.sh`: Script to handle version-specific test configurations
+- `raster_outdb_template.sql`: Template for raster tests with explicit file paths
+
+## Excluded Tests
+
+**Important Note:** The test exclusions listed below are specifically for regular-user tests against staging instances. These exclusions are necessary because staging instances run with limited privileges and cannot perform operations requiring superuser access. Docker-compose based tests are not affected by these exclusions.
+
+### Tests Requiring Superuser Permissions
+
+These tests cannot be run as a regular user:
+- `estimatedextent`
+- `regress/core/legacy`
+- `regress/core/typmod`
+- `regress/loader/TestSkipANALYZE`
+- `regress/loader/TestANALYZE`
+
+### Tests Requiring Filesystem Access
+
+These tests need direct filesystem access that is only possible for superusers:
+- `loader/load_outdb`
+
+### Tests with Flaky Results
+
+These tests have assumptions that don't always hold true:
+- `regress/core/computed_columns` - Assumes computed columns always outperform alternatives, which is not consistently true
+
+### Tests Requiring Tunable Parameter Modifications
+
+These tests attempt to modify the `postgis.gdal_enabled_drivers` parameter, which is only accessible to superusers:
+- `raster/test/regress/rt_wkb`
+- `raster/test/regress/rt_addband`
+- `raster/test/regress/rt_setbandpath`
+- `raster/test/regress/rt_fromgdalraster`
+- `raster/test/regress/rt_asgdalraster`
+- `raster/test/regress/rt_astiff`
+- `raster/test/regress/rt_asjpeg`
+- `raster/test/regress/rt_aspng`
+- `raster/test/regress/permitted_gdal_drivers`
+- Loader tests: `BasicOutDB`, `Tiled10x10`, `Tiled10x10Copy`, `Tiled8x8`, `TiledAuto`, `TiledAutoSkipNoData`, `TiledAutoCopyn`
+
+### Topology Tests (v17 only)
+- `populate_topology_layer`
+- `renametopogeometrycolumn`
+
+## Other Modifications
+
+- Binary.sql tests are modified to use explicit file paths
+- Server-side SQL COPY commands (which require superuser privileges) are converted to client-side `\copy` commands
+- Upgrade tests are disabled
--- a/docker-compose/ext-src/postgis-src/neon-test.sh
+++ b/docker-compose/ext-src/postgis-src/neon-test.sh
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -ex
+cd "$(dirname "$0")"
+patch -p1 <"postgis-common-${PG_VERSION}.patch"
+trap 'echo Cleaning up; patch -R -p1 <postgis-common-${PG_VERSION}.patch' EXIT
+make installcheck-base
--- a/docker-compose/ext-src/postgis-src/postgis-common-v16.patch
+++ b/docker-compose/ext-src/postgis-src/postgis-common-v16.patch
@@ -0,0 +1,37 @@
+diff --git a/regress/core/tests.mk b/regress/core/tests.mk
+index 3abd7bc..64a9254 100644
+--- a/regress/core/tests.mk
+++ b/regress/core/tests.mk
+@@ -144,11 +144,6 @@ TESTS_SLOW = \
+ 	$(top_srcdir)/regress/core/concave_hull_hard \
+ 	$(top_srcdir)/regress/core/knn_recheck
+ 
+-ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1)
+-	TESTS += \
+-		$(top_srcdir)/regress/core/computed_columns
+-endif
+-
+ ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1)
+ 	# GEOS-3.7 adds:
+ 	# ST_FrechetDistance
+diff --git a/regress/runtest.mk b/regress/runtest.mk
+index c051f03..010e493 100644
+--- a/regress/runtest.mk
+++ b/regress/runtest.mk
+@@ -24,16 +24,6 @@ check-regress:
+ 
+ 	POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
+ 
+-	@if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
+-		echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
+-		POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
+-      --upgrade \
+-      $(RUNTESTFLAGS) \
+-      $(RUNTESTFLAGS_INTERNAL) \
+-      $(TESTS); \
+-	else \
+-		echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
+-	fi
+ 
+ check-long:
+ 	$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
--- a/docker-compose/ext-src/postgis-src/postgis-common-v17.patch
+++ b/docker-compose/ext-src/postgis-src/postgis-common-v17.patch
@@ -0,0 +1,35 @@
+diff --git a/regress/core/tests.mk b/regress/core/tests.mk
+index 9e05244..90987df 100644
+--- a/regress/core/tests.mk
+++ b/regress/core/tests.mk
+@@ -143,8 +143,7 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/oriented_envelope \
+ 	$(top_srcdir)/regress/core/point_coordinates \
+ 	$(top_srcdir)/regress/core/out_geojson \
+-  $(top_srcdir)/regress/core/wrapx \
+-	$(top_srcdir)/regress/core/computed_columns
+  $(top_srcdir)/regress/core/wrapx
+ 
+ # Slow slow tests
+ TESTS_SLOW = \
+diff --git a/regress/runtest.mk b/regress/runtest.mk
+index 4b95b7e..449d5a2 100644
+--- a/regress/runtest.mk
+++ b/regress/runtest.mk
+@@ -24,16 +24,6 @@ check-regress:
+ 
+ 	@POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
+ 
+-	@if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
+-		echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
+-		POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
+-      --upgrade \
+-      $(RUNTESTFLAGS) \
+-      $(RUNTESTFLAGS_INTERNAL) \
+-      $(TESTS); \
+-	else \
+-		echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
+-	fi
+ 
+ check-long:
+ 	$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
--- a/docker-compose/ext-src/postgis-src/postgis-regular-v16.patch
+++ b/docker-compose/ext-src/postgis-src/postgis-regular-v16.patch
@@ -0,0 +1,186 @@
+diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
+index 00918e1..7e2b6cd 100644
+--- a/raster/test/regress/tests.mk
+++ b/raster/test/regress/tests.mk
+@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
+   $(RUNTESTFLAGS_INTERNAL) \
+   --after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
+ 
+-RASTER_TEST_FIRST = \
+-	$(top_srcdir)/raster/test/regress/check_gdal \
+-	$(top_srcdir)/raster/test/regress/loader/load_outdb
+RASTER_TEST_FIRST =
+ 
+ RASTER_TEST_LAST = \
+ 	$(top_srcdir)/raster/test/regress/clean
+@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
+ 
+ RASTER_TEST_BASIC_FUNC = \
+ 	$(top_srcdir)/raster/test/regress/rt_bytea \
+-	$(top_srcdir)/raster/test/regress/rt_wkb \
+ 	$(top_srcdir)/raster/test/regress/box3d \
+-	$(top_srcdir)/raster/test/regress/rt_addband \
+ 	$(top_srcdir)/raster/test/regress/rt_band \
+ 	$(top_srcdir)/raster/test/regress/rt_tile
+ 
+@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
+ 	$(top_srcdir)/raster/test/regress/rt_neighborhood \
+ 	$(top_srcdir)/raster/test/regress/rt_nearestvalue \
+ 	$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
+-	$(top_srcdir)/raster/test/regress/rt_polygon \
+-	$(top_srcdir)/raster/test/regress/rt_setbandpath
+	$(top_srcdir)/raster/test/regress/rt_polygon
+ 
+ RASTER_TEST_UTILITY = \
+ 	$(top_srcdir)/raster/test/regress/rt_utility \
+-	$(top_srcdir)/raster/test/regress/rt_fromgdalraster \
+-	$(top_srcdir)/raster/test/regress/rt_asgdalraster \
+-	$(top_srcdir)/raster/test/regress/rt_astiff \
+-	$(top_srcdir)/raster/test/regress/rt_asjpeg \
+-	$(top_srcdir)/raster/test/regress/rt_aspng \
+ 	$(top_srcdir)/raster/test/regress/rt_reclass \
+ 	$(top_srcdir)/raster/test/regress/rt_gdalwarp \
+ 	$(top_srcdir)/raster/test/regress/rt_gdalcontour \
+@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
+ 
+ RASTER_TEST_BUGS = \
+ 	$(top_srcdir)/raster/test/regress/bug_test_car5 \
+-	$(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
+ 	$(top_srcdir)/raster/test/regress/tickets
+ 
+ RASTER_TEST_LOADER = \
+ 	$(top_srcdir)/raster/test/regress/loader/Basic \
+ 	$(top_srcdir)/raster/test/regress/loader/Projected \
+ 	$(top_srcdir)/raster/test/regress/loader/BasicCopy \
+-	$(top_srcdir)/raster/test/regress/loader/BasicFilename \
+-	$(top_srcdir)/raster/test/regress/loader/BasicOutDB \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAuto \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
+	$(top_srcdir)/raster/test/regress/loader/BasicFilename
+ 
+ RASTER_TESTS := $(RASTER_TEST_FIRST) \
+ 	$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
+diff --git a/regress/core/binary.sql b/regress/core/binary.sql
+index 7a36b65..ad78fc7 100644
+--- a/regress/core/binary.sql
+++ b/regress/core/binary.sql
+@@ -1,4 +1,5 @@
+ SET client_min_messages TO warning;
+
+ CREATE SCHEMA tm;
+ 
+ CREATE TABLE tm.geoms (id serial, g geometry);
+@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
+ INSERT INTO tm.geoms(g)
+ SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
+ 
+-COPY tm.geoms TO :tmpfile WITH BINARY;
+-- define temp file path
+\set tmpfile '/tmp/postgis_binary_test.dat'
+
+-- export
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+-- import
+ CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
+-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
+-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
+- AND ST_OrderingEquals(i.g, o.g);
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
+ 
+ CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
+ WHERE geometrytype(g) NOT LIKE '%CURVE%'
+   AND geometrytype(g) NOT LIKE '%CIRCULAR%'
+   AND geometrytype(g) NOT LIKE '%SURFACE%'
+   AND geometrytype(g) NOT LIKE 'TRIANGLE%'
+-  AND geometrytype(g) NOT LIKE 'TIN%'
+-;
+  AND geometrytype(g) NOT LIKE 'TIN%';
+ 
+-COPY tm.geogs TO :tmpfile WITH BINARY;
+-- export
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+-- import
+ CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
+-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
+-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
+- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
+ 
+ DROP SCHEMA tm CASCADE;
+
+diff --git a/regress/core/tests.mk b/regress/core/tests.mk
+index 64a9254..94903c3 100644
+--- a/regress/core/tests.mk
+++ b/regress/core/tests.mk
+@@ -23,7 +23,6 @@ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ RUNTESTFLAGS_INTERNAL += \
+   --before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
+   --after-upgrade-script  $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
+-  --after-create-script   $(top_srcdir)/regress/hooks/hook-after-create.sql \
+   --before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
+ 
+ TESTS += \
+@@ -40,7 +39,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/dumppoints \
+ 	$(top_srcdir)/regress/core/dumpsegments \
+ 	$(top_srcdir)/regress/core/empty \
+-	$(top_srcdir)/regress/core/estimatedextent \
+ 	$(top_srcdir)/regress/core/forcecurve \
+ 	$(top_srcdir)/regress/core/flatgeobuf \
+ 	$(top_srcdir)/regress/core/geography \
+@@ -55,7 +53,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/out_marc21 \
+ 	$(top_srcdir)/regress/core/in_encodedpolyline \
+ 	$(top_srcdir)/regress/core/iscollection \
+-	$(top_srcdir)/regress/core/legacy \
+ 	$(top_srcdir)/regress/core/letters \
+ 	$(top_srcdir)/regress/core/long_xact \
+ 	$(top_srcdir)/regress/core/lwgeom_regress \
+@@ -112,7 +109,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/temporal_knn \
+ 	$(top_srcdir)/regress/core/tickets \
+ 	$(top_srcdir)/regress/core/twkb \
+-	$(top_srcdir)/regress/core/typmod \
+ 	$(top_srcdir)/regress/core/wkb \
+ 	$(top_srcdir)/regress/core/wkt \
+ 	$(top_srcdir)/regress/core/wmsservers \
+diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
+index 1fc77ac..c3cb9de 100644
+--- a/regress/loader/tests.mk
+++ b/regress/loader/tests.mk
+@@ -38,7 +38,5 @@ TESTS += \
+ 	$(top_srcdir)/regress/loader/Latin1 \
+ 	$(top_srcdir)/regress/loader/Latin1-implicit \
+ 	$(top_srcdir)/regress/loader/mfile \
+-	$(top_srcdir)/regress/loader/TestSkipANALYZE \
+-	$(top_srcdir)/regress/loader/TestANALYZE \
+ 	$(top_srcdir)/regress/loader/CharNoWidth
+ 
+diff --git a/regress/run_test.pl b/regress/run_test.pl
+index 0ec5b2d..1c331f4 100755
+--- a/regress/run_test.pl
+++ b/regress/run_test.pl
+@@ -147,7 +147,6 @@ $ENV{"LANG"} = "C";
+ # Add locale info to the psql options
+ # Add pg12 precision suppression
+ my $PGOPTIONS = $ENV{"PGOPTIONS"};
+-$PGOPTIONS .= " -c lc_messages=C";
+ $PGOPTIONS .= " -c client_min_messages=NOTICE";
+ $PGOPTIONS .= " -c extra_float_digits=0";
+ $ENV{"PGOPTIONS"} = $PGOPTIONS;
--- a/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch
+++ b/docker-compose/ext-src/postgis-src/postgis-regular-v17.patch
@@ -0,0 +1,208 @@
+diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
+index 00918e1..7e2b6cd 100644
+--- a/raster/test/regress/tests.mk
+++ b/raster/test/regress/tests.mk
+@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
+   $(RUNTESTFLAGS_INTERNAL) \
+   --after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
+ 
+-RASTER_TEST_FIRST = \
+-	$(top_srcdir)/raster/test/regress/check_gdal \
+-	$(top_srcdir)/raster/test/regress/loader/load_outdb
+RASTER_TEST_FIRST =
+ 
+ RASTER_TEST_LAST = \
+ 	$(top_srcdir)/raster/test/regress/clean
+@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
+ 
+ RASTER_TEST_BASIC_FUNC = \
+ 	$(top_srcdir)/raster/test/regress/rt_bytea \
+-	$(top_srcdir)/raster/test/regress/rt_wkb \
+ 	$(top_srcdir)/raster/test/regress/box3d \
+-	$(top_srcdir)/raster/test/regress/rt_addband \
+ 	$(top_srcdir)/raster/test/regress/rt_band \
+ 	$(top_srcdir)/raster/test/regress/rt_tile
+ 
+@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
+ 	$(top_srcdir)/raster/test/regress/rt_neighborhood \
+ 	$(top_srcdir)/raster/test/regress/rt_nearestvalue \
+ 	$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
+-	$(top_srcdir)/raster/test/regress/rt_polygon \
+-	$(top_srcdir)/raster/test/regress/rt_setbandpath
+	$(top_srcdir)/raster/test/regress/rt_polygon
+ 
+ RASTER_TEST_UTILITY = \
+ 	$(top_srcdir)/raster/test/regress/rt_utility \
+-	$(top_srcdir)/raster/test/regress/rt_fromgdalraster \
+-	$(top_srcdir)/raster/test/regress/rt_asgdalraster \
+-	$(top_srcdir)/raster/test/regress/rt_astiff \
+-	$(top_srcdir)/raster/test/regress/rt_asjpeg \
+-	$(top_srcdir)/raster/test/regress/rt_aspng \
+ 	$(top_srcdir)/raster/test/regress/rt_reclass \
+ 	$(top_srcdir)/raster/test/regress/rt_gdalwarp \
+ 	$(top_srcdir)/raster/test/regress/rt_gdalcontour \
+@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
+ 
+ RASTER_TEST_BUGS = \
+ 	$(top_srcdir)/raster/test/regress/bug_test_car5 \
+-	$(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
+ 	$(top_srcdir)/raster/test/regress/tickets
+ 
+ RASTER_TEST_LOADER = \
+ 	$(top_srcdir)/raster/test/regress/loader/Basic \
+ 	$(top_srcdir)/raster/test/regress/loader/Projected \
+ 	$(top_srcdir)/raster/test/regress/loader/BasicCopy \
+-	$(top_srcdir)/raster/test/regress/loader/BasicFilename \
+-	$(top_srcdir)/raster/test/regress/loader/BasicOutDB \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
+-	$(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAuto \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
+-	$(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
+	$(top_srcdir)/raster/test/regress/loader/BasicFilename
+ 
+ RASTER_TESTS := $(RASTER_TEST_FIRST) \
+ 	$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
+diff --git a/regress/core/binary.sql b/regress/core/binary.sql
+index 7a36b65..ad78fc7 100644
+--- a/regress/core/binary.sql
+++ b/regress/core/binary.sql
+@@ -1,4 +1,5 @@
+ SET client_min_messages TO warning;
+
+ CREATE SCHEMA tm;
+ 
+ CREATE TABLE tm.geoms (id serial, g geometry);
+@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
+ INSERT INTO tm.geoms(g)
+ SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
+ 
+-COPY tm.geoms TO :tmpfile WITH BINARY;
+-- define temp file path
+\set tmpfile '/tmp/postgis_binary_test.dat'
+
+-- export
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+-- import
+ CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
+-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
+-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
+- AND ST_OrderingEquals(i.g, o.g);
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
+ 
+ CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
+ WHERE geometrytype(g) NOT LIKE '%CURVE%'
+   AND geometrytype(g) NOT LIKE '%CIRCULAR%'
+   AND geometrytype(g) NOT LIKE '%SURFACE%'
+   AND geometrytype(g) NOT LIKE 'TRIANGLE%'
+-  AND geometrytype(g) NOT LIKE 'TIN%'
+-;
+  AND geometrytype(g) NOT LIKE 'TIN%';
+ 
+-COPY tm.geogs TO :tmpfile WITH BINARY;
+-- export
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+-- import
+ CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
+-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
+-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
+- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
+:command
+
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
+ 
+ DROP SCHEMA tm CASCADE;
+
+diff --git a/regress/core/tests.mk b/regress/core/tests.mk
+index 90987df..74fe3f1 100644
+--- a/regress/core/tests.mk
+++ b/regress/core/tests.mk
+@@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170
+ POSTGIS_GEOS_VERSION=31101
+ HAVE_JSON=yes
+ HAVE_SPGIST=yes
+-INTERRUPTTESTS=yes
+INTERRUPTTESTS=no
+ 
+ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ 
+ RUNTESTFLAGS_INTERNAL += \
+   --before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
+   --after-upgrade-script  $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
+-  --after-create-script   $(top_srcdir)/regress/hooks/hook-after-create.sql \
+   --before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
+ 
+ TESTS += \
+@@ -40,7 +39,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/dumppoints \
+ 	$(top_srcdir)/regress/core/dumpsegments \
+ 	$(top_srcdir)/regress/core/empty \
+-	$(top_srcdir)/regress/core/estimatedextent \
+ 	$(top_srcdir)/regress/core/forcecurve \
+ 	$(top_srcdir)/regress/core/flatgeobuf \
+ 	$(top_srcdir)/regress/core/frechet \
+@@ -60,7 +58,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/out_marc21 \
+ 	$(top_srcdir)/regress/core/in_encodedpolyline \
+ 	$(top_srcdir)/regress/core/iscollection \
+-	$(top_srcdir)/regress/core/legacy \
+ 	$(top_srcdir)/regress/core/letters \
+ 	$(top_srcdir)/regress/core/lwgeom_regress \
+ 	$(top_srcdir)/regress/core/measures \
+@@ -119,7 +116,6 @@ TESTS += \
+ 	$(top_srcdir)/regress/core/temporal_knn \
+ 	$(top_srcdir)/regress/core/tickets \
+ 	$(top_srcdir)/regress/core/twkb \
+-	$(top_srcdir)/regress/core/typmod \
+ 	$(top_srcdir)/regress/core/wkb \
+ 	$(top_srcdir)/regress/core/wkt \
+ 	$(top_srcdir)/regress/core/wmsservers \
+diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
+index ac4f8ad..4bad4fc 100644
+--- a/regress/loader/tests.mk
+++ b/regress/loader/tests.mk
+@@ -38,7 +38,5 @@ TESTS += \
+ 	$(top_srcdir)/regress/loader/Latin1 \
+ 	$(top_srcdir)/regress/loader/Latin1-implicit \
+ 	$(top_srcdir)/regress/loader/mfile \
+-	$(top_srcdir)/regress/loader/TestSkipANALYZE \
+-	$(top_srcdir)/regress/loader/TestANALYZE \
+ 	$(top_srcdir)/regress/loader/CharNoWidth \
+ 
+diff --git a/regress/run_test.pl b/regress/run_test.pl
+index cac4b2e..4c7c82b 100755
+--- a/regress/run_test.pl
+++ b/regress/run_test.pl
+@@ -238,7 +238,6 @@ $ENV{"LANG"} = "C";
+ # Add locale info to the psql options
+ # Add pg12 precision suppression
+ my $PGOPTIONS = $ENV{"PGOPTIONS"};
+-$PGOPTIONS .= " -c lc_messages=C";
+ $PGOPTIONS .= " -c client_min_messages=NOTICE";
+ $PGOPTIONS .= " -c extra_float_digits=0";
+ $ENV{"PGOPTIONS"} = $PGOPTIONS;
+diff --git a/topology/test/tests.mk b/topology/test/tests.mk
+index cbe2633..2c7c18f 100644
+--- a/topology/test/tests.mk
+++ b/topology/test/tests.mk
+@@ -46,9 +46,7 @@ TESTS += \
+ 	$(top_srcdir)/topology/test/regress/legacy_query.sql \
+ 	$(top_srcdir)/topology/test/regress/legacy_validate.sql \
+ 	$(top_srcdir)/topology/test/regress/polygonize.sql \
+-	$(top_srcdir)/topology/test/regress/populate_topology_layer.sql \
+ 	$(top_srcdir)/topology/test/regress/removeunusedprimitives.sql \
+-	$(top_srcdir)/topology/test/regress/renametopogeometrycolumn.sql \
+ 	$(top_srcdir)/topology/test/regress/renametopology.sql \
+ 	$(top_srcdir)/topology/test/regress/share_sequences.sql \
+ 	$(top_srcdir)/topology/test/regress/sqlmm.sql \
--- a/docker-compose/ext-src/postgis-src/raster_outdb_template.sql
+++ b/docker-compose/ext-src/postgis-src/raster_outdb_template.sql
--- a/docker-compose/ext-src/postgis-src/regular-test.sh
+++ b/docker-compose/ext-src/postgis-src/regular-test.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -ex
+cd "$(dirname "${0}")"
+dropdb --if-exist contrib_regression
+createdb contrib_regression
+psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='UTC'" \
+     -c "ALTER DATABASE contrib_regression SET DateStyle='ISO, MDY'" \
+     -c "CREATE EXTENSION postgis SCHEMA public" \
+     -c "CREATE EXTENSION postgis_topology" \
+     -c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \
+     -c "CREATE EXTENSION postgis_raster SCHEMA public" \
+     -c "CREATE EXTENSION postgis_sfcgal SCHEMA public"
+patch -p1 <"postgis-common-${PG_VERSION}.patch"
+patch -p1 <"postgis-regular-${PG_VERSION}.patch"
+psql -d contrib_regression -f raster_outdb_template.sql
+trap 'patch -R -p1 <postgis-regular-${PG_VERSION}.patch && patch -R -p1 <"postgis-common-${PG_VERSION}.patch"' EXIT
+POSTGIS_REGRESS_DB=contrib_regression RUNTESTFLAGS=--nocreate make installcheck-base
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -63,5 +63,9 @@ done
 for d in ${FAILED}; do
  cat "$(find $d -name regression.diffs)"
 done
+for postgis_diff in /tmp/pgis_reg/*_diff; do
+  echo "${postgis_diff}:"
+  cat "${postgis_diff}"
+done
 echo "${FAILED}"
 exit 1
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -82,7 +82,8 @@ EXTENSIONS='[
 {"extname": "pg_ivm", "extdir": "pg_ivm-src"},
 {"extname": "pgjwt", "extdir": "pgjwt-src"},
 {"extname": "pgtap", "extdir": "pgtap-src"},
-{"extname": "pg_repack", "extdir": "pg_repack-src"}
+{"extname": "pg_repack", "extdir": "pg_repack-src"},
+{"extname": "h3", "extdir": "h3-pg-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -178,9 +178,9 @@ pub struct ComputeSpec {
    /// JWT for authorizing requests to endpoint storage service
    pub endpoint_storage_token: Option<String>,

-    /// If true, download LFC state from endpoint_storage and pass it to Postgres on startup
+    /// Download LFC state from endpoint_storage and pass it to Postgres on startup
    #[serde(default)]
-    pub prewarm_lfc_on_startup: bool,
+    pub autoprewarm: bool,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -192,6 +192,9 @@ pub enum ComputeFeature {
    /// track short-lived connections as user activity.
    ActivityMonitorExperimental,

+    /// Enable TLS functionality.
+    TlsExperimental,
+
    /// This is a special feature flag that is used to represent unknown feature flags.
    /// Basically all unknown to enum flags are represented as this one. See unit test
    /// `parse_unknown_features()` for more details.
@@ -250,34 +253,44 @@ impl RemoteExtSpec {
        }

        match self.extension_data.get(real_ext_name) {
-            Some(_ext_data) => {
-                // We have decided to use the Go naming convention due to Kubernetes.
-
-                let arch = match std::env::consts::ARCH {
-                    "x86_64" => "amd64",
-                    "aarch64" => "arm64",
-                    arch => arch,
-                };
-
-                // Construct the path to the extension archive
-                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
-                //
-                // Keep it in sync with path generation in
-                // https://github.com/neondatabase/build-custom-extensions/tree/main
-                let archive_path_str = format!(
-                    "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
-                );
-                Ok((
-                    real_ext_name.to_string(),
-                    RemotePath::from_string(&archive_path_str)?,
-                ))
-            }
+            Some(_ext_data) => Ok((
+                real_ext_name.to_string(),
+                Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?,
+            )),
            None => Err(anyhow::anyhow!(
                "real_ext_name {} is not found",
                real_ext_name
            )),
        }
    }
+
+    /// Get the architecture-specific portion of the remote extension path. We
+    /// use the Go naming convention due to Kubernetes.
+    fn get_arch() -> &'static str {
+        match std::env::consts::ARCH {
+            "x86_64" => "amd64",
+            "aarch64" => "arm64",
+            arch => arch,
+        }
+    }
+
+    /// Build a [`RemotePath`] for an extension.
+    fn build_remote_path(
+        build_tag: &str,
+        pg_major_version: &str,
+        ext_name: &str,
+    ) -> anyhow::Result<RemotePath> {
+        let arch = Self::get_arch();
+
+        // Construct the path to the extension archive
+        // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
+        //
+        // Keep it in sync with path generation in
+        // https://github.com/neondatabase/build-custom-extensions/tree/main
+        RemotePath::from_string(&format!(
+            "{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst"
+        ))
+    }
 }

 #[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
@@ -518,6 +531,37 @@ mod tests {
            .expect("Library should be found");
    }

+    #[test]
+    fn remote_extension_path() {
+        let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
+            "public_extensions": ["ext"],
+            "custom_extensions": [],
+            "library_index": {
+                "extlib": "ext",
+            },
+            "extension_data": {
+                "ext": {
+                    "control_data": {
+                        "ext.control": ""
+                    },
+                    "archive_path": ""
+                }
+            },
+        }))
+        .unwrap();
+
+        let (_ext_name, ext_path) = rspec
+            .get_ext("ext", false, "latest", "v17")
+            .expect("Extension should be found");
+        // Starting with a forward slash would have consequences for the
+        // Url::join() that occurs when downloading a remote extension.
+        assert!(!ext_path.to_string().starts_with("/"));
+        assert_eq!(
+            ext_path,
+            RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap()
+        );
+    }
+
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
--- a/libs/compute_api/tests/cluster_spec.json
+++ b/libs/compute_api/tests/cluster_spec.json
@@ -85,7 +85,7 @@
                "vartype": "bool"
            },
            {
-                "name": "prewarm_lfc_on_startup",
+                "name": "autoprewarm",
                "value": "off",
                "vartype": "bool"
            },
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -107,7 +107,7 @@ impl<const N: usize> MetricType for HyperLogLogState<N> {
 }

 impl<const N: usize> HyperLogLogState<N> {
-    pub fn measure(&self, item: &impl Hash) {
+    pub fn measure(&self, item: &(impl Hash + ?Sized)) {
        // changing the hasher will break compatibility with previous measurements.
        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -27,6 +27,7 @@ pub use prometheus::{

 pub mod launch_timestamp;
 mod wrappers;
+pub use prometheus;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
 pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -6,8 +6,20 @@ license.workspace = true

 [dependencies]
 thiserror.workspace = true
-nix.workspace=true
+nix.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+rustc-hash = { version = "2.1.1" }
+
+[dev-dependencies]
+criterion = { workspace = true, features = ["html_reports"] }
+rand = "0.9.1"
+rand_distr = "0.5.1"
+xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
+ahash.workspace = true

 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
+
+[[bench]]
+name = "hmap_resize"
+harness = false
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,438 @@
+//! Hash table implementation on top of 'shmem'
+//!
+//! Features required in the long run by the communicator project:
+//!
+//! [X] Accessible from both Postgres processes and rust threads in the communicator process
+//! [X] Low latency
+//! [ ] Scalable to lots of concurrent accesses (currently relies on caller for locking)
+//! [ ] Resizable
+
+use std::fmt::Debug;
+use std::hash::{Hash, Hasher, BuildHasher};
+use std::mem::MaybeUninit;
+
+use rustc_hash::FxBuildHasher;
+
+use crate::shmem::ShmemHandle;
+
+mod core;
+pub mod entry;
+
+#[cfg(test)]
+mod tests;
+
+mod optim;
+
+use core::{CoreHashMap, INVALID_POS};
+use entry::{Entry, OccupiedEntry};
+
+
+pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    // Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+	shared_size: usize,
+	hasher: S,
+	num_buckets: u32,
+}
+
+pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+	hasher: S,
+}
+
+unsafe impl<'a, K: Sync, V: Sync, S> Sync for HashMapAccess<'a, K, V, S> {}
+unsafe impl<'a, K: Send, V: Send, S> Send for HashMapAccess<'a, K, V, S> {}
+
+impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
+	pub fn with_hasher(self, hasher: S) -> HashMapInit<'a, K, V, S> {
+		Self { hasher, ..self }
+	}
+	
+	pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+	
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
+		let mut ptr: *mut u8 = self.shared_ptr.cast();
+        let end_ptr: *mut u8 = unsafe { ptr.add(self.shared_size) };
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+        let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+ 
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::LinkedKey<K>>())) };
+        let keys_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<core::LinkedKey<K>>() * self.num_buckets as usize) };
+		
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<Option<V>>())) };
+        let vals_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<Option<V>>() * self.num_buckets as usize) };
+		
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_ptr = ptr;
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        let keys =
+            unsafe { std::slice::from_raw_parts_mut(keys_ptr.cast(), self.num_buckets as usize) };
+		let vals =
+            unsafe { std::slice::from_raw_parts_mut(vals_ptr.cast(), self.num_buckets as usize) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+        };
+        let hashmap = CoreHashMap::new(keys, vals, dictionary);
+        unsafe {
+            std::ptr::write(shared_ptr, HashMapShared { inner: hashmap });
+        }
+		
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+			hasher: self.hasher,
+        }
+    }
+
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
+        // no difference to attach_writer currently
+         self.attach_writer()
+    }
+}
+
+/// This is stored in the shared memory area
+///
+/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
+/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
+/// area as follows:
+///
+/// HashMapShared
+/// [buckets]
+/// [dictionary]
+///
+/// In between the above parts, there can be padding bytes to align the parts correctly.
+struct HashMapShared<'a, K, V> {
+    inner: CoreHashMap<'a, K, V>	
+}
+
+impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
+where
+	K: Clone + Hash + Eq
+{
+	pub fn with_fixed(
+		num_buckets: u32,
+        area: &'a mut [MaybeUninit<u8>],
+    ) -> HashMapInit<'a, K, V> {
+		Self {
+			num_buckets,
+			shmem_handle: None,
+			shared_ptr: area.as_mut_ptr().cast(),
+			shared_size: area.len(),
+			hasher: rustc_hash::FxBuildHasher::default(),
+		}		
+    }
+
+    /// Initialize a new hash map in the given shared memory area
+    pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
+		let size = Self::estimate_size(num_buckets);
+		shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+		Self {
+			num_buckets,
+			shared_ptr: shmem.data_ptr.as_ptr().cast(),
+			shmem_handle: Some(shmem),
+			shared_size: size,
+			hasher: rustc_hash::FxBuildHasher::default()
+		}
+    }
+
+	pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> HashMapInit<'a, K, V> {
+		let size = Self::estimate_size(num_buckets);
+		let max_size = Self::estimate_size(max_buckets);
+		let shmem = ShmemHandle::new(name, size, max_size)
+			.expect("failed to make shared memory area");
+		
+		Self {
+			num_buckets,
+			shared_ptr: shmem.data_ptr.as_ptr().cast(),
+			shmem_handle: Some(shmem),
+			shared_size: size,
+			hasher: rustc_hash::FxBuildHasher::default()
+		}
+	}
+
+	pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> HashMapInit<'a, K, V> {
+		use std::sync::atomic::{AtomicUsize, Ordering};
+		const COUNTER: AtomicUsize = AtomicUsize::new(0);
+		let val = COUNTER.fetch_add(1, Ordering::Relaxed);
+		let name = format!("neon_shmem_hmap{}", val);
+		Self::new_resizeable_named(num_buckets, max_buckets, &name)
+	}
+}
+
+impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
+where
+    K: Clone + Hash + Eq,
+{
+    pub fn get_hash_value(&self, key: &K) -> u64 {
+		self.hasher.hash_one(key)        
+    }
+
+    pub fn get_with_hash<'e>(&'e self, key: &K, hash: u64) -> Option<&'e V> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        map.inner.get_with_hash(key, hash)
+    }
+
+    pub fn entry_with_hash(&mut self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+
+        map.inner.entry_with_hash(key, hash)
+    }
+
+    pub fn remove_with_hash(&mut self, key: &K, hash: u64) {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+
+        match map.inner.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(e) => {
+                e.remove();
+            }
+            Entry::Vacant(_) => {}
+        };
+    }
+
+    pub fn entry_at_bucket(&mut self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        map.inner.entry_at_bucket(pos)
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map. (An Iterator might be nicer. The communicator's
+    /// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
+    /// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
+    pub fn get_at_bucket(&self, pos: usize) -> Option<(&K, &V)> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        if pos >= map.inner.keys.len() {
+            return None;
+        }
+        let key = &map.inner.keys[pos];
+		key.inner.as_ref().map(|k| (k, map.inner.vals[pos].as_ref().unwrap()))
+    }
+
+    pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+
+        let origin = map.inner.vals.as_ptr();
+        let idx = (val_ptr as usize - origin as usize) / (size_of::<V>() as usize);
+        assert!(idx < map.inner.vals.len());
+
+        idx
+    }
+
+    // for metrics
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
+        map.inner.buckets_in_use as usize
+    }
+
+	pub fn clear(&mut self) {
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        let inner = &mut map.inner;
+        inner.clear()
+	}
+	
+	/// Helper function that abstracts the common logic between growing and shrinking.
+	/// The only significant difference in the rehashing step is how many buckets to rehash.
+	fn rehash_dict(
+		&mut self,
+		inner: &mut CoreHashMap<'a, K, V>,
+		keys_ptr: *mut core::LinkedKey<K>,
+		end_ptr: *mut u8,
+		num_buckets: u32,
+		rehash_buckets: u32,
+	) {
+		inner.free_head = INVALID_POS;
+		
+		// Recalculate the dictionary
+        let keys;
+        let dictionary;
+        unsafe {
+            let keys_end_ptr = keys_ptr.add(num_buckets as usize);
+            let buckets_end_ptr: *mut u8 = (keys_end_ptr as *mut u8)
+				.add(size_of::<Option<V>>() * num_buckets as usize);
+			let dictionary_ptr: *mut u32 = buckets_end_ptr
+				.byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            keys = std::slice::from_raw_parts_mut(keys_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }
+        for i in 0..dictionary.len() {
+            dictionary[i] = INVALID_POS;
+        }
+
+        for i in 0..rehash_buckets as usize {
+			if keys[i].inner.is_none() {
+				keys[i].next = inner.free_head;
+				inner.free_head = i as u32;
+				continue;
+			}
+
+			let hash = self.hasher.hash_one(&keys[i].inner.as_ref().unwrap());
+            let pos: usize = (hash % dictionary.len() as u64) as usize;
+            keys[i].next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        // Finally, update the CoreHashMap struct
+        inner.dictionary = dictionary;
+        inner.keys = keys;
+	}
+
+	/// Rehash the map. Intended for benchmarking only.
+	pub fn shuffle(&mut self) {
+		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+        let inner = &mut map.inner;
+		let num_buckets = inner.get_num_buckets() as u32;
+		let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+		let end_ptr: *mut u8 = unsafe { (self.shared_ptr as *mut u8).add(size_bytes) };
+        let keys_ptr = inner.keys.as_mut_ptr();
+		self.rehash_dict(inner, keys_ptr, end_ptr, num_buckets, num_buckets);
+	}
+
+	
+    // /// Grow
+    // ///
+    // /// 1. grow the underlying shared memory area
+    // /// 2. Initialize new buckets. This overwrites the current dictionary
+    // /// 3. Recalculate the dictionary
+    // pub fn grow(&mut self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
+    //     let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+    //     let inner = &mut map.inner;
+    //     let old_num_buckets = inner.buckets.len() as u32;
+    //     if num_buckets < old_num_buckets {
+    //         panic!("grow called with a smaller number of buckets");
+    //     }
+    //     if num_buckets == old_num_buckets {
+    //         return Ok(());
+    //     }
+    //     let shmem_handle = self
+    //         .shmem_handle
+    //         .as_ref()
+    //         .expect("grow called on a fixed-size hash table");
+
+    //     let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+    //     shmem_handle.set_size(size_bytes)?;
+    //     let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+    //     // Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
+    //     // the dictionary!
+    //     let keys_ptr = inner.keys.as_mut_ptr();
+    //     unsafe {
+    //         for i in old_num_buckets..num_buckets {
+    //             let bucket_ptr = buckets_ptr.add(i as usize);
+    //             bucket_ptr.write(core::Bucket {
+    //                 next: if i < num_buckets-1 {
+    //                     i as u32 + 1
+    //                 } else {
+    //                     inner.free_head
+    //                 },
+	// 				prev: if i > 0 {
+	// 					PrevPos::Chained(i as u32 - 1)
+	// 				} else {
+	// 					PrevPos::First(INVALID_POS)
+	// 				},
+    //                 inner: None,
+    //             });
+    //         }
+    //     }
+	// 	self.rehash_dict(inner, keys_ptr, end_ptr, num_buckets, old_num_buckets);
+    //     inner.free_head = old_num_buckets;
+
+    //     Ok(())
+    // }
+
+	// /// Begin a shrink, limiting all new allocations to be in buckets with index less than `num_buckets`. 
+// 	pub fn begin_shrink(&mut self, num_buckets: u32) {
+// 		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+// 		if num_buckets > map.inner.get_num_buckets() as u32 {
+//             panic!("shrink called with a larger number of buckets");
+//         }
+// 		_ = self
+//             .shmem_handle
+//             .as_ref()
+//             .expect("shrink called on a fixed-size hash table");
+// 		map.inner.alloc_limit = num_buckets;
+// 	}
+
+// 	/// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing.
+// 	pub fn finish_shrink(&mut self) -> Result<(), crate::shmem::Error> {
+// 		let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
+// 		let inner = &mut map.inner;
+// 		if !inner.is_shrinking() {
+// 			panic!("called finish_shrink when no shrink is in progress");
+// 		}
+
+// 		let num_buckets = inner.alloc_limit; 
+
+// 		if inner.get_num_buckets() == num_buckets as usize {
+//             return Ok(());
+//         }
+		
+// 		for i in (num_buckets as usize)..inner.buckets.len() {
+// 			if inner.buckets[i].inner.is_some() {
+// 				// TODO(quantumish) Do we want to treat this as a violation of an invariant
+// 				// or a legitimate error the caller can run into? Originally I thought this
+// 				// could return something like a UnevictedError(index) as soon as it runs
+// 				// into something (that way a caller could clear their soon-to-be-shrinked 
+// 				// buckets by repeatedly trying to call `finish_shrink`). 
+// 				//
+// 				// Would require making a wider error type enum with this and shmem errors.
+// 				panic!("unevicted entries in shrinked space")
+// 			}
+// 			match inner.buckets[i].prev {
+// 				PrevPos::First(_) => {
+// 					let next_pos = inner.buckets[i].next;
+// 					inner.free_head = next_pos;
+// 					if next_pos != INVALID_POS {
+// 						inner.buckets[next_pos as usize].prev = PrevPos::First(INVALID_POS);
+// 					}
+// 				},
+// 				PrevPos::Chained(j) => {
+// 					let next_pos = inner.buckets[i].next;
+// 					inner.buckets[j as usize].next = next_pos;
+// 					if next_pos != INVALID_POS {
+// 						inner.buckets[next_pos as usize].prev = PrevPos::Chained(j);
+// 					}
+// 				}
+// 			}
+// 		}
+
+//         let shmem_handle = self
+//             .shmem_handle
+//             .as_ref()
+//             .expect("shrink called on a fixed-size hash table");
+
+// 		let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+//         shmem_handle.set_size(size_bytes)?;
+//         let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+// 		let buckets_ptr = inner.buckets.as_mut_ptr();
+// 		self.rehash_dict(inner, buckets_ptr, end_ptr, num_buckets, num_buckets);
+// 		inner.alloc_limit = INVALID_POS;
+		
+// 		Ok(())
+// 	}
+
+}
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,247 @@
+//! Simple hash table with chaining
+//!
+//! # Resizing
+//!
+
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+
+use crate::hash::entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};
+
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+pub(crate) struct LinkedKey<K> {
+	pub(crate) inner: Option<K>,
+	pub(crate) next: u32,	
+}
+
+pub(crate) struct CoreHashMap<'a, K, V> {
+	/// Dictionary used to map hashes to bucket indices.	
+    pub(crate) dictionary: &'a mut [u32],
+    pub(crate) keys: &'a mut [LinkedKey<K>],
+	pub(crate) vals: &'a mut [Option<V>],
+	/// Head of the freelist.
+    pub(crate) free_head: u32,
+
+    pub(crate) _user_list_head: u32,
+	/// Maximum index of a bucket allowed to be allocated. INVALID_POS if no limit.
+	pub(crate) alloc_limit: u32,
+
+    // metrics
+    pub(crate) buckets_in_use: u32,
+}
+
+#[derive(Debug)]
+pub struct FullError();
+
+impl<'a, K: Hash + Eq, V> CoreHashMap<'a, K, V>
+where
+    K: Clone + Hash + Eq,
+{
+    const FILL_FACTOR: f32 = 0.60;
+
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += (size_of::<LinkedKey<K>>() + size_of::<Option<V>>())
+			* num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }	
+
+    pub fn new(
+        keys: &'a mut [MaybeUninit<LinkedKey<K>>],
+		vals: &'a mut [MaybeUninit<Option<V>>],
+        dictionary: &'a mut [MaybeUninit<u32>],
+    ) -> CoreHashMap<'a, K, V> {
+        // Initialize the buckets
+        for i in 0..keys.len() {
+            keys[i].write(LinkedKey {
+				next: if i < keys.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },
+				inner: None,
+			});
+		}
+		for i in 0..vals.len() {
+            vals[i].write(None);
+		}
+
+		// Initialize the dictionary
+        for i in 0..dictionary.len() {
+            dictionary[i].write(INVALID_POS);
+        }
+
+        // TODO: use std::slice::assume_init_mut() once it stabilizes
+        let keys =
+            unsafe { std::slice::from_raw_parts_mut(keys.as_mut_ptr().cast(), keys.len()) };
+		let vals =
+            unsafe { std::slice::from_raw_parts_mut(vals.as_mut_ptr().cast(), vals.len()) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
+        };
+
+        CoreHashMap {
+            dictionary,
+            keys,
+			vals,
+            free_head: 0,
+            buckets_in_use: 0,
+            _user_list_head: INVALID_POS,
+			alloc_limit: INVALID_POS,
+        }
+    }
+
+    pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let keylink = &self.keys[next as usize];
+            let bucket_key = keylink.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(self.vals[next as usize].as_ref().unwrap());
+            }
+            next = keylink.next;
+        }
+    }
+
+    // all updates are done through Entry
+    pub fn entry_with_hash(&mut self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+        let dict_pos = hash as usize % self.dictionary.len();
+        let first = self.dictionary[dict_pos];
+        if first == INVALID_POS {
+            // no existing entry
+            return Entry::Vacant(VacantEntry {
+                map: self,
+                key,
+                dict_pos: dict_pos as u32,
+            });
+        }
+
+        let mut prev_pos = PrevPos::First(dict_pos as u32);
+        let mut next = first;
+        loop {
+            let keylink = &mut self.keys[next as usize];
+            let bucket_key = keylink.inner.as_mut().expect("entry is in use");
+            if *bucket_key == key {
+                // found existing entry
+                return Entry::Occupied(OccupiedEntry {
+                    map: self,
+                    _key: key,
+                    prev_pos,
+                    bucket_pos: next,
+                });
+            }
+
+            if keylink.next == INVALID_POS {
+                // No existing entry
+                return Entry::Vacant(VacantEntry {
+                    map: self,
+                    key,
+                    dict_pos: dict_pos as u32,
+                });
+            }
+            prev_pos = PrevPos::Chained(next);
+            next = keylink.next;
+        }
+    }
+
+    pub fn get_num_buckets(&self) -> usize {
+        self.keys.len()
+    }
+
+	pub fn is_shrinking(&self) -> bool {
+		self.alloc_limit != INVALID_POS
+	}
+
+	/// Clears all entries from the hashmap.
+	/// Does not reset any allocation limits, but does clear any entries beyond them.
+	pub fn clear(&mut self) {
+		for i in 0..self.keys.len() {
+            self.keys[i] = LinkedKey {
+                next: if i < self.keys.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },				
+                inner: None,
+            }
+        }
+		for i in 0..self.vals.len() {
+			self.vals[i] = None;
+		}
+
+        for i in 0..self.dictionary.len() {
+            self.dictionary[i] = INVALID_POS;
+        }
+
+		self.buckets_in_use = 0;
+	}
+	
+    pub fn entry_at_bucket(&mut self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
+		if pos >= self.keys.len() {
+			return None;
+		}
+
+		let entry = self.keys[pos].inner.as_ref();
+		match entry {
+			Some(key) => Some(OccupiedEntry {
+				_key: key.clone(),
+				bucket_pos: pos as u32,
+				prev_pos: PrevPos::Unknown,
+				map: self,
+			}),
+			_ => None,
+		}		
+    }
+
+	/// Find the position of an unused bucket via the freelist and initialize it. 
+    pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
+        let mut pos = self.free_head;
+
+		// Find the first bucket we're *allowed* to use.
+		let mut prev = PrevPos::First(self.free_head);
+		while pos != INVALID_POS && pos >= self.alloc_limit {
+			let keylink = &mut self.keys[pos as usize];
+			prev = PrevPos::Chained(pos);
+			pos = keylink.next;
+		}
+		if pos == INVALID_POS {
+			return Err(FullError());
+		}
+
+		// Repair the freelist.
+		match prev {
+			PrevPos::First(_) => {
+				let next_pos = self.keys[pos as usize].next;
+				self.free_head = next_pos;		
+			}
+			PrevPos::Chained(p) => if p != INVALID_POS {
+				let next_pos = self.keys[pos as usize].next;
+				self.keys[p as usize].next = next_pos;
+			},
+			PrevPos::Unknown => unreachable!()
+		}
+
+		// Initialize the bucket.
+		let keylink = &mut self.keys[pos as usize];
+		self.buckets_in_use += 1;
+        keylink.next = INVALID_POS;		
+        keylink.inner = Some(key);
+		self.vals[pos as usize] = Some(value);
+
+        return Ok(pos);
+    }
+}
+
+	
--- a/libs/neon-shmem/src/hash/entry.rs
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -0,0 +1,107 @@
+//! Like std::collections::hash_map::Entry;
+
+use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
+
+use std::hash::Hash;
+use std::mem;
+
+pub enum Entry<'a, 'b, K, V> {
+    Occupied(OccupiedEntry<'a, 'b, K, V>),
+    Vacant(VacantEntry<'a, 'b, K, V>),
+}
+
+/// Helper enum representing the previous position within a hashmap chain.
+#[derive(Clone, Copy)]
+pub(crate) enum PrevPos {
+	/// Starting index within the dictionary.  
+    First(u32),
+	/// Regular index within the buckets.
+    Chained(u32),
+	/// Unknown - e.g. the associated entry was retrieved by index instead of chain.
+	Unknown,
+}
+
+impl PrevPos {
+	/// Unwrap an index from a `PrevPos::First`, panicking otherwise.
+	pub fn unwrap_first(&self) -> u32 {
+		match self {
+			Self::First(i) => *i,
+			_ => panic!("not first entry in chain")
+		}
+	}
+}
+
+pub struct OccupiedEntry<'a, 'b, K, V> {
+	pub(crate) map: &'b mut CoreHashMap<'a, K, V>,
+	/// The key of the occupied entry
+    pub(crate) _key: K,
+	/// The index of the previous entry in the chain.
+    pub(crate) prev_pos: PrevPos,
+	/// The position of the bucket in the CoreHashMap's buckets array.
+    pub(crate) bucket_pos: u32, 
+}
+
+impl<'a, 'b, K, V> OccupiedEntry<'a, 'b, K, V> {
+    pub fn get(&self) -> &V {
+        self.map.vals[self.bucket_pos as usize]
+            .as_ref()
+            .unwrap()
+    }
+
+    pub fn get_mut(&mut self) -> &mut V {
+        self.map.vals[self.bucket_pos as usize]
+            .as_mut()
+            .unwrap()
+    }
+
+    pub fn insert(&mut self, value: V) -> V {
+        let bucket = &mut self.map.vals[self.bucket_pos as usize];
+        // This assumes inner is Some, which it must be for an OccupiedEntry
+        let old_value = mem::replace(bucket.as_mut().unwrap(), value);
+        old_value
+    }
+
+    pub fn remove(self) -> V {
+        // CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
+        let keylink = &mut self.map.keys[self.bucket_pos as usize];
+
+        // unlink it from the chain
+        match self.prev_pos {
+            PrevPos::First(dict_pos) => self.map.dictionary[dict_pos as usize] = keylink.next,
+            PrevPos::Chained(bucket_pos) => {
+                self.map.keys[bucket_pos as usize].next = keylink.next
+            },
+			PrevPos::Unknown => panic!("can't safely remove entry with unknown previous entry"),
+        }
+
+        // and add it to the freelist        
+        let keylink = &mut self.map.keys[self.bucket_pos as usize];
+        keylink.inner = None;
+		keylink.next = self.map.free_head;
+		let old_value = self.map.vals[self.bucket_pos as usize].take();
+        self.map.free_head = self.bucket_pos;
+        self.map.buckets_in_use -= 1;
+
+        return old_value.unwrap();
+    }
+}
+
+pub struct VacantEntry<'a, 'b, K, V> {
+    pub(crate) map: &'b mut CoreHashMap<'a, K, V>,
+    pub(crate) key: K, // The key to insert
+    pub(crate) dict_pos: u32,
+}
+
+impl<'a, 'b, K: Clone + Hash + Eq, V> VacantEntry<'a, 'b, K, V> {
+    pub fn insert(self, value: V) -> Result<&'b mut V, FullError> {
+        let pos = self.map.alloc_bucket(self.key, value)?;
+        if pos == INVALID_POS {
+            return Err(FullError());
+        }
+        self.map.keys[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
+        self.map.dictionary[self.dict_pos as usize] = pos;
+
+        let result = self.map.vals[pos as usize].as_mut().unwrap();
+        return Ok(result);
+    }
+}
--- a/libs/neon-shmem/src/hash/optim.rs
+++ b/libs/neon-shmem/src/hash/optim.rs
@@ -0,0 +1,85 @@
+//! Adapted from https://github.com/jsnell/parallel-xxhash (TODO: license?)
+
+use core::arch::x86::*;
+
+const PRIME32_1: u32 = 2654435761;
+const PRIME32_2: u32 = 2246822519;
+const PRIME32_3: u32 = 3266489917;
+const PRIME32_4: u32 =  668265263;
+const PRIME32_5: u32 =  374761393;
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+fn mm256_rol32<const r: u32>(x: __m256i) -> __m256i {
+    return _mm256_or_si256(_mm256_slli_epi32(x, r),
+                           _mm256_srli_epi32(x, 32 - r));
+} 
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+fn mm256_fmix32(mut h: __m256i) -> __m256i {
+    h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 15));
+    h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_2));
+    h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 13));
+    h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_3));
+    h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16));
+	h
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+fn mm256_round(mut seed: __m256i, input: __m256i) -> __m256i {
+	seed = _mm256_add_epi32(
+		seed,
+        _mm256_mullo_epi32(input, _mm256_set1_epi32(PRIME32_2))
+	);
+    seed = mm256_rol32::<13>(seed);
+    seed = _mm256_mullo_epi32(seed, _mm256_set1_epi32(PRIME32_1));
+	seed
+}
+
+/// Computes xxHash for 8 keys of size 4*N bytes in column-major order.
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx2")]
+fn xxhash_many<const N: usize>(keys: *const u32, seed: u32) -> [u32; 8] {
+	let mut res = [0; 8];
+	let mut h = _mm256_set1_epi32(seed + PRIME32_5);
+	if (N >= 4) {
+		let mut v1 = _mm256_set1_epi32(seed + PRIME32_1 + PRIME32_2);
+		let mut v2 = _mm256_set1_epi32(seed + PRIME32_2);
+		let mut v3 = _mm256_set1_epi32(seed);
+		let mut v4 = _mm256_set1_eip32(seed - PRIME32_1);
+		let mut i = 0;
+		while i < (N & !3) {
+			let k1 = _mm256_loadu_si256(keys.add((i + 0) * 8).cast());
+			let k2 = _mm256_loadu_si256(keys.add((i + 1) * 8).cast());
+			let k3 = _mm256_loadu_si256(keys.add((i + 2) * 8).cast());
+			let k4 = _mm256_loadu_si256(keys.add((i + 3) * 8).cast());
+			v1 = mm256_round(v1, k1);
+			v2 = mm256_round(v2, k2);
+			v3 = mm256_round(v3, k3);
+			v4 = mm256_round(v4, k4);
+			i += 4;
+		}
+		h = mm256_rol32::<1>(v1) + mm256_rol32::<7>(v2) +
+			mm256_rol32::<12>(v3) + mm256_rol32::<18>(v4);
+	}
+
+	// Unneeded, keeps bitwise parity with xxhash though.
+	h = _m256_add_epi32(h, _mm256_set1_eip32(N * 4));
+
+	for i in -(N & 3)..0 {
+        let v = _mm256_loadu_si256(keys.add((N + i) * 8));
+        h = _mm256_add_epi32(
+			h,
+            _mm256_mullo_epi32(v, _mm256_set1_epi32(PRIME32_3))
+		);
+        h = _mm256_mullo_epi32(
+			mm256_rol32::<17>(h),
+            _mm256_set1_epi32(PRIME32_4)
+		);
+    }
+
+    _mm256_storeu_si256((&mut res as *mut _).cast(), mm256_fmix32(h));
+	res
+}
--- a/libs/neon-shmem/src/hash/tests.rs
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,382 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::{Debug, Formatter};
+use std::mem::uninitialized;
+use std::mem::MaybeUninit;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::Entry;
+use crate::shmem::ShmemHandle;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {	
+    let mut w = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		100000, 120000, "test_inserts"
+	).attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+		let hash = w.get_hash_value(&(*k).into());
+		let res = w.entry_with_hash((*k).into(), hash);
+		match res {
+			Entry::Occupied(mut e) => { e.insert(idx); }
+			Entry::Vacant(e) => {
+				let res = e.insert(idx);
+				assert!(res.is_ok());
+			},
+		};
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+		let hash = w.get_hash_value(&(*k).into());
+        let x = w.get_with_hash(&(*k).into(), hash);
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.get(&key).is_some() {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+struct TestValue(AtomicUsize);
+
+impl TestValue {
+    fn new(val: usize) -> TestValue {
+        TestValue(AtomicUsize::new(val))
+    }
+
+    fn load(&self) -> usize {
+        self.0.load(Ordering::Relaxed)
+    }
+}
+
+impl Clone for TestValue {
+    fn clone(&self) -> TestValue {
+        TestValue::new(self.load())
+    }
+}
+
+impl Debug for TestValue {
+    fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
+        write!(fmt, "{:?}", self.load())
+    }
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    map: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+	let hash = map.get_hash_value(&op.0);
+	let entry = map.entry_with_hash(op.0, hash);
+    let hash_existing = match op.1 {
+		Some(new) => {
+			match entry {
+				Entry::Occupied(mut e) => Some(e.insert(new)),
+				Entry::Vacant(e) => { e.insert(new).unwrap(); None },
+			}
+		},
+		None => {
+			match entry {
+				Entry::Occupied(e) => Some(e.remove()),
+				Entry::Vacant(_) => None,
+			}
+		},
+	};
+
+	assert_eq!(shadow_existing, hash_existing);
+}
+
+fn do_random_ops(
+	num_ops: usize,
+	size: u32,
+	del_prob: f64,
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+	rng: &mut rand::rngs::ThreadRng,
+) {
+	for i in 0..num_ops {
+        let key: TestKey = ((rng.next_u32() % size) as u128).into();
+        let op = TestOp(key, if rng.random_bool(del_prob) { Some(i) } else { None });
+        apply_op(&op, writer, shadow);
+    }
+}
+
+fn do_deletes(
+	num_ops: usize,
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+) {
+	for i in 0..num_ops {
+		let (k, _) = shadow.pop_first().unwrap();
+		let hash = writer.get_hash_value(&k);
+		writer.remove_with_hash(&k, hash);
+	}
+}
+
+fn do_shrink(
+	writer: &mut HashMapAccess<TestKey, usize>,
+	shadow: &mut BTreeMap<TestKey, usize>,
+	from: u32,
+	to: u32
+) {
+	writer.begin_shrink(to);
+	while writer.get_num_buckets_in_use() > to as usize {
+		let (k, _) = shadow.pop_first().unwrap();
+		let hash = writer.get_hash_value(&k);
+		let entry = writer.entry_with_hash(k, hash);
+		if let Entry::Occupied(mut e) = entry {
+			e.remove();
+		}
+	}
+	writer.finish_shrink().unwrap();
+}
+
+#[test]
+fn random_ops() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		100000, 120000, "test_random"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+	
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &mut writer, &mut shadow);
+
+        if i % 1000 == 0 {
+            eprintln!("{i} ops processed");
+        }
+    }
+}
+
+
+#[test]
+fn test_shuffle() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 1200, "test_shuf"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.shuffle();
+	do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_grow() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 2000, "test_grow"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.grow(1500).unwrap();
+	do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_shrink() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_shrink"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+	
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    do_shrink(&mut writer, &mut shadow, 1500, 1000);
+	do_deletes(500, &mut writer, &mut shadow);
+	do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
+	assert!(writer.get_num_buckets_in_use() <= 1000);
+}
+
+#[test]
+fn test_shrink_grow_seq() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 20000, "test_grow_seq"
+	).attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Shrinking to 750");
+    do_shrink(&mut writer, &mut shadow, 1000, 750);
+	do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Growing to 1500");
+	writer.grow(1500).unwrap();
+	do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Shrinking to 200");
+	do_shrink(&mut writer, &mut shadow, 1500, 200);
+	do_deletes(100, &mut writer, &mut shadow);
+	do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+	eprintln!("Growing to 10k");
+	writer.grow(10000).unwrap();
+	do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_bucket_ops() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1000, 1200, "test_bucket_ops"
+	).attach_writer();
+	let hash = writer.get_hash_value(&1.into());
+	match writer.entry_with_hash(1.into(), hash) {
+		Entry::Occupied(mut e) => { e.insert(2); },
+		Entry::Vacant(e) => { e.insert(2).unwrap(); },
+	}
+	assert_eq!(writer.get_num_buckets_in_use(), 1);
+	assert_eq!(writer.get_num_buckets(), 1000);
+	assert_eq!(writer.get_with_hash(&1.into(), hash), Some(&2));
+	match writer.entry_with_hash(1.into(), hash) {
+		Entry::Occupied(e) => {
+			assert_eq!(e._key, 1.into());
+			let pos = e.bucket_pos as usize;
+			assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into());
+			assert_eq!(writer.get_at_bucket(pos), Some(&(1.into(), 2)));
+		},
+		Entry::Vacant(_) => { panic!("Insert didn't affect entry"); },
+	}
+	writer.remove_with_hash(&1.into(), hash);
+	assert_eq!(writer.get_with_hash(&1.into(), hash), None);
+}
+
+#[test]
+fn test_shrink_zero() {
+	let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_shrink_zero"
+	).attach_writer();
+	writer.begin_shrink(0);
+	for i in 0..1500 {
+		writer.entry_at_bucket(i).map(|x| x.remove());
+	}
+	writer.finish_shrink().unwrap();
+	assert_eq!(writer.get_num_buckets_in_use(), 0);
+	let hash = writer.get_hash_value(&1.into());
+	let entry = writer.entry_with_hash(1.into(), hash);
+	if let Entry::Vacant(v) = entry {
+		assert!(v.insert(2).is_err());
+	} else {
+		panic!("Somehow got non-vacant entry in empty map.")
+	}
+	writer.grow(50).unwrap();
+	let entry = writer.entry_with_hash(1.into(), hash);
+	if let Entry::Vacant(v) = entry {
+		assert!(v.insert(2).is_ok());
+	} else {
+		panic!("Somehow got non-vacant entry in empty map.")
+	}
+	assert_eq!(writer.get_num_buckets_in_use(), 1);
+}
+
+#[test]
+#[should_panic]
+fn test_grow_oom() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2000, "test_grow_oom"
+	).attach_writer();
+	writer.grow(20000).unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_bigger() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2500, "test_shrink_bigger"
+	).attach_writer();
+	writer.begin_shrink(2000);
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_early_finish() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
+		1500, 2500, "test_shrink_early_finish"
+	).attach_writer();
+	writer.finish_shrink().unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_fixed_size() {
+	let mut area = [MaybeUninit::uninit(); 10000];
+    let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
+    let mut writer = init_struct.attach_writer();
+	writer.begin_shrink(1);
+}
+
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1,4 @@
 //! Shared memory utilities for neon communicator

-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {} too large", max_size);
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {}", i);
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod hash;
+pub mod shmem;
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,418 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with memfd_create(). The full address space for
+/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the ShmemHandle functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Error {
+        Error {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
+    ///
+    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(
+        fd: OwnedFd,
+        initial_size: usize,
+        max_size: usize,
+    ) -> Result<ShmemHandle, Error> {
+        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        if max_size >= 1 << 48 {
+            panic!("max size {} too large", max_size);
+        }
+        if initial_size > max_size {
+            panic!("initial size {initial_size} larger than max size {max_size}");
+        }
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed: {e}", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            })
+        };
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(ShmemHandle {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an Error.
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        if new_size > self.max_size {
+            panic!(
+                "new size ({} is greater than max size ({})",
+                new_size, self.max_size
+            );
+        }
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in 'current_size'
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry .
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
+                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
+                }),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
+    /// responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed: {e}", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area: {e}",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
+            Error::new(
+                "could not grow shmem segment, posix_fallocate failed: {e}",
+                e,
+            )
+        })
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {}", i);
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -8,6 +8,8 @@ pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
 pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
+// TODO: gRPC is disabled by default for now, but the port is used in neon_local.
+pub const DEFAULT_GRPC_LISTEN_PORT: u16 = 51051; // storage-broker already uses 50051

 use std::collections::HashMap;
 use std::num::{NonZeroU64, NonZeroUsize};
@@ -18,7 +20,6 @@ use postgres_backend::AuthType;
 use remote_storage::RemoteStorageConfig;
 use serde_with::serde_as;
 use utils::logging::LogFormat;
-use utils::postgres_client::PostgresClientProtocol;

 use crate::models::{ImageCompressionAlgorithm, LsnLease};

@@ -43,6 +44,21 @@ pub struct NodeMetadata {
    pub other: HashMap<String, serde_json::Value>,
 }

+/// PostHog integration config.
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct PostHogConfig {
+    /// PostHog project ID
+    pub project_id: String,
+    /// Server-side (private) API key
+    pub server_api_key: String,
+    /// Client-side (public) API key
+    pub client_api_key: String,
+    /// Private API URL
+    pub private_api_url: String,
+    /// Public API URL
+    pub public_api_url: String,
+}
+
 /// `pageserver.toml`
 ///
 /// We use serde derive with `#[serde(default)]` to generate a deserializer
@@ -104,6 +120,7 @@ pub struct ConfigToml {
    pub listen_pg_addr: String,
    pub listen_http_addr: String,
    pub listen_https_addr: Option<String>,
+    pub listen_grpc_addr: Option<String>,
    pub ssl_key_file: Utf8PathBuf,
    pub ssl_cert_file: Utf8PathBuf,
    #[serde(with = "humantime_serde")]
@@ -123,6 +140,7 @@ pub struct ConfigToml {
    pub http_auth_type: AuthType,
    #[serde_as(as = "serde_with::DisplayFromStr")]
    pub pg_auth_type: AuthType,
+    pub grpc_auth_type: AuthType,
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,
    pub remote_storage: Option<RemoteStorageConfig>,
    pub tenant_config: TenantConfigToml,
@@ -162,6 +180,7 @@ pub struct ConfigToml {
    pub virtual_file_io_engine: Option<crate::models::virtual_file::IoEngineKind>,
    pub ingest_batch_size: u64,
    pub max_vectored_read_bytes: MaxVectoredReadBytes,
+    pub max_get_vectored_keys: MaxGetVectoredKeys,
    pub image_compression: ImageCompressionAlgorithm,
    pub timeline_offloading: bool,
    pub ephemeral_bytes_per_memory_kb: usize,
@@ -169,7 +188,6 @@ pub struct ConfigToml {
    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub no_sync: Option<bool>,
-    pub wal_receiver_protocol: PostgresClientProtocol,
    pub page_service_pipelining: PageServicePipeliningConfig,
    pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
    pub enable_read_path_debugging: Option<bool>,
@@ -182,6 +200,8 @@ pub struct ConfigToml {
    pub tracing: Option<Tracing>,
    pub enable_tls_page_service_api: bool,
    pub dev_mode: bool,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub posthog_config: Option<PostHogConfig>,
    pub timeline_import_config: TimelineImportConfig,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub basebackup_cache_config: Option<BasebackupCacheConfig>,
@@ -208,7 +228,7 @@ pub enum PageServicePipeliningConfig {
 }
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub struct PageServicePipeliningConfigPipelined {
-    /// Causes runtime errors if larger than max get_vectored batch size.
+    /// Failed config parsing and validation if larger than `max_get_vectored_keys`.
    pub max_batch_size: NonZeroUsize,
    pub execution: PageServiceProtocolPipelinedExecutionStrategy,
    // The default below is such that new versions of the software can start
@@ -308,6 +328,8 @@ pub struct TimelineImportConfig {
    pub import_job_concurrency: NonZeroUsize,
    pub import_job_soft_size_limit: NonZeroUsize,
    pub import_job_checkpoint_threshold: NonZeroUsize,
+    /// Max size of the remote storage partial read done by any job
+    pub import_job_max_byte_range_size: NonZeroUsize,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -382,6 +404,16 @@ impl Default for EvictionOrder {
 #[serde(transparent)]
 pub struct MaxVectoredReadBytes(pub NonZeroUsize);

+#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(transparent)]
+pub struct MaxGetVectoredKeys(NonZeroUsize);
+
+impl MaxGetVectoredKeys {
+    pub fn get(&self) -> usize {
+        self.0.get()
+    }
+}
+
 /// Tenant-level configuration values, used for various purposes.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(default)]
@@ -493,8 +525,6 @@ pub struct TenantConfigToml {
    /// (either this flag or the pageserver-global one need to be set)
    pub timeline_offloading: bool,

-    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
-
    /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
    /// `index_part.json`, and it cannot be reversed.
    pub rel_size_v2_enabled: bool,
@@ -566,6 +596,8 @@ pub mod defaults {
    /// That is, slightly above 128 kB.
    pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB

+    pub const DEFAULT_MAX_GET_VECTORED_KEYS: usize = 32;
+
    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
        ImageCompressionAlgorithm::Zstd { level: Some(1) };

@@ -573,9 +605,6 @@ pub mod defaults {

    pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;

-    pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
-        utils::postgres_client::PostgresClientProtocol::Vanilla;
-
    pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
    pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
 }
@@ -588,6 +617,7 @@ impl Default for ConfigToml {
            listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
            listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
            listen_https_addr: (None),
+            listen_grpc_addr: None, // TODO: default to 127.0.0.1:51051
            ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE),
            ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE),
            ssl_cert_reload_period: Duration::from_secs(60),
@@ -604,6 +634,7 @@ impl Default for ConfigToml {
            pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
            http_auth_type: (AuthType::Trust),
            pg_auth_type: (AuthType::Trust),
+            grpc_auth_type: (AuthType::Trust),
            auth_validation_public_key_path: (None),
            remote_storage: None,
            broker_endpoint: (storage_broker::DEFAULT_ENDPOINT
@@ -662,6 +693,9 @@ impl Default for ConfigToml {
            max_vectored_read_bytes: (MaxVectoredReadBytes(
                NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
            )),
+            max_get_vectored_keys: (MaxGetVectoredKeys(
+                NonZeroUsize::new(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap(),
+            )),
            image_compression: (DEFAULT_IMAGE_COMPRESSION),
            timeline_offloading: true,
            ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
@@ -669,7 +703,6 @@ impl Default for ConfigToml {
            virtual_file_io_mode: None,
            tenant_config: TenantConfigToml::default(),
            no_sync: None,
-            wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
            page_service_pipelining: PageServicePipeliningConfig::Pipelined(
                PageServicePipeliningConfigPipelined {
                    max_batch_size: NonZeroUsize::new(32).unwrap(),
@@ -690,11 +723,13 @@ impl Default for ConfigToml {
            enable_tls_page_service_api: false,
            dev_mode: false,
            timeline_import_config: TimelineImportConfig {
-                import_job_concurrency: NonZeroUsize::new(128).unwrap(),
-                import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(),
-                import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(),
+                import_job_concurrency: NonZeroUsize::new(32).unwrap(),
+                import_job_soft_size_limit: NonZeroUsize::new(256 * 1024 * 1024).unwrap(),
+                import_job_checkpoint_threshold: NonZeroUsize::new(32).unwrap(),
+                import_job_max_byte_range_size: NonZeroUsize::new(4 * 1024 * 1024).unwrap(),
            },
            basebackup_cache_config: None,
+            posthog_config: None,
        }
    }
 }
@@ -812,7 +847,6 @@ impl Default for TenantConfigToml {
            lsn_lease_length: LsnLease::DEFAULT_LENGTH,
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
            timeline_offloading: true,
-            wal_receiver_protocol_override: None,
            rel_size_v2_enabled: false,
            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
            gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -20,7 +20,6 @@ use serde_with::serde_as;
 pub use utilization::PageserverUtilization;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
-use utils::postgres_client::PostgresClientProtocol;
 use utils::{completion, serde_system_time};

 use crate::config::Ratio;
@@ -354,6 +353,9 @@ pub struct ShardImportProgressV1 {
    pub completed: usize,
    /// Hash of the plan
    pub import_plan_hash: u64,
+    /// Soft limit for the job size
+    /// This needs to remain constant throughout the import
+    pub job_soft_size_limit: usize,
 }

 impl ShardImportStatus {
@@ -402,6 +404,8 @@ pub enum TimelineCreateRequestMode {
        // using a flattened enum, so, it was an accepted field, and
        // we continue to accept it by having it here.
        pg_version: Option<u32>,
+        #[serde(default, skip_serializing_if = "std::ops::Not::not")]
+        read_only: bool,
    },
    ImportPgdata {
        import_pgdata: TimelineCreateRequestModeImportPgdata,
@@ -617,8 +621,6 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub timeline_offloading: FieldPatch<bool>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
-    pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
-    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub rel_size_v2_enabled: FieldPatch<bool>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_compaction_enabled: FieldPatch<bool>,
@@ -743,9 +745,6 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timeline_offloading: Option<bool>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    pub rel_size_v2_enabled: Option<bool>,

@@ -807,7 +806,6 @@ impl TenantConfig {
            mut lsn_lease_length,
            mut lsn_lease_length_for_ts,
            mut timeline_offloading,
-            mut wal_receiver_protocol_override,
            mut rel_size_v2_enabled,
            mut gc_compaction_enabled,
            mut gc_compaction_verification,
@@ -900,9 +898,6 @@ impl TenantConfig {
            .map(|v| humantime::parse_duration(&v))?
            .apply(&mut lsn_lease_length_for_ts);
        patch.timeline_offloading.apply(&mut timeline_offloading);
-        patch
-            .wal_receiver_protocol_override
-            .apply(&mut wal_receiver_protocol_override);
        patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
        patch
            .gc_compaction_enabled
@@ -955,7 +950,6 @@ impl TenantConfig {
            lsn_lease_length,
            lsn_lease_length_for_ts,
            timeline_offloading,
-            wal_receiver_protocol_override,
            rel_size_v2_enabled,
            gc_compaction_enabled,
            gc_compaction_verification,
@@ -1053,9 +1047,6 @@ impl TenantConfig {
            timeline_offloading: self
                .timeline_offloading
                .unwrap_or(global_conf.timeline_offloading),
-            wal_receiver_protocol_override: self
-                .wal_receiver_protocol_override
-                .or(global_conf.wal_receiver_protocol_override),
            rel_size_v2_enabled: self
                .rel_size_v2_enabled
                .unwrap_or(global_conf.rel_size_v2_enabled),
@@ -1929,7 +1920,7 @@ pub enum PagestreamFeMessage {
 }

 // Wrapped in libpq CopyData
-#[derive(strum_macros::EnumProperty)]
+#[derive(Debug, strum_macros::EnumProperty)]
 pub enum PagestreamBeMessage {
    Exists(PagestreamExistsResponse),
    Nblocks(PagestreamNblocksResponse),
@@ -2040,7 +2031,7 @@ pub enum PagestreamProtocolVersion {

 pub type RequestId = u64;

-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamRequest {
    pub reqid: RequestId,
    pub request_lsn: Lsn,
@@ -2059,7 +2050,7 @@ pub struct PagestreamNblocksRequest {
    pub rel: RelTag,
 }

-#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
 pub struct PagestreamGetPageRequest {
    pub hdr: PagestreamRequest,
    pub rel: RelTag,
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -24,7 +24,7 @@ use serde::{Deserialize, Serialize};
 // FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
 // Then we could replace the custom Ord and PartialOrd implementations below with
 // deriving them. This will require changes in walredoproc.c.
-#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
+#[derive(Debug, Default, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
 pub struct RelTag {
    pub forknum: u8,
    pub spcnode: Oid,
@@ -184,12 +184,12 @@ pub enum SlruKind {
    MultiXactOffsets,
 }

-impl SlruKind {
-    pub fn to_str(&self) -> &'static str {
+impl fmt::Display for SlruKind {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
-            Self::Clog => "pg_xact",
-            Self::MultiXactMembers => "pg_multixact/members",
-            Self::MultiXactOffsets => "pg_multixact/offsets",
+            Self::Clog => write!(f, "pg_xact"),
+            Self::MultiXactMembers => write!(f, "pg_multixact/members"),
+            Self::MultiXactOffsets => write!(f, "pg_multixact/offsets"),
        }
    }
 }
--- a/libs/posthog_client_lite/Cargo.toml
+++ b/libs/posthog_client_lite/Cargo.toml
@@ -6,9 +6,14 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
 reqwest.workspace = true
-serde.workspace = true
 serde_json.workspace = true
+serde.workspace = true
 sha2.workspace = true
-workspace_hack.workspace = true
 thiserror.workspace = true
+tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
+tokio-util.workspace = true
+tracing-utils.workspace = true
+tracing.workspace = true
+workspace_hack.workspace = true
--- a/libs/posthog_client_lite/src/background_loop.rs
+++ b/libs/posthog_client_lite/src/background_loop.rs
@@ -0,0 +1,87 @@
+//! A background loop that fetches feature flags from PostHog and updates the feature store.
+
+use std::{sync::Arc, time::Duration};
+
+use arc_swap::ArcSwap;
+use tokio_util::sync::CancellationToken;
+use tracing::{Instrument, info_span};
+
+use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig};
+
+/// A background loop that fetches feature flags from PostHog and updates the feature store.
+pub struct FeatureResolverBackgroundLoop {
+    posthog_client: PostHogClient,
+    feature_store: ArcSwap<FeatureStore>,
+    cancel: CancellationToken,
+}
+
+impl FeatureResolverBackgroundLoop {
+    pub fn new(config: PostHogClientConfig, shutdown_pageserver: CancellationToken) -> Self {
+        Self {
+            posthog_client: PostHogClient::new(config),
+            feature_store: ArcSwap::new(Arc::new(FeatureStore::new())),
+            cancel: shutdown_pageserver,
+        }
+    }
+
+    pub fn spawn(
+        self: Arc<Self>,
+        handle: &tokio::runtime::Handle,
+        refresh_period: Duration,
+        fake_tenants: Vec<CaptureEvent>,
+    ) {
+        let this = self.clone();
+        let cancel = self.cancel.clone();
+
+        // Main loop of updating the feature flags.
+        handle.spawn(
+            async move {
+                tracing::info!("Starting PostHog feature resolver");
+                let mut ticker = tokio::time::interval(refresh_period);
+                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+                loop {
+                    tokio::select! {
+                        _ = ticker.tick() => {}
+                        _ = cancel.cancelled() => break
+                    }
+                    let resp = match this
+                        .posthog_client
+                        .get_feature_flags_local_evaluation()
+                        .await
+                    {
+                        Ok(resp) => resp,
+                        Err(e) => {
+                            tracing::warn!("Cannot get feature flags: {}", e);
+                            continue;
+                        }
+                    };
+                    let feature_store = FeatureStore::new_with_flags(resp.flags);
+                    this.feature_store.store(Arc::new(feature_store));
+                    tracing::info!("Feature flag updated");
+                }
+                tracing::info!("PostHog feature resolver stopped");
+            }
+            .instrument(info_span!("posthog_feature_resolver")),
+        );
+
+        // Report fake tenants to PostHog so that we have the combination of all the properties in the UI.
+        // Do one report per pageserver restart.
+        let this = self.clone();
+        handle.spawn(
+            async move {
+                tracing::info!("Starting PostHog feature reporter");
+                for tenant in &fake_tenants {
+                    tracing::info!("Reporting fake tenant: {:?}", tenant);
+                }
+                if let Err(e) = this.posthog_client.capture_event_batch(&fake_tenants).await {
+                    tracing::warn!("Cannot report fake tenants: {}", e);
+                }
+            }
+            .instrument(info_span!("posthog_feature_reporter")),
+        );
+    }
+
+    pub fn feature_store(&self) -> Arc<FeatureStore> {
+        self.feature_store.load_full()
+    }
+}
--- a/libs/posthog_client_lite/src/lib.rs
+++ b/libs/posthog_client_lite/src/lib.rs
@@ -1,5 +1,9 @@
 //! A lite version of the PostHog client that only supports local evaluation of feature flags.

+mod background_loop;
+
+pub use background_loop::FeatureResolverBackgroundLoop;
+
 use std::collections::HashMap;

 use serde::{Deserialize, Serialize};
@@ -18,10 +22,19 @@ pub enum PostHogEvaluationError {
    Internal(String),
 }

+impl PostHogEvaluationError {
+    pub fn as_variant_str(&self) -> &'static str {
+        match self {
+            PostHogEvaluationError::NotAvailable(_) => "not_available",
+            PostHogEvaluationError::NoConditionGroupMatched => "no_condition_group_matched",
+            PostHogEvaluationError::Internal(_) => "internal",
+        }
+    }
+}
+
 #[derive(Deserialize)]
 pub struct LocalEvaluationResponse {
-    #[allow(dead_code)]
-    flags: Vec<LocalEvaluationFlag>,
+    pub flags: Vec<LocalEvaluationFlag>,
 }

 #[derive(Deserialize)]
@@ -34,7 +47,7 @@ pub struct LocalEvaluationFlag {
 #[derive(Deserialize)]
 pub struct LocalEvaluationFlagFilters {
    groups: Vec<LocalEvaluationFlagFilterGroup>,
-    multivariate: LocalEvaluationFlagMultivariate,
+    multivariate: Option<LocalEvaluationFlagMultivariate>,
 }

 #[derive(Deserialize)]
@@ -51,7 +64,7 @@ pub struct LocalEvaluationFlagFilterProperty {
    operator: String,
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, Clone)]
 #[serde(untagged)]
 pub enum PostHogFlagFilterPropertyValue {
    String(String),
@@ -94,6 +107,12 @@ impl FeatureStore {
        }
    }

+    pub fn new_with_flags(flags: Vec<LocalEvaluationFlag>) -> Self {
+        let mut store = Self::new();
+        store.set_flags(flags);
+        store
+    }
+
    pub fn set_flags(&mut self, flags: Vec<LocalEvaluationFlag>) {
        self.flags.clear();
        for flag in flags {
@@ -245,7 +264,7 @@ impl FeatureStore {
        }
    }

-    /// Evaluate a multivariate feature flag. Returns `None` if the flag is not available or if there are errors
+    /// Evaluate a multivariate feature flag. Returns an error if the flag is not available or if there are errors
    /// during the evaluation.
    ///
    /// The parsing logic is as follows:
@@ -263,10 +282,15 @@ impl FeatureStore {
    /// Example: we have a multivariate flag with 3 groups of the configured global rollout percentage: A (10%), B (20%), C (70%).
    /// There is a single group with a condition that has a rollout percentage of 10% and it does not have a variant override.
    /// Then, we will have 1% of the users evaluated to A, 2% to B, and 7% to C.
+    ///
+    /// Error handling: the caller should inspect the error and decide the behavior when a feature flag
+    /// cannot be evaluated (i.e., default to false if it cannot be resolved). The error should *not* be
+    /// propagated beyond where the feature flag gets resolved.
    pub fn evaluate_multivariate(
        &self,
        flag_key: &str,
        user_id: &str,
+        properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
    ) -> Result<String, PostHogEvaluationError> {
        let hash_on_global_rollout_percentage =
            Self::consistent_hash(user_id, flag_key, "multivariate");
@@ -276,10 +300,39 @@ impl FeatureStore {
            flag_key,
            hash_on_global_rollout_percentage,
            hash_on_group_rollout_percentage,
-            &HashMap::new(),
+            properties,
        )
    }

+    /// Evaluate a boolean feature flag. Returns  an error if the flag is not available or if there are errors
+    /// during the evaluation.
+    ///
+    /// The parsing logic is as follows:
+    ///
+    /// * Generate a consistent hash for the tenant-feature.
+    /// * Match each filter group.
+    ///   - If a group is matched, it will first determine whether the user is in the range of the rollout
+    ///     percentage.
+    ///   - If the hash falls within the group's rollout percentage, return true.
+    /// * Otherwise, continue with the next group until all groups are evaluated and no group is within the
+    ///   rollout percentage.
+    /// * If there are no matching groups, return an error.
+    ///
+    /// Returns `Ok(())` if the feature flag evaluates to true. In the future, it will return a payload.
+    ///
+    /// Error handling: the caller should inspect the error and decide the behavior when a feature flag
+    /// cannot be evaluated (i.e., default to false if it cannot be resolved). The error should *not* be
+    /// propagated beyond where the feature flag gets resolved.
+    pub fn evaluate_boolean(
+        &self,
+        flag_key: &str,
+        user_id: &str,
+        properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+    ) -> Result<(), PostHogEvaluationError> {
+        let hash_on_global_rollout_percentage = Self::consistent_hash(user_id, flag_key, "boolean");
+        self.evaluate_boolean_inner(flag_key, hash_on_global_rollout_percentage, properties)
+    }
+
    /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID
    /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests
    /// and avoid duplicate computations.
@@ -306,6 +359,11 @@ impl FeatureStore {
                    flag_key
                )));
            }
+            let Some(ref multivariate) = flag_config.filters.multivariate else {
+                return Err(PostHogEvaluationError::Internal(format!(
+                    "No multivariate available, should use evaluate_boolean?: {flag_key}"
+                )));
+            };
            // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog
            // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it
            // does not matter.
@@ -314,7 +372,7 @@ impl FeatureStore {
                    GroupEvaluationResult::MatchedAndOverride(variant) => return Ok(variant),
                    GroupEvaluationResult::MatchedAndEvaluate => {
                        let mut percentage = 0;
-                        for variant in &flag_config.filters.multivariate.variants {
+                        for variant in &multivariate.variants {
                            percentage += variant.rollout_percentage;
                            if self
                                .evaluate_percentage(hash_on_global_rollout_percentage, percentage)
@@ -342,6 +400,89 @@ impl FeatureStore {
            )))
        }
    }
+
+    /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID
+    /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests
+    /// and avoid duplicate computations.
+    ///
+    /// Use a different consistent hash for evaluating the group rollout percentage.
+    /// The behavior: if the condition is set to rolling out to 10% of the users, and
+    /// we set the variant A to 20% in the global config, then 2% of the total users will
+    /// be evaluated to variant A.
+    ///
+    /// Note that the hash to determine group rollout percentage is shared across all groups. So if we have two
+    /// exactly-the-same conditions with 10% and 20% rollout percentage respectively, a total of 20% of the users
+    /// will be evaluated (versus 30% if group evaluation is done independently).
+    pub(crate) fn evaluate_boolean_inner(
+        &self,
+        flag_key: &str,
+        hash_on_global_rollout_percentage: f64,
+        properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+    ) -> Result<(), PostHogEvaluationError> {
+        if let Some(flag_config) = self.flags.get(flag_key) {
+            if !flag_config.active {
+                return Err(PostHogEvaluationError::NotAvailable(format!(
+                    "The feature flag is not active: {}",
+                    flag_key
+                )));
+            }
+            if flag_config.filters.multivariate.is_some() {
+                return Err(PostHogEvaluationError::Internal(format!(
+                    "This looks like a multivariate flag, should use evaluate_multivariate?: {flag_key}"
+                )));
+            };
+            // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog
+            // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it
+            // does not matter.
+            for group in &flag_config.filters.groups {
+                match self.evaluate_group(group, hash_on_global_rollout_percentage, properties)? {
+                    GroupEvaluationResult::MatchedAndOverride(_) => {
+                        return Err(PostHogEvaluationError::Internal(format!(
+                            "Boolean flag cannot have overrides: {}",
+                            flag_key
+                        )));
+                    }
+                    GroupEvaluationResult::MatchedAndEvaluate => {
+                        return Ok(());
+                    }
+                    GroupEvaluationResult::Unmatched => continue,
+                }
+            }
+            // If no group is matched, the feature is not available, and up to the caller to decide what to do.
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        } else {
+            // The feature flag is not available yet
+            Err(PostHogEvaluationError::NotAvailable(format!(
+                "Not found in the local evaluation spec: {}",
+                flag_key
+            )))
+        }
+    }
+
+    /// Infer whether a feature flag is a boolean flag by checking if it has a multivariate filter.
+    pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result<bool, PostHogEvaluationError> {
+        if let Some(flag_config) = self.flags.get(flag_key) {
+            Ok(flag_config.filters.multivariate.is_none())
+        } else {
+            Err(PostHogEvaluationError::NotAvailable(format!(
+                "Not found in the local evaluation spec: {}",
+                flag_key
+            )))
+        }
+    }
+}
+
+pub struct PostHogClientConfig {
+    /// The server API key.
+    pub server_api_key: String,
+    /// The client API key.
+    pub client_api_key: String,
+    /// The project ID.
+    pub project_id: String,
+    /// The private API URL.
+    pub private_api_url: String,
+    /// The public API URL.
+    pub public_api_url: String,
 }

 /// A lite PostHog client.
@@ -360,37 +501,23 @@ impl FeatureStore {
 /// want to report the feature flag usage back to PostHog. The current plan is to use PostHog only as an UI to
 /// configure feature flags so it is very likely that the client API will not be used.
 pub struct PostHogClient {
-    /// The server API key.
-    server_api_key: String,
-    /// The client API key.
-    client_api_key: String,
-    /// The project ID.
-    project_id: String,
-    /// The private API URL.
-    private_api_url: String,
-    /// The public API URL.
-    public_api_url: String,
+    /// The config.
+    config: PostHogClientConfig,
    /// The HTTP client.
    client: reqwest::Client,
 }

+#[derive(Serialize, Debug)]
+pub struct CaptureEvent {
+    pub event: String,
+    pub distinct_id: String,
+    pub properties: serde_json::Value,
+}
+
 impl PostHogClient {
-    pub fn new(
-        server_api_key: String,
-        client_api_key: String,
-        project_id: String,
-        private_api_url: String,
-        public_api_url: String,
-    ) -> Self {
+    pub fn new(config: PostHogClientConfig) -> Self {
        let client = reqwest::Client::new();
-        Self {
-            server_api_key,
-            client_api_key,
-            project_id,
-            private_api_url,
-            public_api_url,
-            client,
-        }
+        Self { config, client }
    }

    pub fn new_with_us_region(
@@ -398,13 +525,13 @@ impl PostHogClient {
        client_api_key: String,
        project_id: String,
    ) -> Self {
-        Self::new(
+        Self::new(PostHogClientConfig {
            server_api_key,
            client_api_key,
            project_id,
-            "https://us.posthog.com".to_string(),
-            "https://us.i.posthog.com".to_string(),
-        )
+            private_api_url: "https://us.posthog.com".to_string(),
+            public_api_url: "https://us.i.posthog.com".to_string(),
+        })
    }

    /// Fetch the feature flag specs from the server.
@@ -422,15 +549,23 @@ impl PostHogClient {
        // with bearer token of self.server_api_key
        let url = format!(
            "{}/api/projects/{}/feature_flags/local_evaluation",
-            self.private_api_url, self.project_id
+            self.config.private_api_url, self.config.project_id
        );
        let response = self
            .client
            .get(url)
-            .bearer_auth(&self.server_api_key)
+            .bearer_auth(&self.config.server_api_key)
            .send()
            .await?;
+        let status = response.status();
        let body = response.text().await?;
+        if !status.is_success() {
+            return Err(anyhow::anyhow!(
+                "Failed to get feature flags: {}, {}",
+                status,
+                body
+            ));
+        }
        Ok(serde_json::from_str(&body)?)
    }

@@ -442,21 +577,54 @@ impl PostHogClient {
        &self,
        event: &str,
        distinct_id: &str,
-        properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
+        properties: &serde_json::Value,
    ) -> anyhow::Result<()> {
        // PUBLIC_URL/capture/
-        // with bearer token of self.client_api_key
-        let url = format!("{}/capture/", self.public_api_url);
-        self.client
+        let url = format!("{}/capture/", self.config.public_api_url);
+        let response = self
+            .client
            .post(url)
            .body(serde_json::to_string(&json!({
-                "api_key": self.client_api_key,
+                "api_key": self.config.client_api_key,
                "distinct_id": distinct_id,
                "event": event,
                "properties": properties,
            }))?)
            .send()
            .await?;
+        let status = response.status();
+        let body = response.text().await?;
+        if !status.is_success() {
+            return Err(anyhow::anyhow!(
+                "Failed to capture events: {}, {}",
+                status,
+                body
+            ));
+        }
+        Ok(())
+    }
+
+    pub async fn capture_event_batch(&self, events: &[CaptureEvent]) -> anyhow::Result<()> {
+        // PUBLIC_URL/batch/
+        let url = format!("{}/batch/", self.config.public_api_url);
+        let response = self
+            .client
+            .post(url)
+            .body(serde_json::to_string(&json!({
+                "api_key": self.config.client_api_key,
+                "batch": events,
+            }))?)
+            .send()
+            .await?;
+        let status = response.status();
+        let body = response.text().await?;
+        if !status.is_success() {
+            return Err(anyhow::anyhow!(
+                "Failed to capture events: {}, {}",
+                status,
+                body
+            ));
+        }
        Ok(())
    }
 }
@@ -467,95 +635,162 @@ mod tests {

    fn data() -> &'static str {
        r#"{
-            "flags": [
-                {
-                    "id": 132794,
-                    "team_id": 152860,
-                    "name": "",
-                    "key": "gc-compaction",
-                    "filters": {
-                        "groups": [
-                            {
-                                "variant": "enabled-stage-2",
-                                "properties": [
-                                    {
-                                        "key": "plan_type",
-                                        "type": "person",
-                                        "value": [
-                                            "free"
-                                        ],
-                                        "operator": "exact"
-                                    },
-                                    {
-                                        "key": "pageserver_remote_size",
-                                        "type": "person",
-                                        "value": "10000000",
-                                        "operator": "lt"
-                                    }
-                                ],
-                                "rollout_percentage": 50
-                            },
-                            {
-                                "properties": [
-                                    {
-                                        "key": "plan_type",
-                                        "type": "person",
-                                        "value": [
-                                            "free"
-                                        ],
-                                        "operator": "exact"
-                                    },
-                                    {
-                                        "key": "pageserver_remote_size",
-                                        "type": "person",
-                                        "value": "10000000",
-                                        "operator": "lt"
-                                    }
-                                ],
-                                "rollout_percentage": 80
-                            }
-                        ],
-                        "payloads": {},
-                        "multivariate": {
-                            "variants": [
-                                {
-                                    "key": "disabled",
-                                    "name": "",
-                                    "rollout_percentage": 90
-                                },
-                                {
-                                    "key": "enabled-stage-1",
-                                    "name": "",
-                                    "rollout_percentage": 10
-                                },
-                                {
-                                    "key": "enabled-stage-2",
-                                    "name": "",
-                                    "rollout_percentage": 0
-                                },
-                                {
-                                    "key": "enabled-stage-3",
-                                    "name": "",
-                                    "rollout_percentage": 0
-                                },
-                                {
-                                    "key": "enabled",
-                                    "name": "",
-                                    "rollout_percentage": 0
-                                }
-                            ]
-                        }
-                    },
-                    "deleted": false,
-                    "active": true,
-                    "ensure_experience_continuity": false,
-                    "has_encrypted_payloads": false,
-                    "version": 6
-                }
+  "flags": [
+    {
+      "id": 141807,
+      "team_id": 152860,
+      "name": "",
+      "key": "image-compaction-boundary",
+      "filters": {
+        "groups": [
+          {
+            "variant": null,
+            "properties": [
+              {
+                "key": "plan_type",
+                "type": "person",
+                "value": [
+                  "free"
+                ],
+                "operator": "exact"
+              }
            ],
-            "group_type_mapping": {},
-            "cohorts": {}
-        }"#
+            "rollout_percentage": 40
+          },
+          {
+            "variant": null,
+            "properties": [],
+            "rollout_percentage": 10
+          }
+        ],
+        "payloads": {},
+        "multivariate": null
+      },
+      "deleted": false,
+      "active": true,
+      "ensure_experience_continuity": false,
+      "has_encrypted_payloads": false,
+      "version": 1
+    },
+    {
+      "id": 135586,
+      "team_id": 152860,
+      "name": "",
+      "key": "boolean-flag",
+      "filters": {
+        "groups": [
+          {
+            "variant": null,
+            "properties": [
+              {
+                "key": "plan_type",
+                "type": "person",
+                "value": [
+                  "free"
+                ],
+                "operator": "exact"
+              }
+            ],
+            "rollout_percentage": 47
+          }
+        ],
+        "payloads": {},
+        "multivariate": null
+      },
+      "deleted": false,
+      "active": true,
+      "ensure_experience_continuity": false,
+      "has_encrypted_payloads": false,
+      "version": 1
+    },
+    {
+      "id": 132794,
+      "team_id": 152860,
+      "name": "",
+      "key": "gc-compaction",
+      "filters": {
+        "groups": [
+          {
+            "variant": "enabled-stage-2",
+            "properties": [
+              {
+                "key": "plan_type",
+                "type": "person",
+                "value": [
+                  "free"
+                ],
+                "operator": "exact"
+              },
+              {
+                "key": "pageserver_remote_size",
+                "type": "person",
+                "value": "10000000",
+                "operator": "lt"
+              }
+            ],
+             "rollout_percentage": 50
+          },
+          {
+            "properties": [
+              {
+                "key": "plan_type",
+                "type": "person",
+                "value": [
+                  "free"
+                ],
+                "operator": "exact"
+              },
+              {
+                "key": "pageserver_remote_size",
+                "type": "person",
+                "value": "10000000",
+                "operator": "lt"
+              }
+            ],
+            "rollout_percentage": 80
+          }
+        ],
+        "payloads": {},
+        "multivariate": {
+          "variants": [
+            {
+              "key": "disabled",
+              "name": "",
+              "rollout_percentage": 90
+            },
+            {
+              "key": "enabled-stage-1",
+              "name": "",
+              "rollout_percentage": 10
+            },
+            {
+              "key": "enabled-stage-2",
+              "name": "",
+              "rollout_percentage": 0
+            },
+            {
+              "key": "enabled-stage-3",
+              "name": "",
+              "rollout_percentage": 0
+            },
+            {
+              "key": "enabled",
+              "name": "",
+              "rollout_percentage": 0
+            }
+          ]
+        }
+      },
+      "deleted": false,
+      "active": true,
+      "ensure_experience_continuity": false,
+      "has_encrypted_payloads": false,
+      "version": 7
+    }
+  ],
+  "group_type_mapping": {},
+  "cohorts": {}
+}"#
    }

    #[test]
@@ -631,4 +866,125 @@ mod tests {
            Err(PostHogEvaluationError::NoConditionGroupMatched)
        ),);
    }
+
+    #[test]
+    fn evaluate_boolean_1() {
+        // The `boolean-flag` feature flag only has one group that matches on the free user.
+
+        let mut store = FeatureStore::new();
+        let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
+        store.set_flags(response.flags);
+
+        // This lacks the required properties and cannot be evaluated.
+        let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &HashMap::new());
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NotAvailable(_))
+        ),);
+
+        let properties_unmatched = HashMap::from([
+            (
+                "plan_type".to_string(),
+                PostHogFlagFilterPropertyValue::String("paid".to_string()),
+            ),
+            (
+                "pageserver_remote_size".to_string(),
+                PostHogFlagFilterPropertyValue::Number(1000.0),
+            ),
+        ]);
+
+        // This does not match any group so there will be an error.
+        let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &properties_unmatched);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+
+        let properties = HashMap::from([
+            (
+                "plan_type".to_string(),
+                PostHogFlagFilterPropertyValue::String("free".to_string()),
+            ),
+            (
+                "pageserver_remote_size".to_string(),
+                PostHogFlagFilterPropertyValue::Number(1000.0),
+            ),
+        ]);
+
+        // It matches the first group as 0.10 <= 0.50 and the properties are matched. Then it gets evaluated to the variant override.
+        let variant = store.evaluate_boolean_inner("boolean-flag", 0.10, &properties);
+        assert!(variant.is_ok());
+
+        // It matches the group conditions but not the group rollout percentage.
+        let variant = store.evaluate_boolean_inner("boolean-flag", 1.00, &properties);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+    }
+
+    #[test]
+    fn evaluate_boolean_2() {
+        // The `image-compaction-boundary` feature flag has one group that matches on the free user and a group that matches on all users.
+
+        let mut store = FeatureStore::new();
+        let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap();
+        store.set_flags(response.flags);
+
+        // This lacks the required properties and cannot be evaluated.
+        let variant =
+            store.evaluate_boolean_inner("image-compaction-boundary", 1.00, &HashMap::new());
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NotAvailable(_))
+        ),);
+
+        let properties_unmatched = HashMap::from([
+            (
+                "plan_type".to_string(),
+                PostHogFlagFilterPropertyValue::String("paid".to_string()),
+            ),
+            (
+                "pageserver_remote_size".to_string(),
+                PostHogFlagFilterPropertyValue::Number(1000.0),
+            ),
+        ]);
+
+        // This does not match the filtered group but the all user group.
+        let variant =
+            store.evaluate_boolean_inner("image-compaction-boundary", 1.00, &properties_unmatched);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+        let variant =
+            store.evaluate_boolean_inner("image-compaction-boundary", 0.05, &properties_unmatched);
+        assert!(variant.is_ok());
+
+        let properties = HashMap::from([
+            (
+                "plan_type".to_string(),
+                PostHogFlagFilterPropertyValue::String("free".to_string()),
+            ),
+            (
+                "pageserver_remote_size".to_string(),
+                PostHogFlagFilterPropertyValue::Number(1000.0),
+            ),
+        ]);
+
+        // It matches the first group as 0.30 <= 0.40 and the properties are matched. Then it gets evaluated to the variant override.
+        let variant = store.evaluate_boolean_inner("image-compaction-boundary", 0.30, &properties);
+        assert!(variant.is_ok());
+
+        // It matches the group conditions but not the group rollout percentage.
+        let variant = store.evaluate_boolean_inner("image-compaction-boundary", 1.00, &properties);
+        assert!(matches!(
+            variant,
+            Err(PostHogEvaluationError::NoConditionGroupMatched)
+        ),);
+
+        // It matches the second "all" group conditions.
+        let variant = store.evaluate_boolean_inner("image-compaction-boundary", 0.09, &properties);
+        assert!(variant.is_ok());
+    }
 }
--- a/libs/proxy/postgres-protocol2/src/message/frontend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -25,6 +25,7 @@ where
    Ok(())
 }

+#[derive(Debug)]
 pub enum BindError {
    Conversion(Box<dyn Error + marker::Sync + Send>),
    Serialization(io::Error),
@@ -288,6 +289,12 @@ pub fn sync(buf: &mut BytesMut) {
    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
 }

+#[inline]
+pub fn flush(buf: &mut BytesMut) {
+    buf.put_u8(b'H');
+    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+}
+
 #[inline]
 pub fn terminate(buf: &mut BytesMut) {
    buf.put_u8(b'X');
--- a/libs/proxy/postgres-types2/src/lib.rs
+++ b/libs/proxy/postgres-types2/src/lib.rs
@@ -9,7 +9,6 @@ use std::error::Error;
 use std::fmt;
 use std::sync::Arc;

-use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 #[doc(inline)]
 pub use postgres_protocol2::Oid;
@@ -27,41 +26,6 @@ macro_rules! accepts {
    )
 }

-/// Generates an implementation of `ToSql::to_sql_checked`.
-///
-/// All `ToSql` implementations should use this macro.
-macro_rules! to_sql_checked {
-    () => {
-        fn to_sql_checked(
-            &self,
-            ty: &$crate::Type,
-            out: &mut $crate::private::BytesMut,
-        ) -> ::std::result::Result<
-            $crate::IsNull,
-            Box<dyn ::std::error::Error + ::std::marker::Sync + ::std::marker::Send>,
-        > {
-            $crate::__to_sql_checked(self, ty, out)
-        }
-    };
-}
-
-// WARNING: this function is not considered part of this crate's public API.
-// It is subject to change at any time.
-#[doc(hidden)]
-pub fn __to_sql_checked<T>(
-    v: &T,
-    ty: &Type,
-    out: &mut BytesMut,
-) -> Result<IsNull, Box<dyn Error + Sync + Send>>
-where
-    T: ToSql,
-{
-    if !T::accepts(ty) {
-        return Err(Box::new(WrongType::new::<T>(ty.clone())));
-    }
-    v.to_sql(ty, out)
-}
-
 // mod pg_lsn;
 #[doc(hidden)]
 pub mod private;
@@ -142,7 +106,7 @@ pub enum Kind {
    /// An array type along with the type of its elements.
    Array(Type),
    /// A range type along with the type of its elements.
-    Range(Type),
+    Range(Oid),
    /// A multirange type along with the type of its elements.
    Multirange(Type),
    /// A domain type along with its underlying type.
@@ -377,43 +341,6 @@ pub enum IsNull {
    No,
 }

-/// A trait for types that can be converted into Postgres values.
-pub trait ToSql: fmt::Debug {
-    /// Converts the value of `self` into the binary format of the specified
-    /// Postgres `Type`, appending it to `out`.
-    ///
-    /// The caller of this method is responsible for ensuring that this type
-    /// is compatible with the Postgres `Type`.
-    ///
-    /// The return value indicates if this value should be represented as
-    /// `NULL`. If this is the case, implementations **must not** write
-    /// anything to `out`.
-    fn to_sql(&self, ty: &Type, out: &mut BytesMut) -> Result<IsNull, Box<dyn Error + Sync + Send>>
-    where
-        Self: Sized;
-
-    /// Determines if a value of this type can be converted to the specified
-    /// Postgres `Type`.
-    fn accepts(ty: &Type) -> bool
-    where
-        Self: Sized;
-
-    /// An adaptor method used internally by Rust-Postgres.
-    ///
-    /// *All* implementations of this method should be generated by the
-    /// `to_sql_checked!()` macro.
-    fn to_sql_checked(
-        &self,
-        ty: &Type,
-        out: &mut BytesMut,
-    ) -> Result<IsNull, Box<dyn Error + Sync + Send>>;
-
-    /// Specify the encode format
-    fn encode_format(&self, _ty: &Type) -> Format {
-        Format::Binary
-    }
-}
-
 /// Supported Postgres message format types
 ///
 /// Using Text format in a message assumes a Postgres `SERVER_ENCODING` of `UTF8`
@@ -424,52 +351,3 @@ pub enum Format {
    /// Compact, typed binary format
    Binary,
 }
-
-impl ToSql for &str {
-    fn to_sql(&self, ty: &Type, w: &mut BytesMut) -> Result<IsNull, Box<dyn Error + Sync + Send>> {
-        match *ty {
-            ref ty if ty.name() == "ltree" => types::ltree_to_sql(self, w),
-            ref ty if ty.name() == "lquery" => types::lquery_to_sql(self, w),
-            ref ty if ty.name() == "ltxtquery" => types::ltxtquery_to_sql(self, w),
-            _ => types::text_to_sql(self, w),
-        }
-        Ok(IsNull::No)
-    }
-
-    fn accepts(ty: &Type) -> bool {
-        match *ty {
-            Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true,
-            ref ty
-                if (ty.name() == "citext"
-                    || ty.name() == "ltree"
-                    || ty.name() == "lquery"
-                    || ty.name() == "ltxtquery") =>
-            {
-                true
-            }
-            _ => false,
-        }
-    }
-
-    to_sql_checked!();
-}
-
-macro_rules! simple_to {
-    ($t:ty, $f:ident, $($expected:ident),+) => {
-        impl ToSql for $t {
-            fn to_sql(&self,
-                      _: &Type,
-                      w: &mut BytesMut)
-                      -> Result<IsNull, Box<dyn Error + Sync + Send>> {
-                types::$f(*self, w);
-                Ok(IsNull::No)
-            }
-
-            accepts!($($expected),+);
-
-            to_sql_checked!();
-        }
-    }
-}
-
-simple_to!(u32, oid_to_sql, OID);
--- a/libs/proxy/postgres-types2/src/type_gen.rs
+++ b/libs/proxy/postgres-types2/src/type_gen.rs
@@ -393,7 +393,7 @@ impl Inner {
        }
    }

-    pub fn oid(&self) -> Oid {
+    pub const fn const_oid(&self) -> Oid {
        match *self {
            Inner::Bool => 16,
            Inner::Bytea => 17,
@@ -580,7 +580,14 @@ impl Inner {
            Inner::TstzmultiRangeArray => 6153,
            Inner::DatemultiRangeArray => 6155,
            Inner::Int8multiRangeArray => 6157,
+            Inner::Other(_) => u32::MAX,
+        }
+    }
+
+    pub fn oid(&self) -> Oid {
+        match *self {
            Inner::Other(ref u) => u.oid,
+            _ => self.const_oid(),
        }
    }

@@ -727,17 +734,17 @@ impl Inner {
            Inner::JsonbArray => &Kind::Array(Type(Inner::Jsonb)),
            Inner::AnyRange => &Kind::Pseudo,
            Inner::EventTrigger => &Kind::Pseudo,
-            Inner::Int4Range => &Kind::Range(Type(Inner::Int4)),
+            Inner::Int4Range => &const { Kind::Range(Inner::Int4.const_oid()) },
            Inner::Int4RangeArray => &Kind::Array(Type(Inner::Int4Range)),
-            Inner::NumRange => &Kind::Range(Type(Inner::Numeric)),
+            Inner::NumRange => &const { Kind::Range(Inner::Numeric.const_oid()) },
            Inner::NumRangeArray => &Kind::Array(Type(Inner::NumRange)),
-            Inner::TsRange => &Kind::Range(Type(Inner::Timestamp)),
+            Inner::TsRange => &const { Kind::Range(Inner::Timestamp.const_oid()) },
            Inner::TsRangeArray => &Kind::Array(Type(Inner::TsRange)),
-            Inner::TstzRange => &Kind::Range(Type(Inner::Timestamptz)),
+            Inner::TstzRange => &const { Kind::Range(Inner::Timestamptz.const_oid()) },
            Inner::TstzRangeArray => &Kind::Array(Type(Inner::TstzRange)),
-            Inner::DateRange => &Kind::Range(Type(Inner::Date)),
+            Inner::DateRange => &const { Kind::Range(Inner::Date.const_oid()) },
            Inner::DateRangeArray => &Kind::Array(Type(Inner::DateRange)),
-            Inner::Int8Range => &Kind::Range(Type(Inner::Int8)),
+            Inner::Int8Range => &const { Kind::Range(Inner::Int8.const_oid()) },
            Inner::Int8RangeArray => &Kind::Array(Type(Inner::Int8Range)),
            Inner::Jsonpath => &Kind::Simple,
            Inner::JsonpathArray => &Kind::Array(Type(Inner::Jsonpath)),
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -1,14 +1,12 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::net::IpAddr;
-use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::time::Duration;

 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{TryStreamExt, future, ready};
-use parking_lot::Mutex;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use serde::{Deserialize, Serialize};
@@ -16,29 +14,52 @@ use tokio::sync::mpsc;

 use crate::codec::{BackendMessages, FrontendMessage};
 use crate::config::{Host, SslMode};
-use crate::connection::{Request, RequestMessages};
 use crate::query::RowStream;
 use crate::simple_query::SimpleQueryStream;
 use crate::types::{Oid, Type};
 use crate::{
-    CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Statement, Transaction,
-    TransactionBuilder, query, simple_query,
+    CancelToken, Error, ReadyForQueryStatus, SimpleQueryMessage, Transaction, TransactionBuilder,
+    query, simple_query,
 };

 pub struct Responses {
+    /// new messages from conn
    receiver: mpsc::Receiver<BackendMessages>,
+    /// current batch of messages
    cur: BackendMessages,
+    /// number of total queries sent.
+    waiting: usize,
+    /// number of ReadyForQuery messages received.
+    received: usize,
 }

 impl Responses {
    pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll<Result<Message, Error>> {
        loop {
-            match self.cur.next().map_err(Error::parse)? {
-                Some(Message::ErrorResponse(body)) => return Poll::Ready(Err(Error::db(body))),
-                Some(message) => return Poll::Ready(Ok(message)),
-                None => {}
+            // get the next saved message
+            if let Some(message) = self.cur.next().map_err(Error::parse)? {
+                let received = self.received;
+
+                // increase the query head if this is the last message.
+                if let Message::ReadyForQuery(_) = message {
+                    self.received += 1;
+                }
+
+                // check if the client has skipped this query.
+                if received + 1 < self.waiting {
+                    // grab the next message.
+                    continue;
+                }
+
+                // convenience: turn the error messaage into a proper error.
+                let res = match message {
+                    Message::ErrorResponse(body) => Err(Error::db(body)),
+                    message => Ok(message),
+                };
+                return Poll::Ready(res);
            }

+            // get the next batch of messages.
            match ready!(self.receiver.poll_recv(cx)) {
                Some(messages) => self.cur = messages,
                None => return Poll::Ready(Err(Error::closed())),
@@ -55,44 +76,87 @@ impl Responses {
 /// (corresponding to the queries in the [crate::prepare] module).
 #[derive(Default)]
 pub(crate) struct CachedTypeInfo {
-    /// A statement for basic information for a type from its
-    /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
-    /// fallback).
-    pub(crate) typeinfo: Option<Statement>,
-
    /// Cache of types already looked up.
    pub(crate) types: HashMap<Oid, Type>,
 }

 pub struct InnerClient {
-    sender: mpsc::UnboundedSender<Request>,
+    sender: mpsc::UnboundedSender<FrontendMessage>,
+    responses: Responses,

    /// A buffer to use when writing out postgres commands.
-    buffer: Mutex<BytesMut>,
+    buffer: BytesMut,
 }

 impl InnerClient {
-    pub fn send(&self, messages: RequestMessages) -> Result<Responses, Error> {
-        let (sender, receiver) = mpsc::channel(1);
-        let request = Request { messages, sender };
-        self.sender.send(request).map_err(|_| Error::closed())?;
-
-        Ok(Responses {
-            receiver,
-            cur: BackendMessages::empty(),
-        })
+    pub fn start(&mut self) -> Result<PartialQuery, Error> {
+        self.responses.waiting += 1;
+        Ok(PartialQuery(Some(self)))
    }

-    /// Call the given function with a buffer to be used when writing out
-    /// postgres commands.
-    pub fn with_buf<F, R>(&self, f: F) -> R
+    // pub fn send_with_sync<F>(&mut self, f: F) -> Result<&mut Responses, Error>
+    // where
+    //     F: FnOnce(&mut BytesMut) -> Result<(), Error>,
+    // {
+    //     self.start()?.send_with_sync(f)
+    // }
+
+    pub fn send_simple_query(&mut self, query: &str) -> Result<&mut Responses, Error> {
+        self.responses.waiting += 1;
+
+        self.buffer.clear();
+        // simple queries do not need sync.
+        frontend::query(query, &mut self.buffer).map_err(Error::encode)?;
+        let buf = self.buffer.split().freeze();
+        self.send_message(FrontendMessage::Raw(buf))
+    }
+
+    fn send_message(&mut self, messages: FrontendMessage) -> Result<&mut Responses, Error> {
+        self.sender.send(messages).map_err(|_| Error::closed())?;
+        Ok(&mut self.responses)
+    }
+}
+
+pub struct PartialQuery<'a>(Option<&'a mut InnerClient>);
+
+impl Drop for PartialQuery<'_> {
+    fn drop(&mut self) {
+        if let Some(client) = self.0.take() {
+            client.buffer.clear();
+            frontend::sync(&mut client.buffer);
+            let buf = client.buffer.split().freeze();
+            let _ = client.send_message(FrontendMessage::Raw(buf));
+        }
+    }
+}
+
+impl<'a> PartialQuery<'a> {
+    pub fn send_with_flush<F>(&mut self, f: F) -> Result<&mut Responses, Error>
    where
-        F: FnOnce(&mut BytesMut) -> R,
+        F: FnOnce(&mut BytesMut) -> Result<(), Error>,
    {
-        let mut buffer = self.buffer.lock();
-        let r = f(&mut buffer);
-        buffer.clear();
-        r
+        let client = self.0.as_deref_mut().unwrap();
+
+        client.buffer.clear();
+        f(&mut client.buffer)?;
+        frontend::flush(&mut client.buffer);
+        let buf = client.buffer.split().freeze();
+        client.send_message(FrontendMessage::Raw(buf))
+    }
+
+    pub fn send_with_sync<F>(mut self, f: F) -> Result<&'a mut Responses, Error>
+    where
+        F: FnOnce(&mut BytesMut) -> Result<(), Error>,
+    {
+        let client = self.0.as_deref_mut().unwrap();
+
+        client.buffer.clear();
+        f(&mut client.buffer)?;
+        frontend::sync(&mut client.buffer);
+        let buf = client.buffer.split().freeze();
+        let _ = client.send_message(FrontendMessage::Raw(buf));
+
+        Ok(&mut self.0.take().unwrap().responses)
    }
 }

@@ -109,7 +173,7 @@ pub struct SocketConfig {
 /// The client is one half of what is returned when a connection is established. Users interact with the database
 /// through this client object.
 pub struct Client {
-    inner: Arc<InnerClient>,
+    inner: InnerClient,
    cached_typeinfo: CachedTypeInfo,

    socket_config: SocketConfig,
@@ -120,17 +184,24 @@ pub struct Client {

 impl Client {
    pub(crate) fn new(
-        sender: mpsc::UnboundedSender<Request>,
+        sender: mpsc::UnboundedSender<FrontendMessage>,
+        receiver: mpsc::Receiver<BackendMessages>,
        socket_config: SocketConfig,
        ssl_mode: SslMode,
        process_id: i32,
        secret_key: i32,
    ) -> Client {
        Client {
-            inner: Arc::new(InnerClient {
+            inner: InnerClient {
                sender,
+                responses: Responses {
+                    receiver,
+                    cur: BackendMessages::empty(),
+                    waiting: 0,
+                    received: 0,
+                },
                buffer: Default::default(),
-            }),
+            },
            cached_typeinfo: Default::default(),

            socket_config,
@@ -145,19 +216,29 @@ impl Client {
        self.process_id
    }

-    pub(crate) fn inner(&self) -> &Arc<InnerClient> {
-        &self.inner
+    pub(crate) fn inner_mut(&mut self) -> &mut InnerClient {
+        &mut self.inner
    }

    /// Pass text directly to the Postgres backend to allow it to sort out typing itself and
    /// to save a roundtrip
-    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    pub async fn query_raw_txt<S, I>(
+        &mut self,
+        statement: &str,
+        params: I,
+    ) -> Result<RowStream, Error>
    where
        S: AsRef<str>,
        I: IntoIterator<Item = Option<S>>,
        I::IntoIter: ExactSizeIterator,
    {
-        query::query_txt(&self.inner, statement, params).await
+        query::query_txt(
+            &mut self.inner,
+            &mut self.cached_typeinfo,
+            statement,
+            params,
+        )
+        .await
    }

    /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
@@ -173,12 +254,15 @@ impl Client {
    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
    /// them to this method!
-    pub async fn simple_query(&self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
+    pub async fn simple_query(&mut self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
        self.simple_query_raw(query).await?.try_collect().await
    }

-    pub(crate) async fn simple_query_raw(&self, query: &str) -> Result<SimpleQueryStream, Error> {
-        simple_query::simple_query(self.inner(), query).await
+    pub(crate) async fn simple_query_raw(
+        &mut self,
+        query: &str,
+    ) -> Result<SimpleQueryStream, Error> {
+        simple_query::simple_query(self.inner_mut(), query).await
    }

    /// Executes a sequence of SQL statements using the simple query protocol.
@@ -191,15 +275,11 @@ impl Client {
    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
    /// them to this method!
-    pub async fn batch_execute(&self, query: &str) -> Result<ReadyForQueryStatus, Error> {
-        simple_query::batch_execute(self.inner(), query).await
+    pub async fn batch_execute(&mut self, query: &str) -> Result<ReadyForQueryStatus, Error> {
+        simple_query::batch_execute(self.inner_mut(), query).await
    }

    pub async fn discard_all(&mut self) -> Result<ReadyForQueryStatus, Error> {
-        // clear the prepared statements that are about to be nuked from the postgres session
-
-        self.cached_typeinfo.typeinfo = None;
-
        self.batch_execute("discard all").await
    }

@@ -208,7 +288,7 @@ impl Client {
    /// The transaction will roll back by default - use the `commit` method to commit it.
    pub async fn transaction(&mut self) -> Result<Transaction<'_>, Error> {
        struct RollbackIfNotDone<'me> {
-            client: &'me Client,
+            client: &'me mut Client,
            done: bool,
        }

@@ -218,14 +298,7 @@ impl Client {
                    return;
                }

-                let buf = self.client.inner().with_buf(|buf| {
-                    frontend::query("ROLLBACK", buf).unwrap();
-                    buf.split().freeze()
-                });
-                let _ = self
-                    .client
-                    .inner()
-                    .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+                let _ = self.client.inner.send_simple_query("ROLLBACK");
            }
        }

@@ -239,7 +312,7 @@ impl Client {
                client: self,
                done: false,
            };
-            self.batch_execute("BEGIN").await?;
+            cleaner.client.batch_execute("BEGIN").await?;
            cleaner.done = true;
        }

@@ -265,11 +338,6 @@ impl Client {
        }
    }

-    /// Query for type information
-    pub(crate) async fn get_type_inner(&mut self, oid: Oid) -> Result<Type, Error> {
-        crate::prepare::get_type(&self.inner, &mut self.cached_typeinfo, oid).await
-    }
-
    /// Determines if the connection to the server has already closed.
    ///
    /// In that case, all future queries will fail.
--- a/libs/proxy/tokio-postgres2/src/codec.rs
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -1,21 +1,16 @@
 use std::io;

-use bytes::{Buf, Bytes, BytesMut};
+use bytes::{Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use postgres_protocol2::message::backend;
-use postgres_protocol2::message::frontend::CopyData;
 use tokio_util::codec::{Decoder, Encoder};

 pub enum FrontendMessage {
    Raw(Bytes),
-    CopyData(CopyData<Box<dyn Buf + Send>>),
 }

 pub enum BackendMessage {
-    Normal {
-        messages: BackendMessages,
-        request_complete: bool,
-    },
+    Normal { messages: BackendMessages },
    Async(backend::Message),
 }

@@ -44,7 +39,6 @@ impl Encoder<FrontendMessage> for PostgresCodec {
    fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> {
        match item {
            FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf),
-            FrontendMessage::CopyData(data) => data.write(dst),
        }

        Ok(())
@@ -57,7 +51,6 @@ impl Decoder for PostgresCodec {

    fn decode(&mut self, src: &mut BytesMut) -> Result<Option<BackendMessage>, io::Error> {
        let mut idx = 0;
-        let mut request_complete = false;

        while let Some(header) = backend::Header::parse(&src[idx..])? {
            let len = header.len() as usize + 1;
@@ -82,7 +75,6 @@ impl Decoder for PostgresCodec {
            idx += len;

            if header.tag() == backend::READY_FOR_QUERY_TAG {
-                request_complete = true;
                break;
            }
        }
@@ -92,7 +84,6 @@ impl Decoder for PostgresCodec {
        } else {
            Ok(Some(BackendMessage::Normal {
                messages: BackendMessages(src.split_to(idx)),
-                request_complete,
            }))
        }
    }
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -59,9 +59,11 @@ where
        connect_timeout: config.connect_timeout,
    };

-    let (sender, receiver) = mpsc::unbounded_channel();
+    let (client_tx, conn_rx) = mpsc::unbounded_channel();
+    let (conn_tx, client_rx) = mpsc::channel(4);
    let client = Client::new(
-        sender,
+        client_tx,
+        client_rx,
        socket_config,
        config.ssl_mode,
        process_id,
@@ -74,7 +76,7 @@ where
        .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
        .collect();

-    let connection = Connection::new(stream, delayed, parameters, receiver);
+    let connection = Connection::new(stream, delayed, parameters, conn_tx, conn_rx);

    Ok((client, connection))
 }
--- a/libs/proxy/tokio-postgres2/src/connection.rs
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -4,7 +4,6 @@ use std::pin::Pin;
 use std::task::{Context, Poll};

 use bytes::BytesMut;
-use fallible_iterator::FallibleIterator;
 use futures_util::{Sink, Stream, ready};
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
@@ -19,30 +18,12 @@ use crate::error::DbError;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::{AsyncMessage, Error, Notification};

-pub enum RequestMessages {
-    Single(FrontendMessage),
-}
-
-pub struct Request {
-    pub messages: RequestMessages,
-    pub sender: mpsc::Sender<BackendMessages>,
-}
-
-pub struct Response {
-    sender: PollSender<BackendMessages>,
-}
-
 #[derive(PartialEq, Debug)]
 enum State {
    Active,
    Closing,
 }

-enum WriteReady {
-    Terminating,
-    WaitingOnRead,
-}
-
 /// A connection to a PostgreSQL database.
 ///
 /// This is one half of what is returned when a new connection is established. It performs the actual IO with the
@@ -56,9 +37,11 @@ pub struct Connection<S, T> {
    pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
    /// HACK: we need this in the Neon Proxy to forward params.
    pub parameters: HashMap<String, String>,
-    receiver: mpsc::UnboundedReceiver<Request>,
+
+    sender: PollSender<BackendMessages>,
+    receiver: mpsc::UnboundedReceiver<FrontendMessage>,
+
    pending_responses: VecDeque<BackendMessage>,
-    responses: VecDeque<Response>,
    state: State,
 }

@@ -71,14 +54,15 @@ where
        stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
        pending_responses: VecDeque<BackendMessage>,
        parameters: HashMap<String, String>,
-        receiver: mpsc::UnboundedReceiver<Request>,
+        sender: mpsc::Sender<BackendMessages>,
+        receiver: mpsc::UnboundedReceiver<FrontendMessage>,
    ) -> Connection<S, T> {
        Connection {
            stream,
            parameters,
+            sender: PollSender::new(sender),
            receiver,
            pending_responses,
-            responses: VecDeque::new(),
            state: State::Active,
        }
    }
@@ -110,7 +94,7 @@ where
                }
            };

-            let (mut messages, request_complete) = match message {
+            let messages = match message {
                BackendMessage::Async(Message::NoticeResponse(body)) => {
                    let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?;
                    return Poll::Ready(Ok(AsyncMessage::Notice(error)));
@@ -131,41 +115,19 @@ where
                    continue;
                }
                BackendMessage::Async(_) => unreachable!(),
-                BackendMessage::Normal {
-                    messages,
-                    request_complete,
-                } => (messages, request_complete),
+                BackendMessage::Normal { messages } => messages,
            };

-            let mut response = match self.responses.pop_front() {
-                Some(response) => response,
-                None => match messages.next().map_err(Error::parse)? {
-                    Some(Message::ErrorResponse(error)) => {
-                        return Poll::Ready(Err(Error::db(error)));
-                    }
-                    _ => return Poll::Ready(Err(Error::unexpected_message())),
-                },
-            };
-
-            match response.sender.poll_reserve(cx) {
+            match self.sender.poll_reserve(cx) {
                Poll::Ready(Ok(())) => {
-                    let _ = response.sender.send_item(messages);
-                    if !request_complete {
-                        self.responses.push_front(response);
-                    }
+                    let _ = self.sender.send_item(messages);
                }
                Poll::Ready(Err(_)) => {
-                    // we need to keep paging through the rest of the messages even if the receiver's hung up
-                    if !request_complete {
-                        self.responses.push_front(response);
-                    }
+                    return Poll::Ready(Err(Error::closed()));
                }
                Poll::Pending => {
-                    self.responses.push_front(response);
-                    self.pending_responses.push_back(BackendMessage::Normal {
-                        messages,
-                        request_complete,
-                    });
+                    self.pending_responses
+                        .push_back(BackendMessage::Normal { messages });
                    trace!("poll_read: waiting on sender");
                    return Poll::Pending;
                }
@@ -174,7 +136,7 @@ where
    }

    /// Fetch the next client request and enqueue the response sender.
-    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<RequestMessages>> {
+    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<FrontendMessage>> {
        if self.receiver.is_closed() {
            return Poll::Ready(None);
        }
@@ -182,10 +144,7 @@ where
        match self.receiver.poll_recv(cx) {
            Poll::Ready(Some(request)) => {
                trace!("polled new request");
-                self.responses.push_back(Response {
-                    sender: PollSender::new(request.sender),
-                });
-                Poll::Ready(Some(request.messages))
+                Poll::Ready(Some(request))
            }
            Poll::Ready(None) => Poll::Ready(None),
            Poll::Pending => Poll::Pending,
@@ -194,7 +153,7 @@ where

    /// Process client requests and write them to the postgres connection, flushing if necessary.
    /// client -> postgres
-    fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll<Result<WriteReady, Error>> {
+    fn poll_write(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
        loop {
            if Pin::new(&mut self.stream)
                .poll_ready(cx)
@@ -209,14 +168,14 @@ where

            match self.poll_request(cx) {
                // send the message to postgres
-                Poll::Ready(Some(RequestMessages::Single(request))) => {
+                Poll::Ready(Some(request)) => {
                    Pin::new(&mut self.stream)
                        .start_send(request)
                        .map_err(Error::io)?;
                }
                // No more messages from the client, and no more responses to wait for.
                // Send a terminate message to postgres
-                Poll::Ready(None) if self.responses.is_empty() => {
+                Poll::Ready(None) => {
                    trace!("poll_write: at eof, terminating");
                    let mut request = BytesMut::new();
                    frontend::terminate(&mut request);
@@ -228,16 +187,7 @@ where

                    trace!("poll_write: sent eof, closing");
                    trace!("poll_write: done");
-                    return Poll::Ready(Ok(WriteReady::Terminating));
-                }
-                // No more messages from the client, but there are still some responses to wait for.
-                Poll::Ready(None) => {
-                    trace!(
-                        "poll_write: at eof, pending responses {}",
-                        self.responses.len()
-                    );
-                    ready!(self.poll_flush(cx))?;
-                    return Poll::Ready(Ok(WriteReady::WaitingOnRead));
+                    return Poll::Ready(Ok(()));
                }
                // Still waiting for a message from the client.
                Poll::Pending => {
@@ -298,7 +248,7 @@ where
            // if the state is still active, try read from and write to postgres.
            let message = self.poll_read(cx)?;
            let closing = self.poll_write(cx)?;
-            if let Poll::Ready(WriteReady::Terminating) = closing {
+            if let Poll::Ready(()) = closing {
                self.state = State::Closing;
            }

--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -1,9 +1,6 @@
 #![allow(async_fn_in_trait)]

-use postgres_protocol2::Oid;
-
 use crate::query::RowStream;
-use crate::types::Type;
 use crate::{Client, Error, Transaction};

 mod private {
@@ -15,20 +12,17 @@ mod private {
 /// This trait is "sealed", and cannot be implemented outside of this crate.
 pub trait GenericClient: private::Sealed {
    /// Like `Client::query_raw_txt`.
-    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
        I::IntoIter: ExactSizeIterator + Sync + Send;
-
-    /// Query for type information
-    async fn get_type(&mut self, oid: Oid) -> Result<Type, Error>;
 }

 impl private::Sealed for Client {}

 impl GenericClient for Client {
-    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -36,17 +30,12 @@ impl GenericClient for Client {
    {
        self.query_raw_txt(statement, params).await
    }
-
-    /// Query for type information
-    async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
-        self.get_type_inner(oid).await
-    }
 }

 impl private::Sealed for Transaction<'_> {}

 impl GenericClient for Transaction<'_> {
-    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
    where
        S: AsRef<str> + Sync + Send,
        I: IntoIterator<Item = Option<S>> + Sync + Send,
@@ -54,9 +43,4 @@ impl GenericClient for Transaction<'_> {
    {
        self.query_raw_txt(statement, params).await
    }
-
-    /// Query for type information
-    async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
-        self.client_mut().get_type(oid).await
-    }
 }
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -18,7 +18,6 @@ pub use crate::statement::{Column, Statement};
 pub use crate::tls::NoTls;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
-use crate::types::ToSql;

 /// After executing a query, the connection will be in one of these states
 #[derive(Clone, Copy, Debug, PartialEq)]
@@ -120,9 +119,3 @@ pub enum SimpleQueryMessage {
    /// The number of rows modified or selected is returned.
    CommandComplete(u64),
 }
-
-fn slice_iter<'a>(
-    s: &'a [&'a (dyn ToSql + Sync)],
-) -> impl ExactSizeIterator<Item = &'a (dyn ToSql + Sync)> + 'a {
-    s.iter().map(|s| *s as _)
-}
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -1,19 +1,14 @@
-use std::future::Future;
-use std::pin::Pin;
-use std::sync::Arc;
-
-use bytes::Bytes;
+use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
-use futures_util::{TryStreamExt, pin_mut};
-use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::IsNull;
+use postgres_protocol2::message::backend::{Message, RowDescriptionBody};
 use postgres_protocol2::message::frontend;
-use tracing::debug;
+use postgres_protocol2::types::oid_to_sql;
+use postgres_types2::Format;

-use crate::client::{CachedTypeInfo, InnerClient};
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
+use crate::client::{CachedTypeInfo, PartialQuery, Responses};
 use crate::types::{Kind, Oid, Type};
-use crate::{Column, Error, Statement, query, slice_iter};
+use crate::{Column, Error, Row, Statement};

 pub(crate) const TYPEINFO_QUERY: &str = "\
 SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
@@ -23,22 +18,51 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
 WHERE t.oid = $1
 ";

+/// we need to make sure we close this prepared statement.
+struct CloseStmt<'a, 'b> {
+    client: Option<&'a mut PartialQuery<'b>>,
+    name: &'static str,
+}
+
+impl<'a> CloseStmt<'a, '_> {
+    fn close(mut self) -> Result<&'a mut Responses, Error> {
+        let client = self.client.take().unwrap();
+        client.send_with_flush(|buf| {
+            frontend::close(b'S', self.name, buf).map_err(Error::encode)?;
+            Ok(())
+        })
+    }
+}
+
+impl Drop for CloseStmt<'_, '_> {
+    fn drop(&mut self) {
+        if let Some(client) = self.client.take() {
+            let _ = client.send_with_flush(|buf| {
+                frontend::close(b'S', self.name, buf).map_err(Error::encode)?;
+                Ok(())
+            });
+        }
+    }
+}
+
 async fn prepare_typecheck(
-    client: &Arc<InnerClient>,
+    client: &mut PartialQuery<'_>,
    name: &'static str,
    query: &str,
-    types: &[Type],
 ) -> Result<Statement, Error> {
-    let buf = encode(client, name, query, types)?;
-    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+    let responses = client.send_with_flush(|buf| {
+        frontend::parse(name, query, [], buf).map_err(Error::encode)?;
+        frontend::describe(b'S', name, buf).map_err(Error::encode)?;
+        Ok(())
+    })?;

    match responses.next().await? {
        Message::ParseComplete => {}
        _ => return Err(Error::unexpected_message()),
    }

-    let parameter_description = match responses.next().await? {
-        Message::ParameterDescription(body) => body,
+    match responses.next().await? {
+        Message::ParameterDescription(_) => {}
        _ => return Err(Error::unexpected_message()),
    };

@@ -48,13 +72,6 @@ async fn prepare_typecheck(
        _ => return Err(Error::unexpected_message()),
    };

-    let mut parameters = vec![];
-    let mut it = parameter_description.parameters();
-    while let Some(oid) = it.next().map_err(Error::parse)? {
-        let type_ = Type::from_oid(oid).ok_or_else(Error::unexpected_message)?;
-        parameters.push(type_);
-    }
-
    let mut columns = vec![];
    if let Some(row_description) = row_description {
        let mut it = row_description.fields();
@@ -65,98 +82,168 @@ async fn prepare_typecheck(
        }
    }

-    Ok(Statement::new(client, name, parameters, columns))
+    Ok(Statement::new(name, columns))
 }

-fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
-    if types.is_empty() {
-        debug!("preparing query {}: {}", name, query);
-    } else {
-        debug!("preparing query {} with types {:?}: {}", name, types, query);
-    }
-
-    client.with_buf(|buf| {
-        frontend::parse(name, query, types.iter().map(Type::oid), buf).map_err(Error::encode)?;
-        frontend::describe(b'S', name, buf).map_err(Error::encode)?;
-        frontend::sync(buf);
-        Ok(buf.split().freeze())
-    })
-}
-
-pub async fn get_type(
-    client: &Arc<InnerClient>,
-    typecache: &mut CachedTypeInfo,
-    oid: Oid,
-) -> Result<Type, Error> {
+fn try_from_cache(typecache: &CachedTypeInfo, oid: Oid) -> Option<Type> {
    if let Some(type_) = Type::from_oid(oid) {
-        return Ok(type_);
+        return Some(type_);
    }

    if let Some(type_) = typecache.types.get(&oid) {
-        return Ok(type_.clone());
+        return Some(type_.clone());
    };

-    let stmt = typeinfo_statement(client, typecache).await?;
+    None
+}

-    let rows = query::query(client, stmt, slice_iter(&[&oid])).await?;
-    pin_mut!(rows);
+pub async fn parse_row_description(
+    client: &mut PartialQuery<'_>,
+    typecache: &mut CachedTypeInfo,
+    row_description: Option<RowDescriptionBody>,
+) -> Result<Vec<Column>, Error> {
+    let mut columns = vec![];

-    let row = match rows.try_next().await? {
-        Some(row) => row,
-        None => return Err(Error::unexpected_message()),
+    if let Some(row_description) = row_description {
+        let mut it = row_description.fields();
+        while let Some(field) = it.next().map_err(Error::parse)? {
+            let type_ = try_from_cache(typecache, field.type_oid()).unwrap_or(Type::UNKNOWN);
+            let column = Column::new(field.name().to_string(), type_, field);
+            columns.push(column);
+        }
+    }
+
+    let all_known = columns.iter().all(|c| c.type_ != Type::UNKNOWN);
+    if all_known {
+        // all known, return early.
+        return Ok(columns);
+    }
+
+    let typeinfo = "neon_proxy_typeinfo";
+
+    // make sure to close the typeinfo statement before exiting.
+    let mut guard = CloseStmt {
+        name: typeinfo,
+        client: None,
+    };
+    let client = guard.client.insert(client);
+
+    // get the typeinfo statement.
+    let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY).await?;
+
+    for column in &mut columns {
+        column.type_ = get_type(client, typecache, &stmt, column.type_oid()).await?;
+    }
+
+    // cancel the close guard.
+    let responses = guard.close()?;
+
+    match responses.next().await? {
+        Message::CloseComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    Ok(columns)
+}
+
+async fn get_type(
+    client: &mut PartialQuery<'_>,
+    typecache: &mut CachedTypeInfo,
+    stmt: &Statement,
+    mut oid: Oid,
+) -> Result<Type, Error> {
+    let mut stack = vec![];
+    let mut type_ = loop {
+        if let Some(type_) = try_from_cache(typecache, oid) {
+            break type_;
+        }
+
+        let row = exec(client, stmt, oid).await?;
+        if stack.len() > 8 {
+            return Err(Error::unexpected_message());
+        }
+
+        let name: String = row.try_get(0)?;
+        let type_: i8 = row.try_get(1)?;
+        let elem_oid: Oid = row.try_get(2)?;
+        let rngsubtype: Option<Oid> = row.try_get(3)?;
+        let basetype: Oid = row.try_get(4)?;
+        let schema: String = row.try_get(5)?;
+        let relid: Oid = row.try_get(6)?;
+
+        let kind = if type_ == b'e' as i8 {
+            Kind::Enum
+        } else if type_ == b'p' as i8 {
+            Kind::Pseudo
+        } else if basetype != 0 {
+            Kind::Domain(basetype)
+        } else if elem_oid != 0 {
+            stack.push((name, oid, schema));
+            oid = elem_oid;
+            continue;
+        } else if relid != 0 {
+            Kind::Composite(relid)
+        } else if let Some(rngsubtype) = rngsubtype {
+            Kind::Range(rngsubtype)
+        } else {
+            Kind::Simple
+        };
+
+        let type_ = Type::new(name, oid, kind, schema);
+        typecache.types.insert(oid, type_.clone());
+        break type_;
    };

-    let name: String = row.try_get(0)?;
-    let type_: i8 = row.try_get(1)?;
-    let elem_oid: Oid = row.try_get(2)?;
-    let rngsubtype: Option<Oid> = row.try_get(3)?;
-    let basetype: Oid = row.try_get(4)?;
-    let schema: String = row.try_get(5)?;
-    let relid: Oid = row.try_get(6)?;
-
-    let kind = if type_ == b'e' as i8 {
-        Kind::Enum
-    } else if type_ == b'p' as i8 {
-        Kind::Pseudo
-    } else if basetype != 0 {
-        Kind::Domain(basetype)
-    } else if elem_oid != 0 {
-        let type_ = get_type_rec(client, typecache, elem_oid).await?;
-        Kind::Array(type_)
-    } else if relid != 0 {
-        Kind::Composite(relid)
-    } else if let Some(rngsubtype) = rngsubtype {
-        let type_ = get_type_rec(client, typecache, rngsubtype).await?;
-        Kind::Range(type_)
-    } else {
-        Kind::Simple
-    };
-
-    let type_ = Type::new(name, oid, kind, schema);
-    typecache.types.insert(oid, type_.clone());
+    while let Some((name, oid, schema)) = stack.pop() {
+        type_ = Type::new(name, oid, Kind::Array(type_), schema);
+        typecache.types.insert(oid, type_.clone());
+    }

    Ok(type_)
 }

-fn get_type_rec<'a>(
-    client: &'a Arc<InnerClient>,
-    typecache: &'a mut CachedTypeInfo,
-    oid: Oid,
-) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
-    Box::pin(get_type(client, typecache, oid))
-}
+/// exec the typeinfo statement returning one row.
+async fn exec(
+    client: &mut PartialQuery<'_>,
+    statement: &Statement,
+    param: Oid,
+) -> Result<Row, Error> {
+    let responses = client.send_with_flush(|buf| {
+        encode_bind(statement, param, "", buf);
+        frontend::execute("", 0, buf).map_err(Error::encode)?;
+        Ok(())
+    })?;

-async fn typeinfo_statement(
-    client: &Arc<InnerClient>,
-    typecache: &mut CachedTypeInfo,
-) -> Result<Statement, Error> {
-    if let Some(stmt) = &typecache.typeinfo {
-        return Ok(stmt.clone());
+    match responses.next().await? {
+        Message::BindComplete => {}
+        _ => return Err(Error::unexpected_message()),
    }

-    let typeinfo = "neon_proxy_typeinfo";
-    let stmt = prepare_typecheck(client, typeinfo, TYPEINFO_QUERY, &[]).await?;
+    let row = match responses.next().await? {
+        Message::DataRow(body) => Row::new(statement.clone(), body, Format::Binary)?,
+        _ => return Err(Error::unexpected_message()),
+    };

-    typecache.typeinfo = Some(stmt.clone());
-    Ok(stmt)
+    match responses.next().await? {
+        Message::CommandComplete(_) => {}
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    Ok(row)
+}
+
+fn encode_bind(statement: &Statement, param: Oid, portal: &str, buf: &mut BytesMut) {
+    frontend::bind(
+        portal,
+        statement.name(),
+        [Format::Binary as i16],
+        [param],
+        |param, buf| {
+            oid_to_sql(param, buf);
+            Ok(IsNull::No)
+        },
+        [Format::Binary as i16],
+        buf,
+    )
+    .unwrap();
 }
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -1,76 +1,43 @@
-use std::fmt;
-use std::marker::PhantomPinned;
 use std::pin::Pin;
-use std::sync::Arc;
 use std::task::{Context, Poll};

-use bytes::{BufMut, Bytes, BytesMut};
-use fallible_iterator::FallibleIterator;
+use bytes::BufMut;
 use futures_util::{Stream, ready};
-use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
-use postgres_types2::{Format, ToSql, Type};
-use tracing::debug;
+use postgres_types2::Format;

-use crate::client::{InnerClient, Responses};
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
-use crate::types::IsNull;
-use crate::{Column, Error, ReadyForQueryStatus, Row, Statement};
+use crate::client::{CachedTypeInfo, InnerClient, Responses};
+use crate::{Error, ReadyForQueryStatus, Row, Statement};

-struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]);
-
-impl fmt::Debug for BorrowToSqlParamsDebug<'_> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_list().entries(self.0.iter()).finish()
-    }
-}
-
-pub async fn query<'a, I>(
-    client: &InnerClient,
-    statement: Statement,
-    params: I,
-) -> Result<RowStream, Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    let buf = if tracing::enabled!(tracing::Level::DEBUG) {
-        let params = params.into_iter().collect::<Vec<_>>();
-        debug!(
-            "executing statement {} with parameters: {:?}",
-            statement.name(),
-            BorrowToSqlParamsDebug(params.as_slice()),
-        );
-        encode(client, &statement, params)?
-    } else {
-        encode(client, &statement, params)?
-    };
-    let responses = start(client, buf).await?;
-    Ok(RowStream {
-        statement,
-        responses,
-        command_tag: None,
-        status: ReadyForQueryStatus::Unknown,
-        output_format: Format::Binary,
-        _p: PhantomPinned,
-    })
-}
-
-pub async fn query_txt<S, I>(
-    client: &Arc<InnerClient>,
+pub async fn query_txt<'a, S, I>(
+    client: &'a mut InnerClient,
+    typecache: &mut CachedTypeInfo,
    query: &str,
    params: I,
-) -> Result<RowStream, Error>
+) -> Result<RowStream<'a>, Error>
 where
    S: AsRef<str>,
    I: IntoIterator<Item = Option<S>>,
    I::IntoIter: ExactSizeIterator,
 {
    let params = params.into_iter();
+    let mut client = client.start()?;

-    let buf = client.with_buf(|buf| {
+    // Flow:
+    // 1. Parse the query
+    // 2. Inspect the row description for OIDs
+    // 3. If there's any OIDs we don't already know about, perform the typeinfo routine
+    // 4. Execute the query
+    // 5. Sync.
+    //
+    // The typeinfo routine:
+    // 1. Parse the typeinfo query
+    // 2. Execute the query on each OID
+    // 3. If the result does not match an OID we know, repeat 2.
+
+    // parse the query and get type info
+    let responses = client.send_with_flush(|buf| {
        frontend::parse(
            "",                 // unnamed prepared statement
            query,              // query to parse
@@ -79,7 +46,30 @@ where
        )
        .map_err(Error::encode)?;
        frontend::describe(b'S', "", buf).map_err(Error::encode)?;
-        // Bind, pass params as text, retrieve as binary
+        Ok(())
+    })?;
+
+    match responses.next().await? {
+        Message::ParseComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    match responses.next().await? {
+        Message::ParameterDescription(_) => {}
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let row_description = match responses.next().await? {
+        Message::RowDescription(body) => Some(body),
+        Message::NoData => None,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let columns =
+        crate::prepare::parse_row_description(&mut client, typecache, row_description).await?;
+
+    let responses = client.send_with_sync(|buf| {
+        // Bind, pass params as text, retrieve as text
        match frontend::bind(
            "",                 // empty string selects the unnamed portal
            "",                 // unnamed prepared statement
@@ -102,173 +92,55 @@ where

        // Execute
        frontend::execute("", 0, buf).map_err(Error::encode)?;
-        // Sync
-        frontend::sync(buf);

-        Ok(buf.split().freeze())
+        Ok(())
    })?;

-    // now read the responses
-    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
-
-    match responses.next().await? {
-        Message::ParseComplete => {}
-        _ => return Err(Error::unexpected_message()),
-    }
-
-    let parameter_description = match responses.next().await? {
-        Message::ParameterDescription(body) => body,
-        _ => return Err(Error::unexpected_message()),
-    };
-
-    let row_description = match responses.next().await? {
-        Message::RowDescription(body) => Some(body),
-        Message::NoData => None,
-        _ => return Err(Error::unexpected_message()),
-    };
-
    match responses.next().await? {
        Message::BindComplete => {}
        _ => return Err(Error::unexpected_message()),
    }

-    let mut parameters = vec![];
-    let mut it = parameter_description.parameters();
-    while let Some(oid) = it.next().map_err(Error::parse)? {
-        let type_ = Type::from_oid(oid).unwrap_or(Type::UNKNOWN);
-        parameters.push(type_);
-    }
-
-    let mut columns = vec![];
-    if let Some(row_description) = row_description {
-        let mut it = row_description.fields();
-        while let Some(field) = it.next().map_err(Error::parse)? {
-            let type_ = Type::from_oid(field.type_oid()).unwrap_or(Type::UNKNOWN);
-            let column = Column::new(field.name().to_string(), type_, field);
-            columns.push(column);
-        }
-    }
-
    Ok(RowStream {
-        statement: Statement::new_anonymous(parameters, columns),
        responses,
+        statement: Statement::new("", columns),
        command_tag: None,
        status: ReadyForQueryStatus::Unknown,
        output_format: Format::Text,
-        _p: PhantomPinned,
    })
 }

-async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
-    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
-
-    match responses.next().await? {
-        Message::BindComplete => {}
-        _ => return Err(Error::unexpected_message()),
-    }
-
-    Ok(responses)
+/// A stream of table rows.
+pub struct RowStream<'a> {
+    responses: &'a mut Responses,
+    output_format: Format,
+    pub statement: Statement,
+    pub command_tag: Option<String>,
+    pub status: ReadyForQueryStatus,
 }

-pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result<Bytes, Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    client.with_buf(|buf| {
-        encode_bind(statement, params, "", buf)?;
-        frontend::execute("", 0, buf).map_err(Error::encode)?;
-        frontend::sync(buf);
-        Ok(buf.split().freeze())
-    })
-}
-
-pub fn encode_bind<'a, I>(
-    statement: &Statement,
-    params: I,
-    portal: &str,
-    buf: &mut BytesMut,
-) -> Result<(), Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    let param_types = statement.params();
-    let params = params.into_iter();
-
-    assert!(
-        param_types.len() == params.len(),
-        "expected {} parameters but got {}",
-        param_types.len(),
-        params.len()
-    );
-
-    let (param_formats, params): (Vec<_>, Vec<_>) = params
-        .zip(param_types.iter())
-        .map(|(p, ty)| (p.encode_format(ty) as i16, p))
-        .unzip();
-
-    let params = params.into_iter();
-
-    let mut error_idx = 0;
-    let r = frontend::bind(
-        portal,
-        statement.name(),
-        param_formats,
-        params.zip(param_types).enumerate(),
-        |(idx, (param, ty)), buf| match param.to_sql_checked(ty, buf) {
-            Ok(IsNull::No) => Ok(postgres_protocol2::IsNull::No),
-            Ok(IsNull::Yes) => Ok(postgres_protocol2::IsNull::Yes),
-            Err(e) => {
-                error_idx = idx;
-                Err(e)
-            }
-        },
-        Some(1),
-        buf,
-    );
-    match r {
-        Ok(()) => Ok(()),
-        Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, error_idx)),
-        Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)),
-    }
-}
-
-pin_project! {
-    /// A stream of table rows.
-    pub struct RowStream {
-        statement: Statement,
-        responses: Responses,
-        command_tag: Option<String>,
-        output_format: Format,
-        status: ReadyForQueryStatus,
-        #[pin]
-        _p: PhantomPinned,
-    }
-}
-
-impl Stream for RowStream {
+impl Stream for RowStream<'_> {
    type Item = Result<Row, Error>;

    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        let this = self.project();
+        let this = self.get_mut();
        loop {
            match ready!(this.responses.poll_next(cx)?) {
                Message::DataRow(body) => {
                    return Poll::Ready(Some(Ok(Row::new(
                        this.statement.clone(),
                        body,
-                        *this.output_format,
+                        this.output_format,
                    )?)));
                }
                Message::EmptyQueryResponse | Message::PortalSuspended => {}
                Message::CommandComplete(body) => {
                    if let Ok(tag) = body.tag() {
-                        *this.command_tag = Some(tag.to_string());
+                        this.command_tag = Some(tag.to_string());
                    }
                }
                Message::ReadyForQuery(status) => {
-                    *this.status = status.into();
+                    this.status = status.into();
                    return Poll::Ready(None);
                }
                _ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
@@ -276,24 +148,3 @@ impl Stream for RowStream {
        }
    }
 }
-
-impl RowStream {
-    /// Returns information about the columns of data in the row.
-    pub fn columns(&self) -> &[Column] {
-        self.statement.columns()
-    }
-
-    /// Returns the command tag of this query.
-    ///
-    /// This is only available after the stream has been exhausted.
-    pub fn command_tag(&self) -> Option<String> {
-        self.command_tag.clone()
-    }
-
-    /// Returns if the connection is ready for querying, with the status of the connection.
-    ///
-    /// This might be available only after the stream has been exhausted.
-    pub fn ready_status(&self) -> ReadyForQueryStatus {
-        self.status
-    }
-}
--- a/libs/proxy/tokio-postgres2/src/simple_query.rs
+++ b/libs/proxy/tokio-postgres2/src/simple_query.rs
@@ -1,19 +1,14 @@
-use std::marker::PhantomPinned;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{Context, Poll};

-use bytes::Bytes;
 use fallible_iterator::FallibleIterator;
 use futures_util::{Stream, ready};
 use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
-use postgres_protocol2::message::frontend;
 use tracing::debug;

 use crate::client::{InnerClient, Responses};
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
 use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow};

 /// Information about a column of a single query row.
@@ -33,28 +28,28 @@ impl SimpleColumn {
    }
 }

-pub async fn simple_query(client: &InnerClient, query: &str) -> Result<SimpleQueryStream, Error> {
+pub async fn simple_query<'a>(
+    client: &'a mut InnerClient,
+    query: &str,
+) -> Result<SimpleQueryStream<'a>, Error> {
    debug!("executing simple query: {}", query);

-    let buf = encode(client, query)?;
-    let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+    let responses = client.send_simple_query(query)?;

    Ok(SimpleQueryStream {
        responses,
        columns: None,
        status: ReadyForQueryStatus::Unknown,
-        _p: PhantomPinned,
    })
 }

 pub async fn batch_execute(
-    client: &InnerClient,
+    client: &mut InnerClient,
    query: &str,
 ) -> Result<ReadyForQueryStatus, Error> {
    debug!("executing statement batch: {}", query);

-    let buf = encode(client, query)?;
-    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+    let responses = client.send_simple_query(query)?;

    loop {
        match responses.next().await? {
@@ -68,25 +63,16 @@ pub async fn batch_execute(
    }
 }

-pub(crate) fn encode(client: &InnerClient, query: &str) -> Result<Bytes, Error> {
-    client.with_buf(|buf| {
-        frontend::query(query, buf).map_err(Error::encode)?;
-        Ok(buf.split().freeze())
-    })
-}
-
 pin_project! {
    /// A stream of simple query results.
-    pub struct SimpleQueryStream {
-        responses: Responses,
+    pub struct SimpleQueryStream<'a> {
+        responses: &'a mut Responses,
        columns: Option<Arc<[SimpleColumn]>>,
        status: ReadyForQueryStatus,
-        #[pin]
-        _p: PhantomPinned,
    }
 }

-impl SimpleQueryStream {
+impl SimpleQueryStream<'_> {
    /// Returns if the connection is ready for querying, with the status of the connection.
    ///
    /// This might be available only after the stream has been exhausted.
@@ -95,7 +81,7 @@ impl SimpleQueryStream {
    }
 }

-impl Stream for SimpleQueryStream {
+impl Stream for SimpleQueryStream<'_> {
    type Item = Result<SimpleQueryMessage, Error>;

    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
--- a/libs/proxy/tokio-postgres2/src/statement.rs
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -1,35 +1,15 @@
 use std::fmt;
-use std::sync::{Arc, Weak};
+use std::sync::Arc;

+use crate::types::Type;
 use postgres_protocol2::Oid;
 use postgres_protocol2::message::backend::Field;
-use postgres_protocol2::message::frontend;
-
-use crate::client::InnerClient;
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
-use crate::types::Type;

 struct StatementInner {
-    client: Weak<InnerClient>,
    name: &'static str,
-    params: Vec<Type>,
    columns: Vec<Column>,
 }

-impl Drop for StatementInner {
-    fn drop(&mut self) {
-        if let Some(client) = self.client.upgrade() {
-            let buf = client.with_buf(|buf| {
-                frontend::close(b'S', self.name, buf).unwrap();
-                frontend::sync(buf);
-                buf.split().freeze()
-            });
-            let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)));
-        }
-    }
-}
-
 /// A prepared statement.
 ///
 /// Prepared statements can only be used with the connection that created them.
@@ -37,38 +17,14 @@ impl Drop for StatementInner {
 pub struct Statement(Arc<StatementInner>);

 impl Statement {
-    pub(crate) fn new(
-        inner: &Arc<InnerClient>,
-        name: &'static str,
-        params: Vec<Type>,
-        columns: Vec<Column>,
-    ) -> Statement {
-        Statement(Arc::new(StatementInner {
-            client: Arc::downgrade(inner),
-            name,
-            params,
-            columns,
-        }))
-    }
-
-    pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
-        Statement(Arc::new(StatementInner {
-            client: Weak::new(),
-            name: "<anonymous>",
-            params,
-            columns,
-        }))
+    pub(crate) fn new(name: &'static str, columns: Vec<Column>) -> Statement {
+        Statement(Arc::new(StatementInner { name, columns }))
    }

    pub(crate) fn name(&self) -> &str {
        self.0.name
    }

-    /// Returns the expected types of the statement's parameters.
-    pub fn params(&self) -> &[Type] {
-        &self.0.params
-    }
-
    /// Returns information about the columns returned when the statement is queried.
    pub fn columns(&self) -> &[Column] {
        &self.0.columns
@@ -78,7 +34,7 @@ impl Statement {
 /// Information about a column of a query.
 pub struct Column {
    name: String,
-    type_: Type,
+    pub(crate) type_: Type,

    // raw fields from RowDescription
    table_oid: Oid,
--- a/libs/proxy/tokio-postgres2/src/transaction.rs
+++ b/libs/proxy/tokio-postgres2/src/transaction.rs
@@ -1,7 +1,3 @@
-use postgres_protocol2::message::frontend;
-
-use crate::codec::FrontendMessage;
-use crate::connection::RequestMessages;
 use crate::query::RowStream;
 use crate::{CancelToken, Client, Error, ReadyForQueryStatus};

@@ -20,14 +16,7 @@ impl Drop for Transaction<'_> {
            return;
        }

-        let buf = self.client.inner().with_buf(|buf| {
-            frontend::query("ROLLBACK", buf).unwrap();
-            buf.split().freeze()
-        });
-        let _ = self
-            .client
-            .inner()
-            .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+        let _ = self.client.inner_mut().send_simple_query("ROLLBACK");
    }
 }

@@ -54,7 +43,11 @@ impl<'a> Transaction<'a> {
    }

    /// Like `Client::query_raw_txt`.
-    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    pub async fn query_raw_txt<S, I>(
+        &mut self,
+        statement: &str,
+        params: I,
+    ) -> Result<RowStream, Error>
    where
        S: AsRef<str>,
        I: IntoIterator<Item = Option<S>>,
--- a/libs/utils/src/leaky_bucket.rs
+++ b/libs/utils/src/leaky_bucket.rs
@@ -28,6 +28,7 @@ use std::time::Duration;
 use tokio::sync::Notify;
 use tokio::time::Instant;

+#[derive(Clone, Copy)]
 pub struct LeakyBucketConfig {
    /// This is the "time cost" of a single request unit.
    /// Should loosely represent how long it takes to handle a request unit in active resource time.
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -73,6 +73,7 @@ pub mod error;
 /// async timeout helper
 pub mod timeout;

+pub mod span;
 pub mod sync;

 pub mod failpoint_support;
--- a/libs/utils/src/span.rs
+++ b/libs/utils/src/span.rs
@@ -0,0 +1,19 @@
+//! Tracing span helpers.
+
+/// Records the given fields in the current span, as a single call. The fields must already have
+/// been declared for the span (typically with empty values).
+#[macro_export]
+macro_rules! span_record {
+    ($($tokens:tt)*) => {$crate::span_record_in!(::tracing::Span::current(), $($tokens)*)};
+}
+
+/// Records the given fields in the given span, as a single call. The fields must already have been
+/// declared for the span (typically with empty values).
+#[macro_export]
+macro_rules! span_record_in {
+    ($span:expr, $($tokens:tt)*) => {
+        if let Some(meta) = $span.metadata() {
+            $span.record_all(&tracing::valueset!(meta.fields(), $($tokens)*));
+        }
+    };
+}
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -439,6 +439,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
        currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
        shard_ps_feedback: [empty_feedback; 128],
        num_shards: 0,
+        replica_promote: false,
        min_ps_feedback: empty_feedback,
    }
 }
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -1,6 +1,7 @@
 #![allow(clippy::todo)]

 use std::ffi::CString;
+use std::str::FromStr;

 use postgres_ffi::WAL_SEGMENT_SIZE;
 use utils::id::TenantTimelineId;
@@ -173,6 +174,8 @@ pub struct Config {
    pub ttid: TenantTimelineId,
    /// List of safekeepers in format `host:port`
    pub safekeepers_list: Vec<String>,
+    /// libpq connection info options
+    pub safekeeper_conninfo_options: String,
    /// Safekeeper reconnect timeout in milliseconds
    pub safekeeper_reconnect_timeout: i32,
    /// Safekeeper connection timeout in milliseconds
@@ -202,6 +205,9 @@ impl Wrapper {
            .into_bytes_with_nul();
        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
+        let safekeeper_conninfo_options = CString::from_str(&config.safekeeper_conninfo_options)
+            .unwrap()
+            .into_raw();

        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

@@ -209,6 +215,7 @@ impl Wrapper {
            neon_tenant,
            neon_timeline,
            safekeepers_list,
+            safekeeper_conninfo_options,
            safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
            safekeeper_connection_timeout: config.safekeeper_connection_timeout,
            wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
@@ -576,6 +583,7 @@ mod tests {
        let config = crate::walproposer::Config {
            ttid,
            safekeepers_list: vec!["localhost:5000".to_string()],
+            safekeeper_conninfo_options: String::new(),
            safekeeper_reconnect_timeout: 1000,
            safekeeper_connection_timeout: 10000,
            sync_safekeepers: true,
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -17,50 +17,70 @@ anyhow.workspace = true
 arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
-bit_field.workspace = true
 bincode.workspace = true
+bit_field.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
-camino.workspace = true
 camino-tempfile.workspace = true
+camino.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
 consumption_metrics.workspace = true
 crc32c.workspace = true
 either.workspace = true
+enum-map.workspace = true
+enumset = { workspace = true, features = ["serde"]}
 fail.workspace = true
 futures.workspace = true
 hashlink.workspace = true
 hex.workspace = true
-humantime.workspace = true
+http.workspace = true
+http-utils.workspace = true
 humantime-serde.workspace = true
+humantime.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
 jsonwebtoken.workspace = true
 md5.workspace = true
+metrics.workspace = true
 nix.workspace = true
-# hack to get the number of worker threads tokio uses
-num_cpus.workspace = true
+num_cpus.workspace = true # hack to get the number of worker threads tokio uses
 num-traits.workspace = true
 once_cell.workspace = true
+pageserver_api.workspace = true
+pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
+pageserver_compaction.workspace = true
+pageserver_page_api.workspace = true
+pem.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
+postgres_connection.workspace = true
+postgres_ffi.workspace = true
+postgres_initdb.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
-postgres_initdb.workspace = true
+posthog_client_lite.workspace = true
 pprof.workspace = true
+pq_proto.workspace = true
 rand.workspace = true
 range-set-blaze = { version = "0.1.16", features = ["alloc"] }
 regex.workspace = true
+remote_storage.workspace = true
+reqwest.workspace = true
+rpds.workspace = true
 rustls.workspace = true
 scopeguard.workspace = true
 send-future.workspace = true
-serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
 serde_with.workspace = true
+serde.workspace = true
+smallvec.workspace = true
+storage_broker.workspace = true
+strum_macros.workspace = true
+strum.workspace = true
 sysinfo.workspace = true
-tokio-tar.workspace = true
+tenant_size_model.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
@@ -69,34 +89,19 @@ tokio-io-timeout.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio-stream.workspace = true
+tokio-tar.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
+tonic.workspace = true
+tonic-reflection.workspace = true
+tower.workspace = true
 tracing.workspace = true
 tracing-utils.workspace = true
 url.workspace = true
-walkdir.workspace = true
-metrics.workspace = true
-pageserver_api.workspace = true
-pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
-pageserver_compaction.workspace = true
-pem.workspace = true
-postgres_connection.workspace = true
-postgres_ffi.workspace = true
-pq_proto.workspace = true
-remote_storage.workspace = true
-storage_broker.workspace = true
-tenant_size_model.workspace = true
-http-utils.workspace = true
 utils.workspace = true
-workspace_hack.workspace = true
-reqwest.workspace = true
-rpds.workspace = true
-enum-map.workspace = true
-enumset = { workspace = true, features = ["serde"]}
-strum.workspace = true
-strum_macros.workspace = true
 wal_decoder.workspace = true
-smallvec.workspace = true
+walkdir.workspace = true
+workspace_hack.workspace = true
 twox-hash.workspace = true

 [target.'cfg(target_os = "linux")'.dependencies]
--- a/pageserver/benches/bench_metrics.rs
+++ b/pageserver/benches/bench_metrics.rs
@@ -264,10 +264,56 @@ mod propagation_of_cached_label_value {
    }
 }

+criterion_group!(histograms, histograms::bench_bucket_scalability);
+mod histograms {
+    use std::time::Instant;
+
+    use criterion::{BenchmarkId, Criterion};
+    use metrics::core::Collector;
+
+    pub fn bench_bucket_scalability(c: &mut Criterion) {
+        let mut g = c.benchmark_group("bucket_scalability");
+
+        for n in [1, 4, 8, 16, 32, 64, 128, 256] {
+            g.bench_with_input(BenchmarkId::new("nbuckets", n), &n, |b, n| {
+                b.iter_custom(|iters| {
+                    let buckets: Vec<f64> = (0..*n).map(|i| i as f64 * 100.0).collect();
+                    let histo = metrics::Histogram::with_opts(
+                        metrics::prometheus::HistogramOpts::new("name", "help")
+                            .buckets(buckets.clone()),
+                    )
+                    .unwrap();
+                    let start = Instant::now();
+                    for i in 0..usize::try_from(iters).unwrap() {
+                        histo.observe(buckets[i % buckets.len()]);
+                    }
+                    let elapsed = start.elapsed();
+                    // self-test
+                    let mfs = histo.collect();
+                    assert_eq!(mfs.len(), 1);
+                    let metrics = mfs[0].get_metric();
+                    assert_eq!(metrics.len(), 1);
+                    let histo = metrics[0].get_histogram();
+                    let buckets = histo.get_bucket();
+                    assert!(
+                        buckets
+                            .iter()
+                            .enumerate()
+                            .all(|(i, b)| b.get_cumulative_count()
+                                >= i as u64 * (iters / buckets.len() as u64))
+                    );
+                    elapsed
+                })
+            });
+        }
+    }
+}
+
 criterion_main!(
    label_values,
    single_metric_multicore_scalability,
-    propagation_of_cached_label_value
+    propagation_of_cached_label_value,
+    histograms,
 );

 /*
@@ -290,6 +336,14 @@ propagation_of_cached_label_value__naive/nthreads/8 time:   [211.50 ns 214.44 ns
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1 time:   [14.135 ns 14.147 ns 14.160 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4 time:   [14.243 ns 14.255 ns 14.268 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8 time:   [14.470 ns 14.682 ns 14.895 ns]
+bucket_scalability/nbuckets/1     time:   [30.352 ns 30.353 ns 30.354 ns]
+bucket_scalability/nbuckets/4     time:   [30.464 ns 30.465 ns 30.467 ns]
+bucket_scalability/nbuckets/8     time:   [30.569 ns 30.575 ns 30.584 ns]
+bucket_scalability/nbuckets/16      time:   [30.961 ns 30.965 ns 30.969 ns]
+bucket_scalability/nbuckets/32      time:   [35.691 ns 35.707 ns 35.722 ns]
+bucket_scalability/nbuckets/64      time:   [47.829 ns 47.898 ns 47.974 ns]
+bucket_scalability/nbuckets/128     time:   [73.479 ns 73.512 ns 73.545 ns]
+bucket_scalability/nbuckets/256     time:   [127.92 ns 127.94 ns 127.96 ns]

 Results on an i3en.3xlarge instance

@@ -344,6 +398,14 @@ propagation_of_cached_label_value__naive/nthreads/8     time:   [434.87 ns 456.4
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1     time:   [3.3767 ns 3.3974 ns 3.4220 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4     time:   [3.6105 ns 4.2355 ns 5.1463 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8     time:   [4.0889 ns 4.9714 ns 6.0779 ns]
+bucket_scalability/nbuckets/1     time:   [4.8455 ns 4.8542 ns 4.8646 ns]
+bucket_scalability/nbuckets/4     time:   [4.5663 ns 4.5722 ns 4.5787 ns]
+bucket_scalability/nbuckets/8     time:   [4.5531 ns 4.5670 ns 4.5842 ns]
+bucket_scalability/nbuckets/16      time:   [4.6392 ns 4.6524 ns 4.6685 ns]
+bucket_scalability/nbuckets/32      time:   [6.0302 ns 6.0439 ns 6.0589 ns]
+bucket_scalability/nbuckets/64      time:   [10.608 ns 10.644 ns 10.691 ns]
+bucket_scalability/nbuckets/128     time:   [22.178 ns 22.316 ns 22.483 ns]
+bucket_scalability/nbuckets/256     time:   [42.190 ns 42.328 ns 42.492 ns]

 Results on a Hetzner AX102 AMD Ryzen 9 7950X3D 16-Core Processor

@@ -362,5 +424,13 @@ propagation_of_cached_label_value__naive/nthreads/8     time:   [164.24 ns 170.1
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/1     time:   [2.2915 ns 2.2960 ns 2.3012 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/4     time:   [2.5726 ns 2.6158 ns 2.6624 ns]
 propagation_of_cached_label_value__long_lived_reference_per_thread/nthreads/8     time:   [2.7068 ns 2.8243 ns 2.9824 ns]
+bucket_scalability/nbuckets/1     time:   [6.3998 ns 6.4288 ns 6.4684 ns]
+bucket_scalability/nbuckets/4     time:   [6.3603 ns 6.3620 ns 6.3637 ns]
+bucket_scalability/nbuckets/8     time:   [6.1646 ns 6.1654 ns 6.1667 ns]
+bucket_scalability/nbuckets/16      time:   [6.1341 ns 6.1391 ns 6.1454 ns]
+bucket_scalability/nbuckets/32      time:   [8.2206 ns 8.2254 ns 8.2301 ns]
+bucket_scalability/nbuckets/64      time:   [13.988 ns 13.994 ns 14.000 ns]
+bucket_scalability/nbuckets/128     time:   [28.180 ns 28.216 ns 28.251 ns]
+bucket_scalability/nbuckets/256     time:   [54.914 ns 54.931 ns 54.951 ns]

 */
--- a/pageserver/page_api/Cargo.toml
+++ b/pageserver/page_api/Cargo.toml
@@ -5,8 +5,13 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+bytes.workspace = true
+pageserver_api.workspace = true
+postgres_ffi.workspace = true
 prost.workspace = true
+thiserror.workspace = true
 tonic.workspace = true
+utils.workspace = true
 workspace_hack.workspace = true

 [build-dependencies]
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -54,9 +54,9 @@ service PageService {
  // RPCs use regular unary requests, since they are not as frequent and
  // performance-critical, and this simplifies implementation.
  //
-  // NB: a status response (e.g. errors) will terminate the stream. The stream
-  // may be shared by e.g. multiple Postgres backends, so we should avoid this.
-  // Most errors are therefore sent as GetPageResponse.status instead.
+  // NB: a gRPC status response (e.g. errors) will terminate the stream. The
+  // stream may be shared by multiple Postgres backends, so we avoid this by
+  // sending them as GetPageResponse.status_code instead.
  rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);

  // Returns the size of a relation, as # of blocks.
@@ -159,8 +159,8 @@ message GetPageRequest {
 // A GetPageRequest class. Primarily intended for observability, but may also be
 // used for prioritization in the future.
 enum GetPageClass {
-  // Unknown class. For forwards compatibility: used when the client sends a
-  // class that the server doesn't know about.
+  // Unknown class. For backwards compatibility: used when an older client version sends a class
+  // that a newer server version has removed.
  GET_PAGE_CLASS_UNKNOWN = 0;
  // A normal request. This is the default.
  GET_PAGE_CLASS_NORMAL = 1;
@@ -180,31 +180,37 @@ message GetPageResponse {
  // The original request's ID.
  uint64 request_id = 1;
  // The response status code.
-  GetPageStatus status = 2;
+  GetPageStatusCode status_code = 2;
  // A string describing the status, if any.
  string reason = 3;
-  // The 8KB page images, in the same order as the request. Empty if status != OK.
+  // The 8KB page images, in the same order as the request. Empty if status_code != OK.
  repeated bytes page_image = 4;
 }

-// A GetPageResponse status code. Since we use a bidirectional stream, we don't
-// want to send errors as gRPC statuses, since this would terminate the stream.
-enum GetPageStatus {
-  // Unknown status. For forwards compatibility: used when the server sends a
-  // status code that the client doesn't know about.
-  GET_PAGE_STATUS_UNKNOWN = 0;
+// A GetPageResponse status code.
+//
+// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
+// (potentially shared by many backends), and a gRPC status response would terminate the stream so
+// we send GetPageResponse messages with these codes instead.
+enum GetPageStatusCode {
+  // Unknown status. For forwards compatibility: used when an older client version receives a new
+  // status code from a newer server version.
+  GET_PAGE_STATUS_CODE_UNKNOWN = 0;
  // The request was successful.
-  GET_PAGE_STATUS_OK = 1;
+  GET_PAGE_STATUS_CODE_OK = 1;
  // The page did not exist. The tenant/timeline/shard has already been
  // validated during stream setup.
-  GET_PAGE_STATUS_NOT_FOUND = 2;
+  GET_PAGE_STATUS_CODE_NOT_FOUND = 2;
  // The request was invalid.
-  GET_PAGE_STATUS_INVALID = 3;
+  GET_PAGE_STATUS_CODE_INVALID_REQUEST = 3;
+  // The request failed due to an internal server error.
+  GET_PAGE_STATUS_CODE_INTERNAL_ERROR = 4;
  // The tenant is rate limited. Slow down and retry later.
-  GET_PAGE_STATUS_SLOW_DOWN = 4;
-  // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
-  // layer download. This could free up the server task to process other
-  // requests while the layer download is in progress.
+  GET_PAGE_STATUS_CODE_SLOW_DOWN = 5;
+  // NB: shutdown errors are emitted as a gRPC Unavailable status.
+  //
+  // TODO: consider adding a GET_PAGE_STATUS_CODE_LAYER_DOWNLOAD in the case of a layer download.
+  // This could free up the server task to process other requests while the download is in progress.
 }

 // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
--- a/pageserver/page_api/src/lib.rs
+++ b/pageserver/page_api/src/lib.rs
@@ -17,3 +17,7 @@ pub mod proto {
    pub use page_service_client::PageServiceClient;
    pub use page_service_server::{PageService, PageServiceServer};
 }
+
+mod model;
+
+pub use model::*;
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -0,0 +1,587 @@
+//! Structs representing the canonical page service API.
+//!
+//! These mirror the autogenerated Protobuf types. The differences are:
+//!
+//! - Types that are in fact required by the API are not Options. The protobuf "required"
+//!   attribute is deprecated and 'prost' marks a lot of members as optional because of that.
+//!   (See <https://github.com/tokio-rs/prost/issues/800> for a gripe on this)
+//!
+//! - Use more precise datatypes, e.g. Lsn and uints shorter than 32 bits.
+//!
+//! - Validate protocol invariants, via try_from() and try_into().
+//!
+//! Validation only happens on the receiver side, i.e. when converting from Protobuf to domain
+//! types. This is where it matters -- the Protobuf types are less strict than the domain types, and
+//! receivers should expect all sorts of junk from senders. This also allows the sender to use e.g.
+//! stream combinators without dealing with errors, and avoids validating the same message twice.
+
+use std::fmt::Display;
+
+use bytes::Bytes;
+use postgres_ffi::Oid;
+// TODO: split out Lsn, RelTag, SlruKind, Oid and other basic types to a separate crate, to avoid
+// pulling in all of their other crate dependencies when building the client.
+use utils::lsn::Lsn;
+
+use crate::proto;
+
+/// A protocol error. Typically returned via try_from() or try_into().
+#[derive(thiserror::Error, Debug)]
+pub enum ProtocolError {
+    #[error("field '{0}' has invalid value '{1}'")]
+    Invalid(&'static str, String),
+    #[error("required field '{0}' is missing")]
+    Missing(&'static str),
+}
+
+impl ProtocolError {
+    /// Helper to generate a new ProtocolError::Invalid for the given field and value.
+    pub fn invalid(field: &'static str, value: impl std::fmt::Debug) -> Self {
+        Self::Invalid(field, format!("{value:?}"))
+    }
+}
+
+impl From<ProtocolError> for tonic::Status {
+    fn from(err: ProtocolError) -> Self {
+        tonic::Status::invalid_argument(format!("{err}"))
+    }
+}
+
+/// The LSN a request should read at.
+#[derive(Clone, Copy, Debug)]
+pub struct ReadLsn {
+    /// The request's read LSN.
+    pub request_lsn: Lsn,
+    /// If given, the caller guarantees that the page has not been modified since this LSN. Must be
+    /// smaller than or equal to request_lsn. This allows the Pageserver to serve an old page
+    /// without waiting for the request LSN to arrive. If not given, the request will read at the
+    /// request_lsn and wait for it to arrive if necessary. Valid for all request types.
+    ///
+    /// It is undefined behaviour to make a request such that the page was, in fact, modified
+    /// between request_lsn and not_modified_since_lsn. The Pageserver might detect it and return an
+    /// error, or it might return the old page version or the new page version. Setting
+    /// not_modified_since_lsn equal to request_lsn is always safe, but can lead to unnecessary
+    /// waiting.
+    pub not_modified_since_lsn: Option<Lsn>,
+}
+
+impl Display for ReadLsn {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let req_lsn = self.request_lsn;
+        if let Some(mod_lsn) = self.not_modified_since_lsn {
+            write!(f, "{req_lsn}>={mod_lsn}")
+        } else {
+            req_lsn.fmt(f)
+        }
+    }
+}
+
+impl TryFrom<proto::ReadLsn> for ReadLsn {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::ReadLsn) -> Result<Self, Self::Error> {
+        if pb.request_lsn == 0 {
+            return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
+        }
+        if pb.not_modified_since_lsn > pb.request_lsn {
+            return Err(ProtocolError::invalid(
+                "not_modified_since_lsn",
+                pb.not_modified_since_lsn,
+            ));
+        }
+        Ok(Self {
+            request_lsn: Lsn(pb.request_lsn),
+            not_modified_since_lsn: match pb.not_modified_since_lsn {
+                0 => None,
+                lsn => Some(Lsn(lsn)),
+            },
+        })
+    }
+}
+
+impl From<ReadLsn> for proto::ReadLsn {
+    fn from(read_lsn: ReadLsn) -> Self {
+        Self {
+            request_lsn: read_lsn.request_lsn.0,
+            not_modified_since_lsn: read_lsn.not_modified_since_lsn.unwrap_or_default().0,
+        }
+    }
+}
+
+// RelTag is defined in pageserver_api::reltag.
+pub type RelTag = pageserver_api::reltag::RelTag;
+
+impl TryFrom<proto::RelTag> for RelTag {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::RelTag) -> Result<Self, Self::Error> {
+        Ok(Self {
+            spcnode: pb.spc_oid,
+            dbnode: pb.db_oid,
+            relnode: pb.rel_number,
+            forknum: pb
+                .fork_number
+                .try_into()
+                .map_err(|_| ProtocolError::invalid("fork_number", pb.fork_number))?,
+        })
+    }
+}
+
+impl From<RelTag> for proto::RelTag {
+    fn from(rel_tag: RelTag) -> Self {
+        Self {
+            spc_oid: rel_tag.spcnode,
+            db_oid: rel_tag.dbnode,
+            rel_number: rel_tag.relnode,
+            fork_number: rel_tag.forknum as u32,
+        }
+    }
+}
+
+/// Checks whether a relation exists, at the given LSN. Only valid on shard 0, other shards error.
+#[derive(Clone, Copy, Debug)]
+pub struct CheckRelExistsRequest {
+    pub read_lsn: ReadLsn,
+    pub rel: RelTag,
+}
+
+impl TryFrom<proto::CheckRelExistsRequest> for CheckRelExistsRequest {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::CheckRelExistsRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
+            read_lsn: pb
+                .read_lsn
+                .ok_or(ProtocolError::Missing("read_lsn"))?
+                .try_into()?,
+            rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
+        })
+    }
+}
+
+impl From<CheckRelExistsRequest> for proto::CheckRelExistsRequest {
+    fn from(request: CheckRelExistsRequest) -> Self {
+        Self {
+            read_lsn: Some(request.read_lsn.into()),
+            rel: Some(request.rel.into()),
+        }
+    }
+}
+
+pub type CheckRelExistsResponse = bool;
+
+impl From<proto::CheckRelExistsResponse> for CheckRelExistsResponse {
+    fn from(pb: proto::CheckRelExistsResponse) -> Self {
+        pb.exists
+    }
+}
+
+impl From<CheckRelExistsResponse> for proto::CheckRelExistsResponse {
+    fn from(exists: CheckRelExistsResponse) -> Self {
+        Self { exists }
+    }
+}
+
+/// Requests a base backup at a given LSN.
+#[derive(Clone, Copy, Debug)]
+pub struct GetBaseBackupRequest {
+    /// The LSN to fetch a base backup at.
+    pub read_lsn: ReadLsn,
+    /// If true, logical replication slots will not be created.
+    pub replica: bool,
+}
+
+impl TryFrom<proto::GetBaseBackupRequest> for GetBaseBackupRequest {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetBaseBackupRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
+            read_lsn: pb
+                .read_lsn
+                .ok_or(ProtocolError::Missing("read_lsn"))?
+                .try_into()?,
+            replica: pb.replica,
+        })
+    }
+}
+
+impl From<GetBaseBackupRequest> for proto::GetBaseBackupRequest {
+    fn from(request: GetBaseBackupRequest) -> Self {
+        Self {
+            read_lsn: Some(request.read_lsn.into()),
+            replica: request.replica,
+        }
+    }
+}
+
+pub type GetBaseBackupResponseChunk = Bytes;
+
+impl TryFrom<proto::GetBaseBackupResponseChunk> for GetBaseBackupResponseChunk {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetBaseBackupResponseChunk) -> Result<Self, Self::Error> {
+        if pb.chunk.is_empty() {
+            return Err(ProtocolError::Missing("chunk"));
+        }
+        Ok(pb.chunk)
+    }
+}
+
+impl From<GetBaseBackupResponseChunk> for proto::GetBaseBackupResponseChunk {
+    fn from(chunk: GetBaseBackupResponseChunk) -> Self {
+        Self { chunk }
+    }
+}
+
+/// Requests the size of a database, as # of bytes. Only valid on shard 0, other shards will error.
+#[derive(Clone, Copy, Debug)]
+pub struct GetDbSizeRequest {
+    pub read_lsn: ReadLsn,
+    pub db_oid: Oid,
+}
+
+impl TryFrom<proto::GetDbSizeRequest> for GetDbSizeRequest {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetDbSizeRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
+            read_lsn: pb
+                .read_lsn
+                .ok_or(ProtocolError::Missing("read_lsn"))?
+                .try_into()?,
+            db_oid: pb.db_oid,
+        })
+    }
+}
+
+impl From<GetDbSizeRequest> for proto::GetDbSizeRequest {
+    fn from(request: GetDbSizeRequest) -> Self {
+        Self {
+            read_lsn: Some(request.read_lsn.into()),
+            db_oid: request.db_oid,
+        }
+    }
+}
+
+pub type GetDbSizeResponse = u64;
+
+impl From<proto::GetDbSizeResponse> for GetDbSizeResponse {
+    fn from(pb: proto::GetDbSizeResponse) -> Self {
+        pb.num_bytes
+    }
+}
+
+impl From<GetDbSizeResponse> for proto::GetDbSizeResponse {
+    fn from(num_bytes: GetDbSizeResponse) -> Self {
+        Self { num_bytes }
+    }
+}
+
+/// Requests one or more pages.
+#[derive(Clone, Debug)]
+pub struct GetPageRequest {
+    /// A request ID. Will be included in the response. Should be unique for in-flight requests on
+    /// the stream.
+    pub request_id: RequestID,
+    /// The request class.
+    pub request_class: GetPageClass,
+    /// The LSN to read at.
+    pub read_lsn: ReadLsn,
+    /// The relation to read from.
+    pub rel: RelTag,
+    /// Page numbers to read. Must belong to the remote shard.
+    ///
+    /// Multiple pages will be executed as a single batch by the Pageserver, amortizing layer access
+    /// costs and parallelizing them. This may increase the latency of any individual request, but
+    /// improves the overall latency and throughput of the batch as a whole.
+    pub block_numbers: Vec<u32>,
+}
+
+impl TryFrom<proto::GetPageRequest> for GetPageRequest {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetPageRequest) -> Result<Self, Self::Error> {
+        if pb.block_number.is_empty() {
+            return Err(ProtocolError::Missing("block_number"));
+        }
+        Ok(Self {
+            request_id: pb.request_id,
+            request_class: pb.request_class.into(),
+            read_lsn: pb
+                .read_lsn
+                .ok_or(ProtocolError::Missing("read_lsn"))?
+                .try_into()?,
+            rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
+            block_numbers: pb.block_number,
+        })
+    }
+}
+
+impl From<GetPageRequest> for proto::GetPageRequest {
+    fn from(request: GetPageRequest) -> Self {
+        Self {
+            request_id: request.request_id,
+            request_class: request.request_class.into(),
+            read_lsn: Some(request.read_lsn.into()),
+            rel: Some(request.rel.into()),
+            block_number: request.block_numbers,
+        }
+    }
+}
+
+/// A GetPage request ID.
+pub type RequestID = u64;
+
+/// A GetPage request class.
+#[derive(Clone, Copy, Debug)]
+pub enum GetPageClass {
+    /// Unknown class. For backwards compatibility: used when an older client version sends a class
+    /// that a newer server version has removed.
+    Unknown,
+    /// A normal request. This is the default.
+    Normal,
+    /// A prefetch request. NB: can only be classified on pg < 18.
+    Prefetch,
+    /// A background request (e.g. vacuum).
+    Background,
+}
+
+impl From<proto::GetPageClass> for GetPageClass {
+    fn from(pb: proto::GetPageClass) -> Self {
+        match pb {
+            proto::GetPageClass::Unknown => Self::Unknown,
+            proto::GetPageClass::Normal => Self::Normal,
+            proto::GetPageClass::Prefetch => Self::Prefetch,
+            proto::GetPageClass::Background => Self::Background,
+        }
+    }
+}
+
+impl From<i32> for GetPageClass {
+    fn from(class: i32) -> Self {
+        proto::GetPageClass::try_from(class)
+            .unwrap_or(proto::GetPageClass::Unknown)
+            .into()
+    }
+}
+
+impl From<GetPageClass> for proto::GetPageClass {
+    fn from(class: GetPageClass) -> Self {
+        match class {
+            GetPageClass::Unknown => Self::Unknown,
+            GetPageClass::Normal => Self::Normal,
+            GetPageClass::Prefetch => Self::Prefetch,
+            GetPageClass::Background => Self::Background,
+        }
+    }
+}
+
+impl From<GetPageClass> for i32 {
+    fn from(class: GetPageClass) -> Self {
+        proto::GetPageClass::from(class).into()
+    }
+}
+
+/// A GetPage response.
+///
+/// A batch response will contain all of the requested pages. We could eagerly emit individual pages
+/// as soon as they are ready, but on a readv() Postgres holds buffer pool locks on all pages in the
+/// batch and we'll only return once the entire batch is ready, so no one can make use of the
+/// individual pages.
+#[derive(Clone, Debug)]
+pub struct GetPageResponse {
+    /// The original request's ID.
+    pub request_id: RequestID,
+    /// The response status code.
+    pub status_code: GetPageStatusCode,
+    /// A string describing the status, if any.
+    pub reason: Option<String>,
+    /// The 8KB page images, in the same order as the request. Empty if status != OK.
+    pub page_images: Vec<Bytes>,
+}
+
+impl From<proto::GetPageResponse> for GetPageResponse {
+    fn from(pb: proto::GetPageResponse) -> Self {
+        Self {
+            request_id: pb.request_id,
+            status_code: pb.status_code.into(),
+            reason: Some(pb.reason).filter(|r| !r.is_empty()),
+            page_images: pb.page_image,
+        }
+    }
+}
+
+impl From<GetPageResponse> for proto::GetPageResponse {
+    fn from(response: GetPageResponse) -> Self {
+        Self {
+            request_id: response.request_id,
+            status_code: response.status_code.into(),
+            reason: response.reason.unwrap_or_default(),
+            page_image: response.page_images,
+        }
+    }
+}
+
+/// A GetPage response status code.
+///
+/// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
+/// (potentially shared by many backends), and a gRPC status response would terminate the stream so
+/// we send GetPageResponse messages with these codes instead.
+#[derive(Clone, Copy, Debug)]
+pub enum GetPageStatusCode {
+    /// Unknown status. For forwards compatibility: used when an older client version receives a new
+    /// status code from a newer server version.
+    Unknown,
+    /// The request was successful.
+    Ok,
+    /// The page did not exist. The tenant/timeline/shard has already been validated during stream
+    /// setup.
+    NotFound,
+    /// The request was invalid.
+    InvalidRequest,
+    /// The request failed due to an internal server error.
+    InternalError,
+    /// The tenant is rate limited. Slow down and retry later.
+    SlowDown,
+}
+
+impl From<proto::GetPageStatusCode> for GetPageStatusCode {
+    fn from(pb: proto::GetPageStatusCode) -> Self {
+        match pb {
+            proto::GetPageStatusCode::Unknown => Self::Unknown,
+            proto::GetPageStatusCode::Ok => Self::Ok,
+            proto::GetPageStatusCode::NotFound => Self::NotFound,
+            proto::GetPageStatusCode::InvalidRequest => Self::InvalidRequest,
+            proto::GetPageStatusCode::InternalError => Self::InternalError,
+            proto::GetPageStatusCode::SlowDown => Self::SlowDown,
+        }
+    }
+}
+
+impl From<i32> for GetPageStatusCode {
+    fn from(status_code: i32) -> Self {
+        proto::GetPageStatusCode::try_from(status_code)
+            .unwrap_or(proto::GetPageStatusCode::Unknown)
+            .into()
+    }
+}
+
+impl From<GetPageStatusCode> for proto::GetPageStatusCode {
+    fn from(status_code: GetPageStatusCode) -> Self {
+        match status_code {
+            GetPageStatusCode::Unknown => Self::Unknown,
+            GetPageStatusCode::Ok => Self::Ok,
+            GetPageStatusCode::NotFound => Self::NotFound,
+            GetPageStatusCode::InvalidRequest => Self::InvalidRequest,
+            GetPageStatusCode::InternalError => Self::InternalError,
+            GetPageStatusCode::SlowDown => Self::SlowDown,
+        }
+    }
+}
+
+impl From<GetPageStatusCode> for i32 {
+    fn from(status_code: GetPageStatusCode) -> Self {
+        proto::GetPageStatusCode::from(status_code).into()
+    }
+}
+
+// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
+// shards will error.
+pub struct GetRelSizeRequest {
+    pub read_lsn: ReadLsn,
+    pub rel: RelTag,
+}
+
+impl TryFrom<proto::GetRelSizeRequest> for GetRelSizeRequest {
+    type Error = ProtocolError;
+
+    fn try_from(proto: proto::GetRelSizeRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
+            read_lsn: proto
+                .read_lsn
+                .ok_or(ProtocolError::Missing("read_lsn"))?
+                .try_into()?,
+            rel: proto.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
+        })
+    }
+}
+
+impl From<GetRelSizeRequest> for proto::GetRelSizeRequest {
+    fn from(request: GetRelSizeRequest) -> Self {
+        Self {
+            read_lsn: Some(request.read_lsn.into()),
+            rel: Some(request.rel.into()),
+        }
+    }
+}
+
+pub type GetRelSizeResponse = u32;
+
+impl From<proto::GetRelSizeResponse> for GetRelSizeResponse {
+    fn from(proto: proto::GetRelSizeResponse) -> Self {
+        proto.num_blocks
+    }
+}
+
+impl From<GetRelSizeResponse> for proto::GetRelSizeResponse {
+    fn from(num_blocks: GetRelSizeResponse) -> Self {
+        Self { num_blocks }
+    }
+}
+
+/// Requests an SLRU segment. Only valid on shard 0, other shards will error.
+pub struct GetSlruSegmentRequest {
+    pub read_lsn: ReadLsn,
+    pub kind: SlruKind,
+    pub segno: u32,
+}
+
+impl TryFrom<proto::GetSlruSegmentRequest> for GetSlruSegmentRequest {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetSlruSegmentRequest) -> Result<Self, Self::Error> {
+        Ok(Self {
+            read_lsn: pb
+                .read_lsn
+                .ok_or(ProtocolError::Missing("read_lsn"))?
+                .try_into()?,
+            kind: u8::try_from(pb.kind)
+                .ok()
+                .and_then(SlruKind::from_repr)
+                .ok_or_else(|| ProtocolError::invalid("slru_kind", pb.kind))?,
+            segno: pb.segno,
+        })
+    }
+}
+
+impl From<GetSlruSegmentRequest> for proto::GetSlruSegmentRequest {
+    fn from(request: GetSlruSegmentRequest) -> Self {
+        Self {
+            read_lsn: Some(request.read_lsn.into()),
+            kind: request.kind as u32,
+            segno: request.segno,
+        }
+    }
+}
+
+pub type GetSlruSegmentResponse = Bytes;
+
+impl TryFrom<proto::GetSlruSegmentResponse> for GetSlruSegmentResponse {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetSlruSegmentResponse) -> Result<Self, Self::Error> {
+        if pb.segment.is_empty() {
+            return Err(ProtocolError::Missing("segment"));
+        }
+        Ok(pb.segment)
+    }
+}
+
+impl From<GetSlruSegmentResponse> for proto::GetSlruSegmentResponse {
+    fn from(segment: GetSlruSegmentResponse) -> Self {
+        Self { segment }
+    }
+}
+
+// SlruKind is defined in pageserver_api::reltag.
+pub type SlruKind = pageserver_api::reltag::SlruKind;
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -8,6 +8,8 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+async-trait.workspace = true
+bytes.workspace = true
 camino.workspace = true
 clap.workspace = true
 futures.workspace = true
@@ -15,14 +17,17 @@ hdrhistogram.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
 rand.workspace = true
-reqwest.workspace=true
+reqwest.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tracing.workspace = true
 tokio.workspace = true
+tokio-stream.workspace = true
 tokio-util.workspace = true
+tonic.workspace = true

 pageserver_client.workspace = true
 pageserver_api.workspace = true
+pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashSet, VecDeque};
+use std::collections::{HashMap, HashSet, VecDeque};
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -7,11 +7,15 @@ use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};

 use anyhow::Context;
+use async_trait::async_trait;
+use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
+use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
+use pageserver_page_api::proto;
 use rand::prelude::*;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
@@ -22,6 +26,12 @@ use utils::lsn::Lsn;
 use crate::util::tokio_thread_local_stats::AllThreadLocalStats;
 use crate::util::{request_stats, tokio_thread_local_stats};

+#[derive(clap::ValueEnum, Clone, Debug)]
+enum Protocol {
+    Libpq,
+    Grpc,
+}
+
 /// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
 #[derive(clap::Parser)]
 pub(crate) struct Args {
@@ -35,6 +45,8 @@ pub(crate) struct Args {
    num_clients: NonZeroUsize,
    #[clap(long)]
    runtime: Option<humantime::Duration>,
+    #[clap(long, value_enum, default_value = "libpq")]
+    protocol: Protocol,
    /// Each client sends requests at the given rate.
    ///
    /// If a request takes too long and we should be issuing a new request already,
@@ -65,6 +77,16 @@ pub(crate) struct Args {
    #[clap(long, default_value = "1")]
    queue_depth: NonZeroUsize,

+    /// Batch size of contiguous pages generated by each client. This is equivalent to how Postgres
+    /// will request page batches (e.g. prefetches or vectored reads). A batch counts as 1 RPS and
+    /// 1 queue depth.
+    ///
+    /// The libpq protocol does not support client-side batching, and will submit batches as many
+    /// individual requests, in the hope that the server will batch them. Each batch still counts as
+    /// 1 RPS and 1 queue depth.
+    #[clap(long, default_value = "1")]
+    batch_size: NonZeroUsize,
+
    #[clap(long)]
    only_relnode: Option<u32>,

@@ -303,7 +325,20 @@ async fn main_impl(
                .unwrap();

        Box::pin(async move {
-            client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
+            let client: Box<dyn Client> = match args.protocol {
+                Protocol::Libpq => Box::new(
+                    LibpqClient::new(args.page_service_connstring.clone(), worker_id.timeline)
+                        .await
+                        .unwrap(),
+                ),
+
+                Protocol::Grpc => Box::new(
+                    GrpcClient::new(args.page_service_connstring.clone(), worker_id.timeline)
+                        .await
+                        .unwrap(),
+                ),
+            };
+            run_worker(args, client, ss, cancel, rps_period, ranges, weights).await
        })
    };

@@ -355,27 +390,28 @@ async fn main_impl(
    anyhow::Ok(())
 }

-async fn client_libpq(
+async fn run_worker(
    args: &Args,
-    worker_id: WorkerId,
+    mut client: Box<dyn Client>,
    shared_state: Arc<SharedState>,
    cancel: CancellationToken,
    rps_period: Option<Duration>,
    ranges: Vec<KeyRange>,
    weights: rand::distributions::weighted::WeightedIndex<i128>,
 ) {
-    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-        .await
-        .unwrap();
-    let mut client = client
-        .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
-        .await
-        .unwrap();
-
    shared_state.start_work_barrier.wait().await;
    let client_start = Instant::now();
    let mut ticks_processed = 0;
-    let mut inflight = VecDeque::new();
+    let mut req_id = 0;
+    let batch_size: usize = args.batch_size.into();
+
+    // Track inflight requests by request ID and start time. This times the request duration, and
+    // ensures responses match requests. We don't expect responses back in any particular order.
+    //
+    // NB: this does not check that all requests received a response, because we don't wait for the
+    // inflight requests to complete when the duration elapses.
+    let mut inflight: HashMap<u64, Instant> = HashMap::new();
+
    while !cancel.is_cancelled() {
        // Detect if a request took longer than the RPS rate
        if let Some(period) = &rps_period {
@@ -391,36 +427,72 @@ async fn client_libpq(
        }

        while inflight.len() < args.queue_depth.get() {
+            req_id += 1;
            let start = Instant::now();
-            let req = {
+            let (req_lsn, mod_lsn, rel, blks) = {
+                /// Converts a compact i128 key to a relation tag and block number.
+                fn key_to_block(key: i128) -> (RelTag, u32) {
+                    let key = Key::from_i128(key);
+                    assert!(key.is_rel_block_key());
+                    key.to_rel_block()
+                        .expect("we filter non-rel-block keys out above")
+                }
+
+                // Pick a random page from a random relation.
                let mut rng = rand::thread_rng();
                let r = &ranges[weights.sample(&mut rng)];
                let key: i128 = rng.gen_range(r.start..r.end);
-                let key = Key::from_i128(key);
-                assert!(key.is_rel_block_key());
-                let (rel_tag, block_no) = key
-                    .to_rel_block()
-                    .expect("we filter non-rel-block keys out above");
-                PagestreamGetPageRequest {
-                    hdr: PagestreamRequest {
-                        reqid: 0,
-                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                            Lsn::MAX
-                        } else {
-                            r.timeline_lsn
-                        },
-                        not_modified_since: r.timeline_lsn,
-                    },
-                    rel: rel_tag,
-                    blkno: block_no,
+                let (rel_tag, block_no) = key_to_block(key);
+
+                let mut blks = VecDeque::with_capacity(batch_size);
+                blks.push_back(block_no);
+
+                // If requested, populate a batch of sequential pages. This is how Postgres will
+                // request page batches (e.g. prefetches). If we hit the end of the relation, we
+                // grow the batch towards the start too.
+                for i in 1..batch_size {
+                    let (r, b) = key_to_block(key + i as i128);
+                    if r != rel_tag {
+                        break; // went outside relation
+                    }
+                    blks.push_back(b)
                }
+
+                if blks.len() < batch_size {
+                    // Grow batch backwards if needed.
+                    for i in 1..batch_size {
+                        let (r, b) = key_to_block(key - i as i128);
+                        if r != rel_tag {
+                            break; // went outside relation
+                        }
+                        blks.push_front(b)
+                    }
+                }
+
+                // We assume that the entire batch can fit within the relation.
+                assert_eq!(blks.len(), batch_size, "incomplete batch");
+
+                let req_lsn = if rng.gen_bool(args.req_latest_probability) {
+                    Lsn::MAX
+                } else {
+                    r.timeline_lsn
+                };
+                (req_lsn, r.timeline_lsn, rel_tag, blks.into())
            };
-            client.getpage_send(req).await.unwrap();
-            inflight.push_back(start);
+            client
+                .send_get_page(req_id, req_lsn, mod_lsn, rel, blks)
+                .await
+                .unwrap();
+            let old = inflight.insert(req_id, start);
+            assert!(old.is_none(), "duplicate request ID {req_id}");
        }

-        let start = inflight.pop_front().unwrap();
-        client.getpage_recv().await.unwrap();
+        let (req_id, pages) = client.recv_get_page().await.unwrap();
+        assert_eq!(pages.len(), batch_size, "unexpected page count");
+        assert!(pages.iter().all(|p| !p.is_empty()), "empty page");
+        let start = inflight
+            .remove(&req_id)
+            .expect("response for unknown request ID");
        let end = Instant::now();
        shared_state.live_stats.request_done();
        ticks_processed += 1;
@@ -442,3 +514,154 @@ async fn client_libpq(
        }
    }
 }
+
+/// A benchmark client, to allow switching out the transport protocol.
+///
+/// For simplicity, this just uses separate asynchronous send/recv methods. The send method could
+/// return a future that resolves when the response is received, but we don't really need it.
+#[async_trait]
+trait Client: Send {
+    /// Sends an asynchronous GetPage request to the pageserver.
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()>;
+
+    /// Receives the next GetPage response from the pageserver.
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)>;
+}
+
+/// A libpq-based Pageserver client.
+struct LibpqClient {
+    inner: pageserver_client::page_service::PagestreamClient,
+    // Track sent batches, so we know how many responses to expect.
+    batch_sizes: VecDeque<usize>,
+}
+
+impl LibpqClient {
+    async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
+        let inner = pageserver_client::page_service::Client::new(connstring)
+            .await?
+            .pagestream(ttid.tenant_id, ttid.timeline_id)
+            .await?;
+        Ok(Self {
+            inner,
+            batch_sizes: VecDeque::new(),
+        })
+    }
+}
+
+#[async_trait]
+impl Client for LibpqClient {
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()> {
+        // libpq doesn't support client-side batches, so we send a bunch of individual requests
+        // instead in the hope that the server will batch them for us. We use the same request ID
+        // for all, because we'll return a single batch response.
+        self.batch_sizes.push_back(blks.len());
+        for blkno in blks {
+            let req = PagestreamGetPageRequest {
+                hdr: PagestreamRequest {
+                    reqid: req_id,
+                    request_lsn: req_lsn,
+                    not_modified_since: mod_lsn,
+                },
+                rel,
+                blkno,
+            };
+            self.inner.getpage_send(req).await?;
+        }
+        Ok(())
+    }
+
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
+        let batch_size = self.batch_sizes.pop_front().unwrap();
+        let mut batch = Vec::with_capacity(batch_size);
+        let mut req_id = None;
+        for _ in 0..batch_size {
+            let resp = self.inner.getpage_recv().await?;
+            if req_id.is_none() {
+                req_id = Some(resp.req.hdr.reqid);
+            }
+            assert_eq!(req_id, Some(resp.req.hdr.reqid), "request ID mismatch");
+            batch.push(resp.page);
+        }
+        Ok((req_id.unwrap(), batch))
+    }
+}
+
+/// A gRPC client using the raw, no-frills gRPC client.
+struct GrpcClient {
+    req_tx: tokio::sync::mpsc::Sender<proto::GetPageRequest>,
+    resp_rx: tonic::Streaming<proto::GetPageResponse>,
+}
+
+impl GrpcClient {
+    async fn new(connstring: String, ttid: TenantTimelineId) -> anyhow::Result<Self> {
+        let mut client = pageserver_page_api::proto::PageServiceClient::connect(connstring).await?;
+
+        // The channel has a buffer size of 1, since 0 is not allowed. It does not matter, since the
+        // benchmark will control the queue depth (i.e. in-flight requests) anyway, and requests are
+        // buffered by Tonic and the OS too.
+        let (req_tx, req_rx) = tokio::sync::mpsc::channel(1);
+        let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
+        let mut req = tonic::Request::new(req_stream);
+        let metadata = req.metadata_mut();
+        metadata.insert("neon-tenant-id", ttid.tenant_id.to_string().try_into()?);
+        metadata.insert("neon-timeline-id", ttid.timeline_id.to_string().try_into()?);
+        metadata.insert("neon-shard-id", "0000".try_into()?);
+
+        let resp = client.get_pages(req).await?;
+        let resp_stream = resp.into_inner();
+
+        Ok(Self {
+            req_tx,
+            resp_rx: resp_stream,
+        })
+    }
+}
+
+#[async_trait]
+impl Client for GrpcClient {
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()> {
+        let req = proto::GetPageRequest {
+            request_id: req_id,
+            request_class: proto::GetPageClass::Normal as i32,
+            read_lsn: Some(proto::ReadLsn {
+                request_lsn: req_lsn.0,
+                not_modified_since_lsn: mod_lsn.0,
+            }),
+            rel: Some(rel.into()),
+            block_number: blks,
+        };
+        self.req_tx.send(req).await?;
+        Ok(())
+    }
+
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
+        let resp = self.resp_rx.message().await?.unwrap();
+        anyhow::ensure!(
+            resp.status_code == proto::GetPageStatusCode::Ok as i32,
+            "unexpected status code: {}",
+            resp.status_code
+        );
+        Ok((resp.request_id, resp.page_image))
+    }
+}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -65,6 +65,30 @@ impl From<GetVectoredError> for BasebackupError {
    }
 }

+impl From<BasebackupError> for postgres_backend::QueryError {
+    fn from(err: BasebackupError) -> Self {
+        use postgres_backend::QueryError;
+        use pq_proto::framed::ConnectionError;
+        match err {
+            BasebackupError::Client(err, _) => QueryError::Disconnected(ConnectionError::Io(err)),
+            BasebackupError::Server(err) => QueryError::Other(err),
+            BasebackupError::Shutdown => QueryError::Shutdown,
+        }
+    }
+}
+
+impl From<BasebackupError> for tonic::Status {
+    fn from(err: BasebackupError) -> Self {
+        use tonic::Code;
+        let code = match &err {
+            BasebackupError::Client(_, _) => Code::Cancelled,
+            BasebackupError::Server(_) => Code::Internal,
+            BasebackupError::Shutdown => Code::Unavailable,
+        };
+        tonic::Status::new(code, err.to_string())
+    }
+}
+
 /// Create basebackup with non-rel data in it.
 /// Only include relational data if 'full_backup' is true.
 ///
@@ -248,7 +272,7 @@ where
    async fn flush(&mut self) -> Result<(), BasebackupError> {
        let nblocks = self.buf.len() / BLCKSZ as usize;
        let (kind, segno) = self.current_segment.take().unwrap();
-        let segname = format!("{}/{:>04X}", kind.to_str(), segno);
+        let segname = format!("{kind}/{segno:>04X}");
        let header = new_tar_header(&segname, self.buf.len() as u64)?;
        self.ar
            .append(&header, self.buf.as_slice())
@@ -347,7 +371,7 @@ where
                .await?
                .partition(
                    self.timeline.get_shard_identity(),
-                    Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+                    self.timeline.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
                );

            let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -21,6 +21,7 @@ use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::feature_resolver::FeatureResolver;
 use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::{
    BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
@@ -157,7 +158,6 @@ fn main() -> anyhow::Result<()> {
    // (maybe we should automate this with a visitor?).
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
-    info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
    info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation");
    info!(?conf.page_service_pipelining, "starting with page service pipelining config");
    info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
@@ -388,23 +388,30 @@ fn start_pageserver(
    // We need to release the lock file only when the process exits.
    std::mem::forget(lock_file);

-    // Bind the HTTP and libpq ports early, so that if they are in use by some other
-    // process, we error out early.
-    let http_addr = &conf.listen_http_addr;
-    info!("Starting pageserver http handler on {http_addr}");
-    let http_listener = tcp_listener::bind(http_addr)?;
+    // Bind the HTTP, libpq, and gRPC ports early, to error out if they are
+    // already in use.
+    info!(
+        "Starting pageserver http handler on {} with auth {:#?}",
+        conf.listen_http_addr, conf.http_auth_type
+    );
+    let http_listener = tcp_listener::bind(&conf.listen_http_addr)?;

    let https_listener = match conf.listen_https_addr.as_ref() {
        Some(https_addr) => {
-            info!("Starting pageserver https handler on {https_addr}");
+            info!(
+                "Starting pageserver https handler on {https_addr} with auth {:#?}",
+                conf.http_auth_type
+            );
            Some(tcp_listener::bind(https_addr)?)
        }
        None => None,
    };

-    let pg_addr = &conf.listen_pg_addr;
-    info!("Starting pageserver pg protocol handler on {pg_addr}");
-    let pageserver_listener = tcp_listener::bind(pg_addr)?;
+    info!(
+        "Starting pageserver pg protocol handler on {} with auth {:#?}",
+        conf.listen_pg_addr, conf.pg_auth_type,
+    );
+    let pageserver_listener = tcp_listener::bind(&conf.listen_pg_addr)?;

    // Enable SO_KEEPALIVE on the socket, to detect dead connections faster.
    // These are configured via net.ipv4.tcp_keepalive_* sysctls.
@@ -413,6 +420,15 @@ fn start_pageserver(
    // support enabling keepalives while using the default OS sysctls.
    setsockopt(&pageserver_listener, sockopt::KeepAlive, &true)?;

+    let mut grpc_listener = None;
+    if let Some(grpc_addr) = &conf.listen_grpc_addr {
+        info!(
+            "Starting pageserver gRPC handler on {grpc_addr} with auth {:#?}",
+            conf.grpc_auth_type
+        );
+        grpc_listener = Some(tcp_listener::bind(grpc_addr).map_err(|e| anyhow!("{e}"))?);
+    }
+
    // Launch broker client
    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
    let broker_client = WALRECEIVER_RUNTIME
@@ -440,7 +456,8 @@ fn start_pageserver(
    // Initialize authentication for incoming connections
    let http_auth;
    let pg_auth;
-    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
+    let grpc_auth;
+    if [conf.http_auth_type, conf.pg_auth_type, conf.grpc_auth_type].contains(&AuthType::NeonJWT) {
        // unwrap is ok because check is performed when creating config, so path is set and exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
@@ -448,20 +465,23 @@ fn start_pageserver(
        let jwt_auth = JwtAuth::from_key_path(key_path)?;
        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));

-        http_auth = match &conf.http_auth_type {
+        http_auth = match conf.http_auth_type {
            AuthType::Trust => None,
            AuthType::NeonJWT => Some(auth.clone()),
        };
-        pg_auth = match &conf.pg_auth_type {
+        pg_auth = match conf.pg_auth_type {
+            AuthType::Trust => None,
+            AuthType::NeonJWT => Some(auth.clone()),
+        };
+        grpc_auth = match conf.grpc_auth_type {
            AuthType::Trust => None,
            AuthType::NeonJWT => Some(auth),
        };
    } else {
        http_auth = None;
        pg_auth = None;
+        grpc_auth = None;
    }
-    info!("Using auth for http API: {:#?}", conf.http_auth_type);
-    info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);

    let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
    {
@@ -502,6 +522,12 @@ fn start_pageserver(
    // Set up remote storage client
    let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;

+    let feature_resolver = create_feature_resolver(
+        conf,
+        shutdown_pageserver.clone(),
+        BACKGROUND_RUNTIME.handle(),
+    )?;
+
    // Set up deletion queue
    let (deletion_queue, deletion_workers) = DeletionQueue::new(
        remote_storage.clone(),
@@ -555,6 +581,7 @@ fn start_pageserver(
            deletion_queue_client,
            l0_flush_global_state,
            basebackup_prepare_sender,
+            feature_resolver,
        },
        order,
        shutdown_pageserver.clone(),
@@ -779,6 +806,23 @@ fn start_pageserver(
        basebackup_cache,
    );

+    // Spawn a Pageserver gRPC server task. It will spawn separate tasks for
+    // each stream/request.
+    //
+    // TODO: this uses a separate Tokio runtime for the page service. If we want
+    // other gRPC services, they will need their own port and runtime. Is this
+    // necessary?
+    let mut page_service_grpc = None;
+    if let Some(grpc_listener) = grpc_listener {
+        page_service_grpc = Some(page_service::spawn_grpc(
+            tenant_manager.clone(),
+            grpc_auth,
+            otel_guard.as_ref().map(|g| g.dispatch.clone()),
+            conf.get_vectored_concurrent_io,
+            grpc_listener,
+        )?);
+    }
+
    // All started up! Now just sit and wait for shutdown signal.
    BACKGROUND_RUNTIME.block_on(async move {
        let signal_token = CancellationToken::new();
@@ -797,6 +841,7 @@ fn start_pageserver(
            http_endpoint_listener,
            https_endpoint_listener,
            page_service,
+            page_service_grpc,
            consumption_metrics_tasks,
            disk_usage_eviction_task,
            &tenant_manager,
@@ -810,6 +855,14 @@ fn start_pageserver(
    })
 }

+fn create_feature_resolver(
+    conf: &'static PageServerConf,
+    shutdown_pageserver: CancellationToken,
+    handle: &tokio::runtime::Handle,
+) -> anyhow::Result<FeatureResolver> {
+    FeatureResolver::spawn(conf, shutdown_pageserver, handle)
+}
+
 async fn create_remote_storage_client(
    conf: &'static PageServerConf,
 ) -> anyhow::Result<GenericRemoteStorage> {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -14,7 +14,10 @@ use std::time::Duration;
 use anyhow::{Context, bail, ensure};
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
-use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes};
+use pageserver_api::config::{
+    DiskUsageEvictionTaskConfig, MaxGetVectoredKeys, MaxVectoredReadBytes,
+    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, PostHogConfig,
+};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use pem::Pem;
@@ -24,7 +27,6 @@ use reqwest::Url;
 use storage_broker::Uri;
 use utils::id::{NodeId, TimelineId};
 use utils::logging::{LogFormat, SecretString};
-use utils::postgres_client::PostgresClientProtocol;

 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -58,11 +60,16 @@ pub struct PageServerConf {
    pub listen_http_addr: String,
    /// Example: 127.0.0.1:9899
    pub listen_https_addr: Option<String>,
+    /// If set, expose a gRPC API on this address.
+    /// Example: 127.0.0.1:51051
+    ///
+    /// EXPERIMENTAL: this protocol is unstable and under active development.
+    pub listen_grpc_addr: Option<String>,

-    /// Path to a file with certificate's private key for https API.
+    /// Path to a file with certificate's private key for https and gRPC API.
    /// Default: server.key
    pub ssl_key_file: Utf8PathBuf,
-    /// Path to a file with a X509 certificate for https API.
+    /// Path to a file with a X509 certificate for https and gRPC API.
    /// Default: server.crt
    pub ssl_cert_file: Utf8PathBuf,
    /// Period to reload certificate and private key from files.
@@ -100,6 +107,8 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
+    /// authentication method for gRPC connections from compute
+    pub grpc_auth_type: AuthType,
    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,
@@ -178,6 +187,9 @@ pub struct PageServerConf {

    pub max_vectored_read_bytes: MaxVectoredReadBytes,

+    /// Maximum number of keys to be read in a single get_vectored call.
+    pub max_get_vectored_keys: MaxGetVectoredKeys,
+
    pub image_compression: ImageCompressionAlgorithm,

    /// Whether to offload archived timelines automatically
@@ -198,8 +210,6 @@ pub struct PageServerConf {
    /// Optionally disable disk syncs (unsafe!)
    pub no_sync: bool,

-    pub wal_receiver_protocol: PostgresClientProtocol,
-
    pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,

    pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
@@ -231,6 +241,9 @@ pub struct PageServerConf {
    /// This is insecure and should only be used in development environments.
    pub dev_mode: bool,

+    /// PostHog integration config.
+    pub posthog_config: Option<PostHogConfig>,
+
    pub timeline_import_config: pageserver_api::config::TimelineImportConfig,

    pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
@@ -355,6 +368,7 @@ impl PageServerConf {
            listen_pg_addr,
            listen_http_addr,
            listen_https_addr,
+            listen_grpc_addr,
            ssl_key_file,
            ssl_cert_file,
            ssl_cert_reload_period,
@@ -369,6 +383,7 @@ impl PageServerConf {
            pg_distrib_dir,
            http_auth_type,
            pg_auth_type,
+            grpc_auth_type,
            auth_validation_public_key_path,
            remote_storage,
            broker_endpoint,
@@ -392,6 +407,7 @@ impl PageServerConf {
            secondary_download_concurrency,
            ingest_batch_size,
            max_vectored_read_bytes,
+            max_get_vectored_keys,
            image_compression,
            timeline_offloading,
            ephemeral_bytes_per_memory_kb,
@@ -402,7 +418,6 @@ impl PageServerConf {
            virtual_file_io_engine,
            tenant_config,
            no_sync,
-            wal_receiver_protocol,
            page_service_pipelining,
            get_vectored_concurrent_io,
            enable_read_path_debugging,
@@ -412,6 +427,7 @@ impl PageServerConf {
            tracing,
            enable_tls_page_service_api,
            dev_mode,
+            posthog_config,
            timeline_import_config,
            basebackup_cache_config,
        } = config_toml;
@@ -423,6 +439,7 @@ impl PageServerConf {
            listen_pg_addr,
            listen_http_addr,
            listen_https_addr,
+            listen_grpc_addr,
            ssl_key_file,
            ssl_cert_file,
            ssl_cert_reload_period,
@@ -435,6 +452,7 @@ impl PageServerConf {
            max_file_descriptors,
            http_auth_type,
            pg_auth_type,
+            grpc_auth_type,
            auth_validation_public_key_path,
            remote_storage_config: remote_storage,
            broker_endpoint,
@@ -455,13 +473,13 @@ impl PageServerConf {
            secondary_download_concurrency,
            ingest_batch_size,
            max_vectored_read_bytes,
+            max_get_vectored_keys,
            image_compression,
            timeline_offloading,
            ephemeral_bytes_per_memory_kb,
            import_pgdata_upcall_api,
            import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
            import_pgdata_aws_endpoint_url,
-            wal_receiver_protocol,
            page_service_pipelining,
            get_vectored_concurrent_io,
            tracing,
@@ -525,13 +543,16 @@ impl PageServerConf {
                }
                None => Vec::new(),
            },
+            posthog_config,
        };

        // ------------------------------------------------------------
        // custom validation code that covers more than one field in isolation
        // ------------------------------------------------------------

-        if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
+        if [conf.http_auth_type, conf.pg_auth_type, conf.grpc_auth_type]
+            .contains(&AuthType::NeonJWT)
+        {
            let auth_validation_public_key_path = conf
                .auth_validation_public_key_path
                .get_or_insert_with(|| workdir.join("auth_public_key.pem"));
@@ -580,6 +601,19 @@ impl PageServerConf {
                )
            })?;

+        if let PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+            max_batch_size,
+            ..
+        }) = conf.page_service_pipelining
+        {
+            if max_batch_size.get() > conf.max_get_vectored_keys.get() {
+                return Err(anyhow::anyhow!(
+                    "`max_batch_size` ({max_batch_size}) must be less than or equal to `max_get_vectored_keys` ({})",
+                    conf.max_get_vectored_keys.get()
+                ));
+            }
+        };
+
        Ok(conf)
    }

@@ -667,6 +701,7 @@ impl ConfigurableSemaphore {
 mod tests {

    use camino::Utf8PathBuf;
+    use rstest::rstest;
    use utils::id::NodeId;

    use super::PageServerConf;
@@ -706,4 +741,28 @@ mod tests {
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
            .expect_err("parse_and_validate should fail for endpoint without scheme");
    }
+
+    #[rstest]
+    #[case(32, 32, true)]
+    #[case(64, 32, false)]
+    #[case(64, 64, true)]
+    #[case(128, 128, true)]
+    fn test_config_max_batch_size_is_valid(
+        #[case] max_batch_size: usize,
+        #[case] max_get_vectored_keys: usize,
+        #[case] is_valid: bool,
+    ) {
+        let input = format!(
+            r#"
+            control_plane_api = "http://localhost:6666"
+            max_get_vectored_keys = {max_get_vectored_keys}
+            page_service_pipelining = {{ mode="pipelined", execution="concurrent-futures", max_batch_size={max_batch_size}, batching="uniform-lsn" }}
+        "#,
+        );
+        let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(&input)
+            .expect("config has valid fields");
+        let workdir = Utf8PathBuf::from("/nonexistent");
+        let result = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir);
+        assert_eq!(result.is_ok(), is_valid);
+    }
 }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -837,7 +837,30 @@ async fn collect_eviction_candidates(
                continue;
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
-            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+            debug!(
+                tenant_id=%tl.tenant_shard_id.tenant_id,
+                shard_id=%tl.tenant_shard_id.shard_slug(),
+                timeline_id=%tl.timeline_id,
+                "timeline resident layers count: {}", info.resident_layers.len()
+            );
+
+            tenant_candidates.extend(info.resident_layers.into_iter());
+            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
+
+            if cancel.is_cancelled() {
+                return Ok(EvictionCandidates::Cancelled);
+            }
+        }
+
+        // Also consider layers of timelines being imported for eviction
+        for tl in tenant.list_importing_timelines() {
+            let info = tl.timeline.get_local_layers_for_disk_usage_eviction().await;
+            debug!(
+                tenant_id=%tl.timeline.tenant_shard_id.tenant_id,
+                shard_id=%tl.timeline.tenant_shard_id.shard_slug(),
+                timeline_id=%tl.timeline.timeline_id,
+                "timeline resident layers count: {}", info.resident_layers.len()
+            );

            tenant_candidates.extend(info.resident_layers.into_iter());
            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -0,0 +1,237 @@
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use posthog_client_lite::{
+    CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
+    PostHogFlagFilterPropertyValue,
+};
+use remote_storage::RemoteStorageKind;
+use serde_json::json;
+use tokio_util::sync::CancellationToken;
+use utils::id::TenantId;
+
+use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};
+
+#[derive(Clone)]
+pub struct FeatureResolver {
+    inner: Option<Arc<FeatureResolverBackgroundLoop>>,
+    internal_properties: Option<Arc<HashMap<String, PostHogFlagFilterPropertyValue>>>,
+}
+
+impl FeatureResolver {
+    pub fn new_disabled() -> Self {
+        Self {
+            inner: None,
+            internal_properties: None,
+        }
+    }
+
+    pub fn spawn(
+        conf: &PageServerConf,
+        shutdown_pageserver: CancellationToken,
+        handle: &tokio::runtime::Handle,
+    ) -> anyhow::Result<Self> {
+        // DO NOT block in this function: make it return as fast as possible to avoid startup delays.
+        if let Some(posthog_config) = &conf.posthog_config {
+            let inner = FeatureResolverBackgroundLoop::new(
+                PostHogClientConfig {
+                    server_api_key: posthog_config.server_api_key.clone(),
+                    client_api_key: posthog_config.client_api_key.clone(),
+                    project_id: posthog_config.project_id.clone(),
+                    private_api_url: posthog_config.private_api_url.clone(),
+                    public_api_url: posthog_config.public_api_url.clone(),
+                },
+                shutdown_pageserver,
+            );
+            let inner = Arc::new(inner);
+
+            // The properties shared by all tenants on this pageserver.
+            let internal_properties = {
+                let mut properties = HashMap::new();
+                properties.insert(
+                    "pageserver_id".to_string(),
+                    PostHogFlagFilterPropertyValue::String(conf.id.to_string()),
+                );
+                if let Some(availability_zone) = &conf.availability_zone {
+                    properties.insert(
+                        "availability_zone".to_string(),
+                        PostHogFlagFilterPropertyValue::String(availability_zone.clone()),
+                    );
+                }
+                // Infer region based on the remote storage config.
+                if let Some(remote_storage) = &conf.remote_storage_config {
+                    match &remote_storage.storage {
+                        RemoteStorageKind::AwsS3(config) => {
+                            properties.insert(
+                                "region".to_string(),
+                                PostHogFlagFilterPropertyValue::String(format!(
+                                    "aws-{}",
+                                    config.bucket_region
+                                )),
+                            );
+                        }
+                        RemoteStorageKind::AzureContainer(config) => {
+                            properties.insert(
+                                "region".to_string(),
+                                PostHogFlagFilterPropertyValue::String(format!(
+                                    "azure-{}",
+                                    config.container_region
+                                )),
+                            );
+                        }
+                        RemoteStorageKind::LocalFs { .. } => {
+                            properties.insert(
+                                "region".to_string(),
+                                PostHogFlagFilterPropertyValue::String("local".to_string()),
+                            );
+                        }
+                    }
+                }
+                // TODO: add pageserver URL.
+                Arc::new(properties)
+            };
+            let fake_tenants = {
+                let mut tenants = Vec::new();
+                for i in 0..10 {
+                    let distinct_id = format!(
+                        "fake_tenant_{}_{}_{}",
+                        conf.availability_zone.as_deref().unwrap_or_default(),
+                        conf.id,
+                        i
+                    );
+                    let properties = Self::collect_properties_inner(
+                        distinct_id.clone(),
+                        Some(&internal_properties),
+                    );
+                    tenants.push(CaptureEvent {
+                        event: "initial_tenant_report".to_string(),
+                        distinct_id,
+                        properties: json!({ "$set": properties }), // use `$set` to set the person properties instead of the event properties
+                    });
+                }
+                tenants
+            };
+            // TODO: make refresh period configurable
+            inner
+                .clone()
+                .spawn(handle, Duration::from_secs(60), fake_tenants);
+            Ok(FeatureResolver {
+                inner: Some(inner),
+                internal_properties: Some(internal_properties),
+            })
+        } else {
+            Ok(FeatureResolver {
+                inner: None,
+                internal_properties: None,
+            })
+        }
+    }
+
+    fn collect_properties_inner(
+        tenant_id: String,
+        internal_properties: Option<&HashMap<String, PostHogFlagFilterPropertyValue>>,
+    ) -> HashMap<String, PostHogFlagFilterPropertyValue> {
+        let mut properties = HashMap::new();
+        if let Some(internal_properties) = internal_properties {
+            for (key, value) in internal_properties.iter() {
+                properties.insert(key.clone(), value.clone());
+            }
+        }
+        properties.insert(
+            "tenant_id".to_string(),
+            PostHogFlagFilterPropertyValue::String(tenant_id),
+        );
+        properties
+    }
+
+    /// Collect all properties availble for the feature flag evaluation.
+    pub(crate) fn collect_properties(
+        &self,
+        tenant_id: TenantId,
+    ) -> HashMap<String, PostHogFlagFilterPropertyValue> {
+        Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref())
+    }
+
+    /// Evaluate a multivariate feature flag. Currently, we do not support any properties.
+    ///
+    /// Error handling: the caller should inspect the error and decide the behavior when a feature flag
+    /// cannot be evaluated (i.e., default to false if it cannot be resolved). The error should *not* be
+    /// propagated beyond where the feature flag gets resolved.
+    pub fn evaluate_multivariate(
+        &self,
+        flag_key: &str,
+        tenant_id: TenantId,
+    ) -> Result<String, PostHogEvaluationError> {
+        if let Some(inner) = &self.inner {
+            let res = inner.feature_store().evaluate_multivariate(
+                flag_key,
+                &tenant_id.to_string(),
+                &self.collect_properties(tenant_id),
+            );
+            match &res {
+                Ok(value) => {
+                    FEATURE_FLAG_EVALUATION
+                        .with_label_values(&[flag_key, "ok", value])
+                        .inc();
+                }
+                Err(e) => {
+                    FEATURE_FLAG_EVALUATION
+                        .with_label_values(&[flag_key, "error", e.as_variant_str()])
+                        .inc();
+                }
+            }
+            res
+        } else {
+            Err(PostHogEvaluationError::NotAvailable(
+                "PostHog integration is not enabled".to_string(),
+            ))
+        }
+    }
+
+    /// Evaluate a boolean feature flag. Currently, we do not support any properties.
+    ///
+    /// Returns `Ok(())` if the flag is evaluated to true, otherwise returns an error.
+    ///
+    /// Error handling: the caller should inspect the error and decide the behavior when a feature flag
+    /// cannot be evaluated (i.e., default to false if it cannot be resolved). The error should *not* be
+    /// propagated beyond where the feature flag gets resolved.
+    pub fn evaluate_boolean(
+        &self,
+        flag_key: &str,
+        tenant_id: TenantId,
+    ) -> Result<(), PostHogEvaluationError> {
+        if let Some(inner) = &self.inner {
+            let res = inner.feature_store().evaluate_boolean(
+                flag_key,
+                &tenant_id.to_string(),
+                &self.collect_properties(tenant_id),
+            );
+            match &res {
+                Ok(()) => {
+                    FEATURE_FLAG_EVALUATION
+                        .with_label_values(&[flag_key, "ok", "true"])
+                        .inc();
+                }
+                Err(e) => {
+                    FEATURE_FLAG_EVALUATION
+                        .with_label_values(&[flag_key, "error", e.as_variant_str()])
+                        .inc();
+                }
+            }
+            res
+        } else {
+            Err(PostHogEvaluationError::NotAvailable(
+                "PostHog integration is not enabled".to_string(),
+            ))
+        }
+    }
+
+    pub fn is_feature_flag_boolean(&self, flag_key: &str) -> Result<bool, PostHogEvaluationError> {
+        if let Some(inner) = &self.inner {
+            inner.feature_store().is_feature_flag_boolean(flag_key)
+        } else {
+            Err(PostHogEvaluationError::NotAvailable(
+                "PostHog integration is not enabled".to_string(),
+            ))
+        }
+    }
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -353,6 +353,33 @@ paths:
        "200":
          description: OK

+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/mark_invisible:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    put:
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              properties:
+                is_visible:
+                  type: boolean
+                  default: false
+      responses:
+        "200":
+          description: OK
+
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
@@ -626,6 +653,8 @@ paths:
                  format: hex
                pg_version:
                  type: integer
+                read_only:
+                  type: boolean
                existing_initdb_timeline_id:
                  type: string
                  format: hex
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -43,6 +43,7 @@ use pageserver_api::models::{
 use pageserver_api::shard::{ShardCount, TenantShardId};
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
+use serde_json::json;
 use tenant_size_model::svg::SvgBranchKind;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio::time::Instant;
@@ -370,6 +371,18 @@ impl From<crate::tenant::secondary::SecondaryTenantError> for ApiError {
    }
 }

+impl From<crate::tenant::FinalizeTimelineImportError> for ApiError {
+    fn from(err: crate::tenant::FinalizeTimelineImportError) -> ApiError {
+        use crate::tenant::FinalizeTimelineImportError::*;
+        match err {
+            ImportTaskStillRunning => {
+                ApiError::ResourceUnavailable("Import task still running".into())
+            }
+            ShuttingDown => ApiError::ShuttingDown,
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
@@ -572,6 +585,7 @@ async fn timeline_create_handler(
        TimelineCreateRequestMode::Branch {
            ancestor_timeline_id,
            ancestor_start_lsn,
+            read_only: _,
            pg_version: _,
        } => tenant::CreateTimelineParams::Branch(tenant::CreateTimelineParamsBranch {
            new_timeline_id,
@@ -3532,10 +3546,7 @@ async fn activate_post_import_handler(

        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;

-        tenant
-            .finalize_importing_timeline(timeline_id)
-            .await
-            .map_err(ApiError::InternalServerError)?;
+        tenant.finalize_importing_timeline(timeline_id).await?;

        match tenant.get_timeline(timeline_id, false) {
            Ok(_timeline) => {
@@ -3653,6 +3664,47 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
    Ok(())
 }

+async fn tenant_evaluate_feature_flag(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let flag: String = must_parse_query_param(&request, "flag")?;
+    let as_type: String = must_parse_query_param(&request, "as")?;
+
+    let state = get_state(&request);
+
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+        let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
+        if as_type == "boolean" {
+            let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
+            let result = result.map(|_| true).map_err(|e| e.to_string());
+            json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
+        } else if as_type == "multivariate" {
+            let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
+            json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
+        } else {
+            // Auto infer the type of the feature flag.
+            let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?;
+            if is_boolean {
+                let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
+                let result = result.map(|_| true).map_err(|e| e.to_string());
+                json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
+            } else {
+                let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
+                json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
+            }
+        }
+    }
+    .instrument(info_span!("tenant_evaluate_feature_flag", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
+    .await
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -4029,5 +4081,8 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import",
            |r| api_handler(r, activate_post_import_handler),
        )
+        .get("/v1/tenant/:tenant_shard_id/feature_flag", |r| {
+            api_handler(r, tenant_evaluate_feature_flag)
+        })
        .any(handler_404))
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -10,6 +10,7 @@ pub mod context;
 pub mod controller_upcall_client;
 pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
+pub mod feature_resolver;
 pub mod http;
 pub mod import_datadir;
 pub mod l0_flush;
@@ -84,6 +85,7 @@ pub async fn shutdown_pageserver(
    http_listener: HttpEndpointListener,
    https_listener: Option<HttpsEndpointListener>,
    page_service: page_service::Listener,
+    grpc_task: Option<CancellableTask>,
    consumption_metrics_worker: ConsumptionMetricsTasks,
    disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
    tenant_manager: &TenantManager,
@@ -177,6 +179,16 @@ pub async fn shutdown_pageserver(
    )
    .await;

+    // Shut down the gRPC server task, including request handlers.
+    if let Some(grpc_task) = grpc_task {
+        timed(
+            grpc_task.shutdown(),
+            "shutdown gRPC PageRequestHandler",
+            Duration::from_secs(3),
+        )
+        .await;
+    }
+
    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
    timed(
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -15,6 +15,7 @@ use metrics::{
    register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::config::defaults::DEFAULT_MAX_GET_VECTORED_KEYS;
 use pageserver_api::config::{
    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
@@ -32,7 +33,6 @@ use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
 use crate::pgdatadir_mapping::DatadirModificationStats;
 use crate::task_mgr::TaskKind;
-use crate::tenant::Timeline;
 use crate::tenant::layer_map::LayerMap;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::storage_layer::{InMemoryLayer, PersistentLayerDesc};
@@ -446,6 +446,15 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static FEATURE_FLAG_EVALUATION: Lazy<CounterVec> = Lazy::new(|| {
+    register_counter_vec!(
+        "pageserver_feature_flag_evaluation",
+        "Number of times a feature flag is evaluated",
+        &["flag_key", "status", "value"],
+    )
+    .unwrap()
+});
+
 #[derive(IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum PageCacheErrorKind {
@@ -1312,11 +1321,44 @@ impl EvictionsWithLowResidenceDuration {
 //
 // Roughly logarithmic scale.
 const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
-    0.000030, // 30 usec
-    0.001000, // 1000 usec
-    0.030,    // 30 ms
-    1.000,    // 1000 ms
-    30.000,   // 30000 ms
+    0.00005,  // 50us
+    0.00006,  // 60us
+    0.00007,  // 70us
+    0.00008,  // 80us
+    0.00009,  // 90us
+    0.0001,   // 100us
+    0.000110, // 110us
+    0.000120, // 120us
+    0.000130, // 130us
+    0.000140, // 140us
+    0.000150, // 150us
+    0.000160, // 160us
+    0.000170, // 170us
+    0.000180, // 180us
+    0.000190, // 190us
+    0.000200, // 200us
+    0.000210, // 210us
+    0.000220, // 220us
+    0.000230, // 230us
+    0.000240, // 240us
+    0.000250, // 250us
+    0.000300, // 300us
+    0.000350, // 350us
+    0.000400, // 400us
+    0.000450, // 450us
+    0.000500, // 500us
+    0.000600, // 600us
+    0.000700, // 700us
+    0.000800, // 800us
+    0.000900, // 900us
+    0.001000, // 1ms
+    0.002000, // 2ms
+    0.003000, // 3ms
+    0.004000, // 4ms
+    0.005000, // 5ms
+    0.01000,  // 10ms
+    0.02000,  // 20ms
+    0.05000,  // 50ms
 ];

 /// VirtualFile fs operation variants.
@@ -1906,7 +1948,7 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
 });

 static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
-    (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap())
+    (1..=u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap())
        .map(|v| v.into())
        .collect()
 });
@@ -1924,7 +1966,7 @@ static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(
    let mut buckets = Vec::new();
    for i in 0.. {
        let bucket = 1 << i;
-        if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() {
+        if bucket > u32::try_from(DEFAULT_MAX_GET_VECTORED_KEYS).unwrap() {
            break;
        }
        buckets.push(bucket.into());
@@ -2234,8 +2276,10 @@ impl BasebackupQueryTimeOngoingRecording<'_> {
        // If you want to change categorize of a specific error, also change it in `log_query_error`.
        let metric = match res {
            Ok(_) => &self.parent.ok,
-            Err(QueryError::Shutdown) => {
-                // Do not observe ok/err for shutdown
+            Err(QueryError::Shutdown) | Err(QueryError::Reconnect) => {
+                // Do not observe ok/err for shutdown/reconnect.
+                // Reconnect error might be raised when the operation is waiting for LSN and the tenant shutdown interrupts
+                // the operation. A reconnect error will be issued and the client will retry.
                return;
            }
            Err(QueryError::Disconnected(ConnectionError::Io(io_error)))
@@ -2811,7 +2855,6 @@ pub(crate) struct WalIngestMetrics {
    pub(crate) records_received: IntCounter,
    pub(crate) records_observed: IntCounter,
    pub(crate) records_committed: IntCounter,
-    pub(crate) records_filtered: IntCounter,
    pub(crate) values_committed_metadata_images: IntCounter,
    pub(crate) values_committed_metadata_deltas: IntCounter,
    pub(crate) values_committed_data_images: IntCounter,
@@ -2867,11 +2910,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
        "Number of WAL records which resulted in writes to pageserver storage"
    )
    .expect("failed to define a metric"),
-    records_filtered: register_int_counter!(
-        "pageserver_wal_ingest_records_filtered",
-        "Number of WAL records filtered out due to sharding"
-    )
-    .expect("failed to define a metric"),
    values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
    values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
    values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -431,10 +431,10 @@ impl Timeline {
                        GetVectoredError::InvalidLsn(e) => {
                            Err(anyhow::anyhow!("invalid LSN: {e:?}").into())
                        }
-                        // NB: this should never happen in practice because we limit MAX_GET_VECTORED_KEYS
+                        // NB: this should never happen in practice because we limit batch size to be smaller than max_get_vectored_keys
                        // TODO: we can prevent this error class by moving this check into the type system
-                        GetVectoredError::Oversized(err) => {
-                            Err(anyhow::anyhow!("batching oversized: {err:?}").into())
+                        GetVectoredError::Oversized(err, max) => {
+                            Err(anyhow::anyhow!("batching oversized: {err} > {max}").into())
                        }
                    };

@@ -471,8 +471,19 @@ impl Timeline {

        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;

+        if rels.is_empty() {
+            return Ok(0);
+        }
+
+        // Pre-deserialize the rel directory to avoid duplicated work in `get_relsize_cached`.
+        let reldir_key = rel_dir_to_key(spcnode, dbnode);
+        let buf = version.get(self, reldir_key, ctx).await?;
+        let reldir = RelDirectory::des(&buf)?;
+
        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, version, ctx).await?;
+            let n_blocks = self
+                .get_rel_size_in_reldir(rel, version, Some((reldir_key, &reldir)), ctx)
+                .await?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
@@ -487,6 +498,19 @@ impl Timeline {
        tag: RelTag,
        version: Version<'_>,
        ctx: &RequestContext,
+    ) -> Result<BlockNumber, PageReconstructError> {
+        self.get_rel_size_in_reldir(tag, version, None, ctx).await
+    }
+
+    /// Get size of a relation file. The relation must exist, otherwise an error is returned.
+    ///
+    /// See [`Self::get_rel_exists_in_reldir`] on why we need `deserialized_reldir_v1`.
+    pub(crate) async fn get_rel_size_in_reldir(
+        &self,
+        tag: RelTag,
+        version: Version<'_>,
+        deserialized_reldir_v1: Option<(Key, &RelDirectory)>,
+        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        if tag.relnode == 0 {
            return Err(PageReconstructError::Other(
@@ -499,7 +523,9 @@ impl Timeline {
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, version, ctx).await?
+            && !self
+                .get_rel_exists_in_reldir(tag, version, deserialized_reldir_v1, ctx)
+                .await?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -521,11 +547,28 @@ impl Timeline {
    ///
    /// Only shard 0 has a full view of the relations. Other shards only know about relations that
    /// the shard stores pages for.
+    ///
    pub(crate) async fn get_rel_exists(
        &self,
        tag: RelTag,
        version: Version<'_>,
        ctx: &RequestContext,
+    ) -> Result<bool, PageReconstructError> {
+        self.get_rel_exists_in_reldir(tag, version, None, ctx).await
+    }
+
+    /// Does the relation exist? With a cached deserialized `RelDirectory`.
+    ///
+    /// There are some cases where the caller loops across all relations. In that specific case,
+    /// the caller should obtain the deserialized `RelDirectory` first and then call this function
+    /// to avoid duplicated work of deserliazation. This is a hack and should be removed by introducing
+    /// a new API (e.g., `get_rel_exists_batched`).
+    pub(crate) async fn get_rel_exists_in_reldir(
+        &self,
+        tag: RelTag,
+        version: Version<'_>,
+        deserialized_reldir_v1: Option<(Key, &RelDirectory)>,
+        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        if tag.relnode == 0 {
            return Err(PageReconstructError::Other(
@@ -568,6 +611,17 @@ impl Timeline {
        // fetch directory listing (old)

        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
+
+        if let Some((cached_key, dir)) = deserialized_reldir_v1 {
+            if cached_key == key {
+                return Ok(dir.rels.contains(&(tag.relnode, tag.forknum)));
+            } else if cfg!(test) || cfg!(feature = "testing") {
+                panic!("cached reldir key mismatch: {cached_key} != {key}");
+            } else {
+                warn!("cached reldir key mismatch: {cached_key} != {key}");
+            }
+            // Fallback to reading the directory from the datadir.
+        }
        let buf = version.get(self, key, ctx).await?;

        let dir = RelDirectory::des(&buf)?;
@@ -665,7 +719,7 @@ impl Timeline {

        let batches = keyspace.partition(
            self.get_shard_identity(),
-            Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+            self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
        );

        let io_concurrency = IoConcurrency::spawn_from_conf(
@@ -905,7 +959,7 @@ impl Timeline {

            let batches = keyspace.partition(
                self.get_shard_identity(),
-                Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+                self.conf.max_get_vectored_keys.get() as u64 * BLCKSZ as u64,
            );

            let io_concurrency = IoConcurrency::spawn_from_conf(
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -276,9 +276,10 @@ pub enum TaskKind {
    // HTTP endpoint listener.
    HttpEndpointListener,

-    // Task that handles a single connection. A PageRequestHandler task
-    // starts detached from any particular tenant or timeline, but it can be
-    // associated with one later, after receiving a command from the client.
+    /// Task that handles a single page service connection. A PageRequestHandler
+    /// task starts detached from any particular tenant or timeline, but it can
+    /// be associated with one later, after receiving a command from the client.
+    /// Also used for the gRPC page service API, including the main server task.
    PageRequestHandler,

    /// Manages the WAL receiver connection for one timeline.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -84,6 +84,7 @@ use crate::context;
 use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
+use crate::feature_resolver::FeatureResolver;
 use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::{
    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
@@ -159,6 +160,7 @@ pub struct TenantSharedResources {
    pub deletion_queue_client: DeletionQueueClient,
    pub l0_flush_global_state: L0FlushGlobalState,
    pub basebackup_prepare_sender: BasebackupPrepareSender,
+    pub feature_resolver: FeatureResolver,
 }

 /// A [`TenantShard`] is really an _attached_ tenant.  The configuration
@@ -298,7 +300,7 @@ pub struct TenantShard {
    ///   as in progress.
    /// * Imported timelines are removed when the storage controller calls the post timeline
    ///   import activation endpoint.
-    timelines_importing: std::sync::Mutex<HashMap<TimelineId, ImportingTimeline>>,
+    timelines_importing: std::sync::Mutex<HashMap<TimelineId, Arc<ImportingTimeline>>>,

    /// The last tenant manifest known to be in remote storage. None if the manifest has not yet
    /// been either downloaded or uploaded. Always Some after tenant attach.
@@ -380,6 +382,8 @@ pub struct TenantShard {
    pub(crate) gc_block: gc_block::GcBlock,

    l0_flush_global_state: L0FlushGlobalState,
+
+    pub(crate) feature_resolver: FeatureResolver,
 }
 impl std::fmt::Debug for TenantShard {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -668,6 +672,7 @@ pub enum MaybeOffloaded {
 pub enum TimelineOrOffloaded {
    Timeline(Arc<Timeline>),
    Offloaded(Arc<OffloadedTimeline>),
+    Importing(Arc<ImportingTimeline>),
 }

 impl TimelineOrOffloaded {
@@ -679,6 +684,9 @@ impl TimelineOrOffloaded {
            TimelineOrOffloaded::Offloaded(offloaded) => {
                TimelineOrOffloadedArcRef::Offloaded(offloaded)
            }
+            TimelineOrOffloaded::Importing(importing) => {
+                TimelineOrOffloadedArcRef::Importing(importing)
+            }
        }
    }
    pub fn tenant_shard_id(&self) -> TenantShardId {
@@ -691,12 +699,16 @@ impl TimelineOrOffloaded {
        match self {
            TimelineOrOffloaded::Timeline(timeline) => &timeline.delete_progress,
            TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
+            TimelineOrOffloaded::Importing(importing) => &importing.delete_progress,
        }
    }
    fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
        match self {
            TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
            TimelineOrOffloaded::Offloaded(_offloaded) => None,
+            TimelineOrOffloaded::Importing(importing) => {
+                Some(importing.timeline.remote_client.clone())
+            }
        }
    }
 }
@@ -704,6 +716,7 @@ impl TimelineOrOffloaded {
 pub enum TimelineOrOffloadedArcRef<'a> {
    Timeline(&'a Arc<Timeline>),
    Offloaded(&'a Arc<OffloadedTimeline>),
+    Importing(&'a Arc<ImportingTimeline>),
 }

 impl TimelineOrOffloadedArcRef<'_> {
@@ -711,12 +724,14 @@ impl TimelineOrOffloadedArcRef<'_> {
        match self {
            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.tenant_shard_id,
            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.tenant_shard_id,
+            TimelineOrOffloadedArcRef::Importing(importing) => importing.timeline.tenant_shard_id,
        }
    }
    pub fn timeline_id(&self) -> TimelineId {
        match self {
            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.timeline_id,
            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.timeline_id,
+            TimelineOrOffloadedArcRef::Importing(importing) => importing.timeline.timeline_id,
        }
    }
 }
@@ -733,6 +748,12 @@ impl<'a> From<&'a Arc<OffloadedTimeline>> for TimelineOrOffloadedArcRef<'a> {
    }
 }

+impl<'a> From<&'a Arc<ImportingTimeline>> for TimelineOrOffloadedArcRef<'a> {
+    fn from(timeline: &'a Arc<ImportingTimeline>) -> Self {
+        Self::Importing(timeline)
+    }
+}
+
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
    #[error("Timeline is shutting down")]
@@ -860,6 +881,14 @@ impl Debug for SetStoppingError {
    }
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum FinalizeTimelineImportError {
+    #[error("Import task not done yet")]
+    ImportTaskStillRunning,
+    #[error("Shutting down")]
+    ShuttingDown,
+}
+
 /// Arguments to [`TenantShard::create_timeline`].
 ///
 /// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
@@ -1146,10 +1175,20 @@ impl TenantShard {
            ctx,
        )?;
        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
-        anyhow::ensure!(
-            disk_consistent_lsn.is_valid(),
-            "Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
+
+        if !disk_consistent_lsn.is_valid() {
+            // As opposed to normal timelines which get initialised with a disk consitent LSN
+            // via initdb, imported timelines start from 0. If the import task stops before
+            // it advances disk consitent LSN, allow it to resume.
+            let in_progress_import = import_pgdata
+                .as_ref()
+                .map(|import| !import.is_done())
+                .unwrap_or(false);
+            if !in_progress_import {
+                anyhow::bail!("Timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn");
+            }
+        }
+
        assert_eq!(
            disk_consistent_lsn,
            metadata.disk_consistent_lsn(),
@@ -1243,20 +1282,25 @@ impl TenantShard {
                    }
                }

-                // Sanity check: a timeline should have some content.
-                anyhow::ensure!(
-                    ancestor.is_some()
-                        || timeline
-                            .layers
-                            .read()
-                            .await
-                            .layer_map()
-                            .expect("currently loading, layer manager cannot be shutdown already")
-                            .iter_historic_layers()
-                            .next()
-                            .is_some(),
-                    "Timeline has no ancestor and no layer files"
-                );
+                if disk_consistent_lsn.is_valid() {
+                    // Sanity check: a timeline should have some content.
+                    // Exception: importing timelines might not yet have any
+                    anyhow::ensure!(
+                        ancestor.is_some()
+                            || timeline
+                                .layers
+                                .read()
+                                .await
+                                .layer_map()
+                                .expect(
+                                    "currently loading, layer manager cannot be shutdown already"
+                                )
+                                .iter_historic_layers()
+                                .next()
+                                .is_some(),
+                        "Timeline has no ancestor and no layer files"
+                    );
+                }

                Ok(TimelineInitAndSyncResult::ReadyToActivate)
            }
@@ -1292,6 +1336,7 @@ impl TenantShard {
            deletion_queue_client,
            l0_flush_global_state,
            basebackup_prepare_sender,
+            feature_resolver,
        } = resources;

        let attach_mode = attached_conf.location.attach_mode;
@@ -1308,6 +1353,7 @@ impl TenantShard {
            deletion_queue_client,
            l0_flush_global_state,
            basebackup_prepare_sender,
+            feature_resolver,
        ));

        // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -1760,20 +1806,25 @@ impl TenantShard {
                    },
                ) => {
                    let timeline_id = timeline.timeline_id;
+                    let import_task_gate = Gate::default();
+                    let import_task_guard = import_task_gate.enter().unwrap();
                    let import_task_handle =
                        tokio::task::spawn(self.clone().create_timeline_import_pgdata_task(
                            timeline.clone(),
                            import_pgdata,
                            guard,
+                            import_task_guard,
                            ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
                        ));

                    let prev = self.timelines_importing.lock().unwrap().insert(
                        timeline_id,
-                        ImportingTimeline {
+                        Arc::new(ImportingTimeline {
                            timeline: timeline.clone(),
                            import_task_handle,
-                        },
+                            import_task_gate,
+                            delete_progress: TimelineDeleteProgress::default(),
+                        }),
                    );

                    assert!(prev.is_none());
@@ -2391,6 +2442,17 @@ impl TenantShard {
            .collect()
    }

+    /// Lists timelines the tenant contains.
+    /// It's up to callers to omit certain timelines that are not considered ready for use.
+    pub fn list_importing_timelines(&self) -> Vec<Arc<ImportingTimeline>> {
+        self.timelines_importing
+            .lock()
+            .unwrap()
+            .values()
+            .map(Arc::clone)
+            .collect()
+    }
+
    /// Lists timelines the tenant manages, including offloaded ones.
    ///
    /// It's up to callers to omit certain timelines that are not considered ready for use.
@@ -2824,19 +2886,25 @@ impl TenantShard {

        let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself();

+        let import_task_gate = Gate::default();
+        let import_task_guard = import_task_gate.enter().unwrap();
+
        let import_task_handle = tokio::spawn(self.clone().create_timeline_import_pgdata_task(
            timeline.clone(),
            index_part,
            timeline_create_guard,
+            import_task_guard,
            timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn),
        ));

        let prev = self.timelines_importing.lock().unwrap().insert(
            timeline.timeline_id,
-            ImportingTimeline {
+            Arc::new(ImportingTimeline {
                timeline: timeline.clone(),
                import_task_handle,
-            },
+                import_task_gate,
+                delete_progress: TimelineDeleteProgress::default(),
+            }),
        );

        // Idempotency is enforced higher up the stack
@@ -2854,13 +2922,13 @@ impl TenantShard {
    pub(crate) async fn finalize_importing_timeline(
        &self,
        timeline_id: TimelineId,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), FinalizeTimelineImportError> {
        let timeline = {
            let locked = self.timelines_importing.lock().unwrap();
            match locked.get(&timeline_id) {
                Some(importing_timeline) => {
                    if !importing_timeline.import_task_handle.is_finished() {
-                        return Err(anyhow::anyhow!("Import task not done yet"));
+                        return Err(FinalizeTimelineImportError::ImportTaskStillRunning);
                    }

                    importing_timeline.timeline.clone()
@@ -2873,8 +2941,13 @@ impl TenantShard {

        timeline
            .remote_client
-            .schedule_index_upload_for_import_pgdata_finalize()?;
-        timeline.remote_client.wait_completion().await?;
+            .schedule_index_upload_for_import_pgdata_finalize()
+            .map_err(|_err| FinalizeTimelineImportError::ShuttingDown)?;
+        timeline
+            .remote_client
+            .wait_completion()
+            .await
+            .map_err(|_err| FinalizeTimelineImportError::ShuttingDown)?;

        self.timelines_importing
            .lock()
@@ -2890,6 +2963,7 @@ impl TenantShard {
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        timeline_create_guard: TimelineCreateGuard,
+        _import_task_guard: GateGuard,
        ctx: RequestContext,
    ) {
        debug_assert_current_span_has_tenant_and_timeline_id();
@@ -3135,11 +3209,18 @@ impl TenantShard {
                        .or_insert_with(|| Arc::new(GcCompactionQueue::new()))
                        .clone()
                };
+                let gc_compaction_strategy = self
+                    .feature_resolver
+                    .evaluate_multivariate("gc-comapction-strategy", self.tenant_shard_id.tenant_id)
+                    .ok();
+                let span = if let Some(gc_compaction_strategy) = gc_compaction_strategy {
+                    info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id, strategy = %gc_compaction_strategy)
+                } else {
+                    info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id)
+                };
                outcome = queue
                    .iteration(cancel, ctx, &self.gc_block, &timeline)
-                    .instrument(
-                        info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id),
-                    )
+                    .instrument(span)
                    .await?;
            }

@@ -3471,8 +3552,9 @@ impl TenantShard {
            let mut timelines_importing = self.timelines_importing.lock().unwrap();
            timelines_importing
                .drain()
-                .for_each(|(_timeline_id, importing_timeline)| {
-                    importing_timeline.shutdown();
+                .for_each(|(timeline_id, importing_timeline)| {
+                    let span = tracing::info_span!("importing_timeline_shutdown", %timeline_id);
+                    js.spawn(async move { importing_timeline.shutdown().instrument(span).await });
                });
        }
        // test_long_timeline_create_then_tenant_delete is leaning on this message
@@ -3793,6 +3875,9 @@ impl TenantShard {
                        .build_timeline_client(offloaded.timeline_id, self.remote_storage.clone());
                    Arc::new(remote_client)
                }
+                TimelineOrOffloadedArcRef::Importing(_) => {
+                    unreachable!("Importing timelines are not included in the iterator")
+                }
            };

            // Shut down the timeline's remote client: this means that the indices we write
@@ -4247,6 +4332,7 @@ impl TenantShard {
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
        basebackup_prepare_sender: BasebackupPrepareSender,
+        feature_resolver: FeatureResolver,
    ) -> TenantShard {
        assert!(!attached_conf.location.generation.is_none());

@@ -4351,6 +4437,7 @@ impl TenantShard {
            gc_block: Default::default(),
            l0_flush_global_state,
            basebackup_prepare_sender,
+            feature_resolver,
        }
    }

@@ -5000,6 +5087,14 @@ impl TenantShard {
                info!("timeline already exists but is offloaded");
                Err(CreateTimelineError::Conflict)
            }
+            Err(TimelineExclusionError::AlreadyExists {
+                existing: TimelineOrOffloaded::Importing(_existing),
+                ..
+            }) => {
+                // If there's a timeline already importing, then we would hit
+                // the [`TimelineExclusionError::AlreadyCreating`] branch above.
+                unreachable!("Importing timelines hold the creation guard")
+            }
            Err(TimelineExclusionError::AlreadyExists {
                existing: TimelineOrOffloaded::Timeline(existing),
                arg,
@@ -5271,6 +5366,7 @@ impl TenantShard {
            l0_compaction_trigger: self.l0_compaction_trigger.clone(),
            l0_flush_global_state: self.l0_flush_global_state.clone(),
            basebackup_prepare_sender: self.basebackup_prepare_sender.clone(),
+            feature_resolver: self.feature_resolver.clone(),
        }
    }

@@ -5736,6 +5832,7 @@ pub(crate) mod harness {
        pub conf: &'static PageServerConf,
        pub tenant_conf: pageserver_api::models::TenantConfig,
        pub tenant_shard_id: TenantShardId,
+        pub shard_identity: ShardIdentity,
        pub generation: Generation,
        pub shard: ShardIndex,
        pub remote_storage: GenericRemoteStorage,
@@ -5803,6 +5900,7 @@ pub(crate) mod harness {
                conf,
                tenant_conf,
                tenant_shard_id,
+                shard_identity,
                generation,
                shard,
                remote_storage,
@@ -5864,8 +5962,7 @@ pub(crate) mod harness {
                    &ShardParameters::default(),
                ))
                .unwrap(),
-                // This is a legacy/test code path: sharding isn't supported here.
-                ShardIdentity::unsharded(),
+                self.shard_identity,
                Some(walredo_mgr),
                self.tenant_shard_id,
                self.remote_storage.clone(),
@@ -5873,6 +5970,7 @@ pub(crate) mod harness {
                // TODO: ideally we should run all unit tests with both configs
                L0FlushGlobalState::new(L0FlushConfig::default()),
                basebackup_requst_sender,
+                FeatureResolver::new_disabled(),
            ));

            let preload = tenant
@@ -5986,6 +6084,7 @@ mod tests {
    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
    use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
    use utils::id::TenantId;
+    use utils::shard::{ShardCount, ShardNumber};

    use super::*;
    use crate::DEFAULT_PG_VERSION;
@@ -7098,7 +7197,7 @@ mod tests {
            let end = desc
                .key_range
                .start
-                .add(Timeline::MAX_GET_VECTORED_KEYS.try_into().unwrap());
+                .add(tenant.conf.max_get_vectored_keys.get() as u32);
            reads.push(KeySpace {
                ranges: vec![start..end],
            });
@@ -8314,10 +8413,24 @@ mod tests {
            }

            tline.freeze_and_flush().await?;
+            // Force layers to L1
+            tline
+                .compact(
+                    &cancel,
+                    {
+                        let mut flags = EnumSet::new();
+                        flags.insert(CompactFlags::ForceL0Compaction);
+                        flags
+                    },
+                    &ctx,
+                )
+                .await?;

            if iter % 5 == 0 {
+                let scan_lsn = Lsn(lsn.0 + 1);
+                info!("scanning at {}", scan_lsn);
                let (_, before_delta_file_accessed) =
-                    scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
+                    scan_with_statistics(&tline, &keyspace, scan_lsn, &ctx, io_concurrency.clone())
                        .await?;
                tline
                    .compact(
@@ -8326,13 +8439,14 @@ mod tests {
                            let mut flags = EnumSet::new();
                            flags.insert(CompactFlags::ForceImageLayerCreation);
                            flags.insert(CompactFlags::ForceRepartition);
+                            flags.insert(CompactFlags::ForceL0Compaction);
                            flags
                        },
                        &ctx,
                    )
                    .await?;
                let (_, after_delta_file_accessed) =
-                    scan_with_statistics(&tline, &keyspace, lsn, &ctx, io_concurrency.clone())
+                    scan_with_statistics(&tline, &keyspace, scan_lsn, &ctx, io_concurrency.clone())
                        .await?;
                assert!(
                    after_delta_file_accessed < before_delta_file_accessed,
@@ -8773,6 +8887,8 @@ mod tests {

        let cancel = CancellationToken::new();

+        // Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
+        tline.force_set_disk_consistent_lsn(Lsn(0x40));
        tline
            .compact(
                &cancel,
@@ -8786,8 +8902,7 @@ mod tests {
            )
            .await
            .unwrap();
-
-        // Image layers are created at last_record_lsn
+        // Image layers are created at repartition LSN
        let images = tline
            .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
            .await
@@ -9305,6 +9420,77 @@ mod tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn test_failed_flush_should_not_update_disk_consistent_lsn() -> anyhow::Result<()> {
+        //
+        // Setup
+        //
+        let harness = TenantHarness::create_custom(
+            "test_failed_flush_should_not_upload_disk_consistent_lsn",
+            pageserver_api::models::TenantConfig::default(),
+            TenantId::generate(),
+            ShardIdentity::new(ShardNumber(0), ShardCount(4), ShardStripeSize(128)).unwrap(),
+            Generation::new(1),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let timeline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        assert_eq!(timeline.get_shard_identity().count, ShardCount(4));
+        let mut writer = timeline.writer().await;
+        writer
+            .put(
+                *TEST_KEY,
+                Lsn(0x20),
+                &Value::Image(test_img("foo at 0x20")),
+                &ctx,
+            )
+            .await?;
+        writer.finish_write(Lsn(0x20));
+        drop(writer);
+        timeline.freeze_and_flush().await.unwrap();
+
+        timeline.remote_client.wait_completion().await.unwrap();
+        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
+        let remote_consistent_lsn = timeline.get_remote_consistent_lsn_projected();
+        assert_eq!(Some(disk_consistent_lsn), remote_consistent_lsn);
+
+        //
+        // Test
+        //
+
+        let mut writer = timeline.writer().await;
+        writer
+            .put(
+                *TEST_KEY,
+                Lsn(0x30),
+                &Value::Image(test_img("foo at 0x30")),
+                &ctx,
+            )
+            .await?;
+        writer.finish_write(Lsn(0x30));
+        drop(writer);
+
+        fail::cfg(
+            "flush-layer-before-update-remote-consistent-lsn",
+            "return()",
+        )
+        .unwrap();
+
+        let flush_res = timeline.freeze_and_flush().await;
+        // if flush failed, the disk/remote consistent LSN should not be updated
+        assert!(flush_res.is_err());
+        assert_eq!(disk_consistent_lsn, timeline.get_disk_consistent_lsn());
+        assert_eq!(
+            remote_consistent_lsn,
+            timeline.get_remote_consistent_lsn_projected()
+        );
+
+        Ok(())
+    }
+
    #[cfg(feature = "testing")]
    #[tokio::test]
    async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
@@ -11074,11 +11260,11 @@ mod tests {
                let mut keyspaces_at_lsn: HashMap<Lsn, KeySpaceRandomAccum> = HashMap::default();
                let mut used_keys: HashSet<Key> = HashSet::default();

-                while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
+                while used_keys.len() < tenant.conf.max_get_vectored_keys.get() {
                    let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty");
                    let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE));

-                    while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
+                    while used_keys.len() < tenant.conf.max_get_vectored_keys.get() {
                        if used_keys.contains(&selected_key)
                            || selected_key >= start_key.add(KEY_DIMENSION_SIZE)
                        {
--- a/Show More
+++ b/Show More