wip: deserialize and serialize nice bytes

this is nice, but fails in tests. looks like: { "Finished": { "before": { "config": { "evict_bytes": "92.23MiB" }, "freed_bytes": "0B" }, "planned": { "respecting_tenant_min_resident_size": { "config": { "evict_bytes": "92.23MiB" }, "freed_bytes": "95.28MiB" }, "fallback_to_global_lru": null }, "assumed": { "projected_after": { "config": { "evict_bytes": "92.23MiB" }, "freed_bytes": "95.28MiB" }, "failed": { "file_sizes": 0, "count": 0 } } } }
refactor: ubyte has From<usize>
2026-01-25 06:10:37 +00:00 · 2023-03-21 12:10:34 +02:00 · 2023-03-21 12:10:03 +02:00 · 2023-03-20 17:34:07 +01:00 · 2023-03-20 17:24:46 +01:00 · 2023-03-20 17:18:15 +01:00
196 changed files with 8928 additions and 4285 deletions
--- a/.github/ansible/deploy.yaml
+++ b/.github/ansible/deploy.yaml
@@ -91,6 +91,15 @@
      tags:
      - pageserver

+    # used in `pageserver.service` template
+    - name: learn current availability_zone
+      shell:
+        cmd: "curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone"
+      register: ec2_availability_zone
+
+    - set_fact: 
+        ec2_availability_zone={{ ec2_availability_zone.stdout }}
+
    - name: upload systemd service definition
      ansible.builtin.template:
        src: systemd/pageserver.service
@@ -118,7 +127,7 @@
        cmd: |
          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
-          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
      tags:
      - pageserver

@@ -153,6 +162,15 @@
      tags:
      - safekeeper

+    # used in `safekeeper.service` template
+    - name: learn current availability_zone
+      shell:
+        cmd: "curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone"
+      register: ec2_availability_zone
+
+    - set_fact: 
+        ec2_availability_zone={{ ec2_availability_zone.stdout }}
+
    # in the future safekeepers should discover pageservers byself
    # but currently use first pageserver that was discovered
    - name: set first pageserver var for safekeepers
@@ -188,6 +206,6 @@
        cmd: |
          INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
-          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
      tags:
      - safekeeper
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -27,6 +27,8 @@ storage:
          ansible_host:  i-0cd8d316ecbb715be
        pageserver-1.eu-central-1.aws.neon.tech:
          ansible_host:  i-090044ed3d383fef0
+        pageserver-2.eu-central-1.aws.neon.tech:
+          ansible_host:  i-033584edf3f4b6742

    safekeepers:
      hosts:
--- a/.github/ansible/scripts/init_pageserver.sh
+++ b/.github/ansible/scripts/init_pageserver.sh
@@ -26,7 +26,7 @@ EOF
 if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/${INSTANCE_ID} -o /dev/null; then

    # not registered, so register it now
-    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id')
+    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" {{ console_mgmt_base_url }}/management/api/v2/pageservers -d@/tmp/payload | jq -r '.id')

    # init pageserver
    sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
--- a/.github/ansible/scripts/init_safekeeper.sh
+++ b/.github/ansible/scripts/init_safekeeper.sh
@@ -25,7 +25,7 @@ EOF
 if ! curl -sf -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/${INSTANCE_ID} -o /dev/null; then

    # not registered, so register it now
-    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id')
+    ID=$(curl -sf -X POST -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -H "Content-Type: application/json" {{ console_mgmt_base_url }}/management/api/v2/safekeepers -d@/tmp/payload | jq -r '.id')
    # init safekeeper
    sudo -u safekeeper /usr/local/bin/safekeeper --id ${ID} --init -D /storage/safekeeper/data
 fi
--- a/.github/ansible/systemd/pageserver.service
+++ b/.github/ansible/systemd/pageserver.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=pageserver
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
-ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -D /storage/pageserver/data
+ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoint='{{ broker_endpoint }}'" -c "availability_zone='{{ ec2_availability_zone }}'" -D /storage/pageserver/data
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }} SENTRY_ENVIRONMENT={{ sentry_environment }}
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoint={{ broker_endpoint }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}' --availability-zone={{ ec2_availability_zone }}
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -1,6 +1,22 @@
 # Helm chart values for neon-proxy-scram.
 # This is a YAML-formatted file.

+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# The pod(s) will stay in Terminating, keeps the existing connections
+# but doesn't receive new ones
+containerLifecycle:
+  preStop:
+    exec:
+      command: ["/bin/sh", "-c", "sleep 604800"]
+terminationGracePeriodSeconds: 604800
+
+
 image:
  repository: neondatabase/neon

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -74,15 +74,12 @@ jobs:
      - name: Install Python deps
        run: ./scripts/pysync

-      - name: Run isort to ensure code format
-        run: poetry run isort --diff --check .
+      - name: Run ruff to ensure code format
+        run: poetry run ruff .

      - name: Run black to ensure code format
        run: poetry run black --diff --check .

-      - name: Run flake8 to ensure code format
-        run: poetry run flake8 .
-
      - name: Run mypy to check types
        run: poetry run mypy .

@@ -551,6 +548,48 @@ jobs:
      - name: Cleanup ECR folder
        run: rm -rf ~/.ecr

+
+  neon-image-depot:
+    # For testing this will run side-by-side for a few merges.
+    # This action is not really optimized yet, but gets the job done
+    runs-on: [ self-hosted, gen3, small ]
+    needs: [ tag ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    permissions:
+      contents: read
+      id-token: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Setup go
+        uses: actions/setup-go@v3
+        with:
+          go-version: '1.19'
+
+      - name: Set up Depot CLI
+        uses: depot/setup-action@v1
+
+      - name: Install Crane & ECR helper
+        run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Build and push
+        uses: depot/build-push-action@v1
+        with:
+          # if no depot.json file is at the root of your repo, you must specify the project id
+          project: nrdv0s4kcs
+          push: true
+          tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
+
  compute-tools-image:
    runs-on: [ self-hosted, gen3, large ]
    needs: [ tag ]
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -31,3 +31,4 @@ jobs:
        head: releases/${{ steps.date.outputs.date }}
        base: release
        title: Release ${{ steps.date.outputs.date }}
+        team_reviewers: release
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -851,6 +851,7 @@ dependencies = [
 "futures",
 "hyper",
 "notify",
+ "num_cpus",
 "opentelemetry",
 "postgres",
 "regex",
@@ -913,6 +914,7 @@ dependencies = [
 "once_cell",
 "pageserver_api",
 "postgres",
+ "postgres_backend",
 "postgres_connection",
 "regex",
 "reqwest",
@@ -2454,6 +2456,7 @@ dependencies = [
 "postgres",
 "postgres-protocol",
 "postgres-types",
+ "postgres_backend",
 "postgres_connection",
 "postgres_ffi",
 "pq_proto",
@@ -2471,6 +2474,7 @@ dependencies = [
 "strum",
 "strum_macros",
 "svg_fmt",
+ "sync_wrapper",
 "tempfile",
 "tenant_size_model",
 "thiserror",
@@ -2480,6 +2484,7 @@ dependencies = [
 "tokio-util",
 "toml_edit",
 "tracing",
+ "ubyte",
 "url",
 "utils",
 "walkdir",
@@ -2676,6 +2681,28 @@ dependencies = [
 "postgres-protocol",
 ]

+[[package]]
+name = "postgres_backend"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "bytes",
+ "futures",
+ "once_cell",
+ "pq_proto",
+ "rustls",
+ "rustls-pemfile",
+ "serde",
+ "thiserror",
+ "tokio",
+ "tokio-postgres",
+ "tokio-postgres-rustls",
+ "tokio-rustls",
+ "tracing",
+ "workspace_hack",
+]
+
 [[package]]
 name = "postgres_connection"
 version = "0.1.0"
@@ -2723,7 +2750,7 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 name = "pq_proto"
 version = "0.1.0"
 dependencies = [
- "anyhow",
+ "byteorder",
 "bytes",
 "pin-project-lite",
 "postgres-protocol",
@@ -2898,6 +2925,7 @@ dependencies = [
 "opentelemetry",
 "parking_lot",
 "pin-project-lite",
+ "postgres_backend",
 "pq_proto",
 "prometheus",
 "rand",
@@ -3277,15 +3305,6 @@ dependencies = [
 "base64 0.21.0",
 ]

-[[package]]
-name = "rustls-split"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "78802c9612b4689d207acff746f38132ca1b12dadb55d471aa5f10fd580f47d3"
-dependencies = [
- "rustls",
-]
-
 [[package]]
 name = "rustversion"
 version = "1.0.11"
@@ -3307,6 +3326,7 @@ dependencies = [
 "async-trait",
 "byteorder",
 "bytes",
+ "chrono",
 "clap 4.1.4",
 "const_format",
 "crc32c",
@@ -3316,11 +3336,11 @@ dependencies = [
 "humantime",
 "hyper",
 "metrics",
- "nix",
 "once_cell",
 "parking_lot",
 "postgres",
 "postgres-protocol",
+ "postgres_backend",
 "postgres_ffi",
 "pq_proto",
 "regex",
@@ -4398,6 +4418,15 @@ version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"

+[[package]]
+name = "ubyte"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c81f0dae7d286ad0d9366d7679a77934cfc3cf3a8d67e82669794412b2368fe6"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "uname"
 version = "0.1.1"
@@ -4505,7 +4534,7 @@ dependencies = [
 "byteorder",
 "bytes",
 "criterion",
- "git-version",
+ "futures",
 "heapless",
 "hex",
 "hex-literal",
@@ -4514,12 +4543,9 @@ dependencies = [
 "metrics",
 "nix",
 "once_cell",
- "pq_proto",
+ "pin-project-lite",
 "rand",
 "routerify",
- "rustls",
- "rustls-pemfile",
- "rustls-split",
 "sentry",
 "serde",
 "serde_json",
@@ -4530,10 +4556,10 @@ dependencies = [
 "tempfile",
 "thiserror",
 "tokio",
- "tokio-rustls",
 "tracing",
 "tracing-subscriber",
 "url",
+ "uuid",
 "workspace_hack",
 ]

@@ -4833,15 +4859,19 @@ name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "byteorder",
 "bytes",
 "chrono",
 "clap 4.1.4",
 "crossbeam-utils",
+ "digest",
 "either",
 "fail",
 "futures",
 "futures-channel",
+ "futures-core",
 "futures-executor",
+ "futures-sink",
 "futures-util",
 "hashbrown 0.12.3",
 "indexmap",
@@ -4866,6 +4896,7 @@ dependencies = [
 "socket2",
 "syn",
 "tokio",
+ "tokio-rustls",
 "tokio-util",
 "tonic",
 "tower",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -64,6 +64,7 @@ md5 = "0.7.0"
 memoffset = "0.8"
 nix = "0.26"
 notify = "5.0.0"
+num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
 opentelemetry = "0.18.0"
@@ -133,6 +134,7 @@ heapless = { default-features=false, features=[], git = "https://github.com/japa
 consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
 metrics = { version = "0.1", path = "./libs/metrics/" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
+postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
 postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
 postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
 pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
--- a/2
+++ b/2
@@ -39,7 +39,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev

 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
-COPY . .
+COPY --chown=nonroot . .

 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -225,6 +225,81 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control

+#########################################################################################
+#
+# Layer "rum-pg-build"
+# compile rum extension
+#
+#########################################################################################
+FROM build-deps AS rum-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
+    mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
+
+#########################################################################################
+#
+# Layer "pgtap-pg-build"
+# compile pgTAP extension
+#
+#########################################################################################
+FROM build-deps AS pgtap-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
+    mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
+
+#########################################################################################
+#
+# Layer "prefix-pg-build"
+# compile Prefix extension
+#
+#########################################################################################
+FROM build-deps AS prefix-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
+    mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
+
+#########################################################################################
+#
+# Layer "hll-pg-build"
+# compile hll extension
+#
+#########################################################################################
+FROM build-deps AS hll-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
+    mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
+
+#########################################################################################
+#
+# Layer "plpgsql-check-pg-build"
+# compile plpgsql_check extension
+#
+#########################################################################################
+FROM build-deps AS plpgsql-check-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
+    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
+
 #########################################################################################
 # 
 # Layer "rust extensions"
@@ -248,7 +323,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
-    cargo install --git https://github.com/vadim2404/pgx --branch neon_abi_v0.6.1 --locked cargo-pgx && \
+    cargo install --locked --version 0.7.3 cargo-pgx && \
    /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'

 USER root
@@ -262,11 +337,11 @@ USER root

 FROM rust-extensions-build AS pg-jsonschema-pg-build

-RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.com/vadim2404/pg_jsonschema/ && \
-    cd pg_jsonschema && \
+# there is no release tag yet, but we need it due to the superuser fix in the control file
+RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
+    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    cargo pgx install --release && \
-    # it's needed to enable extension because it uses untrusted C language
-    sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_jsonschema.control && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control

 #########################################################################################
@@ -278,13 +353,32 @@ RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.

 FROM rust-extensions-build AS pg-graphql-pg-build

-RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github.com/vadim2404/pg_graphql && \
-    cd pg_graphql && \  
+# Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
+# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
+# same 1.1 version we've used before.
+RUN git clone -b remove-pgx-contrib-spiext --single-branch https://github.com/yrashk/pg_graphql && \
+    cd pg_graphql && \
+    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
    cargo pgx install --release && \
    # it's needed to enable extension because it uses untrusted C language
    sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control

+#########################################################################################
+#
+# Layer "pg-tiktoken-build"
+# Compile "pg_tiktoken" extension
+#
+#########################################################################################
+
+FROM rust-extensions-build AS pg-tiktoken-pg-build
+
+RUN git clone --depth=1 --single-branch https://github.com/kelvich/pg_tiktoken && \
+    cd pg_tiktoken && \
+    cargo pgx install --release && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -302,13 +396,23 @@ COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon_utils \
        -s install

 #########################################################################################
@@ -363,7 +467,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb

 # Install:
 # libreadline8 for psql
-# libicu67, locales for collations (including ICU)
+# libicu67, locales for collations (including ICU and plpgsql_check)
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
 # libxml2, libxslt1.1 for xml2
--- a/Dockerfile.vm-compute-node
+++ b/Dockerfile.vm-compute-node
@@ -1,32 +1,70 @@
 # Note: this file *mostly* just builds on Dockerfile.compute-node

 ARG SRC_IMAGE
-ARG VM_INFORMANT_VERSION=v0.1.6
+ARG VM_INFORMANT_VERSION=v0.1.14
+# on libcgroup update, make sure to check bootstrap.sh for changes
+ARG LIBCGROUP_VERSION=v2.0.3

-# Pull VM informant and set up inittab
+# Pull VM informant, to copy from later
 FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant

+# Build cgroup-tools
+#
+# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
+# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-informant
+# requires cgroup v2, so we'll build cgroup-tools ourselves.
+FROM debian:bullseye-slim as libcgroup-builder
+ARG LIBCGROUP_VERSION
+
+RUN set -exu \
+	&& apt update \
+	&& apt install --no-install-recommends -y \
+		git \
+		ca-certificates \
+		automake \
+		cmake \
+		make \
+		gcc \
+		byacc \
+		flex \
+		libtool \
+		libpam0g-dev \
+	&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
+	&& INSTALL_DIR="/libcgroup-install" \
+	&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
+	&& cd libcgroup \
+	# extracted from bootstrap.sh, with modified flags:
+	&& (test -d m4 || mkdir m4) \
+	&& autoreconf -fi \
+	&& rm -rf autom4te.cache \
+	&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
+	# actually build the thing...
+	&& make install
+
+# Combine, starting from non-VM compute node image.
+FROM $SRC_IMAGE as base
+
+# Temporarily set user back to root so we can run adduser, set inittab
+USER root
+RUN adduser vm-informant --disabled-password --no-create-home
+
 RUN set -e \
 	&& rm -f /etc/inittab \
 	&& touch /etc/inittab

-ADD vm-cgconfig.conf /etc/cgconfig.conf
 RUN set -e \
 	&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
-	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab
+	&& CONNSTR="dbname=neondb user=cloud_admin sslmode=disable" \
+	&& ARGS="--auto-restart --cgroup=neon-postgres --pgconnstr=\"$CONNSTR\"" \
+	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant $ARGS'" >> /etc/inittab

-# Combine, starting from non-VM compute node image.
-FROM $SRC_IMAGE as base
-
-# Temporarily set user back to root so we can run apt update and adduser
-USER root
-RUN apt update && \
-	apt install --no-install-recommends -y \
-        cgroup-tools
-RUN adduser vm-informant --disabled-password --no-create-home
 USER postgres

-COPY --from=informant /etc/inittab /etc/inittab
+ADD vm-cgconfig.conf /etc/cgconfig.conf
 COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant

+COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
+COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
+COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
+
 ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
--- a/8
+++ b/8
@@ -133,6 +133,11 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
+	+@echo "Compiling neon_utils $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install

 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
@@ -145,6 +150,9 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean

 .PHONY: neon-pg-ext
 neon-pg-ext: \
--- a/README.md
+++ b/README.md
@@ -46,11 +46,14 @@ postgresql-libs cmake postgresql protobuf
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 ```

-#### Installing dependencies on OSX (12.3.1)
+#### Installing dependencies on macOS (12.3.1)
 1. Install XCode and dependencies
 ```
 xcode-select --install
 brew install protobuf openssl flex bison
+
+# add openssl to PATH, required for ed25519 keys generation in neon_local
+echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,6 +11,7 @@ clap.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
 notify.workspace = true
+num_cpus.workspace = true
 opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -25,6 +25,7 @@ use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
+use tokio_postgres;
 use tracing::{info, instrument, warn};

 use crate::checker::create_writability_check_data;
@@ -284,6 +285,7 @@ impl ComputeNode {
        handle_role_deletions(self, &mut client)?;
        handle_grants(self, &mut client)?;
        create_writability_check_data(&mut client)?;
+        handle_extensions(&self.spec, &mut client)?;

        // 'Close' connection
        drop(client);
@@ -400,4 +402,43 @@ impl ComputeNode {

        Ok(())
    }
+
+    /// Select `pg_stat_statements` data and return it as a stringified JSON
+    pub async fn collect_insights(&self) -> String {
+        let mut result_rows: Vec<String> = Vec::new();
+        let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await;
+        let (client, connection) = connect_result.unwrap();
+        tokio::spawn(async move {
+            if let Err(e) = connection.await {
+                eprintln!("connection error: {}", e);
+            }
+        });
+        let result = client
+            .simple_query(
+                "SELECT
+    row_to_json(pg_stat_statements)
+FROM
+    pg_stat_statements
+WHERE
+    userid != 'cloud_admin'::regrole::oid
+ORDER BY
+    (mean_exec_time + mean_plan_time) DESC
+LIMIT 100",
+            )
+            .await;
+
+        if let Ok(raw_rows) = result {
+            for message in raw_rows.iter() {
+                if let postgres::SimpleQueryMessage::Row(row) = message {
+                    if let Some(json) = row.get(0) {
+                        result_rows.push(json.to_string());
+                    }
+                }
+            }
+
+            format!("{{\"pg_stat_statements\": [{}]}}", result_rows.join(","))
+        } else {
+            "{{\"pg_stat_statements\": []}}".to_string()
+        }
+    }
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -7,6 +7,7 @@ use crate::compute::ComputeNode;
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use num_cpus;
 use serde_json;
 use tracing::{error, info};
 use tracing_utils::http::OtelName;
@@ -33,6 +34,13 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
        }

+        // Collect Postgres current usage insights
+        (&Method::GET, "/insights") => {
+            info!("serving /insights GET request");
+            let insights = compute.collect_insights().await;
+            Response::new(Body::from(insights))
+        }
+
        (&Method::POST, "/check_writability") => {
            info!("serving /check_writability POST request");
            let res = crate::checker::check_writability(compute).await;
@@ -42,6 +50,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        (&Method::GET, "/info") => {
+            let num_cpus = num_cpus::get_physical();
+            info!("serving /info GET request. num_cpus: {}", num_cpus);
+            Response::new(Body::from(
+                serde_json::json!({
+                    "num_cpus": num_cpus,
+                })
+                .to_string(),
+            ))
+        }
+
        // Return the `404 Not Found` for any other routes.
        _ => {
            let mut not_found = Response::new(Body::from("404 Not Found"));
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -10,12 +10,12 @@ paths:
  /status:
    get:
      tags:
-      - "info"
+      - Info
      summary: Get compute node internal status
      description: ""
      operationId: getComputeStatus
      responses:
-        "200":
+        200:
          description: ComputeState
          content:
            application/json:
@@ -25,27 +25,58 @@ paths:
  /metrics.json:
    get:
      tags:
-      - "info"
+      - Info
      summary: Get compute node startup metrics in JSON format
      description: ""
      operationId: getComputeMetricsJSON
      responses:
-        "200":
+        200:
          description: ComputeMetrics
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ComputeMetrics"

+  /insights:
+    get:
+      tags:
+      - Info
+      summary: Get current compute insights in JSON format
+      description: |
+        Note, that this doesn't include any historical data
+      operationId: getComputeInsights
+      responses:
+        200:
+          description: Compute insights
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ComputeInsights"
+
+  /info:
+    get:
+      tags:
+      - "info"
+      summary: Get info about the compute Pod/VM
+      description: ""
+      operationId: getInfo
+      responses:
+        "200":
+          description: Info
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Info"
+
  /check_writability:
    post:
      tags:
-      - "check"
+      - Check
      summary: Check that we can write new data on this compute
      description: ""
      operationId: checkComputeWritability
      responses:
-        "200":
+        200:
          description: Check result
          content:
            text/plain:
@@ -80,6 +111,15 @@ components:
        total_startup_ms:
          type: integer

+    Info:
+      type: object
+      description: Information about VM/Pod
+      required:
+        - num_cpus
+      properties:
+        num_cpus:
+          type: integer
+
    ComputeState:
      type: object
      required:
@@ -96,6 +136,15 @@ components:
          type: string
          description: Text of the error during compute startup, if any

+    ComputeInsights:
+      type: object
+      properties:
+        pg_stat_statements:
+          description: Contains raw output from pg_stat_statements in JSON format
+          type: array
+          items:
+            type: object
+
    ComputeStatus:
      type: string
      enum:
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -47,12 +47,23 @@ pub struct GenericOption {
 /// declare a `trait` on it.
 pub type GenericOptions = Option<Vec<GenericOption>>;

+/// Escape a string for including it in a SQL literal
+fn escape_literal(s: &str) -> String {
+    s.replace('\'', "''").replace('\\', "\\\\")
+}
+
+/// Escape a string so that it can be used in postgresql.conf.
+/// Same as escape_literal, currently.
+fn escape_conf_value(s: &str) -> String {
+    s.replace('\'', "''").replace('\\', "\\\\")
+}
+
 impl GenericOption {
    /// Represent `GenericOption` as SQL statement parameter.
    pub fn to_pg_option(&self) -> String {
        if let Some(val) = &self.value {
            match self.vartype.as_ref() {
-                "string" => format!("{} '{}'", self.name, val),
+                "string" => format!("{} '{}'", self.name, escape_literal(val)),
                _ => format!("{} {}", self.name, val),
            }
        } else {
@@ -63,6 +74,8 @@ impl GenericOption {
    /// Represent `GenericOption` as configuration option.
    pub fn to_pg_setting(&self) -> String {
        if let Some(val) = &self.value {
+            // TODO: check in the console DB that we don't have these settings
+            // set for any non-deleted project and drop this override.
            let name = match self.name.as_str() {
                "safekeepers" => "neon.safekeepers",
                "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
@@ -71,7 +84,7 @@ impl GenericOption {
            };

            match self.vartype.as_ref() {
-                "string" => format!("{} = '{}'", name, val),
+                "string" => format!("{} = '{}'", name, escape_conf_value(val)),
                _ => format!("{} = {}", name, val),
            }
        } else {
@@ -107,6 +120,7 @@ impl PgOptionsSerialize for GenericOptions {
                .map(|op| op.to_pg_setting())
                .collect::<Vec<String>>()
                .join("\n")
+                + "\n" // newline after last setting
        } else {
            "".to_string()
        }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -515,3 +515,18 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {

    Ok(())
 }
+
+/// Create required system extensions
+#[instrument(skip_all)]
+pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+    if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+        if libs.contains("pg_stat_statements") {
+            // Create extension only if this compute really needs it
+            let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements";
+            info!("creating system extensions with query: {}", query);
+            client.simple_query(query)?;
+        }
+    }
+
+    Ok(())
+}
--- a/compute_tools/tests/cluster_spec.json
+++ b/compute_tools/tests/cluster_spec.json
@@ -178,6 +178,11 @@
                "name": "neon.pageserver_connstring",
                "value": "host=127.0.0.1 port=6400",
                "vartype": "string"
+            },
+            {
+                "name": "test.escaping",
+                "value": "here's a backslash \\ and a quote ' and a double-quote \" hooray",
+                "vartype": "string"
            }
        ]
    },
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -28,7 +28,30 @@ mod pg_helpers_tests {

        assert_eq!(
            spec.cluster.settings.as_pg_settings(),
-            "fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
+            r#"fsync = off
+wal_level = replica
+hot_standby = on
+neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
+wal_log_hints = on
+log_connections = on
+shared_buffers = 32768
+port = 55432
+max_connections = 100
+max_wal_senders = 10
+listen_addresses = '0.0.0.0'
+wal_sender_timeout = 0
+password_encryption = md5
+maintenance_work_mem = 65536
+max_parallel_workers = 8
+max_worker_processes = 8
+neon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'
+max_replication_slots = 10
+neon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'
+shared_preload_libraries = 'neon'
+synchronous_standby_names = 'walproposer'
+neon.pageserver_connstring = 'host=127.0.0.1 port=6400'
+test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hooray'
+"#
        );
    }

--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -24,6 +24,7 @@ url.workspace = true
 # Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
 # instead, so that recompile times are better.
 pageserver_api.workspace = true
+postgres_backend.workspace = true
 safekeeper_api.workspace = true
 postgres_connection.workspace = true
 storage_broker.workspace = true
--- a/control_plane/safekeepers.conf
+++ b/control_plane/safekeepers.conf
@@ -2,7 +2,8 @@
 [pageserver]
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
-auth_type = 'Trust'
+pg_auth_type = 'Trust'
+http_auth_type = 'Trust'

 [[safekeepers]]
 id = 1
--- a/control_plane/simple.conf
+++ b/control_plane/simple.conf
@@ -3,7 +3,8 @@
 [pageserver]
 listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'
-auth_type = 'Trust'
+pg_auth_type = 'Trust'
+http_auth_type = 'Trust'

 [[safekeepers]]
 id = 1
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -17,6 +17,7 @@ use pageserver_api::{
    DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
    DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
 };
+use postgres_backend::AuthType;
 use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -30,7 +31,6 @@ use utils::{
    auth::{Claims, Scope},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
    lsn::Lsn,
-    postgres_backend::AuthType,
    project_git_version,
 };

@@ -53,14 +53,15 @@ listen_addr = '{DEFAULT_BROKER_ADDR}'
 id = {DEFAULT_PAGESERVER_ID}
 listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
 listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
-auth_type = '{pageserver_auth_type}'
+pg_auth_type = '{trust_auth}'
+http_auth_type = '{trust_auth}'

 [[safekeepers]]
 id = {DEFAULT_SAFEKEEPER_ID}
 pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
 http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
 "#,
-        pageserver_auth_type = AuthType::Trust,
+        trust_auth = AuthType::Trust,
    )
 }

@@ -627,7 +628,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {

            let node = cplane.nodes.get(&(tenant_id, node_name.to_string()));

-            let auth_token = if matches!(env.pageserver.auth_type, AuthType::NeonJWT) {
+            let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(tenant_id), Scope::Tenant);

                Some(env.generate_auth_token(&claims)?)
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -11,10 +11,10 @@ use std::sync::Arc;
 use std::time::Duration;

 use anyhow::{Context, Result};
+use postgres_backend::AuthType;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
-    postgres_backend::AuthType,
 };

 use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
@@ -97,7 +97,7 @@ impl ComputeControlPlane {
        });

        node.create_pgdata()?;
-        node.setup_pg_conf(self.env.pageserver.auth_type)?;
+        node.setup_pg_conf(self.env.pageserver.pg_auth_type)?;

        self.nodes
            .insert((tenant_id, node.name.clone()), Arc::clone(&node));
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -5,6 +5,7 @@

 use anyhow::{bail, ensure, Context};

+use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
@@ -17,9 +18,8 @@ use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use utils::{
-    auth::{encode_from_key_file, Claims, Scope},
+    auth::{encode_from_key_file, Claims},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
-    postgres_backend::AuthType,
 };

 use crate::safekeeper::SafekeeperNode;
@@ -110,15 +110,14 @@ impl NeonBroker {
 pub struct PageServerConf {
    // node id
    pub id: NodeId,
+
    // Pageserver connection settings
    pub listen_pg_addr: String,
    pub listen_http_addr: String,

-    // used to determine which auth type is used
-    pub auth_type: AuthType,
-
-    // jwt auth token used for communication with pageserver
-    pub auth_token: String,
+    // auth type used for the PG and HTTP ports
+    pub pg_auth_type: AuthType,
+    pub http_auth_type: AuthType,
 }

 impl Default for PageServerConf {
@@ -127,8 +126,8 @@ impl Default for PageServerConf {
            id: NodeId(0),
            listen_pg_addr: String::new(),
            listen_http_addr: String::new(),
-            auth_type: AuthType::Trust,
-            auth_token: String::new(),
+            pg_auth_type: AuthType::Trust,
+            http_auth_type: AuthType::Trust,
        }
    }
 }
@@ -401,48 +400,33 @@ impl LocalEnv {

        fs::create_dir(base_path)?;

-        // generate keys for jwt
-        // openssl genrsa -out private_key.pem 2048
-        let private_key_path;
+        // Generate keypair for JWT.
+        //
+        // The keypair is only needed if authentication is enabled in any of the
+        // components. For convenience, we generate the keypair even if authentication
+        // is not enabled, so that you can easily enable it after the initialization
+        // step. However, if the key generation fails, we treat it as non-fatal if
+        // authentication was not enabled.
        if self.private_key_path == PathBuf::new() {
-            private_key_path = base_path.join("auth_private_key.pem");
-            let keygen_output = Command::new("openssl")
-                .arg("genrsa")
-                .args(["-out", private_key_path.to_str().unwrap()])
-                .arg("2048")
-                .stdout(Stdio::null())
-                .output()
-                .context("failed to generate auth private key")?;
-            if !keygen_output.status.success() {
-                bail!(
-                    "openssl failed: '{}'",
-                    String::from_utf8_lossy(&keygen_output.stderr)
-                );
-            }
-            self.private_key_path = PathBuf::from("auth_private_key.pem");
-
-            let public_key_path = base_path.join("auth_public_key.pem");
-            // openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
-            let keygen_output = Command::new("openssl")
-                .arg("rsa")
-                .args(["-in", private_key_path.to_str().unwrap()])
-                .arg("-pubout")
-                .args(["-outform", "PEM"])
-                .args(["-out", public_key_path.to_str().unwrap()])
-                .stdout(Stdio::null())
-                .output()
-                .context("failed to generate auth private key")?;
-            if !keygen_output.status.success() {
-                bail!(
-                    "openssl failed: '{}'",
-                    String::from_utf8_lossy(&keygen_output.stderr)
-                );
+            match generate_auth_keys(
+                base_path.join("auth_private_key.pem").as_path(),
+                base_path.join("auth_public_key.pem").as_path(),
+            ) {
+                Ok(()) => {
+                    self.private_key_path = PathBuf::from("auth_private_key.pem");
+                }
+                Err(e) => {
+                    if !self.auth_keys_needed() {
+                        eprintln!("Could not generate keypair for JWT authentication: {e}");
+                        eprintln!("Continuing anyway because authentication was not enabled");
+                        self.private_key_path = PathBuf::from("auth_private_key.pem");
+                    } else {
+                        return Err(e);
+                    }
+                }
            }
        }

-        self.pageserver.auth_token =
-            self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
-
        fs::create_dir_all(self.pg_data_dirs_path())?;

        for safekeeper in &self.safekeepers {
@@ -451,6 +435,12 @@ impl LocalEnv {

        self.persist_config(base_path)
    }
+
+    fn auth_keys_needed(&self) -> bool {
+        self.pageserver.pg_auth_type == AuthType::NeonJWT
+            || self.pageserver.http_auth_type == AuthType::NeonJWT
+            || self.safekeepers.iter().any(|sk| sk.auth_enabled)
+    }
 }

 fn base_path() -> PathBuf {
@@ -460,6 +450,43 @@ fn base_path() -> PathBuf {
    }
 }

+/// Generate a public/private key pair for JWT authentication
+fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow::Result<()> {
+    // Generate the key pair
+    //
+    // openssl genpkey -algorithm ed25519 -out auth_private_key.pem
+    let keygen_output = Command::new("openssl")
+        .arg("genpkey")
+        .args(["-algorithm", "ed25519"])
+        .args(["-out", private_key_path.to_str().unwrap()])
+        .stdout(Stdio::null())
+        .output()
+        .context("failed to generate auth private key")?;
+    if !keygen_output.status.success() {
+        bail!(
+            "openssl failed: '{}'",
+            String::from_utf8_lossy(&keygen_output.stderr)
+        );
+    }
+    // Extract the public key from the private key file
+    //
+    // openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
+    let keygen_output = Command::new("openssl")
+        .arg("pkey")
+        .args(["-in", private_key_path.to_str().unwrap()])
+        .arg("-pubout")
+        .args(["-out", public_key_path.to_str().unwrap()])
+        .output()
+        .context("failed to extract public key from private key")?;
+    if !keygen_output.status.success() {
+        bail!(
+            "openssl failed: '{}'",
+            String::from_utf8_lossy(&keygen_output.stderr)
+        );
+    }
+    Ok(())
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,6 +11,7 @@ use anyhow::{bail, Context};
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
+use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
@@ -20,7 +21,6 @@ use utils::{
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
    lsn::Lsn,
-    postgres_backend::AuthType,
 };

 use crate::{background_process, local_env::LocalEnv};
@@ -82,15 +82,8 @@ impl PageServerNode {
        let (host, port) = parse_host_port(&env.pageserver.listen_pg_addr)
            .expect("Unable to parse listen_pg_addr");
        let port = port.unwrap_or(5432);
-        let password = if env.pageserver.auth_type == AuthType::NeonJWT {
-            Some(env.pageserver.auth_token.clone())
-        } else {
-            None
-        };
-
        Self {
-            pg_connection_config: PgConnectionConfig::new_host_port(host, port)
-                .set_password(password),
+            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
            env: env.clone(),
            http_client: Client::new(),
            http_base_url: format!("http://{}/v1", env.pageserver.listen_http_addr),
@@ -106,25 +99,32 @@ impl PageServerNode {
            self.env.pg_distrib_dir_raw().display()
        );

-        let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type);
+        let http_auth_type_param =
+            format!("http_auth_type='{}'", self.env.pageserver.http_auth_type);
        let listen_http_addr_param = format!(
            "listen_http_addr='{}'",
            self.env.pageserver.listen_http_addr
        );
+
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.env.pageserver.pg_auth_type);
        let listen_pg_addr_param =
            format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
+
        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

        let mut overrides = vec![
            id,
            pg_distrib_dir_param,
-            authg_type_param,
+            http_auth_type_param,
+            pg_auth_type_param,
            listen_http_addr_param,
            listen_pg_addr_param,
            broker_endpoint_param,
        ];

-        if self.env.pageserver.auth_type != AuthType::Trust {
+        if self.env.pageserver.http_auth_type != AuthType::Trust
+            || self.env.pageserver.pg_auth_type != AuthType::Trust
+        {
            overrides.push("auth_validation_public_key_path='auth_public_key.pem'".to_owned());
        }
        overrides
@@ -247,7 +247,10 @@ impl PageServerNode {
    }

    fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
-        Ok(if self.env.pageserver.auth_type != AuthType::Trust {
+        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
+        // needs a token, and how to generate that token, seems independent to whether
+        // the pageserver requires a token in incoming requests.
+        Ok(if self.env.pageserver.http_auth_type != AuthType::Trust {
            // Generate a token to connect from the pageserver to a safekeeper
            let token = self
                .env
@@ -270,27 +273,30 @@ impl PageServerNode {
        background_process::stop_process(immediate, "pageserver", &self.pid_file())
    }

-    pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
-        let mut client = self.pg_connection_config.connect_no_tls().unwrap();
-
-        println!("Pageserver query: '{sql}'");
-        client.simple_query(sql).unwrap()
-    }
-
-    pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
-        self.pg_connection_config.connect_no_tls()
-    }
-
-    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
-        let mut builder = self.http_client.request(method, url);
-        if self.env.pageserver.auth_type == AuthType::NeonJWT {
-            builder = builder.bearer_auth(&self.env.pageserver.auth_token)
+    pub fn page_server_psql_client(&self) -> anyhow::Result<postgres::Client> {
+        let mut config = self.pg_connection_config.clone();
+        if self.env.pageserver.pg_auth_type == AuthType::NeonJWT {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+            config = config.set_password(Some(token));
        }
-        builder
+        Ok(config.connect_no_tls()?)
+    }
+
+    fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> anyhow::Result<RequestBuilder> {
+        let mut builder = self.http_client.request(method, url);
+        if self.env.pageserver.http_auth_type == AuthType::NeonJWT {
+            let token = self
+                .env
+                .generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
+            builder = builder.bearer_auth(token)
+        }
+        Ok(builder)
    }

    pub fn check_status(&self) -> Result<()> {
-        self.http_request(Method::GET, format!("{}/status", self.http_base_url))
+        self.http_request(Method::GET, format!("{}/status", self.http_base_url))?
            .send()?
            .error_from_body()?;
        Ok(())
@@ -298,7 +304,7 @@ impl PageServerNode {

    pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
        Ok(self
-            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))
+            .http_request(Method::GET, format!("{}/tenant", self.http_base_url))?
            .send()?
            .error_from_body()?
            .json()?)
@@ -352,11 +358,21 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'trace_read_requests' as bool")?,
+            eviction_policy: settings
+                .get("eviction_policy")
+                .map(|x| serde_json::from_str(x))
+                .transpose()
+                .context("Failed to parse 'eviction_policy' json")?,
+            min_resident_size_override: settings
+                .remove("min_resident_size_override")
+                .map(|x| x.parse::<u64>())
+                .transpose()
+                .context("Failed to parse 'min_resident_size_override' as integer")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
        }
-        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
+        self.http_request(Method::POST, format!("{}/tenant", self.http_base_url))?
            .json(&request)
            .send()?
            .error_from_body()?
@@ -373,7 +389,7 @@ impl PageServerNode {
    }

    pub fn tenant_config(&self, tenant_id: TenantId, settings: HashMap<&str, &str>) -> Result<()> {
-        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
+        self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))?
            .json(&TenantConfigRequest {
                tenant_id,
                checkpoint_distance: settings
@@ -424,6 +440,11 @@ impl PageServerNode {
                    .map(|x| serde_json::from_str(x))
                    .transpose()
                    .context("Failed to parse 'eviction_policy' json")?,
+                min_resident_size_override: settings
+                    .get("min_resident_size_override")
+                    .map(|x| x.parse::<u64>())
+                    .transpose()
+                    .context("Failed to parse 'min_resident_size_override' as an integer")?,
            })
            .send()?
            .error_from_body()?;
@@ -436,7 +457,7 @@ impl PageServerNode {
            .http_request(
                Method::GET,
                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-            )
+            )?
            .send()?
            .error_from_body()?
            .json()?;
@@ -455,7 +476,7 @@ impl PageServerNode {
        self.http_request(
            Method::POST,
            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
-        )
+        )?
        .json(&TimelineCreateRequest {
            new_timeline_id,
            ancestor_start_lsn,
@@ -492,7 +513,7 @@ impl PageServerNode {
        pg_wal: Option<(Lsn, PathBuf)>,
        pg_version: u32,
    ) -> anyhow::Result<()> {
-        let mut client = self.pg_connection_config.connect_no_tls().unwrap();
+        let mut client = self.page_server_psql_client()?;

        // Init base reader
        let (start_lsn, base_tarfile_path) = base;
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,7 +1,6 @@
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
-use std::sync::Arc;
 use std::{io, result};

 use anyhow::Context;
@@ -11,7 +10,6 @@ use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::{http::error::HttpErrorBody, id::NodeId};

-use crate::pageserver::PageServerNode;
 use crate::{
    background_process,
    local_env::{LocalEnv, SafekeeperConf},
@@ -65,14 +63,10 @@ pub struct SafekeeperNode {
    pub env: LocalEnv,
    pub http_client: Client,
    pub http_base_url: String,
-
-    pub pageserver: Arc<PageServerNode>,
 }

 impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
-        let pageserver = Arc::new(PageServerNode::from_env(env));
-
        SafekeeperNode {
            id: conf.id,
            conf: conf.clone(),
@@ -80,7 +74,6 @@ impl SafekeeperNode {
            env: env.clone(),
            http_client: Client::new(),
            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
-            pageserver,
        }
    }

@@ -115,6 +108,10 @@ impl SafekeeperNode {
        let datadir = self.datadir_path();

        let id_string = id.to_string();
+        // TODO: add availability_zone to the config.
+        // Right now we just specify any value here and use it to check metrics in tests.
+        let availability_zone = format!("sk-{}", id_string);
+
        let mut args = vec![
            "-D",
            datadir.to_str().with_context(|| {
@@ -126,6 +123,8 @@ impl SafekeeperNode {
            &listen_pg,
            "--listen-http",
            &listen_http,
+            "--availability-zone",
+            &availability_zone,
        ];
        if !self.conf.sync {
            args.push("--no-sync");
--- a/docs/authentication.md
+++ b/docs/authentication.md
@@ -29,12 +29,54 @@ These components should not have access to the private key and may only get toke
 The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`.
 There is currently no way to rotate the key without bringing down all components.

+### Best practices
+
+See [RFC 8725: JSON Web Token Best Current Practices](https://www.rfc-editor.org/rfc/rfc8725)
+
+
+### Token format
+
+The JWT tokens in Neon use "EdDSA" as the algorithm (defined in [RFC8037](https://www.rfc-editor.org/rfc/rfc8037)).
+
+Example:
+
+Header:
+
+```
+{
+  "alg": "EdDSA",
+  "typ": "JWT"
+}
+```
+
+Payload:
+
+```
+{
+  "scope": "tenant",  # "tenant", "pageserverapi", or "safekeeperdata"
+  "tenant_id": "5204921ff44f09de8094a1390a6a50f6",
+}
+```
+
+
+Meanings of scope:
+
+"tenant": Provides access to all data for a specific tenant
+
+"pageserverapi": Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
+Should only be used e.g. for status check/tenant creation/list.
+
+"safekeeperdata": Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
+Should only be used e.g. for status check.
+Currently also used for connection from any pageserver to any safekeeper.
+
+
 ### CLI
 CLI generates a key pair during call to `neon_local init` with the following commands:

 ```bash
-openssl genrsa -out auth_private_key.pem 2048
-openssl rsa -in auth_private_key.pem -pubout -outform PEM -out auth_public_key.pem
+openssl genpkey -algorithm ed25519 -out auth_private_key.pem
+openssl pkey -in auth_private_key.pem -pubout -out auth_public_key.pem
 ```

 Configuration files for all components point to `public_key.pem` for JWT validation.
@@ -102,10 +144,12 @@ Each compute should present a token valid for the timeline's tenant.
 Pageserver also has HTTP API: some parts are per-tenant,
 some parts are server-wide, these are different scopes.

-The `auth_type` configuration variable in Pageserver's config may have
-either of three values:
+Authentication can be enabled separately for the HTTP mgmt API, and
+for the libpq connections from compute. The `http_auth_type` and
+`pg_auth_type` configuration variables in Pageserver's config may
+have one of these values:

-* `Trust` removes all authentication. The outdated `MD5` value does likewise
+* `Trust` removes all authentication.
 * `NeonJWT` enables JWT validation.
   Tokens are validated using the public key which lies in a PEM file
   specified in the `auth_validation_public_key_path` config.
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -129,13 +129,12 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.

 ### Obligatory checks
-We force code formatting via `black`, `isort` and type hints via `mypy`.
+We force code formatting via `black`, `ruff`, and type hints via `mypy`.
 Run the following commands in the repository's root (next to `pyproject.toml`):

 ```bash
-poetry run isort .  # Imports are reformatted
 poetry run black .  # All code is reformatted
-poetry run flake8 .  # Python linter
+poetry run ruff .  # Python linter
 poetry run mypy .  # Ensure there are no typing errors
 ```

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -115,6 +115,12 @@ pub struct TenantCreateRequest {
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
+    // We defer the parsing of the eviction_policy field to the request handler.
+    // Otherwise we'd have to move the types for eviction policy into this package.
+    // We might do that once the eviction feature has stabilizied.
+    // For now, this field is not even documented in the openapi_spec.yml.
+    pub eviction_policy: Option<serde_json::Value>,
+    pub min_resident_size_override: Option<u64>,
 }

 #[serde_as]
@@ -160,6 +166,7 @@ pub struct TenantConfigRequest {
    // We might do that once the eviction feature has stabilizied.
    // For now, this field is not even documented in the openapi_spec.yml.
    pub eviction_policy: Option<serde_json::Value>,
+    pub min_resident_size_override: Option<u64>,
 }

 impl TenantConfigRequest {
@@ -180,6 +187,7 @@ impl TenantConfigRequest {
            max_lsn_wal_lag: None,
            trace_read_requests: None,
            eviction_policy: None,
+            min_resident_size_override: None,
        }
    }
 }
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -0,0 +1,26 @@
+[package]
+name = "postgres_backend"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+async-trait.workspace = true
+anyhow.workspace = true
+bytes.workspace = true
+futures.workspace = true
+rustls.workspace = true
+serde.workspace = true
+thiserror.workspace = true
+tokio.workspace = true
+tokio-rustls.workspace = true
+tracing.workspace = true
+
+pq_proto.workspace = true
+workspace_hack.workspace = true
+
+[dev-dependencies]
+once_cell.workspace = true
+rustls-pemfile.workspace = true
+tokio-postgres.workspace = true
+tokio-postgres-rustls.workspace = true
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -0,0 +1,931 @@
+//! Server-side asynchronous Postgres connection, as limited as we need.
+//! To use, create PostgresBackend and run() it, passing the Handler
+//! implementation determining how to process the queries. Currently its API
+//! is rather narrow, but we can extend it once required.
+use anyhow::Context;
+use bytes::Bytes;
+use futures::pin_mut;
+use serde::{Deserialize, Serialize};
+use std::io::ErrorKind;
+use std::net::SocketAddr;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{ready, Poll};
+use std::{fmt, io};
+use std::{future::Future, str::FromStr};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_rustls::TlsAcceptor;
+use tracing::{debug, error, info, trace};
+
+use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
+use pq_proto::{
+    BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
+    SQLSTATE_SUCCESSFUL_COMPLETION,
+};
+
+/// An error, occurred during query processing:
+/// either during the connection ([`ConnectionError`]) or before/after it.
+#[derive(thiserror::Error, Debug)]
+pub enum QueryError {
+    /// The connection was lost while processing the query.
+    #[error(transparent)]
+    Disconnected(#[from] ConnectionError),
+    /// Some other error
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<io::Error> for QueryError {
+    fn from(e: io::Error) -> Self {
+        Self::Disconnected(ConnectionError::Io(e))
+    }
+}
+
+impl QueryError {
+    pub fn pg_error_code(&self) -> &'static [u8; 5] {
+        match self {
+            Self::Disconnected(_) => b"08006",         // connection failure
+            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
+        }
+    }
+}
+
+pub fn is_expected_io_error(e: &io::Error) -> bool {
+    use io::ErrorKind::*;
+    matches!(
+        e.kind(),
+        ConnectionRefused | ConnectionAborted | ConnectionReset
+    )
+}
+
+#[async_trait::async_trait]
+pub trait Handler<IO> {
+    /// Handle single query.
+    /// postgres_backend will issue ReadyForQuery after calling this (this
+    /// might be not what we want after CopyData streaming, but currently we don't
+    /// care). It will also flush out the output buffer.
+    async fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+        query_string: &str,
+    ) -> Result<(), QueryError>;
+
+    /// Called on startup packet receival, allows to process params.
+    ///
+    /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
+    /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
+    /// to override whole init logic in implementations.
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend<IO>,
+        _sm: &FeStartupPacket,
+    ) -> Result<(), QueryError> {
+        Ok(())
+    }
+
+    /// Check auth jwt
+    fn check_auth_jwt(
+        &mut self,
+        _pgb: &mut PostgresBackend<IO>,
+        _jwt_response: &[u8],
+    ) -> Result<(), QueryError> {
+        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
+    }
+}
+
+/// PostgresBackend protocol state.
+/// XXX: The order of the constructors matters.
+#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
+pub enum ProtoState {
+    /// Nothing happened yet.
+    Initialization,
+    /// Encryption handshake is done; waiting for encrypted Startup message.
+    Encrypted,
+    /// Waiting for password (auth token).
+    Authentication,
+    /// Performed handshake and auth, ReadyForQuery is issued.
+    Established,
+    Closed,
+}
+
+#[derive(Clone, Copy)]
+pub enum ProcessMsgResult {
+    Continue,
+    Break,
+}
+
+/// Either plain TCP stream or encrypted one, implementing AsyncRead + AsyncWrite.
+pub enum MaybeTlsStream<IO> {
+    Unencrypted(IO),
+    Tls(Box<tokio_rustls::server::TlsStream<IO>>),
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for MaybeTlsStream<IO> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
+            Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
+        }
+    }
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
+            Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
+        }
+    }
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<io::Result<()>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
+            Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
+        }
+    }
+}
+impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncRead for MaybeTlsStream<IO> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut tokio::io::ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        match self.get_mut() {
+            Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
+            Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
+pub enum AuthType {
+    Trust,
+    // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
+    NeonJWT,
+}
+
+impl FromStr for AuthType {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "Trust" => Ok(Self::Trust),
+            "NeonJWT" => Ok(Self::NeonJWT),
+            _ => anyhow::bail!("invalid value \"{s}\" for auth type"),
+        }
+    }
+}
+
+impl fmt::Display for AuthType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(match self {
+            AuthType::Trust => "Trust",
+            AuthType::NeonJWT => "NeonJWT",
+        })
+    }
+}
+
+/// Either full duplex Framed or write only half; the latter is left in
+/// PostgresBackend after call to `split`. In principle we could always store a
+/// pair of splitted handles, but that would force to to pay splitting price
+/// (Arc and kinda mutex inside polling) for all uses (e.g. pageserver).
+enum MaybeWriteOnly<IO> {
+    Full(Framed<MaybeTlsStream<IO>>),
+    WriteOnly(FramedWriter<MaybeTlsStream<IO>>),
+    Broken, // temporary value palmed off during the split
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
+    async fn read_startup_message(&mut self) -> Result<Option<FeStartupPacket>, ConnectionError> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.read_startup_message().await,
+            MaybeWriteOnly::WriteOnly(_) => {
+                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+            }
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+
+    async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.read_message().await,
+            MaybeWriteOnly::WriteOnly(_) => {
+                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+            }
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+
+    fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.write_message(msg),
+            MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.write_message_noflush(msg),
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+
+    async fn flush(&mut self) -> io::Result<()> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.flush().await,
+            MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.flush().await,
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+
+    async fn shutdown(&mut self) -> io::Result<()> {
+        match self {
+            MaybeWriteOnly::Full(framed) => framed.shutdown().await,
+            MaybeWriteOnly::WriteOnly(framed_writer) => framed_writer.shutdown().await,
+            MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
+        }
+    }
+}
+
+pub struct PostgresBackend<IO> {
+    framed: MaybeWriteOnly<IO>,
+
+    pub state: ProtoState,
+
+    auth_type: AuthType,
+
+    peer_addr: SocketAddr,
+    pub tls_config: Option<Arc<rustls::ServerConfig>>,
+}
+
+pub type PostgresBackendTCP = PostgresBackend<tokio::net::TcpStream>;
+
+pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
+    let mut query_string = query_string.to_vec();
+    if let Some(ch) = query_string.last() {
+        if *ch == 0 {
+            query_string.pop();
+        }
+    }
+    query_string
+}
+
+/// Cast a byte slice to a string slice, dropping null terminator if there's one.
+fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
+    let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
+    std::str::from_utf8(without_null).map_err(|e| e.into())
+}
+
+impl PostgresBackend<tokio::net::TcpStream> {
+    pub fn new(
+        socket: tokio::net::TcpStream,
+        auth_type: AuthType,
+        tls_config: Option<Arc<rustls::ServerConfig>>,
+    ) -> io::Result<Self> {
+        let peer_addr = socket.peer_addr()?;
+        let stream = MaybeTlsStream::Unencrypted(socket);
+
+        Ok(Self {
+            framed: MaybeWriteOnly::Full(Framed::new(stream)),
+            state: ProtoState::Initialization,
+            auth_type,
+            tls_config,
+            peer_addr,
+        })
+    }
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
+    pub fn new_from_io(
+        socket: IO,
+        peer_addr: SocketAddr,
+        auth_type: AuthType,
+        tls_config: Option<Arc<rustls::ServerConfig>>,
+    ) -> io::Result<Self> {
+        let stream = MaybeTlsStream::Unencrypted(socket);
+
+        Ok(Self {
+            framed: MaybeWriteOnly::Full(Framed::new(stream)),
+            state: ProtoState::Initialization,
+            auth_type,
+            tls_config,
+            peer_addr,
+        })
+    }
+
+    pub fn get_peer_addr(&self) -> &SocketAddr {
+        &self.peer_addr
+    }
+
+    /// Read full message or return None if connection is cleanly closed with no
+    /// unprocessed data.
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        if let ProtoState::Closed = self.state {
+            Ok(None)
+        } else {
+            let m = self.framed.read_message().await?;
+            trace!("read msg {:?}", m);
+            Ok(m)
+        }
+    }
+
+    /// Write message into internal output buffer, doesn't flush it. Technically
+    /// error type can be only ProtocolError here (if, unlikely, serialization
+    /// fails), but callers typically wrap it anyway.
+    pub fn write_message_noflush(
+        &mut self,
+        message: &BeMessage<'_>,
+    ) -> Result<&mut Self, ConnectionError> {
+        self.framed.write_message_noflush(message)?;
+        trace!("wrote msg {:?}", message);
+        Ok(self)
+    }
+
+    /// Flush output buffer into the socket.
+    pub async fn flush(&mut self) -> io::Result<()> {
+        self.framed.flush().await
+    }
+
+    /// Polling version of `flush()`, saves the caller need to pin.
+    pub fn poll_flush(
+        &mut self,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let flush_fut = self.flush();
+        pin_mut!(flush_fut);
+        flush_fut.poll(cx)
+    }
+
+    /// Write message into internal output buffer and flush it to the stream.
+    pub async fn write_message(
+        &mut self,
+        message: &BeMessage<'_>,
+    ) -> Result<&mut Self, ConnectionError> {
+        self.write_message_noflush(message)?;
+        self.flush().await?;
+        Ok(self)
+    }
+
+    /// Returns an AsyncWrite implementation that wraps all the data written
+    /// to it in CopyData messages, and writes them to the connection
+    ///
+    /// The caller is responsible for sending CopyOutResponse and CopyDone messages.
+    pub fn copyout_writer(&mut self) -> CopyDataWriter<IO> {
+        CopyDataWriter { pgb: self }
+    }
+
+    /// Wrapper for run_message_loop() that shuts down socket when we are done
+    pub async fn run<F, S>(
+        mut self,
+        handler: &mut impl Handler<IO>,
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
+        let ret = self.run_message_loop(handler, shutdown_watcher).await;
+        // socket might be already closed, e.g. if previously received error,
+        // so ignore result.
+        self.framed.shutdown().await.ok();
+        ret
+    }
+
+    async fn run_message_loop<F, S>(
+        &mut self,
+        handler: &mut impl Handler<IO>,
+        shutdown_watcher: F,
+    ) -> Result<(), QueryError>
+    where
+        F: Fn() -> S,
+        S: Future,
+    {
+        trace!("postgres backend to {:?} started", self.peer_addr);
+
+        tokio::select!(
+            biased;
+
+            _ = shutdown_watcher() => {
+                // We were requested to shut down.
+                tracing::info!("shutdown request received during handshake");
+                return Ok(())
+            },
+
+            result = self.handshake(handler) => {
+                // Handshake complete.
+                result?;
+                if self.state == ProtoState::Closed {
+                    return Ok(()); // EOF during handshake
+                }
+            }
+        );
+
+        // Authentication completed
+        let mut query_string = Bytes::new();
+        while let Some(msg) = tokio::select!(
+            biased;
+            _ = shutdown_watcher() => {
+                // We were requested to shut down.
+                tracing::info!("shutdown request received in run_message_loop");
+                Ok(None)
+            },
+            msg = self.read_message() => { msg },
+        )? {
+            trace!("got message {:?}", msg);
+
+            let result = self.process_message(handler, msg, &mut query_string).await;
+            self.flush().await?;
+            match result? {
+                ProcessMsgResult::Continue => {
+                    self.flush().await?;
+                    continue;
+                }
+                ProcessMsgResult::Break => break,
+            }
+        }
+
+        trace!("postgres backend to {:?} exited", self.peer_addr);
+        Ok(())
+    }
+
+    /// Try to upgrade MaybeTlsStream into actual TLS one, performing handshake.
+    async fn tls_upgrade(
+        src: MaybeTlsStream<IO>,
+        tls_config: Arc<rustls::ServerConfig>,
+    ) -> anyhow::Result<MaybeTlsStream<IO>> {
+        match src {
+            MaybeTlsStream::Unencrypted(s) => {
+                let acceptor = TlsAcceptor::from(tls_config);
+                let tls_stream = acceptor.accept(s).await?;
+                Ok(MaybeTlsStream::Tls(Box::new(tls_stream)))
+            }
+            MaybeTlsStream::Tls(_) => {
+                anyhow::bail!("TLS already started");
+            }
+        }
+    }
+
+    async fn start_tls(&mut self) -> anyhow::Result<()> {
+        // temporary replace stream with fake to cook TLS one, Indiana Jones style
+        match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) {
+            MaybeWriteOnly::Full(framed) => {
+                let tls_config = self
+                    .tls_config
+                    .as_ref()
+                    .context("start_tls called without conf")?
+                    .clone();
+                let tls_framed = framed
+                    .map_stream(|s| PostgresBackend::tls_upgrade(s, tls_config))
+                    .await?;
+                // push back ready TLS stream
+                self.framed = MaybeWriteOnly::Full(tls_framed);
+                Ok(())
+            }
+            MaybeWriteOnly::WriteOnly(_) => {
+                anyhow::bail!("TLS upgrade attempt in split state")
+            }
+            MaybeWriteOnly::Broken => panic!("TLS upgrade on framed in invalid state"),
+        }
+    }
+
+    /// Split off owned read part from which messages can be read in different
+    /// task/thread.
+    pub fn split(&mut self) -> anyhow::Result<PostgresBackendReader<IO>> {
+        // temporary replace stream with fake to cook split one, Indiana Jones style
+        match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) {
+            MaybeWriteOnly::Full(framed) => {
+                let (reader, writer) = framed.split();
+                self.framed = MaybeWriteOnly::WriteOnly(writer);
+                Ok(PostgresBackendReader(reader))
+            }
+            MaybeWriteOnly::WriteOnly(_) => {
+                anyhow::bail!("PostgresBackend is already split")
+            }
+            MaybeWriteOnly::Broken => panic!("split on framed in invalid state"),
+        }
+    }
+
+    /// Join read part back.
+    pub fn unsplit(&mut self, reader: PostgresBackendReader<IO>) -> anyhow::Result<()> {
+        // temporary replace stream with fake to cook joined one, Indiana Jones style
+        match std::mem::replace(&mut self.framed, MaybeWriteOnly::Broken) {
+            MaybeWriteOnly::Full(_) => {
+                anyhow::bail!("PostgresBackend is not split")
+            }
+            MaybeWriteOnly::WriteOnly(writer) => {
+                let joined = Framed::unsplit(reader.0, writer);
+                self.framed = MaybeWriteOnly::Full(joined);
+                Ok(())
+            }
+            MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"),
+        }
+    }
+
+    /// Perform handshake with the client, transitioning to Established.
+    /// In case of EOF during handshake logs this, sets state to Closed and returns Ok(()).
+    async fn handshake(&mut self, handler: &mut impl Handler<IO>) -> Result<(), QueryError> {
+        while self.state < ProtoState::Authentication {
+            match self.framed.read_startup_message().await? {
+                Some(msg) => {
+                    self.process_startup_message(handler, msg).await?;
+                }
+                None => {
+                    trace!(
+                        "postgres backend to {:?} received EOF during handshake",
+                        self.peer_addr
+                    );
+                    self.state = ProtoState::Closed;
+                    return Ok(());
+                }
+            }
+        }
+
+        // Perform auth, if needed.
+        if self.state == ProtoState::Authentication {
+            match self.framed.read_message().await? {
+                Some(FeMessage::PasswordMessage(m)) => {
+                    assert!(self.auth_type == AuthType::NeonJWT);
+
+                    let (_, jwt_response) = m.split_last().context("protocol violation")?;
+
+                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
+                        self.write_message_noflush(&BeMessage::ErrorResponse(
+                            &e.to_string(),
+                            Some(e.pg_error_code()),
+                        ))?;
+                        return Err(e);
+                    }
+
+                    self.write_message_noflush(&BeMessage::AuthenticationOk)?
+                        .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
+                        .write_message(&BeMessage::ReadyForQuery)
+                        .await?;
+                    self.state = ProtoState::Established;
+                }
+                Some(m) => {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "Unexpected message {:?} while waiting for handshake",
+                        m
+                    )));
+                }
+                None => {
+                    trace!(
+                        "postgres backend to {:?} received EOF during auth",
+                        self.peer_addr
+                    );
+                    self.state = ProtoState::Closed;
+                    return Ok(());
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Process startup packet:
+    /// - transition to Established if auth type is trust
+    /// - transition to Authentication if auth type is NeonJWT.
+    /// - or perform TLS handshake -- then need to call this again to receive
+    ///   actual startup packet.
+    async fn process_startup_message(
+        &mut self,
+        handler: &mut impl Handler<IO>,
+        msg: FeStartupPacket,
+    ) -> Result<(), QueryError> {
+        assert!(self.state < ProtoState::Authentication);
+        let have_tls = self.tls_config.is_some();
+        match msg {
+            FeStartupPacket::SslRequest => {
+                debug!("SSL requested");
+
+                self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                    .await?;
+
+                if have_tls {
+                    self.start_tls().await?;
+                    self.state = ProtoState::Encrypted;
+                }
+            }
+            FeStartupPacket::GssEncRequest => {
+                debug!("GSS requested");
+                self.write_message(&BeMessage::EncryptionResponse(false))
+                    .await?;
+            }
+            FeStartupPacket::StartupMessage { .. } => {
+                if have_tls && !matches!(self.state, ProtoState::Encrypted) {
+                    self.write_message(&BeMessage::ErrorResponse("must connect with TLS", None))
+                        .await?;
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "client did not connect with TLS"
+                    )));
+                }
+
+                // NB: startup() may change self.auth_type -- we are using that in proxy code
+                // to bypass auth for new users.
+                handler.startup(self, &msg)?;
+
+                match self.auth_type {
+                    AuthType::Trust => {
+                        self.write_message_noflush(&BeMessage::AuthenticationOk)?
+                            .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
+                            .write_message_noflush(&BeMessage::INTEGER_DATETIMES)?
+                            // The async python driver requires a valid server_version
+                            .write_message_noflush(&BeMessage::server_version("14.1"))?
+                            .write_message(&BeMessage::ReadyForQuery)
+                            .await?;
+                        self.state = ProtoState::Established;
+                    }
+                    AuthType::NeonJWT => {
+                        self.write_message(&BeMessage::AuthenticationCleartextPassword)
+                            .await?;
+                        self.state = ProtoState::Authentication;
+                    }
+                }
+            }
+            FeStartupPacket::CancelRequest { .. } => {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "Unexpected CancelRequest message during handshake"
+                )));
+            }
+        }
+        Ok(())
+    }
+
+    async fn process_message(
+        &mut self,
+        handler: &mut impl Handler<IO>,
+        msg: FeMessage,
+        unnamed_query_string: &mut Bytes,
+    ) -> Result<ProcessMsgResult, QueryError> {
+        // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
+        // TODO: change that to proper top-level match of protocol state with separate message handling for each state
+        assert!(self.state == ProtoState::Established);
+
+        match msg {
+            FeMessage::Query(body) => {
+                // remove null terminator
+                let query_string = cstr_to_str(&body)?;
+
+                trace!("got query {query_string:?}");
+                if let Err(e) = handler.process_query(self, query_string).await {
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
+                }
+                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
+            }
+
+            FeMessage::Parse(m) => {
+                *unnamed_query_string = m.query_string;
+                self.write_message_noflush(&BeMessage::ParseComplete)?;
+            }
+
+            FeMessage::Describe(_) => {
+                self.write_message_noflush(&BeMessage::ParameterDescription)?
+                    .write_message_noflush(&BeMessage::NoData)?;
+            }
+
+            FeMessage::Bind(_) => {
+                self.write_message_noflush(&BeMessage::BindComplete)?;
+            }
+
+            FeMessage::Close(_) => {
+                self.write_message_noflush(&BeMessage::CloseComplete)?;
+            }
+
+            FeMessage::Execute(_) => {
+                let query_string = cstr_to_str(unnamed_query_string)?;
+                trace!("got execute {query_string:?}");
+                if let Err(e) = handler.process_query(self, query_string).await {
+                    log_query_error(query_string, &e);
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
+                        &e.to_string(),
+                        Some(e.pg_error_code()),
+                    ))?;
+                }
+                // NOTE there is no ReadyForQuery message. This handler is used
+                // for basebackup and it uses CopyOut which doesn't require
+                // ReadyForQuery message and backend just switches back to
+                // processing mode after sending CopyDone or ErrorResponse.
+            }
+
+            FeMessage::Sync => {
+                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
+            }
+
+            FeMessage::Terminate => {
+                return Ok(ProcessMsgResult::Break);
+            }
+
+            // We prefer explicit pattern matching to wildcards, because
+            // this helps us spot the places where new variants are missing
+            FeMessage::CopyData(_)
+            | FeMessage::CopyDone
+            | FeMessage::CopyFail
+            | FeMessage::PasswordMessage(_) => {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message type: {msg:?}",
+                )));
+            }
+        }
+
+        Ok(ProcessMsgResult::Continue)
+    }
+
+    /// Log as info/error result of handling COPY stream and send back
+    /// ErrorResponse if that makes sense. Shutdown the stream if we got
+    /// Terminate. TODO: transition into waiting for Sync msg if we initiate the
+    /// close.
+    pub async fn handle_copy_stream_end(&mut self, end: CopyStreamHandlerEnd) {
+        use CopyStreamHandlerEnd::*;
+
+        let expected_end = match &end {
+            ServerInitiated(_) | CopyDone | CopyFail | Terminate | EOF => true,
+            CopyStreamHandlerEnd::Disconnected(ConnectionError::Io(io_error))
+                if is_expected_io_error(io_error) =>
+            {
+                true
+            }
+            _ => false,
+        };
+        if expected_end {
+            info!("terminated: {:#}", end);
+        } else {
+            error!("terminated: {:?}", end);
+        }
+
+        // Note: no current usages ever send this
+        if let CopyDone = &end {
+            if let Err(e) = self.write_message(&BeMessage::CopyDone).await {
+                error!("failed to send CopyDone: {}", e);
+            }
+        }
+
+        if let Terminate = &end {
+            self.state = ProtoState::Closed;
+        }
+
+        let err_to_send_and_errcode = match &end {
+            ServerInitiated(_) => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
+            Other(_) => Some((end.to_string(), SQLSTATE_INTERNAL_ERROR)),
+            // Note: CopyFail in duplex copy is somewhat unexpected (at least to
+            // PG walsender; evidently and per my docs reading client should
+            // finish it with CopyDone). It is not a problem to recover from it
+            // finishing the stream in both directions like we do, but note that
+            // sync rust-postgres client (which we don't use anymore) hangs if
+            // socket is not closed here.
+            // https://github.com/sfackler/rust-postgres/issues/755
+            // https://github.com/neondatabase/neon/issues/935
+            //
+            // Currently, the version of tokio_postgres replication patch we use
+            // sends this when it closes the stream (e.g. pageserver decided to
+            // switch conn to another safekeeper and client gets dropped).
+            // Moreover, seems like 'connection' task errors with 'unexpected
+            // message from server' when it receives ErrorResponse (anything but
+            // CopyData/CopyDone) back.
+            CopyFail => Some((end.to_string(), SQLSTATE_SUCCESSFUL_COMPLETION)),
+            _ => None,
+        };
+        if let Some((err, errcode)) = err_to_send_and_errcode {
+            if let Err(ee) = self
+                .write_message(&BeMessage::ErrorResponse(&err, Some(errcode)))
+                .await
+            {
+                error!("failed to send ErrorResponse: {}", ee);
+            }
+        }
+    }
+}
+
+pub struct PostgresBackendReader<IO>(FramedReader<MaybeTlsStream<IO>>);
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
+    /// Read full message or return None if connection is cleanly closed with no
+    /// unprocessed data.
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        let m = self.0.read_message().await?;
+        trace!("read msg {:?}", m);
+        Ok(m)
+    }
+
+    /// Get CopyData contents of the next message in COPY stream or error
+    /// closing it. The error type is wider than actual errors which can happen
+    /// here -- it includes 'Other' and 'ServerInitiated', but that's ok for
+    /// current callers.
+    pub async fn read_copy_message(&mut self) -> Result<Bytes, CopyStreamHandlerEnd> {
+        match self.read_message().await? {
+            Some(msg) => match msg {
+                FeMessage::CopyData(m) => Ok(m),
+                FeMessage::CopyDone => Err(CopyStreamHandlerEnd::CopyDone),
+                FeMessage::CopyFail => Err(CopyStreamHandlerEnd::CopyFail),
+                FeMessage::Terminate => Err(CopyStreamHandlerEnd::Terminate),
+                _ => Err(CopyStreamHandlerEnd::from(ConnectionError::Protocol(
+                    ProtocolError::Protocol(format!("unexpected message in COPY stream {:?}", msg)),
+                ))),
+            },
+            None => Err(CopyStreamHandlerEnd::EOF),
+        }
+    }
+}
+
+///
+/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
+/// messages.
+///
+
+pub struct CopyDataWriter<'a, IO> {
+    pgb: &'a mut PostgresBackend<IO>,
+}
+
+impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> Poll<Result<usize, std::io::Error>> {
+        let this = self.get_mut();
+
+        // It's not strictly required to flush between each message, but makes it easier
+        // to view in wireshark, and usually the messages that the callers write are
+        // decently-sized anyway.
+        if let Err(err) = ready!(this.pgb.poll_flush(cx)) {
+            return Poll::Ready(Err(err));
+        }
+
+        // CopyData
+        // XXX: if the input is large, we should split it into multiple messages.
+        // Not sure what the threshold should be, but the ultimate hard limit is that
+        // the length cannot exceed u32.
+        this.pgb
+            .write_message_noflush(&BeMessage::CopyData(buf))
+            // write_message only writes to the buffer, so it can fail iff the
+            // message is invaid, but CopyData can't be invalid.
+            .map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?;
+
+        Poll::Ready(Ok(buf.len()))
+    }
+
+    fn poll_flush(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        this.pgb.poll_flush(cx)
+    }
+
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Result<(), std::io::Error>> {
+        let this = self.get_mut();
+        this.pgb.poll_flush(cx)
+    }
+}
+
+pub fn short_error(e: &QueryError) -> String {
+    match e {
+        QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Other(e) => format!("{e:#}"),
+    }
+}
+
+fn log_query_error(query: &str, e: &QueryError) {
+    match e {
+        QueryError::Disconnected(ConnectionError::Io(io_error)) => {
+            if is_expected_io_error(io_error) {
+                info!("query handler for '{query}' failed with expected io error: {io_error}");
+            } else {
+                error!("query handler for '{query}' failed with io error: {io_error}");
+            }
+        }
+        QueryError::Disconnected(other_connection_error) => {
+            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
+        }
+        QueryError::Other(e) => {
+            error!("query handler for '{query}' failed: {e:?}");
+        }
+    }
+}
+
+/// Something finishing handling of COPY stream, see handle_copy_stream_end.
+/// This is not always a real error, but it allows to use ? and thiserror impls.
+#[derive(thiserror::Error, Debug)]
+pub enum CopyStreamHandlerEnd {
+    /// Handler initiates the end of streaming.
+    #[error("{0}")]
+    ServerInitiated(String),
+    #[error("received CopyDone")]
+    CopyDone,
+    #[error("received CopyFail")]
+    CopyFail,
+    #[error("received Terminate")]
+    Terminate,
+    #[error("EOF on COPY stream")]
+    EOF,
+    /// The connection was lost
+    #[error(transparent)]
+    Disconnected(#[from] ConnectionError),
+    /// Some other error
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
--- a/libs/postgres_backend/tests/cert.pem
+++ b/libs/postgres_backend/tests/cert.pem
--- a/libs/postgres_backend/tests/key.pem
+++ b/libs/postgres_backend/tests/key.pem
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -0,0 +1,140 @@
+/// Test postgres_backend_async with tokio_postgres
+use once_cell::sync::Lazy;
+use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
+use pq_proto::{BeMessage, RowDescriptor};
+use std::io::Cursor;
+use std::{future, sync::Arc};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::net::{TcpListener, TcpStream};
+use tokio_postgres::config::SslMode;
+use tokio_postgres::tls::MakeTlsConnect;
+use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
+use tokio_postgres_rustls::MakeRustlsConnect;
+
+// generate client, server test streams
+async fn make_tcp_pair() -> (TcpStream, TcpStream) {
+    let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+    let addr = listener.local_addr().unwrap();
+    let client_stream = TcpStream::connect(addr).await.unwrap();
+    let (server_stream, _) = listener.accept().await.unwrap();
+    (client_stream, server_stream)
+}
+
+struct TestHandler {}
+
+#[async_trait::async_trait]
+impl<IO: AsyncRead + AsyncWrite + Unpin + Send> Handler<IO> for TestHandler {
+    // return single col 'hey' for any query
+    async fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+        _query_string: &str,
+    ) -> Result<(), QueryError> {
+        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
+            b"hey",
+        )]))?
+        .write_message_noflush(&BeMessage::DataRow(&[Some("hey".as_bytes())]))?
+        .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        Ok(())
+    }
+}
+
+// test that basic select works
+#[tokio::test]
+async fn simple_select() {
+    let (client_sock, server_sock) = make_tcp_pair().await;
+
+    // create and run pgbackend
+    let pgbackend =
+        PostgresBackend::new(server_sock, AuthType::Trust, None).expect("pgbackend creation");
+
+    tokio::spawn(async move {
+        let mut handler = TestHandler {};
+        pgbackend.run(&mut handler, future::pending::<()>).await
+    });
+
+    let conf = Config::new();
+    let (client, connection) = conf.connect_raw(client_sock, NoTls).await.expect("connect");
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0];
+    if let SimpleQueryMessage::Row(row) = first_val {
+        let first_col = row.get(0).expect("first column");
+        assert_eq!(first_col, "hey");
+    } else {
+        panic!("expected SimpleQueryMessage::Row");
+    }
+}
+
+static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
+    let mut cursor = Cursor::new(include_bytes!("key.pem"));
+    rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
+});
+
+static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
+    let mut cursor = Cursor::new(include_bytes!("cert.pem"));
+    rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
+});
+
+// test that basic select with ssl works
+#[tokio::test]
+async fn simple_select_ssl() {
+    let (client_sock, server_sock) = make_tcp_pair().await;
+
+    let server_cfg = rustls::ServerConfig::builder()
+        .with_safe_defaults()
+        .with_no_client_auth()
+        .with_single_cert(vec![CERT.clone()], KEY.clone())
+        .unwrap();
+    let tls_config = Some(Arc::new(server_cfg));
+    let pgbackend =
+        PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation");
+
+    tokio::spawn(async move {
+        let mut handler = TestHandler {};
+        pgbackend.run(&mut handler, future::pending::<()>).await
+    });
+
+    let client_cfg = rustls::ClientConfig::builder()
+        .with_safe_defaults()
+        .with_root_certificates({
+            let mut store = rustls::RootCertStore::empty();
+            store.add(&CERT).unwrap();
+            store
+        })
+        .with_no_client_auth();
+    let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg);
+    let tls_connect = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::make_tls_connect(
+        &mut make_tls_connect,
+        "localhost",
+    )
+    .expect("make_tls_connect");
+
+    let mut conf = Config::new();
+    conf.ssl_mode(SslMode::Require);
+    let (client, connection) = conf
+        .connect_raw(client_sock, tls_connect)
+        .await
+        .expect("connect");
+    // The connection object performs the actual communication with the database,
+    // so spawn it off to run on its own.
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    let first_val = &(client.simple_query("SELECT 42;").await.expect("select"))[0];
+    if let SimpleQueryMessage::Row(row) = first_val {
+        let first_col = row.get(0).expect("first column");
+        assert_eq!(first_col, "hey");
+    } else {
+        panic!("expected SimpleQueryMessage::Row");
+    }
+}
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -63,10 +63,7 @@ fn main() -> anyhow::Result<()> {
            pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned);
        }

-        let pg_config_bin = pg_install_dir_versioned
-            .join(pg_version)
-            .join("bin")
-            .join("pg_config");
+        let pg_config_bin = pg_install_dir_versioned.join("bin").join("pg_config");
        let inc_server_path: String = if pg_config_bin.exists() {
            let output = Command::new(pg_config_bin)
                .arg("--includedir-server")
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -5,8 +5,8 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-anyhow.workspace = true
 bytes.workspace = true
+byteorder.workspace = true
 pin-project-lite.workspace = true
 postgres-protocol.workspace = true
 rand.workspace = true
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -0,0 +1,244 @@
+//! Provides `Framed` -- writing/flushing and reading Postgres messages to/from
+//! the async stream based on (and buffered with) BytesMut. All functions are
+//! cancellation safe.
+//!
+//! It is similar to what tokio_util::codec::Framed with appropriate codec
+//! provides, but `FramedReader` and `FramedWriter` read/write parts can be used
+//! separately without using split from futures::stream::StreamExt (which
+//! allocates box[1] in polling internally). tokio::io::split is used for splitting
+//! instead. Plus we customize error messages more than a single type for all io
+//! calls.
+//!
+//! [1] https://docs.rs/futures-util/0.3.26/src/futures_util/lock/bilock.rs.html#107
+use bytes::{Buf, BytesMut};
+use std::{
+    future::Future,
+    io::{self, ErrorKind},
+};
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadHalf, WriteHalf};
+
+use crate::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
+
+const INITIAL_CAPACITY: usize = 8 * 1024;
+
+/// Error on postgres connection: either IO (physical transport error) or
+/// protocol violation.
+#[derive(thiserror::Error, Debug)]
+pub enum ConnectionError {
+    #[error(transparent)]
+    Io(#[from] io::Error),
+    #[error(transparent)]
+    Protocol(#[from] ProtocolError),
+}
+
+impl ConnectionError {
+    /// Proxy stream.rs uses only io::Error; provide it.
+    pub fn into_io_error(self) -> io::Error {
+        match self {
+            ConnectionError::Io(io) => io,
+            ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()),
+        }
+    }
+}
+
+/// Wraps async io `stream`, providing messages to write/flush + read Postgres
+/// messages.
+pub struct Framed<S> {
+    stream: S,
+    read_buf: BytesMut,
+    write_buf: BytesMut,
+}
+
+impl<S> Framed<S> {
+    pub fn new(stream: S) -> Self {
+        Self {
+            stream,
+            read_buf: BytesMut::with_capacity(INITIAL_CAPACITY),
+            write_buf: BytesMut::with_capacity(INITIAL_CAPACITY),
+        }
+    }
+
+    /// Get a shared reference to the underlying stream.
+    pub fn get_ref(&self) -> &S {
+        &self.stream
+    }
+
+    /// Deconstruct into the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
+        (self.stream, self.read_buf)
+    }
+
+    /// Return new Framed with stream type transformed by async f, for TLS
+    /// upgrade.
+    pub async fn map_stream<S2, E, F, Fut>(self, f: F) -> Result<Framed<S2>, E>
+    where
+        F: FnOnce(S) -> Fut,
+        Fut: Future<Output = Result<S2, E>>,
+    {
+        let stream = f(self.stream).await?;
+        Ok(Framed {
+            stream,
+            read_buf: self.read_buf,
+            write_buf: self.write_buf,
+        })
+    }
+}
+
+impl<S: AsyncRead + Unpin> Framed<S> {
+    pub async fn read_startup_message(
+        &mut self,
+    ) -> Result<Option<FeStartupPacket>, ConnectionError> {
+        read_message(&mut self.stream, &mut self.read_buf, FeStartupPacket::parse).await
+    }
+
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await
+    }
+}
+
+impl<S: AsyncWrite + Unpin> Framed<S> {
+    /// Write next message to the output buffer; doesn't flush.
+    pub fn write_message(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
+        BeMessage::write(&mut self.write_buf, msg)
+    }
+
+    /// Flush out the buffer. This function is cancellation safe: it can be
+    /// interrupted and flushing will be continued in the next call.
+    pub async fn flush(&mut self) -> Result<(), io::Error> {
+        flush(&mut self.stream, &mut self.write_buf).await
+    }
+
+    /// Flush out the buffer and shutdown the stream.
+    pub async fn shutdown(&mut self) -> Result<(), io::Error> {
+        shutdown(&mut self.stream, &mut self.write_buf).await
+    }
+}
+
+impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
+    /// Split into owned read and write parts. Beware of potential issues with
+    /// using halves in different tasks on TLS stream:
+    /// https://github.com/tokio-rs/tls/issues/40
+    pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
+        let (read_half, write_half) = tokio::io::split(self.stream);
+        let reader = FramedReader {
+            stream: read_half,
+            read_buf: self.read_buf,
+        };
+        let writer = FramedWriter {
+            stream: write_half,
+            write_buf: self.write_buf,
+        };
+        (reader, writer)
+    }
+
+    /// Join read and write parts back.
+    pub fn unsplit(reader: FramedReader<S>, writer: FramedWriter<S>) -> Self {
+        Self {
+            stream: reader.stream.unsplit(writer.stream),
+            read_buf: reader.read_buf,
+            write_buf: writer.write_buf,
+        }
+    }
+}
+
+/// Read-only version of `Framed`.
+pub struct FramedReader<S> {
+    stream: ReadHalf<S>,
+    read_buf: BytesMut,
+}
+
+impl<S: AsyncRead + Unpin> FramedReader<S> {
+    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, ConnectionError> {
+        read_message(&mut self.stream, &mut self.read_buf, FeMessage::parse).await
+    }
+}
+
+/// Write-only version of `Framed`.
+pub struct FramedWriter<S> {
+    stream: WriteHalf<S>,
+    write_buf: BytesMut,
+}
+
+impl<S: AsyncWrite + Unpin> FramedWriter<S> {
+    /// Write next message to the output buffer; doesn't flush.
+    pub fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
+        BeMessage::write(&mut self.write_buf, msg)
+    }
+
+    /// Flush out the buffer. This function is cancellation safe: it can be
+    /// interrupted and flushing will be continued in the next call.
+    pub async fn flush(&mut self) -> Result<(), io::Error> {
+        flush(&mut self.stream, &mut self.write_buf).await
+    }
+
+    /// Flush out the buffer and shutdown the stream.
+    pub async fn shutdown(&mut self) -> Result<(), io::Error> {
+        shutdown(&mut self.stream, &mut self.write_buf).await
+    }
+}
+
+/// Read next message from the stream. Returns Ok(None), if EOF happened and we
+/// don't have remaining data in the buffer. This function is cancellation safe:
+/// you can drop future which is not yet complete and finalize reading message
+/// with the next call.
+///
+/// Parametrized to allow reading startup or usual message, having different
+/// format.
+async fn read_message<S: AsyncRead + Unpin, M, P>(
+    stream: &mut S,
+    read_buf: &mut BytesMut,
+    parse: P,
+) -> Result<Option<M>, ConnectionError>
+where
+    P: Fn(&mut BytesMut) -> Result<Option<M>, ProtocolError>,
+{
+    loop {
+        if let Some(msg) = parse(read_buf)? {
+            return Ok(Some(msg));
+        }
+        // If we can't build a frame yet, try to read more data and try again.
+        // Make sure we've got room for at least one byte to read to ensure
+        // that we don't get a spurious 0 that looks like EOF.
+        read_buf.reserve(1);
+        if stream.read_buf(read_buf).await? == 0 {
+            if read_buf.has_remaining() {
+                return Err(io::Error::new(
+                    ErrorKind::UnexpectedEof,
+                    "EOF with unprocessed data in the buffer",
+                )
+                .into());
+            } else {
+                return Ok(None); // clean EOF
+            }
+        }
+    }
+}
+
+async fn flush<S: AsyncWrite + Unpin>(
+    stream: &mut S,
+    write_buf: &mut BytesMut,
+) -> Result<(), io::Error> {
+    while write_buf.has_remaining() {
+        let bytes_written = stream.write(write_buf.chunk()).await?;
+        if bytes_written == 0 {
+            return Err(io::Error::new(
+                ErrorKind::WriteZero,
+                "failed to write message",
+            ));
+        }
+        // The advanced part will be garbage collected, likely during shifting
+        // data left on next attempt to write to buffer when free space is not
+        // enough.
+        write_buf.advance(bytes_written);
+    }
+    write_buf.clear();
+    stream.flush().await
+}
+
+async fn shutdown<S: AsyncWrite + Unpin>(
+    stream: &mut S,
+    write_buf: &mut BytesMut,
+) -> Result<(), io::Error> {
+    flush(stream, write_buf).await?;
+    stream.shutdown().await
+}
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -2,24 +2,18 @@
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.

-// Tools for calling certain async methods in sync contexts.
-pub mod sync;
+pub mod framed;

-use anyhow::{ensure, Context, Result};
+use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_protocol::PG_EPOCH;
 use serde::{Deserialize, Serialize};
 use std::{
    borrow::Cow,
    collections::HashMap,
-    fmt,
-    future::Future,
-    io::{self, Cursor},
-    str,
+    fmt, io, str,
    time::{Duration, SystemTime},
 };
-use sync::{AsyncishRead, SyncFuture};
-use tokio::io::AsyncReadExt;
 use tracing::{trace, warn};

 pub type Oid = u32;
@@ -31,7 +25,6 @@ pub const TEXT_OID: Oid = 25;

 #[derive(Debug)]
 pub enum FeMessage {
-    StartupPacket(FeStartupPacket),
    // Simple query.
    Query(Bytes),
    // Extended query protocol.
@@ -191,260 +184,205 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;

-/// Retry a read on EINTR
-///
-/// This runs the enclosed expression, and if it returns
-/// Err(io::ErrorKind::Interrupted), retries it.
-macro_rules! retry_read {
-    ( $x:expr ) => {
-        loop {
-            match $x {
-                Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
-                res => break res,
-            }
-        }
-    };
-}
-
-/// An error occured during connection being open.
+/// An error occured while parsing or serializing raw stream into Postgres
+/// messages.
 #[derive(thiserror::Error, Debug)]
-pub enum ConnectionError {
-    /// IO error during writing to or reading from the connection socket.
-    #[error("Socket IO error: {0}")]
-    Socket(std::io::Error),
-    /// Invalid packet was received from client
+pub enum ProtocolError {
+    /// Invalid packet was received from the client (e.g. unexpected message
+    /// type or broken len).
    #[error("Protocol error: {0}")]
    Protocol(String),
-    /// Failed to parse a protocol mesage
+    /// Failed to parse or, (unlikely), serialize a protocol message.
    #[error("Message parse error: {0}")]
-    MessageParse(anyhow::Error),
+    BadMessage(String),
 }

-impl From<anyhow::Error> for ConnectionError {
-    fn from(e: anyhow::Error) -> Self {
-        Self::MessageParse(e)
-    }
-}
-
-impl ConnectionError {
+impl ProtocolError {
+    /// Proxy stream.rs uses only io::Error; provide it.
    pub fn into_io_error(self) -> io::Error {
-        match self {
-            ConnectionError::Socket(io) => io,
-            other => io::Error::new(io::ErrorKind::Other, other.to_string()),
-        }
+        io::Error::new(io::ErrorKind::Other, self.to_string())
    }
 }

 impl FeMessage {
-    /// Read one message from the stream.
-    /// This function returns `Ok(None)` in case of EOF.
-    /// One way to handle this properly:
+    /// Read and parse one message from the `buf` input buffer. If there is at
+    /// least one valid message, returns it, advancing `buf`; redundant copies
+    /// are avoided, as thanks to `bytes` crate ptrs in parsed message point
+    /// directly into the `buf` (processed data is garbage collected after
+    /// parsed message is dropped).
    ///
-    /// ```
-    /// # use std::io;
-    /// # use pq_proto::FeMessage;
-    /// #
-    /// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
-    /// #     Ok(())
-    /// # };
-    /// #
-    /// fn do_the_job(stream: &mut (impl io::Read + Unpin)) -> anyhow::Result<()> {
-    ///     while let Some(msg) = FeMessage::read(stream)? {
-    ///         process_message(msg)?;
-    ///     }
+    /// Returns None if `buf` doesn't contain enough data for a single message.
+    /// For efficiency, tries to reserve large enough space in `buf` for the
+    /// next message in this case to save the repeated calls.
    ///
-    ///     Ok(())
-    /// }
-    /// ```
-    #[inline(never)]
-    pub fn read(
-        stream: &mut (impl io::Read + Unpin),
-    ) -> Result<Option<FeMessage>, ConnectionError> {
-        Self::read_fut(&mut AsyncishRead(stream)).wait()
-    }
+    /// Returns Error if message is malformed, the only possible ErrorKind is
+    /// InvalidInput.
+    //
+    // Inspired by rust-postgres Message::parse.
+    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>, ProtocolError> {
+        // Every message contains message type byte and 4 bytes len; can't do
+        // much without them.
+        if buf.len() < 5 {
+            let to_read = 5 - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }

-    /// Read one message from the stream.
-    /// See documentation for `Self::read`.
-    pub fn read_fut<Reader>(
-        stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
-    where
-        Reader: tokio::io::AsyncRead + Unpin,
-    {
-        // We return a Future that's sync (has a `wait` method) if and only if the provided stream is SyncProof.
-        // SyncFuture contract: we are only allowed to await on sync-proof futures, the AsyncRead and
-        // AsyncReadExt methods of the stream.
-        SyncFuture::new(async move {
-            // Each libpq message begins with a message type byte, followed by message length
-            // If the client closes the connection, return None. But if the client closes the
-            // connection in the middle of a message, we will return an error.
-            let tag = match retry_read!(stream.read_u8().await) {
-                Ok(b) => b,
-                Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(ConnectionError::Socket(e)),
-            };
+        // We shouldn't advance `buf` as probably full message is not there yet,
+        // so can't directly use Bytes::get_u32 etc.
+        let tag = buf[0];
+        let len = (&buf[1..5]).read_u32::<BigEndian>().unwrap();
+        if len < 4 {
+            return Err(ProtocolError::Protocol(format!(
+                "invalid message length {}",
+                len
+            )));
+        }

-            // The message length includes itself, so it better be at least 4.
-            let len = retry_read!(stream.read_u32().await)
-                .map_err(ConnectionError::Socket)?
-                .checked_sub(4)
-                .ok_or_else(|| ConnectionError::Protocol("invalid message length".to_string()))?;
+        // length field includes itself, but not message type.
+        let total_len = len as usize + 1;
+        if buf.len() < total_len {
+            // Don't have full message yet.
+            let to_read = total_len - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }

-            let body = {
-                let mut buffer = vec![0u8; len as usize];
-                stream
-                    .read_exact(&mut buffer)
-                    .await
-                    .map_err(ConnectionError::Socket)?;
-                Bytes::from(buffer)
-            };
+        // got the message, advance buffer
+        let mut msg = buf.split_to(total_len).freeze();
+        msg.advance(5); // consume message type and len

-            match tag {
-                b'Q' => Ok(Some(FeMessage::Query(body))),
-                b'P' => Ok(Some(FeParseMessage::parse(body)?)),
-                b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
-                b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
-                b'B' => Ok(Some(FeBindMessage::parse(body)?)),
-                b'C' => Ok(Some(FeCloseMessage::parse(body)?)),
-                b'S' => Ok(Some(FeMessage::Sync)),
-                b'X' => Ok(Some(FeMessage::Terminate)),
-                b'd' => Ok(Some(FeMessage::CopyData(body))),
-                b'c' => Ok(Some(FeMessage::CopyDone)),
-                b'f' => Ok(Some(FeMessage::CopyFail)),
-                b'p' => Ok(Some(FeMessage::PasswordMessage(body))),
-                tag => {
-                    return Err(ConnectionError::Protocol(format!(
-                        "unknown message tag: {tag},'{body:?}'"
-                    )))
-                }
-            }
-        })
+        match tag {
+            b'Q' => Ok(Some(FeMessage::Query(msg))),
+            b'P' => Ok(Some(FeParseMessage::parse(msg)?)),
+            b'D' => Ok(Some(FeDescribeMessage::parse(msg)?)),
+            b'E' => Ok(Some(FeExecuteMessage::parse(msg)?)),
+            b'B' => Ok(Some(FeBindMessage::parse(msg)?)),
+            b'C' => Ok(Some(FeCloseMessage::parse(msg)?)),
+            b'S' => Ok(Some(FeMessage::Sync)),
+            b'X' => Ok(Some(FeMessage::Terminate)),
+            b'd' => Ok(Some(FeMessage::CopyData(msg))),
+            b'c' => Ok(Some(FeMessage::CopyDone)),
+            b'f' => Ok(Some(FeMessage::CopyFail)),
+            b'p' => Ok(Some(FeMessage::PasswordMessage(msg))),
+            tag => Err(ProtocolError::Protocol(format!(
+                "unknown message tag: {tag},'{msg:?}'"
+            ))),
+        }
    }
 }

 impl FeStartupPacket {
-    /// Read startup message from the stream.
-    // XXX: It's tempting yet undesirable to accept `stream` by value,
-    // since such a change will cause user-supplied &mut references to be consumed
-    pub fn read(
-        stream: &mut (impl io::Read + Unpin),
-    ) -> Result<Option<FeMessage>, ConnectionError> {
-        Self::read_fut(&mut AsyncishRead(stream)).wait()
-    }
-
-    /// Read startup message from the stream.
-    // XXX: It's tempting yet undesirable to accept `stream` by value,
-    // since such a change will cause user-supplied &mut references to be consumed
-    pub fn read_fut<Reader>(
-        stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = Result<Option<FeMessage>, ConnectionError>> + '_>
-    where
-        Reader: tokio::io::AsyncRead + Unpin,
-    {
+    /// Read and parse startup message from the `buf` input buffer. It is
+    /// different from [`FeMessage::parse`] because startup messages don't have
+    /// message type byte; otherwise, its comments apply.
+    pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
        const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
        const CANCEL_REQUEST_CODE: u32 = 5678;
        const NEGOTIATE_SSL_CODE: u32 = 5679;
        const NEGOTIATE_GSS_CODE: u32 = 5680;

-        SyncFuture::new(async move {
-            // Read length. If the connection is closed before reading anything (or before
-            // reading 4 bytes, to be precise), return None to indicate that the connection
-            // was closed. This matches the PostgreSQL server's behavior, which avoids noise
-            // in the log if the client opens connection but closes it immediately.
-            let len = match retry_read!(stream.read_u32().await) {
-                Ok(len) => len as usize,
-                Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
-                Err(e) => return Err(ConnectionError::Socket(e)),
-            };
+        // need at least 4 bytes with packet len
+        if buf.len() < 4 {
+            let to_read = 4 - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }

-            #[allow(clippy::manual_range_contains)]
-            if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
-                return Err(ConnectionError::Protocol(format!(
-                    "invalid message length {len}"
+        // We shouldn't advance `buf` as probably full message is not there yet,
+        // so can't directly use Bytes::get_u32 etc.
+        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
+        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
+            return Err(ProtocolError::Protocol(format!(
+                "invalid startup packet message length {}",
+                len
+            )));
+        }
+
+        if buf.len() < len {
+            // Don't have full message yet.
+            let to_read = len - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }
+
+        // got the message, advance buffer
+        let mut msg = buf.split_to(len).freeze();
+        msg.advance(4); // consume len
+
+        let request_code = msg.get_u32();
+        let req_hi = request_code >> 16;
+        let req_lo = request_code & ((1 << 16) - 1);
+        // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
+        let message = match (req_hi, req_lo) {
+            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
+                if msg.remaining() != 8 {
+                    return Err(ProtocolError::BadMessage(
+                        "CancelRequest message is malformed, backend PID / secret key missing"
+                            .to_owned(),
+                    ));
+                }
+                FeStartupPacket::CancelRequest(CancelKeyData {
+                    backend_pid: msg.get_i32(),
+                    cancel_key: msg.get_i32(),
+                })
+            }
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
+                // Requested upgrade to SSL (aka TLS)
+                FeStartupPacket::SslRequest
+            }
+            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
+                // Requested upgrade to GSSAPI
+                FeStartupPacket::GssEncRequest
+            }
+            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
+                return Err(ProtocolError::Protocol(format!(
+                    "Unrecognized request code {unrecognized_code}"
                )));
            }
+            // TODO bail if protocol major_version is not 3?
+            (major_version, minor_version) => {
+                // StartupMessage

-            let request_code =
-                retry_read!(stream.read_u32().await).map_err(ConnectionError::Socket)?;
+                // Parse pairs of null-terminated strings (key, value).
+                // See `postgres: ProcessStartupPacket, build_startup_packet`.
+                let mut tokens = str::from_utf8(&msg)
+                    .map_err(|_e| {
+                        ProtocolError::BadMessage("StartupMessage params: invalid utf-8".to_owned())
+                    })?
+                    .strip_suffix('\0') // drop packet's own null
+                    .ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: missing null terminator".to_string(),
+                        )
+                    })?
+                    .split_terminator('\0');

-            // the rest of startup packet are params
-            let params_len = len - 8;
-            let mut params_bytes = vec![0u8; params_len];
-            stream
-                .read_exact(params_bytes.as_mut())
-                .await
-                .map_err(ConnectionError::Socket)?;
+                let mut params = HashMap::new();
+                while let Some(name) = tokens.next() {
+                    let value = tokens.next().ok_or_else(|| {
+                        ProtocolError::Protocol(
+                            "StartupMessage params: key without value".to_string(),
+                        )
+                    })?;

-            // Parse params depending on request code
-            let req_hi = request_code >> 16;
-            let req_lo = request_code & ((1 << 16) - 1);
-            let message = match (req_hi, req_lo) {
-                (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
-                    if params_len != 8 {
-                        return Err(ConnectionError::Protocol(
-                            "expected 8 bytes for CancelRequest params".to_string(),
-                        ));
-                    }
-                    let mut cursor = Cursor::new(params_bytes);
-                    FeStartupPacket::CancelRequest(CancelKeyData {
-                        backend_pid: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
-                        cancel_key: cursor.read_i32().await.map_err(ConnectionError::Socket)?,
-                    })
+                    params.insert(name.to_owned(), value.to_owned());
                }
-                (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
-                    // Requested upgrade to SSL (aka TLS)
-                    FeStartupPacket::SslRequest
-                }
-                (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
-                    // Requested upgrade to GSSAPI
-                    FeStartupPacket::GssEncRequest
-                }
-                (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
-                    return Err(ConnectionError::Protocol(format!(
-                        "Unrecognized request code {unrecognized_code}"
-                    )));
-                }
-                // TODO bail if protocol major_version is not 3?
-                (major_version, minor_version) => {
-                    // Parse pairs of null-terminated strings (key, value).
-                    // See `postgres: ProcessStartupPacket, build_startup_packet`.
-                    let mut tokens = str::from_utf8(&params_bytes)
-                        .context("StartupMessage params: invalid utf-8")?
-                        .strip_suffix('\0') // drop packet's own null
-                        .ok_or_else(|| {
-                            ConnectionError::Protocol(
-                                "StartupMessage params: missing null terminator".to_string(),
-                            )
-                        })?
-                        .split_terminator('\0');

-                    let mut params = HashMap::new();
-                    while let Some(name) = tokens.next() {
-                        let value = tokens.next().ok_or_else(|| {
-                            ConnectionError::Protocol(
-                                "StartupMessage params: key without value".to_string(),
-                            )
-                        })?;
-
-                        params.insert(name.to_owned(), value.to_owned());
-                    }
-
-                    FeStartupPacket::StartupMessage {
-                        major_version,
-                        minor_version,
-                        params: StartupMessageParams { params },
-                    }
+                FeStartupPacket::StartupMessage {
+                    major_version,
+                    minor_version,
+                    params: StartupMessageParams { params },
                }
-            };
-
-            Ok(Some(FeMessage::StartupPacket(message)))
-        })
+            }
+        };
+        Ok(Some(message))
    }
 }

 impl FeParseMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
        // FIXME: the rust-postgres driver uses a named prepared statement
        // for copy_out(). We're not prepared to handle that correctly. For
        // now, just ignore the statement name, assuming that the client never
@@ -452,55 +390,82 @@ impl FeParseMessage {

        let _pstmt_name = read_cstr(&mut buf)?;
        let query_string = read_cstr(&mut buf)?;
+        if buf.remaining() < 2 {
+            return Err(ProtocolError::BadMessage(
+                "Parse message is malformed, nparams missing".to_string(),
+            ));
+        }
        let nparams = buf.get_i16();

-        ensure!(nparams == 0, "query params not implemented");
+        if nparams != 0 {
+            return Err(ProtocolError::BadMessage(
+                "query params not implemented".to_string(),
+            ));
+        }

        Ok(FeMessage::Parse(FeParseMessage { query_string }))
    }
 }

 impl FeDescribeMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
        let kind = buf.get_u8();
        let _pstmt_name = read_cstr(&mut buf)?;

        // FIXME: see FeParseMessage::parse
-        ensure!(
-            kind == b'S',
-            "only prepared statemement Describe is implemented"
-        );
+        if kind != b'S' {
+            return Err(ProtocolError::BadMessage(
+                "only prepared statemement Describe is implemented".to_string(),
+            ));
+        }

        Ok(FeMessage::Describe(FeDescribeMessage { kind }))
    }
 }

 impl FeExecuteMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
        let portal_name = read_cstr(&mut buf)?;
+        if buf.remaining() < 4 {
+            return Err(ProtocolError::BadMessage(
+                "FeExecuteMessage message is malformed, maxrows missing".to_string(),
+            ));
+        }
        let maxrows = buf.get_i32();

-        ensure!(portal_name.is_empty(), "named portals not implemented");
-        ensure!(maxrows == 0, "row limit in Execute message not implemented");
+        if !portal_name.is_empty() {
+            return Err(ProtocolError::BadMessage(
+                "named portals not implemented".to_string(),
+            ));
+        }
+        if maxrows != 0 {
+            return Err(ProtocolError::BadMessage(
+                "row limit in Execute message not implemented".to_string(),
+            ));
+        }

        Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
    }
 }

 impl FeBindMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
        let portal_name = read_cstr(&mut buf)?;
        let _pstmt_name = read_cstr(&mut buf)?;

        // FIXME: see FeParseMessage::parse
-        ensure!(portal_name.is_empty(), "named portals not implemented");
+        if !portal_name.is_empty() {
+            return Err(ProtocolError::BadMessage(
+                "named portals not implemented".to_string(),
+            ));
+        }

        Ok(FeMessage::Bind(FeBindMessage))
    }
 }

 impl FeCloseMessage {
-    fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
+    fn parse(mut buf: Bytes) -> Result<FeMessage, ProtocolError> {
        let _kind = buf.get_u8();
        let _pstmt_or_portal_name = read_cstr(&mut buf)?;

@@ -529,6 +494,7 @@ pub enum BeMessage<'a> {
    CloseComplete,
    // None means column is NULL
    DataRow(&'a [Option<&'a [u8]>]),
+    // None errcode means internal_error will be sent.
    ErrorResponse(&'a str, Option<&'a [u8; 5]>),
    /// Single byte - used in response to SSLRequest/GSSENCRequest.
    EncryptionResponse(bool),
@@ -559,6 +525,11 @@ impl<'a> BeMessage<'a> {
        value: b"UTF8",
    };

+    pub const INTEGER_DATETIMES: Self = Self::ParameterStatus {
+        name: b"integer_datetimes",
+        value: b"on",
+    };
+
    /// Build a [`BeMessage::ParameterStatus`] holding the server version.
    pub fn server_version(version: &'a str) -> Self {
        Self::ParameterStatus {
@@ -637,7 +608,7 @@ impl RowDescriptor<'_> {
 #[derive(Debug)]
 pub struct XLogDataBody<'a> {
    pub wal_start: u64,
-    pub wal_end: u64,
+    pub wal_end: u64, // current end of WAL on the server
    pub timestamp: i64,
    pub data: &'a [u8],
 }
@@ -677,12 +648,11 @@ fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
 }

 /// Safe write of s into buf as cstring (String in the protocol).
-fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
+fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> Result<(), ProtocolError> {
    let bytes = s.as_ref();
    if bytes.contains(&0) {
-        return Err(io::Error::new(
-            io::ErrorKind::InvalidInput,
-            "string contains embedded null",
+        return Err(ProtocolError::BadMessage(
+            "string contains embedded null".to_owned(),
        ));
    }
    buf.put_slice(bytes);
@@ -690,22 +660,27 @@ fn write_cstr(s: impl AsRef<[u8]>, buf: &mut BytesMut) -> io::Result<()> {
    Ok(())
 }

-fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
-    let pos = buf.iter().position(|x| *x == 0);
-    let result = buf.split_to(pos.context("missing terminator")?);
+/// Read cstring from buf, advancing it.
+fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
+    let pos = buf
+        .iter()
+        .position(|x| *x == 0)
+        .ok_or_else(|| ProtocolError::BadMessage("missing cstring terminator".to_owned()))?;
+    let result = buf.split_to(pos);
    buf.advance(1); // drop the null terminator
    Ok(result)
 }

 pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
+pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";

 impl<'a> BeMessage<'a> {
-    /// Write message to the given buf.
-    // Unlike the reading side, we use BytesMut
-    // here as msg len precedes its body and it is handy to write it down first
-    // and then fill the length. With Write we would have to either calc it
-    // manually or have one more buffer.
-    pub fn write(buf: &mut BytesMut, message: &BeMessage) -> io::Result<()> {
+    /// Serialize `message` to the given `buf`.
+    /// Apart from smart memory managemet, BytesMut is good here as msg len
+    /// precedes its body and it is handy to write it down first and then fill
+    /// the length. With Write we would have to either calc it manually or have
+    /// one more buffer.
+    pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> {
        match message {
            BeMessage::AuthenticationOk => {
                buf.put_u8(b'R');
@@ -750,7 +725,7 @@ impl<'a> BeMessage<'a> {
                            buf.put_slice(extra);
                        }
                    }
-                    Ok::<_, io::Error>(())
+                    Ok(())
                })?;
            }

@@ -854,7 +829,7 @@ impl<'a> BeMessage<'a> {
                    write_cstr(error_msg, buf)?;

                    buf.put_u8(0); // terminator
-                    Ok::<_, io::Error>(())
+                    Ok(())
                })?;
            }

@@ -877,7 +852,7 @@ impl<'a> BeMessage<'a> {
                    write_cstr(error_msg.as_bytes(), buf)?;

                    buf.put_u8(0); // terminator
-                    Ok::<_, io::Error>(())
+                    Ok(())
                })?;
            }

@@ -932,7 +907,7 @@ impl<'a> BeMessage<'a> {
                        buf.put_i32(-1); /* typmod */
                        buf.put_i16(0); /* format code */
                    }
-                    Ok::<_, io::Error>(())
+                    Ok(())
                })?;
            }

@@ -999,7 +974,7 @@ impl ReplicationFeedback {
    // null-terminated string - key,
    // uint32 - value length in bytes
    // value itself
-    pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
+    pub fn serialize(&self, buf: &mut BytesMut) {
        buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
        buf.put_slice(b"current_timeline_size\0");
        buf.put_i32(8);
@@ -1024,7 +999,6 @@ impl ReplicationFeedback {
        buf.put_slice(b"ps_replytime\0");
        buf.put_i32(8);
        buf.put_i64(timestamp);
-        Ok(())
    }

    // Deserialize ReplicationFeedback message
@@ -1092,7 +1066,7 @@ mod tests {
        // because it is rounded up to microseconds during serialization.
        rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
        let mut data = BytesMut::new();
-        rf.serialize(&mut data).unwrap();
+        rf.serialize(&mut data);

        let rf_parsed = ReplicationFeedback::parse(data.freeze());
        assert_eq!(rf, rf_parsed);
@@ -1107,7 +1081,7 @@ mod tests {
        // because it is rounded up to microseconds during serialization.
        rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
        let mut data = BytesMut::new();
-        rf.serialize(&mut data).unwrap();
+        rf.serialize(&mut data);

        // Add an extra field to the buffer and adjust number of keys
        if let Some(first) = data.first_mut() {
@@ -1149,15 +1123,6 @@ mod tests {
        let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
-
-    // Make sure that `read` is sync/async callable
-    async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) {
-        let _ = FeMessage::read(&mut [].as_ref());
-        let _ = FeMessage::read_fut(stream).await;
-
-        let _ = FeStartupPacket::read(&mut [].as_ref());
-        let _ = FeStartupPacket::read_fut(stream).await;
-    }
 }

 fn terminate_code(code: &[u8; 5]) -> [u8; 6] {
--- a/libs/pq_proto/src/sync.rs
+++ b/libs/pq_proto/src/sync.rs
@@ -1,179 +0,0 @@
-use pin_project_lite::pin_project;
-use std::future::Future;
-use std::marker::PhantomData;
-use std::pin::Pin;
-use std::{io, task};
-
-pin_project! {
-    /// We use this future to mark certain methods
-    /// as callable in both sync and async modes.
-    #[repr(transparent)]
-    pub struct SyncFuture<S, T: Future> {
-        #[pin]
-        inner: T,
-        _marker: PhantomData<S>,
-    }
-}
-
-/// This wrapper lets us synchronously wait for inner future's completion
-/// (see [`SyncFuture::wait`]) **provided that `S` implements [`SyncProof`]**.
-/// For instance, `S` may be substituted with types implementing
-/// [`tokio::io::AsyncRead`], but it's not the only viable option.
-impl<S, T: Future> SyncFuture<S, T> {
-    /// NOTE: caller should carefully pick a type for `S`,
-    /// because we don't want to enable [`SyncFuture::wait`] when
-    /// it's in fact impossible to run the future synchronously.
-    /// Violation of this contract will not cause UB, but
-    /// panics and async event loop freezes won't please you.
-    ///
-    /// Example:
-    ///
-    /// ```
-    /// # use pq_proto::sync::SyncFuture;
-    /// # use std::future::Future;
-    /// # use tokio::io::AsyncReadExt;
-    /// #
-    /// // Parse a pair of numbers from a stream
-    /// pub fn parse_pair<Reader>(
-    ///     stream: &mut Reader,
-    /// ) -> SyncFuture<Reader, impl Future<Output = anyhow::Result<(u32, u64)>> + '_>
-    /// where
-    ///     Reader: tokio::io::AsyncRead + Unpin,
-    /// {
-    ///     // If `Reader` is a `SyncProof`, this will give caller
-    ///     // an opportunity to use `SyncFuture::wait`, because
-    ///     // `.await` will always result in `Poll::Ready`.
-    ///     SyncFuture::new(async move {
-    ///         let x = stream.read_u32().await?;
-    ///         let y = stream.read_u64().await?;
-    ///         Ok((x, y))
-    ///     })
-    /// }
-    /// ```
-    pub fn new(inner: T) -> Self {
-        Self {
-            inner,
-            _marker: PhantomData,
-        }
-    }
-}
-
-impl<S, T: Future> Future for SyncFuture<S, T> {
-    type Output = T::Output;
-
-    /// In async code, [`SyncFuture`] behaves like a regular wrapper.
-    #[inline(always)]
-    fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll<Self::Output> {
-        self.project().inner.poll(cx)
-    }
-}
-
-/// Postulates that we can call [`SyncFuture::wait`].
-/// If implementer is also a [`Future`], it should always
-/// return [`task::Poll::Ready`] from [`Future::poll`].
-///
-/// Each implementation should document which futures
-/// specifically are being declared sync-proof.
-pub trait SyncPostulate {}
-
-impl<T: SyncPostulate> SyncPostulate for &T {}
-impl<T: SyncPostulate> SyncPostulate for &mut T {}
-
-impl<P: SyncPostulate, T: Future> SyncFuture<P, T> {
-    /// Synchronously wait for future completion.
-    pub fn wait(mut self) -> T::Output {
-        const RAW_WAKER: task::RawWaker = task::RawWaker::new(
-            std::ptr::null(),
-            &task::RawWakerVTable::new(
-                |_| RAW_WAKER,
-                |_| panic!("SyncFuture: failed to wake"),
-                |_| panic!("SyncFuture: failed to wake by ref"),
-                |_| { /* drop is no-op */ },
-            ),
-        );
-
-        // SAFETY: We never move `self` during this call;
-        // furthermore, it will be dropped in the end regardless of panics
-        let this = unsafe { Pin::new_unchecked(&mut self) };
-
-        // SAFETY: This waker doesn't do anything apart from panicking
-        let waker = unsafe { task::Waker::from_raw(RAW_WAKER) };
-        let context = &mut task::Context::from_waker(&waker);
-
-        match this.poll(context) {
-            task::Poll::Ready(res) => res,
-            _ => panic!("SyncFuture: unexpected pending!"),
-        }
-    }
-}
-
-/// This wrapper turns any [`std::io::Read`] into a blocking [`tokio::io::AsyncRead`],
-/// which lets us abstract over sync & async readers in methods returning [`SyncFuture`].
-/// NOTE: you **should not** use this in async code.
-#[repr(transparent)]
-pub struct AsyncishRead<T: io::Read + Unpin>(pub T);
-
-/// This lets us call [`SyncFuture<AsyncishRead<_>, _>::wait`],
-/// and allows the future to await on any of the [`AsyncRead`]
-/// and [`AsyncReadExt`] methods on `AsyncishRead`.
-impl<T: io::Read + Unpin> SyncPostulate for AsyncishRead<T> {}
-
-impl<T: io::Read + Unpin> tokio::io::AsyncRead for AsyncishRead<T> {
-    #[inline(always)]
-    fn poll_read(
-        mut self: Pin<&mut Self>,
-        _cx: &mut task::Context<'_>,
-        buf: &mut tokio::io::ReadBuf<'_>,
-    ) -> task::Poll<io::Result<()>> {
-        task::Poll::Ready(
-            // `Read::read` will block, meaning we don't need a real event loop!
-            self.0
-                .read(buf.initialize_unfilled())
-                .map(|sz| buf.advance(sz)),
-        )
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use tokio::io::{AsyncReadExt, AsyncWriteExt};
-
-    // async helper(stream: &mut impl AsyncRead) -> io::Result<u32>
-    fn bytes_add<Reader>(
-        stream: &mut Reader,
-    ) -> SyncFuture<Reader, impl Future<Output = io::Result<u32>> + '_>
-    where
-        Reader: tokio::io::AsyncRead + Unpin,
-    {
-        SyncFuture::new(async move {
-            let a = stream.read_u32().await?;
-            let b = stream.read_u32().await?;
-            Ok(a + b)
-        })
-    }
-
-    #[test]
-    fn test_sync() {
-        let bytes = [100u32.to_be_bytes(), 200u32.to_be_bytes()].concat();
-        let res = bytes_add(&mut AsyncishRead(&mut &bytes[..]))
-            .wait()
-            .unwrap();
-        assert_eq!(res, 300);
-    }
-
-    // We need a single-threaded executor for this test
-    #[tokio::test(flavor = "current_thread")]
-    async fn test_async() {
-        let (mut tx, mut rx) = tokio::net::UnixStream::pair().unwrap();
-
-        let write = async move {
-            tx.write_u32(100).await?;
-            tx.write_u32(200).await?;
-            Ok(())
-        };
-
-        let (res, ()) = tokio::try_join!(bytes_add(&mut rx), write).unwrap();
-        assert_eq!(res, 300);
-    }
-}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -111,7 +111,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
 }

 pub struct Download {
-    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send>>,
+    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -12,41 +12,37 @@ anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
 heapless.workspace = true
+hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
+futures = { workspace = true}
+jsonwebtoken.workspace = true
+nix.workspace = true
+once_cell.workspace = true
+pin-project-lite.workspace = true
 routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-rustls.workspace = true
 tracing.workspace = true
 tracing-subscriber = { workspace = true, features = ["json"] }
-nix.workspace = true
-signal-hook.workspace = true
 rand.workspace = true
-jsonwebtoken.workspace = true
-hex = { workspace = true, features = ["serde"] }
-rustls.workspace = true
-rustls-split.workspace = true
-git-version.workspace = true
 serde_with.workspace = true
-once_cell.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+url.workspace = true
+uuid = { version = "1.2", features = ["v4", "serde"] }

 metrics.workspace = true
-pq_proto.workspace = true
-
 workspace_hack.workspace = true
-url.workspace = true

 [dev-dependencies]
 byteorder.workspace = true
 bytes.workspace = true
+criterion.workspace = true
 hex-literal.workspace = true
 tempfile.workspace = true
-criterion.workspace = true
-rustls-pemfile.workspace = true

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/approx_accurate.rs
+++ b/libs/utils/src/approx_accurate.rs
@@ -0,0 +1,51 @@
+/// Three-state `max` accumulator.
+///
+/// If it accumulates over 0 or many `Some(T)` values, it is `Accurate` maximum of those values.
+/// If a single `None` value is merged, it becomes `Approximate` variant.
+///
+/// Remove when `Layer::file_size` is no longer an `Option`.
+#[derive(Default, Debug)]
+pub enum ApproxAccurate<T> {
+    Approximate(T),
+    Accurate(T),
+    #[default]
+    Empty,
+}
+
+impl<T: Ord + Copy + Default> ApproxAccurate<T> {
+    /// `max(a, b)` where the approximate is inflicted receiving a `None`, or infected onwards.
+    #[must_use]
+    pub fn max(self, next: Option<T>) -> ApproxAccurate<T> {
+        use ApproxAccurate::*;
+        match (self, next) {
+            (Accurate(a) | Approximate(a), None) => Approximate(a),
+            (Empty, None) => Approximate(T::default()),
+            (Accurate(a), Some(b)) => Accurate(a.max(b)),
+            (Approximate(a), Some(b)) => Approximate(a.max(b)),
+            (Empty, Some(b)) => Accurate(b),
+        }
+    }
+
+    pub fn is_approximate(&self) -> bool {
+        matches!(self, ApproxAccurate::Approximate(_))
+    }
+
+    pub fn accurate(self) -> Option<T> {
+        use ApproxAccurate::*;
+        match self {
+            Accurate(a) => Some(a),
+            Empty => Some(T::default()),
+            Approximate(_) => None,
+        }
+    }
+
+    pub fn unwrap_accurate_or(self, default: T) -> T {
+        use ApproxAccurate::*;
+        match self {
+            Accurate(a) => a,
+            Approximate(_) => default,
+            // Empty is still accurate, just special case for above `max`
+            Empty => T::default(),
+        }
+    }
+}
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,4 @@
 // For details about authentication see docs/authentication.md
-//
-// TODO: use ed25519 keys
-// Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162

 use serde;
 use std::fs;
@@ -16,9 +13,10 @@ use serde_with::{serde_as, DisplayFromStr};

 use crate::id::TenantId;

-const JWT_ALGORITHM: Algorithm = Algorithm::RS256;
+/// Algorithm to use. We require EdDSA.
+const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;

-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 #[serde(rename_all = "lowercase")]
 pub enum Scope {
    // Provides access to all data for a specific tenant (specified in `struct Claims` below)
@@ -33,8 +31,9 @@ pub enum Scope {
    SafekeeperData,
 }

+/// JWT payload. See docs/authentication.md for the format
 #[serde_as]
-#[derive(Debug, Serialize, Deserialize, Clone)]
+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
@@ -55,7 +54,8 @@ pub struct JwtAuth {

 impl JwtAuth {
    pub fn new(decoding_key: DecodingKey) -> Self {
-        let mut validation = Validation::new(JWT_ALGORITHM);
+        let mut validation = Validation::default();
+        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
@@ -67,7 +67,7 @@ impl JwtAuth {

    pub fn from_key_path(key_path: &Path) -> Result<Self> {
        let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_rsa_pem(&public_key)?))
+        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
    }

    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
@@ -85,6 +85,75 @@ impl std::fmt::Debug for JwtAuth {

 // this function is used only for testing purposes in CLI e g generate tokens during init
 pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
-    let key = EncodingKey::from_rsa_pem(key_data)?;
-    Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?)
+    let key = EncodingKey::from_ed_pem(key_data)?;
+    Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::str::FromStr;
+
+    // Generated with:
+    //
+    // openssl genpkey -algorithm ed25519 -out ed25519-priv.pem
+    // openssl pkey -in ed25519-priv.pem -pubout -out ed25519-pub.pem
+    const TEST_PUB_KEY_ED25519: &[u8] = br#"
+-----BEGIN PUBLIC KEY-----
+MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
+-----END PUBLIC KEY-----
+"#;
+
+    const TEST_PRIV_KEY_ED25519: &[u8] = br#"
+-----BEGIN PRIVATE KEY-----
+MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
+-----END PRIVATE KEY-----
+"#;
+
+    #[test]
+    fn test_decode() -> Result<(), anyhow::Error> {
+        let expected_claims = Claims {
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            scope: Scope::Tenant,
+        };
+
+        // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519:
+        //
+        // ```
+        // {
+        //   "scope": "tenant",
+        //   "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
+        //   "iss": "neon.controlplane",
+        //   "exp": 1709200879,
+        //   "iat": 1678442479
+        // }
+        // ```
+        //
+        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";
+
+        // Check it can be validated with the public key
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
+        assert_eq!(claims_from_token, expected_claims);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_encode() -> Result<(), anyhow::Error> {
+        let claims = Claims {
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            scope: Scope::Tenant,
+        };
+
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
+
+        // decode it back
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let decoded = auth.decode(&encoded)?;
+
+        assert_eq!(decoded.claims, claims);
+
+        Ok(())
+    }
 }
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -11,7 +11,7 @@ where
    P: AsRef<Path>,
 {
    fn is_empty_dir(&self) -> io::Result<bool> {
-        Ok(fs::read_dir(self)?.into_iter().next().is_none())
+        Ok(fs::read_dir(self)?.next().is_none())
    }
 }

--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -3,15 +3,14 @@ use crate::http::error;
 use anyhow::{anyhow, Context};
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
+use hyper::Method;
 use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
-use hyper::{Method, StatusCode};
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
-use routerify::RequestInfo;
-use routerify::{Middleware, Router, RouterBuilder, RouterService};
+use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
 use tokio::task::JoinError;
-use tracing;
+use tracing::{self, debug, info, info_span, warn, Instrument};

 use std::future::Future;
 use std::net::TcpListener;
@@ -27,16 +26,83 @@ static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
-    // cannot factor out the Level to avoid the repetition
-    // because tracing can only work with const Level
-    // which is not the case here
-    if info.method() == Method::GET && res.status() == StatusCode::OK {
-        tracing::debug!("{} {} {}", info.method(), info.uri().path(), res.status());
-    } else {
-        tracing::info!("{} {} {}", info.method(), info.uri().path(), res.status());
+static X_REQUEST_ID_HEADER_STR: &str = "x-request-id";
+
+static X_REQUEST_ID_HEADER: HeaderName = HeaderName::from_static(X_REQUEST_ID_HEADER_STR);
+#[derive(Debug, Default, Clone)]
+struct RequestId(String);
+
+/// Adds a tracing info_span! instrumentation around the handler events,
+/// logs the request start and end events for non-GET requests and non-200 responses.
+///
+/// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
+/// in this type will get request info logged in the wrapping span, including the unique request ID.
+///
+/// There could be other ways to implement similar functionality:
+///
+/// * procmacros placed on top of all handler methods
+/// With all the drawbacks of procmacros, brings no difference implementation-wise,
+/// and little code reduction compared to the existing approach.
+///
+/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
+/// implemented for [`RouterBuilder`].
+/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
+///
+/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
+/// later, in a post-response middleware.
+/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
+/// tries to achive with its `.instrument` used in the current approach.
+///
+/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
+pub struct RequestSpan<E, R, H>(pub H)
+where
+    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
+    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
+    H: Fn(Request<Body>) -> R + Send + Sync + 'static;
+
+impl<E, R, H> RequestSpan<E, R, H>
+where
+    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
+    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
+    H: Fn(Request<Body>) -> R + Send + Sync + 'static,
+{
+    /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
+    /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
+    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
+        let request_id = request.context::<RequestId>().unwrap_or_default().0;
+        let method = request.method();
+        let path = request.uri().path();
+        let request_span = info_span!("request", %method, %path, %request_id);
+
+        let log_quietly = method == Method::GET;
+        async move {
+            if log_quietly {
+                debug!("Handling request");
+            } else {
+                info!("Handling request");
+            }
+
+            // Note that we reuse `error::handler` here and not returning and error at all,
+            // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
+            // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
+            //
+            // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
+            match (self.0)(request).await {
+                Ok(response) => {
+                    let response_status = response.status();
+                    if log_quietly && response_status.is_success() {
+                        debug!("Request handled, status: {response_status}");
+                    } else {
+                        info!("Request handled, status: {response_status}");
+                    }
+                    Ok(response)
+                }
+                Err(e) => Ok(error::handler(e.into()).await),
+            }
+        }
+        .instrument(request_span)
+        .await
    }
-    Ok(res)
 }

 async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -63,10 +129,48 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
    Ok(response)
 }

+pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
+) -> Middleware<B, ApiError> {
+    Middleware::pre(move |req| async move {
+        let request_id = match req.headers().get(&X_REQUEST_ID_HEADER) {
+            Some(request_id) => request_id
+                .to_str()
+                .expect("extract request id value")
+                .to_owned(),
+            None => {
+                let request_id = uuid::Uuid::new_v4();
+                request_id.to_string()
+            }
+        };
+        req.set_context(RequestId(request_id));
+
+        Ok(req)
+    })
+}
+
+async fn add_request_id_header_to_response(
+    mut res: Response<Body>,
+    req_info: RequestInfo,
+) -> Result<Response<Body>, ApiError> {
+    if let Some(request_id) = req_info.context::<RequestId>() {
+        if let Ok(request_header_value) = HeaderValue::from_str(&request_id.0) {
+            res.headers_mut()
+                .insert(&X_REQUEST_ID_HEADER, request_header_value);
+        };
+    };
+
+    Ok(res)
+}
+
 pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
    Router::builder()
-        .middleware(Middleware::post_with_info(logger))
-        .get("/metrics", prometheus_metrics_handler)
+        .middleware(add_request_id_middleware())
+        .middleware(Middleware::post_with_info(
+            add_request_id_header_to_response,
+        ))
+        .get("/metrics", |r| {
+            RequestSpan(prometheus_metrics_handler).handle(r)
+        })
        .err_handler(error::handler)
 }

@@ -76,40 +180,43 @@ pub fn attach_openapi_ui(
    spec_mount_path: &'static str,
    ui_mount_path: &'static str,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    router_builder.get(spec_mount_path, move |_| async move {
-        Ok(Response::builder().body(Body::from(spec)).unwrap())
-    }).get(ui_mount_path, move |_| async move {
-        Ok(Response::builder().body(Body::from(format!(r#"
-            <!DOCTYPE html>
-            <html lang="en">
-            <head>
-            <title>rweb</title>
-            <link href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui.css" rel="stylesheet">
-            </head>
-            <body>
-                <div id="swagger-ui"></div>
-                <script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui-bundle.js" charset="UTF-8"> </script>
-                <script>
-                    window.onload = function() {{
-                    const ui = SwaggerUIBundle({{
-                        "dom_id": "\#swagger-ui",
-                        presets: [
-                        SwaggerUIBundle.presets.apis,
-                        SwaggerUIBundle.SwaggerUIStandalonePreset
-                        ],
-                        layout: "BaseLayout",
-                        deepLinking: true,
-                        showExtensions: true,
-                        showCommonExtensions: true,
-                        url: "{}",
-                    }})
-                    window.ui = ui;
-                }};
-            </script>
-            </body>
-            </html>
-        "#, spec_mount_path))).unwrap())
-    })
+    router_builder
+        .get(spec_mount_path, move |r| {
+            RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
+                .handle(r)
+        })
+        .get(ui_mount_path, move |r| RequestSpan( move |_| async move {
+            Ok(Response::builder().body(Body::from(format!(r#"
+                <!DOCTYPE html>
+                <html lang="en">
+                <head>
+                <title>rweb</title>
+                <link href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui.css" rel="stylesheet">
+                </head>
+                <body>
+                    <div id="swagger-ui"></div>
+                    <script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui-bundle.js" charset="UTF-8"> </script>
+                    <script>
+                        window.onload = function() {{
+                        const ui = SwaggerUIBundle({{
+                            "dom_id": "\#swagger-ui",
+                            presets: [
+                            SwaggerUIBundle.presets.apis,
+                            SwaggerUIBundle.SwaggerUIStandalonePreset
+                            ],
+                            layout: "BaseLayout",
+                            deepLinking: true,
+                            showExtensions: true,
+                            showCommonExtensions: true,
+                            url: "{}",
+                        }})
+                        window.ui = ui;
+                    }};
+                </script>
+                </body>
+                </html>
+            "#, spec_mount_path))).unwrap())
+        }).handle(r))
 }

 fn parse_token(header_value: &str) -> Result<&str, ApiError> {
@@ -171,7 +278,7 @@ where
            async move {
                let headers = response.headers_mut();
                if headers.contains_key(&name) {
-                    tracing::warn!(
+                    warn!(
                        "{} response already contains header {:?}",
                        request_info.uri(),
                        &name,
@@ -211,7 +318,7 @@ pub fn serve_thread_main<S>(
 where
    S: Future<Output = ()> + Send + Sync,
 {
-    tracing::info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
+    info!("Starting an HTTP endpoint at {}", listener.local_addr()?);

    // Create a Service from the router above to handle incoming requests.
    let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
@@ -231,3 +338,48 @@ where

    Ok(())
 }
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use futures::future::poll_fn;
+    use hyper::service::Service;
+    use routerify::RequestServiceBuilder;
+    use std::net::{IpAddr, SocketAddr};
+
+    #[tokio::test]
+    async fn test_request_id_returned() {
+        let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
+        let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
+        let mut service = builder.build(remote_addr);
+        if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
+            panic!("request service is not ready: {:?}", e);
+        }
+
+        let mut req: Request<Body> = Request::default();
+        req.headers_mut()
+            .append(&X_REQUEST_ID_HEADER, HeaderValue::from_str("42").unwrap());
+
+        let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
+
+        let header_val = resp.headers().get(&X_REQUEST_ID_HEADER).unwrap();
+
+        assert!(header_val == "42", "response header mismatch");
+    }
+
+    #[tokio::test]
+    async fn test_request_id_empty() {
+        let builder = RequestServiceBuilder::new(make_router().build().unwrap()).unwrap();
+        let remote_addr = SocketAddr::new(IpAddr::from_str("127.0.0.1").unwrap(), 80);
+        let mut service = builder.build(remote_addr);
+        if let Err(e) = poll_fn(|ctx| service.poll_ready(ctx)).await {
+            panic!("request service is not ready: {:?}", e);
+        }
+
+        let req: Request<Body> = Request::default();
+        let resp: Response<hyper::body::Body> = service.call(req).await.unwrap();
+
+        let header_val = resp.headers().get(&X_REQUEST_ID_HEADER);
+
+        assert_ne!(header_val, None, "response header should NOT be empty");
+    }
+}
--- a/libs/utils/src/http/json.rs
+++ b/libs/utils/src/http/json.rs
@@ -1,7 +1,9 @@
+use std::fmt::Display;
+
 use anyhow::Context;
 use bytes::Buf;
 use hyper::{header, Body, Request, Response, StatusCode};
-use serde::{Deserialize, Serialize};
+use serde::{Deserialize, Serialize, Serializer};

 use super::error::ApiError;

@@ -31,3 +33,12 @@ pub fn json_response<T: Serialize>(
        .map_err(|e| ApiError::InternalServerError(e.into()))?;
    Ok(response)
 }
+
+/// Serialize through Display trait.
+pub fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+    F: Display,
+{
+    s.serialize_str(&format!("{}", z))
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -13,8 +13,6 @@ pub mod simple_rcu;
 pub mod vec_map;

 pub mod bin_ser;
-pub mod postgres_backend;
-pub mod postgres_backend_async;

 // helper functions for creating and fsyncing
 pub mod crashsafe;
@@ -27,9 +25,6 @@ pub mod id;
 // http endpoint utils
 pub mod http;

-// socket splitting utils
-pub mod sock_split;
-
 // common log initialisation routine
 pub mod logging;

@@ -38,6 +33,7 @@ pub mod pid_file;

 // Misc
 pub mod accum;
+pub mod approx_accurate;
 pub mod shutdown;

 // Utility for binding TcpListeners with proper socket options.
@@ -54,6 +50,8 @@ pub mod fs_ext;

 pub mod history_buffer;

+pub mod measured_stream;
+
 /// use with fail::cfg("$name", "return(2000)")
 #[macro_export]
 macro_rules! failpoint_sleep_millis_async {
--- a/libs/utils/src/measured_stream.rs
+++ b/libs/utils/src/measured_stream.rs
@@ -0,0 +1,77 @@
+use pin_project_lite::pin_project;
+use std::pin::Pin;
+use std::{io, task};
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+pin_project! {
+    /// This stream tracks all writes and calls user provided
+    /// callback when the underlying stream is flushed.
+    pub struct MeasuredStream<S, R, W> {
+        #[pin]
+        stream: S,
+        write_count: usize,
+        inc_read_count: R,
+        inc_write_count: W,
+    }
+}
+
+impl<S, R, W> MeasuredStream<S, R, W> {
+    pub fn new(stream: S, inc_read_count: R, inc_write_count: W) -> Self {
+        Self {
+            stream,
+            write_count: 0,
+            inc_read_count,
+            inc_write_count,
+        }
+    }
+}
+
+impl<S: AsyncRead + Unpin, R: FnMut(usize), W> AsyncRead for MeasuredStream<S, R, W> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        context: &mut task::Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> task::Poll<io::Result<()>> {
+        let this = self.project();
+        let filled = buf.filled().len();
+        this.stream.poll_read(context, buf).map_ok(|()| {
+            let cnt = buf.filled().len() - filled;
+            // Increment the read count.
+            (this.inc_read_count)(cnt);
+        })
+    }
+}
+
+impl<S: AsyncWrite + Unpin, R, W: FnMut(usize)> AsyncWrite for MeasuredStream<S, R, W> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        context: &mut task::Context<'_>,
+        buf: &[u8],
+    ) -> task::Poll<io::Result<usize>> {
+        let this = self.project();
+        this.stream.poll_write(context, buf).map_ok(|cnt| {
+            // Increment the write count.
+            *this.write_count += cnt;
+            cnt
+        })
+    }
+
+    fn poll_flush(
+        self: Pin<&mut Self>,
+        context: &mut task::Context<'_>,
+    ) -> task::Poll<io::Result<()>> {
+        let this = self.project();
+        this.stream.poll_flush(context).map_ok(|()| {
+            // Call the user provided callback and reset the write count.
+            (this.inc_write_count)(*this.write_count);
+            *this.write_count = 0;
+        })
+    }
+
+    fn poll_shutdown(
+        self: Pin<&mut Self>,
+        context: &mut task::Context<'_>,
+    ) -> task::Poll<io::Result<()>> {
+        self.project().stream.poll_shutdown(context)
+    }
+}
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -1,485 +0,0 @@
-//! Server-side synchronous Postgres connection, as limited as we need.
-//! To use, create PostgresBackend and run() it, passing the Handler
-//! implementation determining how to process the queries. Currently its API
-//! is rather narrow, but we can extend it once required.
-
-use crate::postgres_backend_async::{log_query_error, short_error, QueryError};
-use crate::sock_split::{BidiStream, ReadStream, WriteStream};
-use anyhow::Context;
-use bytes::{Bytes, BytesMut};
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket};
-use serde::{Deserialize, Serialize};
-use std::fmt;
-use std::io::{self, Write};
-use std::net::{Shutdown, SocketAddr, TcpStream};
-use std::str::FromStr;
-use std::sync::Arc;
-use std::time::Duration;
-use tracing::*;
-
-pub trait Handler {
-    /// Handle single query.
-    /// postgres_backend will issue ReadyForQuery after calling this (this
-    /// might be not what we want after CopyData streaming, but currently we don't
-    /// care).
-    fn process_query(
-        &mut self,
-        pgb: &mut PostgresBackend,
-        query_string: &str,
-    ) -> Result<(), QueryError>;
-
-    /// Called on startup packet receival, allows to process params.
-    ///
-    /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
-    /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
-    /// to override whole init logic in implementations.
-    fn startup(
-        &mut self,
-        _pgb: &mut PostgresBackend,
-        _sm: &FeStartupPacket,
-    ) -> Result<(), QueryError> {
-        Ok(())
-    }
-
-    /// Check auth jwt
-    fn check_auth_jwt(
-        &mut self,
-        _pgb: &mut PostgresBackend,
-        _jwt_response: &[u8],
-    ) -> Result<(), QueryError> {
-        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
-    }
-
-    fn is_shutdown_requested(&self) -> bool {
-        false
-    }
-}
-
-/// PostgresBackend protocol state.
-/// XXX: The order of the constructors matters.
-#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
-pub enum ProtoState {
-    Initialization,
-    Encrypted,
-    Authentication,
-    Established,
-}
-
-#[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
-pub enum AuthType {
-    Trust,
-    // This mimics postgres's AuthenticationCleartextPassword but instead of password expects JWT
-    NeonJWT,
-}
-
-impl FromStr for AuthType {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "Trust" => Ok(Self::Trust),
-            "NeonJWT" => Ok(Self::NeonJWT),
-            _ => anyhow::bail!("invalid value \"{s}\" for auth type"),
-        }
-    }
-}
-
-impl fmt::Display for AuthType {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.write_str(match self {
-            AuthType::Trust => "Trust",
-            AuthType::NeonJWT => "NeonJWT",
-        })
-    }
-}
-
-#[derive(Clone, Copy)]
-pub enum ProcessMsgResult {
-    Continue,
-    Break,
-}
-
-/// Always-writeable sock_split stream.
-/// May not be readable. See [`PostgresBackend::take_stream_in`]
-pub enum Stream {
-    Bidirectional(BidiStream),
-    WriteOnly(WriteStream),
-}
-
-impl Stream {
-    fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
-        match self {
-            Self::Bidirectional(bidi_stream) => bidi_stream.shutdown(how),
-            Self::WriteOnly(write_stream) => write_stream.shutdown(how),
-        }
-    }
-}
-
-impl io::Write for Stream {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        match self {
-            Self::Bidirectional(bidi_stream) => bidi_stream.write(buf),
-            Self::WriteOnly(write_stream) => write_stream.write(buf),
-        }
-    }
-
-    fn flush(&mut self) -> io::Result<()> {
-        match self {
-            Self::Bidirectional(bidi_stream) => bidi_stream.flush(),
-            Self::WriteOnly(write_stream) => write_stream.flush(),
-        }
-    }
-}
-
-pub struct PostgresBackend {
-    stream: Option<Stream>,
-    // Output buffer. c.f. BeMessage::write why we are using BytesMut here.
-    buf_out: BytesMut,
-
-    pub state: ProtoState,
-
-    auth_type: AuthType,
-
-    peer_addr: SocketAddr,
-    pub tls_config: Option<Arc<rustls::ServerConfig>>,
-}
-
-pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
-    let mut query_string = query_string.to_vec();
-    if let Some(ch) = query_string.last() {
-        if *ch == 0 {
-            query_string.pop();
-        }
-    }
-    query_string
-}
-
-// Helper function for socket read loops
-pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
-    for cause in error.chain() {
-        if let Some(io_error) = cause.downcast_ref::<io::Error>() {
-            if io_error.kind() == std::io::ErrorKind::WouldBlock {
-                return true;
-            }
-        }
-    }
-    false
-}
-
-// Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
-    let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
-    std::str::from_utf8(without_null).map_err(|e| e.into())
-}
-
-impl PostgresBackend {
-    pub fn new(
-        socket: TcpStream,
-        auth_type: AuthType,
-        tls_config: Option<Arc<rustls::ServerConfig>>,
-        set_read_timeout: bool,
-    ) -> io::Result<Self> {
-        let peer_addr = socket.peer_addr()?;
-        if set_read_timeout {
-            socket
-                .set_read_timeout(Some(Duration::from_secs(5)))
-                .unwrap();
-        }
-
-        Ok(Self {
-            stream: Some(Stream::Bidirectional(BidiStream::from_tcp(socket))),
-            buf_out: BytesMut::with_capacity(10 * 1024),
-            state: ProtoState::Initialization,
-            auth_type,
-            tls_config,
-            peer_addr,
-        })
-    }
-
-    pub fn into_stream(self) -> Stream {
-        self.stream.unwrap()
-    }
-
-    /// Get direct reference (into the Option) to the read stream.
-    fn get_stream_in(&mut self) -> anyhow::Result<&mut BidiStream> {
-        match &mut self.stream {
-            Some(Stream::Bidirectional(stream)) => Ok(stream),
-            _ => anyhow::bail!("reader taken"),
-        }
-    }
-
-    pub fn get_peer_addr(&self) -> &SocketAddr {
-        &self.peer_addr
-    }
-
-    pub fn take_stream_in(&mut self) -> Option<ReadStream> {
-        let stream = self.stream.take();
-        match stream {
-            Some(Stream::Bidirectional(bidi_stream)) => {
-                let (read, write) = bidi_stream.split();
-                self.stream = Some(Stream::WriteOnly(write));
-                Some(read)
-            }
-            stream => {
-                self.stream = stream;
-                None
-            }
-        }
-    }
-
-    /// Read full message or return None if connection is closed.
-    pub fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
-        let (state, stream) = (self.state, self.get_stream_in()?);
-
-        use ProtoState::*;
-        match state {
-            Initialization | Encrypted => FeStartupPacket::read(stream),
-            Authentication | Established => FeMessage::read(stream),
-        }
-        .map_err(QueryError::from)
-    }
-
-    /// Write message into internal output buffer.
-    pub fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<&mut Self> {
-        BeMessage::write(&mut self.buf_out, message)?;
-        Ok(self)
-    }
-
-    /// Flush output buffer into the socket.
-    pub fn flush(&mut self) -> io::Result<&mut Self> {
-        let stream = self.stream.as_mut().unwrap();
-        stream.write_all(&self.buf_out)?;
-        self.buf_out.clear();
-        Ok(self)
-    }
-
-    /// Write message into internal buffer and flush it.
-    pub fn write_message(&mut self, message: &BeMessage) -> io::Result<&mut Self> {
-        self.write_message_noflush(message)?;
-        self.flush()
-    }
-
-    // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub fn run(mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
-        let ret = self.run_message_loop(handler);
-        if let Some(stream) = self.stream.as_mut() {
-            let _ = stream.shutdown(Shutdown::Both);
-        }
-        ret
-    }
-
-    fn run_message_loop(&mut self, handler: &mut impl Handler) -> Result<(), QueryError> {
-        trace!("postgres backend to {:?} started", self.peer_addr);
-
-        let mut unnamed_query_string = Bytes::new();
-
-        while !handler.is_shutdown_requested() {
-            match self.read_message() {
-                Ok(message) => {
-                    if let Some(msg) = message {
-                        trace!("got message {msg:?}");
-
-                        match self.process_message(handler, msg, &mut unnamed_query_string)? {
-                            ProcessMsgResult::Continue => continue,
-                            ProcessMsgResult::Break => break,
-                        }
-                    } else {
-                        break;
-                    }
-                }
-                Err(e) => {
-                    if let QueryError::Other(e) = &e {
-                        if is_socket_read_timed_out(e) {
-                            continue;
-                        }
-                    }
-                    return Err(e);
-                }
-            }
-        }
-
-        trace!("postgres backend to {:?} exited", self.peer_addr);
-        Ok(())
-    }
-
-    pub fn start_tls(&mut self) -> anyhow::Result<()> {
-        match self.stream.take() {
-            Some(Stream::Bidirectional(bidi_stream)) => {
-                let conn = rustls::ServerConnection::new(self.tls_config.clone().unwrap())?;
-                self.stream = Some(Stream::Bidirectional(bidi_stream.start_tls(conn)?));
-                Ok(())
-            }
-            stream => {
-                self.stream = stream;
-                anyhow::bail!("can't start TLs without bidi stream");
-            }
-        }
-    }
-
-    fn process_message(
-        &mut self,
-        handler: &mut impl Handler,
-        msg: FeMessage,
-        unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult, QueryError> {
-        // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
-        // TODO: change that to proper top-level match of protocol state with separate message handling for each state
-        if self.state < ProtoState::Established
-            && !matches!(
-                msg,
-                FeMessage::PasswordMessage(_) | FeMessage::StartupPacket(_)
-            )
-        {
-            return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
-        }
-
-        let have_tls = self.tls_config.is_some();
-        match msg {
-            FeMessage::StartupPacket(m) => {
-                trace!("got startup message {m:?}");
-
-                match m {
-                    FeStartupPacket::SslRequest => {
-                        debug!("SSL requested");
-
-                        self.write_message(&BeMessage::EncryptionResponse(have_tls))?;
-                        if have_tls {
-                            self.start_tls()?;
-                            self.state = ProtoState::Encrypted;
-                        }
-                    }
-                    FeStartupPacket::GssEncRequest => {
-                        debug!("GSS requested");
-                        self.write_message(&BeMessage::EncryptionResponse(false))?;
-                    }
-                    FeStartupPacket::StartupMessage { .. } => {
-                        if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse(
-                                "must connect with TLS",
-                                None,
-                            ))?;
-                            return Err(QueryError::Other(anyhow::anyhow!(
-                                "client did not connect with TLS"
-                            )));
-                        }
-
-                        // NB: startup() may change self.auth_type -- we are using that in proxy code
-                        // to bypass auth for new users.
-                        handler.startup(self, &m)?;
-
-                        match self.auth_type {
-                            AuthType::Trust => {
-                                self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
-                                    // The async python driver requires a valid server_version
-                                    .write_message_noflush(&BeMessage::server_version("14.1"))?
-                                    .write_message(&BeMessage::ReadyForQuery)?;
-                                self.state = ProtoState::Established;
-                            }
-                            AuthType::NeonJWT => {
-                                self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
-                                self.state = ProtoState::Authentication;
-                            }
-                        }
-                    }
-                    FeStartupPacket::CancelRequest { .. } => {
-                        return Ok(ProcessMsgResult::Break);
-                    }
-                }
-            }
-
-            FeMessage::PasswordMessage(m) => {
-                trace!("got password message '{:?}'", m);
-
-                assert!(self.state == ProtoState::Authentication);
-
-                match self.auth_type {
-                    AuthType::Trust => unreachable!(),
-                    AuthType::NeonJWT => {
-                        let (_, jwt_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(
-                                &e.to_string(),
-                                Some(e.pg_error_code()),
-                            ))?;
-                            return Err(e);
-                        }
-                    }
-                }
-                self.write_message_noflush(&BeMessage::AuthenticationOk)?
-                    .write_message_noflush(&BeMessage::CLIENT_ENCODING)?
-                    .write_message(&BeMessage::ReadyForQuery)?;
-                self.state = ProtoState::Established;
-            }
-
-            FeMessage::Query(body) => {
-                // remove null terminator
-                let query_string = cstr_to_str(&body)?;
-
-                trace!("got query {query_string:?}");
-                if let Err(e) = handler.process_query(self, query_string) {
-                    log_query_error(query_string, &e);
-                    let short_error = short_error(&e);
-                    self.write_message_noflush(&BeMessage::ErrorResponse(
-                        &short_error,
-                        Some(e.pg_error_code()),
-                    ))?;
-                }
-                self.write_message(&BeMessage::ReadyForQuery)?;
-            }
-
-            FeMessage::Parse(m) => {
-                *unnamed_query_string = m.query_string;
-                self.write_message(&BeMessage::ParseComplete)?;
-            }
-
-            FeMessage::Describe(_) => {
-                self.write_message_noflush(&BeMessage::ParameterDescription)?
-                    .write_message(&BeMessage::NoData)?;
-            }
-
-            FeMessage::Bind(_) => {
-                self.write_message(&BeMessage::BindComplete)?;
-            }
-
-            FeMessage::Close(_) => {
-                self.write_message(&BeMessage::CloseComplete)?;
-            }
-
-            FeMessage::Execute(_) => {
-                let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {query_string:?}");
-                if let Err(e) = handler.process_query(self, query_string) {
-                    log_query_error(query_string, &e);
-                    self.write_message(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?;
-                }
-                // NOTE there is no ReadyForQuery message. This handler is used
-                // for basebackup and it uses CopyOut which doesn't require
-                // ReadyForQuery message and backend just switches back to
-                // processing mode after sending CopyDone or ErrorResponse.
-            }
-
-            FeMessage::Sync => {
-                self.write_message(&BeMessage::ReadyForQuery)?;
-            }
-
-            FeMessage::Terminate => {
-                return Ok(ProcessMsgResult::Break);
-            }
-
-            // We prefer explicit pattern matching to wildcards, because
-            // this helps us spot the places where new variants are missing
-            FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "unexpected message type: {msg:?}"
-                )));
-            }
-        }
-
-        Ok(ProcessMsgResult::Continue)
-    }
-}
--- a/libs/utils/src/postgres_backend_async.rs
+++ b/libs/utils/src/postgres_backend_async.rs
@@ -1,634 +0,0 @@
-//! Server-side asynchronous Postgres connection, as limited as we need.
-//! To use, create PostgresBackend and run() it, passing the Handler
-//! implementation determining how to process the queries. Currently its API
-//! is rather narrow, but we can extend it once required.
-
-use crate::postgres_backend::AuthType;
-use anyhow::Context;
-use bytes::{Buf, Bytes, BytesMut};
-use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
-use std::io;
-use std::net::SocketAddr;
-use std::pin::Pin;
-use std::sync::Arc;
-use std::task::Poll;
-use std::{future::Future, task::ready};
-use tracing::{debug, error, info, trace};
-
-use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
-use tokio_rustls::TlsAcceptor;
-
-pub fn is_expected_io_error(e: &io::Error) -> bool {
-    use io::ErrorKind::*;
-    matches!(
-        e.kind(),
-        ConnectionRefused | ConnectionAborted | ConnectionReset
-    )
-}
-
-/// An error, occurred during query processing:
-/// either during the connection ([`ConnectionError`]) or before/after it.
-#[derive(thiserror::Error, Debug)]
-pub enum QueryError {
-    /// The connection was lost while processing the query.
-    #[error(transparent)]
-    Disconnected(#[from] ConnectionError),
-    /// Some other error
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-impl From<io::Error> for QueryError {
-    fn from(e: io::Error) -> Self {
-        Self::Disconnected(ConnectionError::Socket(e))
-    }
-}
-
-impl QueryError {
-    pub fn pg_error_code(&self) -> &'static [u8; 5] {
-        match self {
-            Self::Disconnected(_) => b"08006",         // connection failure
-            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
-        }
-    }
-}
-
-#[async_trait::async_trait]
-pub trait Handler {
-    /// Handle single query.
-    /// postgres_backend will issue ReadyForQuery after calling this (this
-    /// might be not what we want after CopyData streaming, but currently we don't
-    /// care).
-    async fn process_query(
-        &mut self,
-        pgb: &mut PostgresBackend,
-        query_string: &str,
-    ) -> Result<(), QueryError>;
-
-    /// Called on startup packet receival, allows to process params.
-    ///
-    /// If Ok(false) is returned postgres_backend will skip auth -- that is needed for new users
-    /// creation is the proxy code. That is quite hacky and ad-hoc solution, may be we could allow
-    /// to override whole init logic in implementations.
-    fn startup(
-        &mut self,
-        _pgb: &mut PostgresBackend,
-        _sm: &FeStartupPacket,
-    ) -> Result<(), QueryError> {
-        Ok(())
-    }
-
-    /// Check auth jwt
-    fn check_auth_jwt(
-        &mut self,
-        _pgb: &mut PostgresBackend,
-        _jwt_response: &[u8],
-    ) -> Result<(), QueryError> {
-        Err(QueryError::Other(anyhow::anyhow!("JWT auth failed")))
-    }
-}
-
-/// PostgresBackend protocol state.
-/// XXX: The order of the constructors matters.
-#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
-pub enum ProtoState {
-    Initialization,
-    Encrypted,
-    Authentication,
-    Established,
-    Closed,
-}
-
-#[derive(Clone, Copy)]
-pub enum ProcessMsgResult {
-    Continue,
-    Break,
-}
-
-/// Always-writeable sock_split stream.
-/// May not be readable. See [`PostgresBackend::take_stream_in`]
-pub enum Stream {
-    Unencrypted(BufReader<tokio::net::TcpStream>),
-    Tls(Box<tokio_rustls::server::TlsStream<BufReader<tokio::net::TcpStream>>>),
-    Broken,
-}
-
-impl AsyncWrite for Stream {
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &[u8],
-    ) -> Poll<io::Result<usize>> {
-        match self.get_mut() {
-            Self::Unencrypted(stream) => Pin::new(stream).poll_write(cx, buf),
-            Self::Tls(stream) => Pin::new(stream).poll_write(cx, buf),
-            Self::Broken => unreachable!(),
-        }
-    }
-    fn poll_flush(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll<io::Result<()>> {
-        match self.get_mut() {
-            Self::Unencrypted(stream) => Pin::new(stream).poll_flush(cx),
-            Self::Tls(stream) => Pin::new(stream).poll_flush(cx),
-            Self::Broken => unreachable!(),
-        }
-    }
-    fn poll_shutdown(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<io::Result<()>> {
-        match self.get_mut() {
-            Self::Unencrypted(stream) => Pin::new(stream).poll_shutdown(cx),
-            Self::Tls(stream) => Pin::new(stream).poll_shutdown(cx),
-            Self::Broken => unreachable!(),
-        }
-    }
-}
-impl AsyncRead for Stream {
-    fn poll_read(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut tokio::io::ReadBuf<'_>,
-    ) -> Poll<io::Result<()>> {
-        match self.get_mut() {
-            Self::Unencrypted(stream) => Pin::new(stream).poll_read(cx, buf),
-            Self::Tls(stream) => Pin::new(stream).poll_read(cx, buf),
-            Self::Broken => unreachable!(),
-        }
-    }
-}
-
-pub struct PostgresBackend {
-    stream: Stream,
-
-    // Output buffer. c.f. BeMessage::write why we are using BytesMut here.
-    // The data between 0 and "current position" as tracked by the bytes::Buf
-    // implementation of BytesMut, have already been written.
-    buf_out: BytesMut,
-
-    pub state: ProtoState,
-
-    auth_type: AuthType,
-
-    peer_addr: SocketAddr,
-    pub tls_config: Option<Arc<rustls::ServerConfig>>,
-}
-
-pub fn query_from_cstring(query_string: Bytes) -> Vec<u8> {
-    let mut query_string = query_string.to_vec();
-    if let Some(ch) = query_string.last() {
-        if *ch == 0 {
-            query_string.pop();
-        }
-    }
-    query_string
-}
-
-// Cast a byte slice to a string slice, dropping null terminator if there's one.
-fn cstr_to_str(bytes: &[u8]) -> anyhow::Result<&str> {
-    let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
-    std::str::from_utf8(without_null).map_err(|e| e.into())
-}
-
-impl PostgresBackend {
-    pub fn new(
-        socket: tokio::net::TcpStream,
-        auth_type: AuthType,
-        tls_config: Option<Arc<rustls::ServerConfig>>,
-    ) -> io::Result<Self> {
-        let peer_addr = socket.peer_addr()?;
-
-        Ok(Self {
-            stream: Stream::Unencrypted(BufReader::new(socket)),
-            buf_out: BytesMut::with_capacity(10 * 1024),
-            state: ProtoState::Initialization,
-            auth_type,
-            tls_config,
-            peer_addr,
-        })
-    }
-
-    pub fn get_peer_addr(&self) -> &SocketAddr {
-        &self.peer_addr
-    }
-
-    /// Read full message or return None if connection is closed.
-    pub async fn read_message(&mut self) -> Result<Option<FeMessage>, QueryError> {
-        use ProtoState::*;
-        match self.state {
-            Initialization | Encrypted => FeStartupPacket::read_fut(&mut self.stream).await,
-            Authentication | Established => FeMessage::read_fut(&mut self.stream).await,
-            Closed => Ok(None),
-        }
-        .map_err(QueryError::from)
-    }
-
-    /// Flush output buffer into the socket.
-    pub async fn flush(&mut self) -> io::Result<()> {
-        while self.buf_out.has_remaining() {
-            let bytes_written = self.stream.write(self.buf_out.chunk()).await?;
-            self.buf_out.advance(bytes_written);
-        }
-        self.buf_out.clear();
-        Ok(())
-    }
-
-    /// Write message into internal output buffer.
-    pub fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> {
-        BeMessage::write(&mut self.buf_out, message)?;
-        Ok(self)
-    }
-
-    /// Returns an AsyncWrite implementation that wraps all the data written
-    /// to it in CopyData messages, and writes them to the connection
-    ///
-    /// The caller is responsible for sending CopyOutResponse and CopyDone messages.
-    pub fn copyout_writer(&mut self) -> CopyDataWriter {
-        CopyDataWriter { pgb: self }
-    }
-
-    /// A polling function that tries to write all the data from 'buf_out' to the
-    /// underlying stream.
-    fn poll_write_buf(
-        &mut self,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
-        while self.buf_out.has_remaining() {
-            match ready!(Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk())) {
-                Ok(bytes_written) => self.buf_out.advance(bytes_written),
-                Err(err) => return Poll::Ready(Err(err)),
-            }
-        }
-        Poll::Ready(Ok(()))
-    }
-
-    fn poll_flush(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), std::io::Error>> {
-        Pin::new(&mut self.stream).poll_flush(cx)
-    }
-
-    // Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(
-        mut self,
-        handler: &mut impl Handler,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S,
-        S: Future,
-    {
-        let ret = self.run_message_loop(handler, shutdown_watcher).await;
-        let _ = self.stream.shutdown();
-        ret
-    }
-
-    async fn run_message_loop<F, S>(
-        &mut self,
-        handler: &mut impl Handler,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S,
-        S: Future,
-    {
-        trace!("postgres backend to {:?} started", self.peer_addr);
-
-        tokio::select!(
-            biased;
-
-            _ = shutdown_watcher() => {
-                // We were requested to shut down.
-                tracing::info!("shutdown request received during handshake");
-                return Ok(())
-            },
-
-            result = async {
-                while self.state < ProtoState::Established {
-                    if let Some(msg) = self.read_message().await? {
-                        trace!("got message {msg:?} during handshake");
-
-                        match self.process_handshake_message(handler, msg).await? {
-                            ProcessMsgResult::Continue => {
-                                self.flush().await?;
-                                continue;
-                            }
-                            ProcessMsgResult::Break => {
-                                trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
-                                return Ok(());
-                            }
-                        }
-                    } else {
-                        trace!("postgres backend to {:?} exited during handshake", self.peer_addr);
-                        return Ok(());
-                    }
-                }
-                Ok::<(), QueryError>(())
-            } => {
-                // Handshake complete.
-                result?;
-            }
-        );
-
-        // Authentication completed
-        let mut query_string = Bytes::new();
-        while let Some(msg) = tokio::select!(
-            biased;
-            _ = shutdown_watcher() => {
-                // We were requested to shut down.
-                tracing::info!("shutdown request received in run_message_loop");
-                Ok(None)
-            },
-            msg = self.read_message() => { msg },
-        )? {
-            trace!("got message {:?}", msg);
-
-            let result = self.process_message(handler, msg, &mut query_string).await;
-            self.flush().await?;
-            match result? {
-                ProcessMsgResult::Continue => {
-                    self.flush().await?;
-                    continue;
-                }
-                ProcessMsgResult::Break => break,
-            }
-        }
-
-        trace!("postgres backend to {:?} exited", self.peer_addr);
-        Ok(())
-    }
-
-    async fn start_tls(&mut self) -> anyhow::Result<()> {
-        if let Stream::Unencrypted(plain_stream) =
-            std::mem::replace(&mut self.stream, Stream::Broken)
-        {
-            let acceptor = TlsAcceptor::from(self.tls_config.clone().unwrap());
-            let tls_stream = acceptor.accept(plain_stream).await?;
-
-            self.stream = Stream::Tls(Box::new(tls_stream));
-            return Ok(());
-        };
-        anyhow::bail!("TLS already started");
-    }
-
-    async fn process_handshake_message(
-        &mut self,
-        handler: &mut impl Handler,
-        msg: FeMessage,
-    ) -> Result<ProcessMsgResult, QueryError> {
-        assert!(self.state < ProtoState::Established);
-        let have_tls = self.tls_config.is_some();
-        match msg {
-            FeMessage::StartupPacket(m) => {
-                trace!("got startup message {m:?}");
-
-                match m {
-                    FeStartupPacket::SslRequest => {
-                        debug!("SSL requested");
-
-                        self.write_message(&BeMessage::EncryptionResponse(have_tls))?;
-                        if have_tls {
-                            self.start_tls().await?;
-                            self.state = ProtoState::Encrypted;
-                        }
-                    }
-                    FeStartupPacket::GssEncRequest => {
-                        debug!("GSS requested");
-                        self.write_message(&BeMessage::EncryptionResponse(false))?;
-                    }
-                    FeStartupPacket::StartupMessage { .. } => {
-                        if have_tls && !matches!(self.state, ProtoState::Encrypted) {
-                            self.write_message(&BeMessage::ErrorResponse(
-                                "must connect with TLS",
-                                None,
-                            ))?;
-                            return Err(QueryError::Other(anyhow::anyhow!(
-                                "client did not connect with TLS"
-                            )));
-                        }
-
-                        // NB: startup() may change self.auth_type -- we are using that in proxy code
-                        // to bypass auth for new users.
-                        handler.startup(self, &m)?;
-
-                        match self.auth_type {
-                            AuthType::Trust => {
-                                self.write_message(&BeMessage::AuthenticationOk)?
-                                    .write_message(&BeMessage::CLIENT_ENCODING)?
-                                    // The async python driver requires a valid server_version
-                                    .write_message(&BeMessage::server_version("14.1"))?
-                                    .write_message(&BeMessage::ReadyForQuery)?;
-                                self.state = ProtoState::Established;
-                            }
-                            AuthType::NeonJWT => {
-                                self.write_message(&BeMessage::AuthenticationCleartextPassword)?;
-                                self.state = ProtoState::Authentication;
-                            }
-                        }
-                    }
-                    FeStartupPacket::CancelRequest { .. } => {
-                        self.state = ProtoState::Closed;
-                        return Ok(ProcessMsgResult::Break);
-                    }
-                }
-            }
-
-            FeMessage::PasswordMessage(m) => {
-                trace!("got password message '{:?}'", m);
-
-                assert!(self.state == ProtoState::Authentication);
-
-                match self.auth_type {
-                    AuthType::Trust => unreachable!(),
-                    AuthType::NeonJWT => {
-                        let (_, jwt_response) = m.split_last().context("protocol violation")?;
-
-                        if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
-                            self.write_message(&BeMessage::ErrorResponse(
-                                &e.to_string(),
-                                Some(e.pg_error_code()),
-                            ))?;
-                            return Err(e);
-                        }
-                    }
-                }
-                self.write_message(&BeMessage::AuthenticationOk)?
-                    .write_message(&BeMessage::CLIENT_ENCODING)?
-                    .write_message(&BeMessage::ReadyForQuery)?;
-                self.state = ProtoState::Established;
-            }
-
-            _ => {
-                self.state = ProtoState::Closed;
-                return Ok(ProcessMsgResult::Break);
-            }
-        }
-        Ok(ProcessMsgResult::Continue)
-    }
-
-    async fn process_message(
-        &mut self,
-        handler: &mut impl Handler,
-        msg: FeMessage,
-        unnamed_query_string: &mut Bytes,
-    ) -> Result<ProcessMsgResult, QueryError> {
-        // Allow only startup and password messages during auth. Otherwise client would be able to bypass auth
-        // TODO: change that to proper top-level match of protocol state with separate message handling for each state
-        assert!(self.state == ProtoState::Established);
-
-        match msg {
-            FeMessage::StartupPacket(_) | FeMessage::PasswordMessage(_) => {
-                return Err(QueryError::Other(anyhow::anyhow!("protocol violation")));
-            }
-
-            FeMessage::Query(body) => {
-                // remove null terminator
-                let query_string = cstr_to_str(&body)?;
-
-                trace!("got query {query_string:?}");
-                if let Err(e) = handler.process_query(self, query_string).await {
-                    log_query_error(query_string, &e);
-                    let short_error = short_error(&e);
-                    self.write_message(&BeMessage::ErrorResponse(
-                        &short_error,
-                        Some(e.pg_error_code()),
-                    ))?;
-                }
-                self.write_message(&BeMessage::ReadyForQuery)?;
-            }
-
-            FeMessage::Parse(m) => {
-                *unnamed_query_string = m.query_string;
-                self.write_message(&BeMessage::ParseComplete)?;
-            }
-
-            FeMessage::Describe(_) => {
-                self.write_message(&BeMessage::ParameterDescription)?
-                    .write_message(&BeMessage::NoData)?;
-            }
-
-            FeMessage::Bind(_) => {
-                self.write_message(&BeMessage::BindComplete)?;
-            }
-
-            FeMessage::Close(_) => {
-                self.write_message(&BeMessage::CloseComplete)?;
-            }
-
-            FeMessage::Execute(_) => {
-                let query_string = cstr_to_str(unnamed_query_string)?;
-                trace!("got execute {query_string:?}");
-                if let Err(e) = handler.process_query(self, query_string).await {
-                    log_query_error(query_string, &e);
-                    self.write_message(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?;
-                }
-                // NOTE there is no ReadyForQuery message. This handler is used
-                // for basebackup and it uses CopyOut which doesn't require
-                // ReadyForQuery message and backend just switches back to
-                // processing mode after sending CopyDone or ErrorResponse.
-            }
-
-            FeMessage::Sync => {
-                self.write_message(&BeMessage::ReadyForQuery)?;
-            }
-
-            FeMessage::Terminate => {
-                return Ok(ProcessMsgResult::Break);
-            }
-
-            // We prefer explicit pattern matching to wildcards, because
-            // this helps us spot the places where new variants are missing
-            FeMessage::CopyData(_) | FeMessage::CopyDone | FeMessage::CopyFail => {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "unexpected message type: {:?}",
-                    msg
-                )));
-            }
-        }
-
-        Ok(ProcessMsgResult::Continue)
-    }
-}
-
-///
-/// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
-/// messages.
-///
-
-pub struct CopyDataWriter<'a> {
-    pgb: &'a mut PostgresBackend,
-}
-
-impl<'a> AsyncWrite for CopyDataWriter<'a> {
-    fn poll_write(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &[u8],
-    ) -> Poll<Result<usize, std::io::Error>> {
-        let this = self.get_mut();
-
-        // It's not strictly required to flush between each message, but makes it easier
-        // to view in wireshark, and usually the messages that the callers write are
-        // decently-sized anyway.
-        match ready!(this.pgb.poll_write_buf(cx)) {
-            Ok(()) => {}
-            Err(err) => return Poll::Ready(Err(err)),
-        }
-
-        // CopyData
-        // XXX: if the input is large, we should split it into multiple messages.
-        // Not sure what the threshold should be, but the ultimate hard limit is that
-        // the length cannot exceed u32.
-        this.pgb.write_message(&BeMessage::CopyData(buf))?;
-
-        Poll::Ready(Ok(buf.len()))
-    }
-
-    fn poll_flush(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
-        let this = self.get_mut();
-        match ready!(this.pgb.poll_write_buf(cx)) {
-            Ok(()) => {}
-            Err(err) => return Poll::Ready(Err(err)),
-        }
-        this.pgb.poll_flush(cx)
-    }
-    fn poll_shutdown(
-        self: Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-    ) -> Poll<Result<(), std::io::Error>> {
-        let this = self.get_mut();
-        match ready!(this.pgb.poll_write_buf(cx)) {
-            Ok(()) => {}
-            Err(err) => return Poll::Ready(Err(err)),
-        }
-        this.pgb.poll_flush(cx)
-    }
-}
-
-pub fn short_error(e: &QueryError) -> String {
-    match e {
-        QueryError::Disconnected(connection_error) => connection_error.to_string(),
-        QueryError::Other(e) => format!("{e:#}"),
-    }
-}
-
-pub(super) fn log_query_error(query: &str, e: &QueryError) {
-    match e {
-        QueryError::Disconnected(ConnectionError::Socket(io_error)) => {
-            if is_expected_io_error(io_error) {
-                info!("query handler for '{query}' failed with expected io error: {io_error}");
-            } else {
-                error!("query handler for '{query}' failed with io error: {io_error}");
-            }
-        }
-        QueryError::Disconnected(other_connection_error) => {
-            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
-        }
-        QueryError::Other(e) => {
-            error!("query handler for '{query}' failed: {e:?}");
-        }
-    }
-}
--- a/libs/utils/src/sock_split.rs
+++ b/libs/utils/src/sock_split.rs
@@ -1,206 +0,0 @@
-use std::{
-    io::{self, BufReader, Write},
-    net::{Shutdown, TcpStream},
-    sync::Arc,
-};
-
-use rustls::Connection;
-
-/// Wrapper supporting reads of a shared TcpStream.
-pub struct ArcTcpRead(Arc<TcpStream>);
-
-impl io::Read for ArcTcpRead {
-    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
-        (&*self.0).read(buf)
-    }
-}
-
-impl std::ops::Deref for ArcTcpRead {
-    type Target = TcpStream;
-
-    fn deref(&self) -> &Self::Target {
-        self.0.deref()
-    }
-}
-
-/// Wrapper around a TCP Stream supporting buffered reads.
-pub struct BufStream(BufReader<ArcTcpRead>);
-
-impl io::Read for BufStream {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        self.0.read(buf)
-    }
-}
-
-impl io::Write for BufStream {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        self.get_ref().write(buf)
-    }
-
-    fn flush(&mut self) -> io::Result<()> {
-        self.get_ref().flush()
-    }
-}
-
-impl BufStream {
-    /// Unwrap into the internal BufReader.
-    fn into_reader(self) -> BufReader<ArcTcpRead> {
-        self.0
-    }
-
-    /// Returns a reference to the underlying TcpStream.
-    fn get_ref(&self) -> &TcpStream {
-        &self.0.get_ref().0
-    }
-}
-
-pub enum ReadStream {
-    Tcp(BufReader<ArcTcpRead>),
-    Tls(rustls_split::ReadHalf),
-}
-
-impl io::Read for ReadStream {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        match self {
-            Self::Tcp(reader) => reader.read(buf),
-            Self::Tls(read_half) => read_half.read(buf),
-        }
-    }
-}
-
-impl ReadStream {
-    pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.get_ref().shutdown(how),
-            Self::Tls(write_half) => write_half.shutdown(how),
-        }
-    }
-}
-
-pub enum WriteStream {
-    Tcp(Arc<TcpStream>),
-    Tls(rustls_split::WriteHalf),
-}
-
-impl WriteStream {
-    pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.shutdown(how),
-            Self::Tls(write_half) => write_half.shutdown(how),
-        }
-    }
-}
-
-impl io::Write for WriteStream {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        match self {
-            Self::Tcp(stream) => stream.as_ref().write(buf),
-            Self::Tls(write_half) => write_half.write(buf),
-        }
-    }
-
-    fn flush(&mut self) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.as_ref().flush(),
-            Self::Tls(write_half) => write_half.flush(),
-        }
-    }
-}
-
-type TlsStream<T> = rustls::StreamOwned<rustls::ServerConnection, T>;
-
-pub enum BidiStream {
-    Tcp(BufStream),
-    /// This variant is boxed, because [`rustls::ServerConnection`] is quite larger than [`BufStream`].
-    Tls(Box<TlsStream<BufStream>>),
-}
-
-impl BidiStream {
-    pub fn from_tcp(stream: TcpStream) -> Self {
-        Self::Tcp(BufStream(BufReader::new(ArcTcpRead(Arc::new(stream)))))
-    }
-
-    pub fn shutdown(&mut self, how: Shutdown) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.get_ref().shutdown(how),
-            Self::Tls(tls_boxed) => {
-                if how == Shutdown::Read {
-                    tls_boxed.sock.get_ref().shutdown(how)
-                } else {
-                    tls_boxed.conn.send_close_notify();
-                    let res = tls_boxed.flush();
-                    tls_boxed.sock.get_ref().shutdown(how)?;
-                    res
-                }
-            }
-        }
-    }
-
-    /// Split the bi-directional stream into two owned read and write halves.
-    pub fn split(self) -> (ReadStream, WriteStream) {
-        match self {
-            Self::Tcp(stream) => {
-                let reader = stream.into_reader();
-                let stream: Arc<TcpStream> = reader.get_ref().0.clone();
-
-                (ReadStream::Tcp(reader), WriteStream::Tcp(stream))
-            }
-            Self::Tls(tls_boxed) => {
-                let reader = tls_boxed.sock.into_reader();
-                let buffer_data = reader.buffer().to_owned();
-                let read_buf_cfg = rustls_split::BufCfg::with_data(buffer_data, 8192);
-                let write_buf_cfg = rustls_split::BufCfg::with_capacity(8192);
-
-                // TODO would be nice to avoid the Arc here
-                let socket = Arc::try_unwrap(reader.into_inner().0).unwrap();
-
-                let (read_half, write_half) = rustls_split::split(
-                    socket,
-                    Connection::Server(tls_boxed.conn),
-                    read_buf_cfg,
-                    write_buf_cfg,
-                );
-                (ReadStream::Tls(read_half), WriteStream::Tls(write_half))
-            }
-        }
-    }
-
-    pub fn start_tls(self, mut conn: rustls::ServerConnection) -> io::Result<Self> {
-        match self {
-            Self::Tcp(mut stream) => {
-                conn.complete_io(&mut stream)?;
-                assert!(!conn.is_handshaking());
-                Ok(Self::Tls(Box::new(TlsStream::new(conn, stream))))
-            }
-            Self::Tls { .. } => Err(io::Error::new(
-                io::ErrorKind::InvalidInput,
-                "TLS is already started on this stream",
-            )),
-        }
-    }
-}
-
-impl io::Read for BidiStream {
-    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
-        match self {
-            Self::Tcp(stream) => stream.read(buf),
-            Self::Tls(tls_boxed) => tls_boxed.read(buf),
-        }
-    }
-}
-
-impl io::Write for BidiStream {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        match self {
-            Self::Tcp(stream) => stream.write(buf),
-            Self::Tls(tls_boxed) => tls_boxed.write(buf),
-        }
-    }
-
-    fn flush(&mut self) -> io::Result<()> {
-        match self {
-            Self::Tcp(stream) => stream.flush(),
-            Self::Tls(tls_boxed) => tls_boxed.flush(),
-        }
-    }
-}
--- a/libs/utils/tests/ssl_test.rs
+++ b/libs/utils/tests/ssl_test.rs
@@ -1,238 +0,0 @@
-use std::{
-    collections::HashMap,
-    io::{Cursor, Read, Write},
-    net::{TcpListener, TcpStream},
-    sync::Arc,
-};
-
-use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
-use once_cell::sync::Lazy;
-
-use utils::{
-    postgres_backend::{AuthType, Handler, PostgresBackend},
-    postgres_backend_async::QueryError,
-};
-
-fn make_tcp_pair() -> (TcpStream, TcpStream) {
-    let listener = TcpListener::bind("127.0.0.1:0").unwrap();
-    let addr = listener.local_addr().unwrap();
-    let client_stream = TcpStream::connect(addr).unwrap();
-    let (server_stream, _) = listener.accept().unwrap();
-    (server_stream, client_stream)
-}
-
-static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
-    let mut cursor = Cursor::new(include_bytes!("key.pem"));
-    rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
-});
-
-static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
-    let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-    rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
-});
-
-#[test]
-// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274),
-// we resize the vector so doing some modifications after all
-#[allow(clippy::read_zero_byte_vec)]
-fn ssl() {
-    let (mut client_sock, server_sock) = make_tcp_pair();
-
-    const QUERY: &str = "hello world";
-
-    let client_jh = std::thread::spawn(move || {
-        // SSLRequest
-        client_sock.write_u32::<BigEndian>(8).unwrap();
-        client_sock.write_u32::<BigEndian>(80877103).unwrap();
-
-        let ssl_response = client_sock.read_u8().unwrap();
-        assert_eq!(b'S', ssl_response);
-
-        let cfg = rustls::ClientConfig::builder()
-            .with_safe_defaults()
-            .with_root_certificates({
-                let mut store = rustls::RootCertStore::empty();
-                store.add(&CERT).unwrap();
-                store
-            })
-            .with_no_client_auth();
-        let client_config = Arc::new(cfg);
-
-        let dns_name = "localhost".try_into().unwrap();
-        let mut conn = rustls::ClientConnection::new(client_config, dns_name).unwrap();
-
-        conn.complete_io(&mut client_sock).unwrap();
-        assert!(!conn.is_handshaking());
-
-        let mut stream = rustls::Stream::new(&mut conn, &mut client_sock);
-
-        // StartupMessage
-        stream.write_u32::<BigEndian>(9).unwrap();
-        stream.write_u32::<BigEndian>(196608).unwrap();
-        stream.write_u8(0).unwrap();
-        stream.flush().unwrap();
-
-        // wait for ReadyForQuery
-        let mut msg_buf = Vec::new();
-        loop {
-            let msg = stream.read_u8().unwrap();
-            let size = stream.read_u32::<BigEndian>().unwrap() - 4;
-            msg_buf.resize(size as usize, 0);
-            stream.read_exact(&mut msg_buf).unwrap();
-
-            if msg == b'Z' {
-                // ReadyForQuery
-                break;
-            }
-        }
-
-        // Query
-        stream.write_u8(b'Q').unwrap();
-        stream
-            .write_u32::<BigEndian>(4u32 + QUERY.len() as u32)
-            .unwrap();
-        stream.write_all(QUERY.as_ref()).unwrap();
-        stream.flush().unwrap();
-
-        // ReadyForQuery
-        let msg = stream.read_u8().unwrap();
-        assert_eq!(msg, b'Z');
-    });
-
-    struct TestHandler {
-        got_query: bool,
-    }
-    impl Handler for TestHandler {
-        fn process_query(
-            &mut self,
-            _pgb: &mut PostgresBackend,
-            query_string: &str,
-        ) -> Result<(), QueryError> {
-            self.got_query = query_string == QUERY;
-            Ok(())
-        }
-    }
-    let mut handler = TestHandler { got_query: false };
-
-    let cfg = rustls::ServerConfig::builder()
-        .with_safe_defaults()
-        .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone())
-        .unwrap();
-    let tls_config = Some(Arc::new(cfg));
-
-    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
-    pgb.run(&mut handler).unwrap();
-    assert!(handler.got_query);
-
-    client_jh.join().unwrap();
-
-    // TODO consider shutdown behavior
-}
-
-#[test]
-fn no_ssl() {
-    let (mut client_sock, server_sock) = make_tcp_pair();
-
-    let client_jh = std::thread::spawn(move || {
-        let mut buf = BytesMut::new();
-
-        // SSLRequest
-        buf.put_u32(8);
-        buf.put_u32(80877103);
-        client_sock.write_all(&buf).unwrap();
-        buf.clear();
-
-        let ssl_response = client_sock.read_u8().unwrap();
-        assert_eq!(b'N', ssl_response);
-    });
-
-    struct TestHandler;
-
-    impl Handler for TestHandler {
-        fn process_query(
-            &mut self,
-            _pgb: &mut PostgresBackend,
-            _query_string: &str,
-        ) -> Result<(), QueryError> {
-            panic!()
-        }
-    }
-
-    let mut handler = TestHandler;
-
-    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, None, true).unwrap();
-    pgb.run(&mut handler).unwrap();
-
-    client_jh.join().unwrap();
-}
-
-#[test]
-fn server_forces_ssl() {
-    let (mut client_sock, server_sock) = make_tcp_pair();
-
-    let client_jh = std::thread::spawn(move || {
-        // StartupMessage
-        client_sock.write_u32::<BigEndian>(9).unwrap();
-        client_sock.write_u32::<BigEndian>(196608).unwrap();
-        client_sock.write_u8(0).unwrap();
-        client_sock.flush().unwrap();
-
-        // ErrorResponse
-        assert_eq!(client_sock.read_u8().unwrap(), b'E');
-        let len = client_sock.read_u32::<BigEndian>().unwrap() - 4;
-
-        let mut body = vec![0; len as usize];
-        client_sock.read_exact(&mut body).unwrap();
-        let mut body = Bytes::from(body);
-
-        let mut errors = HashMap::new();
-        loop {
-            let field_type = body.get_u8();
-            if field_type == 0u8 {
-                break;
-            }
-
-            let end_idx = body.iter().position(|&b| b == 0u8).unwrap();
-            let mut value = body.split_to(end_idx + 1);
-            assert_eq!(value[end_idx], 0u8);
-            value.truncate(end_idx);
-            let old = errors.insert(field_type, value);
-            assert!(old.is_none());
-        }
-
-        assert!(!body.has_remaining());
-
-        assert_eq!("must connect with TLS", errors.get(&b'M').unwrap());
-
-        // TODO read failure
-    });
-
-    struct TestHandler;
-    impl Handler for TestHandler {
-        fn process_query(
-            &mut self,
-            _pgb: &mut PostgresBackend,
-            _query_string: &str,
-        ) -> Result<(), QueryError> {
-            panic!()
-        }
-    }
-    let mut handler = TestHandler;
-
-    let cfg = rustls::ServerConfig::builder()
-        .with_safe_defaults()
-        .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone())
-        .unwrap();
-    let tls_config = Some(Arc::new(cfg));
-
-    let pgb = PostgresBackend::new(server_sock, AuthType::Trust, tls_config, true).unwrap();
-    let res = pgb.run(&mut handler).unwrap_err();
-    assert_eq!("client did not connect with TLS", format!("{}", res));
-
-    client_jh.join().unwrap();
-
-    // TODO consider shutdown behavior
-}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,6 +37,7 @@ num-traits.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
 postgres.workspace = true
+postgres_backend.workspace = true
 postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
@@ -47,6 +48,7 @@ serde_json = { workspace = true, features = ["raw_value"] }
 serde_with.workspace = true
 signal-hook.workspace = true
 svg_fmt.workspace = true
+sync_wrapper.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
@@ -54,6 +56,7 @@ tokio-postgres.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
+ubyte = { version = "0.10.3", features = ["serde"] }
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,6 +8,7 @@ use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
+use pageserver::disk_usage_eviction_task::launch_disk_usage_global_eviction_task;
 use remote_storage::GenericRemoteStorage;
 use tracing::*;

@@ -23,11 +24,10 @@ use pageserver::{
    tenant::mgr,
    virtual_file,
 };
+use postgres_backend::AuthType;
 use utils::{
    auth::JwtAuth,
-    logging,
-    postgres_backend::AuthType,
-    project_git_version,
+    logging, project_git_version,
    sentry_init::init_sentry,
    signals::{self, Signal},
    tcp_listener,
@@ -271,43 +271,43 @@ fn start_pageserver(
    WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;

    // Initialize authentication for incoming connections
-    let auth = match &conf.auth_type {
-        AuthType::Trust => None,
-        AuthType::NeonJWT => {
-            // unwrap is ok because check is performed when creating config, so path is set and file exists
-            let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-            Some(JwtAuth::from_key_path(key_path)?.into())
-        }
-    };
-    info!("Using auth: {:#?}", conf.auth_type);
+    let http_auth;
+    let pg_auth;
+    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
+        // unwrap is ok because check is performed when creating config, so path is set and file exists
+        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
+        info!(
+            "Loading public key for verifying JWT tokens from {:#?}",
+            key_path
+        );
+        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);

-    // TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
-    match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
-        (old, Ok(v)) => {
+        http_auth = match &conf.http_auth_type {
+            AuthType::Trust => None,
+            AuthType::NeonJWT => Some(auth.clone()),
+        };
+        pg_auth = match &conf.pg_auth_type {
+            AuthType::Trust => None,
+            AuthType::NeonJWT => Some(auth),
+        };
+    } else {
+        http_auth = None;
+        pg_auth = None;
+    }
+    info!("Using auth for http API: {:#?}", conf.http_auth_type);
+    info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
+
+    match var("NEON_AUTH_TOKEN") {
+        Ok(v) => {
            info!("Loaded JWT token for authentication with Safekeeper");
-            if let Ok(v_old) = old {
-                warn!(
-                    "JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
-                );
-                if v_old != v {
-                    warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
-                }
-            }
            pageserver::config::SAFEKEEPER_AUTH_TOKEN
                .set(Arc::new(v))
                .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
        }
-        (Ok(v), _) => {
-            info!("Loaded JWT token for authentication with Safekeeper");
-            warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
-            pageserver::config::SAFEKEEPER_AUTH_TOKEN
-                .set(Arc::new(v))
-                .map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
-        }
-        (_, Err(VarError::NotPresent)) => {
+        Err(VarError::NotPresent) => {
            info!("No JWT token for authentication with Safekeeper detected");
        }
-        (_, Err(e)) => {
+        Err(e) => {
            return Err(e).with_context(|| {
                "Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
            })
@@ -320,12 +320,16 @@ fn start_pageserver(
    // Scan the local 'tenants/' directory and start loading the tenants
    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;

+    if let Some(remote_storage) = &remote_storage {
+        launch_disk_usage_global_eviction_task(conf, remote_storage.clone())?;
+    }
+
    // Start up the service to handle HTTP mgmt API request. We created the
    // listener earlier already.
    {
        let _rt_guard = MGMT_REQUEST_RUNTIME.enter();

-        let router = http::make_router(conf, launch_ts, auth.clone(), remote_storage)?
+        let router = http::make_router(conf, launch_ts, http_auth, remote_storage)?
            .build()
            .map_err(|err| anyhow!(err))?;
        let service = utils::http::RouterService::new(router).unwrap();
@@ -399,9 +403,9 @@ fn start_pageserver(
            async move {
                page_service::libpq_listener_main(
                    conf,
-                    auth,
+                    pg_auth,
                    pageserver_listener,
-                    conf.auth_type,
+                    conf.pg_auth_type,
                    libpq_ctx,
                )
                .await
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -21,12 +21,13 @@ use std::time::Duration;
 use toml_edit;
 use toml_edit::{Document, Item};

+use postgres_backend::AuthType;
 use utils::{
    id::{NodeId, TenantId, TimelineId},
    logging::LogFormat,
-    postgres_backend::AuthType,
 };

+use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
@@ -61,6 +62,7 @@ pub mod defaults {
    pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
    pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
    pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";

    ///
    /// Default built-in configuration file.
@@ -89,6 +91,8 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'

+#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -118,6 +122,9 @@ pub struct PageServerConf {
    /// Example (default): 127.0.0.1:9898
    pub listen_http_addr: String,

+    /// Current availability zone. Used for traffic metrics.
+    pub availability_zone: Option<String>,
+
    // Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
    pub wait_lsn_timeout: Duration,
    // How long to wait for WAL redo to complete.
@@ -138,9 +145,15 @@ pub struct PageServerConf {

    pub pg_distrib_dir: PathBuf,

-    pub auth_type: AuthType,
-
+    // Authentication
+    /// authentication method for the HTTP mgmt API
+    pub http_auth_type: AuthType,
+    /// authentication method for libpq connections from compute
+    pub pg_auth_type: AuthType,
+    /// Path to a file containing public key for verifying JWT tokens.
+    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<PathBuf>,
+
    pub remote_storage_config: Option<RemoteStorageConfig>,

    pub default_tenant_conf: TenantConf,
@@ -161,6 +174,11 @@ pub struct PageServerConf {
    pub metric_collection_endpoint: Option<Url>,
    pub synthetic_size_calculation_interval: Duration,

+    // See the corresponding metric's help string.
+    pub evictions_low_residence_duration_metric_threshold: Duration,
+
+    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
+
    pub test_remote_failures: u64,

    pub ondemand_download_behavior_treat_error_as_warn: bool,
@@ -196,6 +214,8 @@ struct PageServerConfigBuilder {

    listen_http_addr: BuilderValue<String>,

+    availability_zone: BuilderValue<Option<String>>,
+
    wait_lsn_timeout: BuilderValue<Duration>,
    wal_redo_timeout: BuilderValue<Duration>,

@@ -208,7 +228,8 @@ struct PageServerConfigBuilder {

    pg_distrib_dir: BuilderValue<PathBuf>,

-    auth_type: BuilderValue<AuthType>,
+    http_auth_type: BuilderValue<AuthType>,
+    pg_auth_type: BuilderValue<AuthType>,

    //
    auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
@@ -228,6 +249,10 @@ struct PageServerConfigBuilder {
    metric_collection_endpoint: BuilderValue<Option<Url>>,
    synthetic_size_calculation_interval: BuilderValue<Duration>,

+    evictions_low_residence_duration_metric_threshold: BuilderValue<Duration>,
+
+    disk_usage_based_eviction: BuilderValue<Option<DiskUsageEvictionTaskConfig>>,
+
    test_remote_failures: BuilderValue<u64>,

    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
@@ -240,6 +265,7 @@ impl Default for PageServerConfigBuilder {
        Self {
            listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
            listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
+            availability_zone: Set(None),
            wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
                .expect("cannot parse default wait lsn timeout")),
            wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
@@ -251,7 +277,8 @@ impl Default for PageServerConfigBuilder {
            pg_distrib_dir: Set(env::current_dir()
                .expect("cannot access current directory")
                .join("pg_install")),
-            auth_type: Set(AuthType::Trust),
+            http_auth_type: Set(AuthType::Trust),
+            pg_auth_type: Set(AuthType::Trust),
            auth_validation_public_key_path: Set(None),
            remote_storage_config: Set(None),
            id: NotSet,
@@ -279,6 +306,13 @@ impl Default for PageServerConfigBuilder {
            .expect("cannot parse default synthetic size calculation interval")),
            metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),

+            evictions_low_residence_duration_metric_threshold: Set(humantime::parse_duration(
+                DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")),
+
+            disk_usage_based_eviction: Set(None),
+
            test_remote_failures: Set(0),

            ondemand_download_behavior_treat_error_as_warn: Set(false),
@@ -295,6 +329,10 @@ impl PageServerConfigBuilder {
        self.listen_http_addr = BuilderValue::Set(listen_http_addr)
    }

+    pub fn availability_zone(&mut self, availability_zone: Option<String>) {
+        self.availability_zone = BuilderValue::Set(availability_zone)
+    }
+
    pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
        self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
    }
@@ -323,8 +361,12 @@ impl PageServerConfigBuilder {
        self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
    }

-    pub fn auth_type(&mut self, auth_type: AuthType) {
-        self.auth_type = BuilderValue::Set(auth_type)
+    pub fn http_auth_type(&mut self, auth_type: AuthType) {
+        self.http_auth_type = BuilderValue::Set(auth_type)
+    }
+
+    pub fn pg_auth_type(&mut self, auth_type: AuthType) {
+        self.pg_auth_type = BuilderValue::Set(auth_type)
    }

    pub fn auth_validation_public_key_path(
@@ -386,6 +428,14 @@ impl PageServerConfigBuilder {
        self.test_remote_failures = BuilderValue::Set(fail_first);
    }

+    pub fn evictions_low_residence_duration_metric_threshold(&mut self, value: Duration) {
+        self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value);
+    }
+
+    pub fn disk_usage_based_eviction(&mut self, value: Option<DiskUsageEvictionTaskConfig>) {
+        self.disk_usage_based_eviction = BuilderValue::Set(value);
+    }
+
    pub fn ondemand_download_behavior_treat_error_as_warn(
        &mut self,
        ondemand_download_behavior_treat_error_as_warn: bool,
@@ -402,6 +452,9 @@ impl PageServerConfigBuilder {
            listen_http_addr: self
                .listen_http_addr
                .ok_or(anyhow!("missing listen_http_addr"))?,
+            availability_zone: self
+                .availability_zone
+                .ok_or(anyhow!("missing availability_zone"))?,
            wait_lsn_timeout: self
                .wait_lsn_timeout
                .ok_or(anyhow!("missing wait_lsn_timeout"))?,
@@ -419,7 +472,10 @@ impl PageServerConfigBuilder {
            pg_distrib_dir: self
                .pg_distrib_dir
                .ok_or(anyhow!("missing pg_distrib_dir"))?,
-            auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?,
+            http_auth_type: self
+                .http_auth_type
+                .ok_or(anyhow!("missing http_auth_type"))?,
+            pg_auth_type: self.pg_auth_type.ok_or(anyhow!("missing pg_auth_type"))?,
            auth_validation_public_key_path: self
                .auth_validation_public_key_path
                .ok_or(anyhow!("missing auth_validation_public_key_path"))?,
@@ -453,6 +509,14 @@ impl PageServerConfigBuilder {
            synthetic_size_calculation_interval: self
                .synthetic_size_calculation_interval
                .ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
+            evictions_low_residence_duration_metric_threshold: self
+                .evictions_low_residence_duration_metric_threshold
+                .ok_or(anyhow!(
+                    "missing evictions_low_residence_duration_metric_threshold"
+                ))?,
+            disk_usage_based_eviction: self
+                .disk_usage_based_eviction
+                .ok_or(anyhow!("missing disk_usage_based_eviction"))?,
            test_remote_failures: self
                .test_remote_failures
                .ok_or(anyhow!("missing test_remote_failuers"))?,
@@ -599,6 +663,7 @@ impl PageServerConf {
            match key {
                "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
                "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
+                "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)),
                "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
                "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
                "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
@@ -612,7 +677,8 @@ impl PageServerConf {
                "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
                    PathBuf::from(parse_toml_string(key, item)?),
                )),
-                "auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
+                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
+                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
                "remote_storage" => {
                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
                }
@@ -640,6 +706,13 @@ impl PageServerConf {
                "synthetic_size_calculation_interval" =>
                    builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
+                "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?),
+                "disk_usage_based_eviction" => {
+                    tracing::info!("disk_usage_based_eviction: {:#?}", &item);
+                    builder.disk_usage_based_eviction(
+                    toml_edit::de::from_item(item.clone())
+                    .context("parse disk_usage_based_eviction")?)
+                },
                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
@@ -647,7 +720,7 @@ impl PageServerConf {

        let mut conf = builder.build().context("invalid config")?;

-        if conf.auth_type == AuthType::NeonJWT {
+        if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
            let auth_validation_public_key_path = conf
                .auth_validation_public_key_path
                .get_or_insert_with(|| workdir.join("auth_public_key.pem"));
@@ -698,6 +771,12 @@ impl PageServerConf {
                Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
        }

+        if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
+            t_conf.image_creation_threshold = Some(
+                parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
+            );
+        }
+
        if let Some(gc_horizon) = item.get("gc_horizon") {
            t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
        }
@@ -738,6 +817,13 @@ impl PageServerConf {
            );
        }

+        if let Some(item) = item.get("min_resident_size_override") {
+            t_conf.min_resident_size_override = Some(
+                toml_edit::de::from_item(item.clone())
+                    .context("parse min_resident_size_override")?,
+            );
+        }
+
        Ok(t_conf)
    }

@@ -757,10 +843,12 @@ impl PageServerConf {
            max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
            listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
+            availability_zone: None,
            superuser: "cloud_admin".to_string(),
            workdir: repo_dir,
            pg_distrib_dir,
-            auth_type: AuthType::Trust,
+            http_auth_type: AuthType::Trust,
+            pg_auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
            remote_storage_config: None,
            default_tenant_conf: TenantConf::default(),
@@ -772,6 +860,11 @@ impl PageServerConf {
            cached_metric_collection_interval: Duration::from_secs(60 * 60),
            metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
            synthetic_size_calculation_interval: Duration::from_secs(60),
+            evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
+            )
+            .unwrap(),
+            disk_usage_based_eviction: None,
            test_remote_failures: 0,
            ondemand_download_behavior_treat_error_as_warn: false,
        }
@@ -913,6 +1006,9 @@ metric_collection_interval = '222 s'
 cached_metric_collection_interval = '22200 s'
 metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'
+
+evictions_low_residence_duration_metric_threshold = '444 s'
+
 log_format = 'json'

 "#;
@@ -938,6 +1034,7 @@ log_format = 'json'
                id: NodeId(10),
                listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
                listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
+                availability_zone: None,
                wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
                wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
                superuser: defaults::DEFAULT_SUPERUSER.to_string(),
@@ -945,7 +1042,8 @@ log_format = 'json'
                max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
                workdir,
                pg_distrib_dir,
-                auth_type: AuthType::Trust,
+                http_auth_type: AuthType::Trust,
+                pg_auth_type: AuthType::Trust,
                auth_validation_public_key_path: None,
                remote_storage_config: None,
                default_tenant_conf: TenantConf::default(),
@@ -965,6 +1063,10 @@ log_format = 'json'
                synthetic_size_calculation_interval: humantime::parse_duration(
                    defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                )?,
+                evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
+                    defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD
+                )?,
+                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
            },
@@ -995,6 +1097,7 @@ log_format = 'json'
                id: NodeId(10),
                listen_pg_addr: "127.0.0.1:64000".to_string(),
                listen_http_addr: "127.0.0.1:9898".to_string(),
+                availability_zone: None,
                wait_lsn_timeout: Duration::from_secs(111),
                wal_redo_timeout: Duration::from_secs(111),
                superuser: "zzzz".to_string(),
@@ -1002,7 +1105,8 @@ log_format = 'json'
                max_file_descriptors: 333,
                workdir,
                pg_distrib_dir,
-                auth_type: AuthType::Trust,
+                http_auth_type: AuthType::Trust,
+                pg_auth_type: AuthType::Trust,
                auth_validation_public_key_path: None,
                remote_storage_config: None,
                default_tenant_conf: TenantConf::default(),
@@ -1014,6 +1118,8 @@ log_format = 'json'
                cached_metric_collection_interval: Duration::from_secs(22200),
                metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                synthetic_size_calculation_interval: Duration::from_secs(333),
+                evictions_low_residence_duration_metric_threshold: Duration::from_secs(444),
+                disk_usage_based_eviction: None,
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
            },
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -0,0 +1,666 @@
+//! This module implements the pageserver-global disk-usage-based layer eviction task.
+//!
+//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background
+//! loop that evicts layers in response to a shortage of available bytes
+//! in the $repo/tenants directory's filesystem.
+//!
+//! The loop runs periodically at a configurable `period`.
+//!
+//! Each loop iteration uses `statvfs` to determine filesystem-level space usage.
+//! It compares the returned usage data against two different types of thresholds.
+//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds.
+//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration.
+//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds.
+//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further.
+//!
+//! There are two thresholds:
+//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space.
+//! If the actual usage is higher, the threshold is exceeded.
+//! `min_avail_bytes` is the absolute available space in bytes.
+//! If the actual usage is lower, the threshold is exceeded.
+//!
+//! The iteration evicts layers in LRU fashion.
+//! It tries first with a reservation of up to `tenant_min_resident_size` bytes of the most recent layers per tenant.
+//! The layers not part of the per-tenant reservation are evicted least-recently-used first until we're below all thresholds.
+//! If the per-tenant-reservation strategy doesn't work out, it falls back to global LRU.
+use std::{
+    collections::HashMap,
+    ops::ControlFlow,
+    sync::{Arc, Mutex},
+    time::Duration,
+};
+
+use anyhow::Context;
+use nix::dir::Dir;
+use remote_storage::GenericRemoteStorage;
+use serde::{Deserialize, Serialize};
+use sync_wrapper::SyncWrapper;
+use tokio::time::Instant;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::{approx_accurate::ApproxAccurate, id::TenantId};
+
+use crate::{
+    config::PageServerConf,
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    tenant::{self, LocalLayerInfoForDiskUsageEviction, Timeline},
+};
+
+fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result<u64, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let v: u64 = serde::de::Deserialize::deserialize(deserializer)?;
+    if v > 100 {
+        return Err(serde::de::Error::custom(
+            "must be an integer between 0 and 100",
+        ));
+    }
+    Ok(v)
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct DiskUsageEvictionTaskConfig {
+    #[serde(deserialize_with = "deserialize_pct_0_to_100")]
+    pub max_usage_pct: u64,
+    pub min_avail_bytes: u64,
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+}
+
+pub fn launch_disk_usage_global_eviction_task(
+    conf: &'static PageServerConf,
+    storage: GenericRemoteStorage,
+) -> anyhow::Result<()> {
+    let Some(task_config) = &conf.disk_usage_based_eviction else {
+        info!("disk usage based eviction task not configured");
+        return Ok(());
+    };
+
+    let tenants_dir_fd = {
+        let tenants_path = conf.tenants_path();
+        nix::dir::Dir::open(
+            &tenants_path,
+            nix::fcntl::OFlag::O_DIRECTORY,
+            nix::sys::stat::Mode::empty(),
+        )
+        .with_context(|| format!("open tenants_path {tenants_path:?}"))?
+    };
+
+    info!("launching disk usage based eviction task");
+
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::DiskUsageEviction,
+        None,
+        None,
+        "disk usage based eviction",
+        false,
+        async move {
+            disk_usage_eviction_task(
+                task_config,
+                storage,
+                tenants_dir_fd,
+                task_mgr::shutdown_token(),
+            )
+            .await;
+            info!("disk usage based eviction task finishing");
+            Ok(())
+        },
+    );
+
+    Ok(())
+}
+
+#[instrument(skip_all)]
+async fn disk_usage_eviction_task(
+    task_config: &DiskUsageEvictionTaskConfig,
+    storage: GenericRemoteStorage,
+    tenants_dir_fd: Dir,
+    cancel: CancellationToken,
+) {
+    // nix::dir::Dir is Send but not Sync.
+    // One would think that that is sufficient, but rustc complains that the &tenants_dir_fd
+    // that we pass to disk_usage_eviction_iteration below will outlive the .await;
+    // The reason is that the &tenants_dir_fd is not sync because of stdlib-enforced axiom
+    //  T: Sync <=> &T: Send
+    // The solution is to use SyncWrapper, which, by owning the tenants_dir_fd, can impl Sync.
+    let mut tenants_dir_fd = SyncWrapper::new(tenants_dir_fd);
+
+    use crate::tenant::tasks::random_init_delay;
+    {
+        if random_init_delay(task_config.period, &cancel)
+            .await
+            .is_err()
+        {
+            info!("shutting down");
+            return;
+        }
+    }
+
+    let mut iteration_no = 0;
+    loop {
+        iteration_no += 1;
+        let start = Instant::now();
+
+        async {
+            let res = disk_usage_eviction_task_iteration(
+                task_config,
+                &storage,
+                &mut tenants_dir_fd,
+                &cancel,
+            )
+            .await;
+
+            match res {
+                Ok(()) => {}
+                Err(e) => {
+                    // these stat failures are expected to be very rare
+                    warn!("iteration failed, unexpected error: {e:#}");
+                }
+            }
+        }
+        .instrument(tracing::info_span!("iteration", iteration_no))
+        .await;
+
+        let sleep_until = start + task_config.period;
+        tokio::select! {
+            _ = tokio::time::sleep_until(sleep_until) => {},
+            _ = cancel.cancelled() => {
+                info!("shutting down");
+                break
+            }
+        }
+    }
+}
+
+pub trait Usage: Clone + Copy + std::fmt::Debug {
+    fn has_pressure(&self) -> bool;
+    fn add_available_bytes(&mut self, bytes: u64);
+}
+
+async fn disk_usage_eviction_task_iteration(
+    task_config: &DiskUsageEvictionTaskConfig,
+    storage: &GenericRemoteStorage,
+    tenants_dir_fd: &mut SyncWrapper<Dir>,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    let usage_pre = filesystem_level_usage::get(tenants_dir_fd, task_config)
+        .context("get filesystem-level disk usage before evictions")?;
+    let res = disk_usage_eviction_task_iteration_impl(storage, usage_pre, cancel).await;
+    match res {
+        Ok(outcome) => {
+            debug!(?outcome, "disk_usage_eviction_iteration finished");
+            match outcome {
+                IterationOutcome::NoPressure | IterationOutcome::Cancelled => {
+                    // nothing to do, select statement below will handle things
+                }
+                IterationOutcome::Finished(outcome) => {
+                    // Verify with statvfs whether we made any real progress
+                    let after = filesystem_level_usage::get(tenants_dir_fd, task_config)
+                        // It's quite unlikely to hit the error here. Keep the code simple and bail out.
+                        .context("get filesystem-level disk usage after evictions")?;
+
+                    debug!(?after, "disk usage");
+
+                    if after.has_pressure() {
+                        // Don't bother doing an out-of-order iteration here now.
+                        // In practice, the task period is set to a value in the tens-of-seconds range,
+                        // which will cause another iteration to happen soon enough.
+                        // TODO: deltas between the three different usages would be helpful,
+                        // consider MiB, GiB, TiB
+                        warn!(?outcome, ?after, "disk usage still high");
+                    } else {
+                        info!(?outcome, ?after, "disk usage pressure relieved");
+                    }
+                }
+            }
+        }
+        Err(e) => {
+            error!("disk_usage_eviction_iteration failed: {:#}", e);
+        }
+    }
+
+    Ok(())
+}
+
+#[derive(Debug, Serialize)]
+#[allow(clippy::large_enum_variant)]
+pub enum IterationOutcome<U> {
+    NoPressure,
+    Cancelled,
+    Finished(IterationOutcomeFinished<U>),
+}
+
+// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
+// We use the Debug impl for logging, so, it's allright.
+#[allow(dead_code)]
+#[derive(Debug, Serialize)]
+pub struct IterationOutcomeFinished<U> {
+    /// The actual usage observed before we started the iteration.
+    before: U,
+    /// The expected value for `after`, according to internal accounting, after phase 1.
+    planned: PlannedUsage<U>,
+    /// The outcome of phase 2, where we actually do the evictions.
+    ///
+    /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will
+    /// be the same as `planned`.
+    assumed: AssumedUsage<U>,
+}
+
+// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
+// We use the Debug impl for logging, so, it's allright.
+#[derive(Debug, Serialize)]
+#[allow(dead_code)]
+struct AssumedUsage<U> {
+    /// The expected value for `after`, after phase 2.
+    projected_after: U,
+    /// The layers we failed to evict during phase 2.
+    failed: LayerCount,
+}
+
+// The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
+// We use the Debug impl for logging, so, it's allright.
+#[allow(dead_code)]
+#[derive(Debug, Serialize)]
+struct PlannedUsage<U> {
+    respecting_tenant_min_resident_size: U,
+    fallback_to_global_lru: Option<U>,
+}
+
+#[allow(dead_code)]
+#[derive(Debug, Default, Serialize)]
+struct LayerCount {
+    file_sizes: u64,
+    count: usize,
+}
+
+#[allow(clippy::needless_late_init)]
+pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
+    storage: &GenericRemoteStorage,
+    usage_pre: U,
+    cancel: &CancellationToken,
+) -> anyhow::Result<IterationOutcome<U>> {
+    static MUTEX: once_cell::sync::Lazy<tokio::sync::Mutex<()>> =
+        once_cell::sync::Lazy::new(|| tokio::sync::Mutex::new(()));
+
+    let _g = MUTEX
+        .try_lock()
+        .map_err(|_| anyhow::anyhow!("iteration is already executing"))?;
+
+    // planned post-eviction usage
+    let mut usage_planned_min_resident_size_respecting = usage_pre;
+    let mut usage_planned_global_lru = None;
+    // achieved post-eviction usage according to internal accounting
+    let mut usage_assumed = usage_pre;
+    // actual usage read after batched evictions
+
+    debug!(?usage_pre, "disk usage");
+
+    if !usage_pre.has_pressure() {
+        return Ok(IterationOutcome::NoPressure);
+    }
+
+    warn!(
+        ?usage_pre,
+        "running disk usage based eviction due to pressure"
+    );
+
+    let mut lru_candidates: Vec<(_, LocalLayerInfoForDiskUsageEviction)> = Vec::new();
+
+    // get a snapshot of the list of tenants
+    let tenants = tenant::mgr::list_tenants()
+        .await
+        .context("get list of tenants")?;
+
+    {
+        let mut tmp = Vec::new();
+        for (tenant_id, _state) in &tenants {
+            let flow = extend_lru_candidates(
+                Mode::RespectTenantMinResidentSize,
+                *tenant_id,
+                &mut lru_candidates,
+                &mut tmp,
+                cancel,
+            )
+            .await;
+
+            if let ControlFlow::Break(()) = flow {
+                return Ok(IterationOutcome::Cancelled);
+            }
+
+            assert!(tmp.is_empty(), "tmp has to be fully drained each iteration");
+        }
+    }
+
+    if cancel.is_cancelled() {
+        return Ok(IterationOutcome::Cancelled);
+    }
+
+    // phase1: select victims to relieve pressure
+    lru_candidates.sort_unstable_by_key(|(_, layer)| layer.last_activity_ts);
+    let mut batched: HashMap<_, Vec<LocalLayerInfoForDiskUsageEviction>> = HashMap::new();
+    for (i, (timeline, layer)) in lru_candidates.into_iter().enumerate() {
+        if !usage_planned_min_resident_size_respecting.has_pressure() {
+            debug!(
+                no_candidates_evicted = i,
+                "took enough candidates for pressure to be relieved"
+            );
+            break;
+        }
+
+        usage_planned_min_resident_size_respecting.add_available_bytes(layer.file_size());
+
+        batched
+            .entry(TimelineKey(timeline.clone()))
+            .or_default()
+            .push(layer);
+    }
+    // If we can't relieve pressure while respecting tenant_min_resident_size, fall back to global LRU.
+    if usage_planned_min_resident_size_respecting.has_pressure() {
+        // NB: tests depend on parts of this log message
+        warn!(?usage_pre, ?usage_planned_min_resident_size_respecting, "tenant_min_resident_size-respecting LRU would not relieve pressure, falling back to global LRU");
+        batched.clear();
+        let mut usage_planned = usage_pre;
+        let mut global_lru_candidates = Vec::new();
+        let mut tmp = Vec::new();
+        for (tenant_id, _state) in &tenants {
+            let flow = extend_lru_candidates(
+                Mode::GlobalLru,
+                *tenant_id,
+                &mut global_lru_candidates,
+                &mut tmp,
+                cancel,
+            )
+            .await;
+
+            if let ControlFlow::Break(()) = flow {
+                return Ok(IterationOutcome::Cancelled);
+            }
+
+            assert!(tmp.is_empty(), "tmp has to be fully drained each iteration");
+        }
+        global_lru_candidates.sort_unstable_by_key(|(_, layer)| layer.last_activity_ts);
+        for (timeline, layer) in global_lru_candidates {
+            usage_planned.add_available_bytes(layer.file_size());
+            batched
+                .entry(TimelineKey(timeline.clone()))
+                .or_default()
+                .push(layer);
+            if cancel.is_cancelled() {
+                return Ok(IterationOutcome::Cancelled);
+            }
+        }
+        usage_planned_global_lru = Some(usage_planned);
+    }
+    let usage_planned = PlannedUsage {
+        respecting_tenant_min_resident_size: usage_planned_min_resident_size_respecting,
+        fallback_to_global_lru: usage_planned_global_lru,
+    };
+
+    debug!(?usage_planned, "usage planned");
+
+    // phase2: evict victims batched by timeline
+    let mut batch = Vec::new();
+    let mut evictions_failed = LayerCount::default();
+    for (timeline, layers) in batched {
+        let tenant_id = timeline.tenant_id;
+        let timeline_id = timeline.timeline_id;
+
+        batch.clear();
+        batch.extend(layers.iter().map(|x| &x.layer).cloned());
+        let batch_size = batch.len();
+
+        debug!(%timeline_id, "evicting batch for timeline");
+
+        async {
+            let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
+
+            match results {
+                Err(e) => {
+                    warn!("failed to evict batch: {:#}", e);
+                }
+                Ok(results) => {
+                    assert_eq!(results.len(), layers.len());
+                    for (result, layer) in results.into_iter().zip(layers.iter()) {
+                        match result {
+                            Some(Ok(true)) => {
+                                usage_assumed.add_available_bytes(layer.file_size());
+                            }
+                            Some(Ok(false)) => {
+                                // this is:
+                                // - Replacement::{NotFound, Unexpected}
+                                // - it cannot be is_remote_layer, filtered already
+                                evictions_failed.file_sizes += layer.file_size();
+                                evictions_failed.count += 1;
+                            }
+                            None => {
+                                assert!(cancel.is_cancelled());
+                                return;
+                            }
+                            Some(Err(e)) => {
+                                // we really shouldn't be getting this, precondition failure
+                                error!("failed to evict layer: {:#}", e);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
+        .await;
+
+        if cancel.is_cancelled() {
+            return Ok(IterationOutcome::Cancelled);
+        }
+    }
+
+    Ok(IterationOutcome::Finished(IterationOutcomeFinished {
+        before: usage_pre,
+        planned: usage_planned,
+        assumed: AssumedUsage {
+            projected_after: usage_assumed,
+            failed: evictions_failed,
+        },
+    }))
+}
+
+/// Different modes of gathering tenant's least recently used layers.
+#[derive(Debug)]
+enum Mode {
+    /// Add all but the most recently used `min_resident_size` worth of layers to the candidates
+    /// list.
+    ///
+    /// `min_resident_size` defaults to maximum layer file size of the tenant. This ensures that
+    /// the tenant will always have one layer resident. If we cannot compute `min_resident_size`
+    /// accurately because metadata is missing we use hardcoded constant. `min_resident_size` can
+    /// be overridden per tenant for important tenants.
+    RespectTenantMinResidentSize,
+    /// Consider all layer files from all tenants in LRU order.
+    ///
+    /// This is done if the `min_resident_size` respecting does not relieve pressure.
+    GlobalLru,
+}
+
+#[instrument(skip_all, fields(?mode, %tenant_id))]
+async fn extend_lru_candidates(
+    mode: Mode,
+    tenant_id: TenantId,
+    lru_candidates: &mut Vec<(Arc<Timeline>, LocalLayerInfoForDiskUsageEviction)>,
+    scratch: &mut Vec<(Arc<Timeline>, LocalLayerInfoForDiskUsageEviction)>,
+    cancel: &CancellationToken,
+) -> ControlFlow<()> {
+    debug!("begin");
+
+    let tenant = match tenant::mgr::get_tenant(tenant_id, true).await {
+        Ok(tenant) => tenant,
+        Err(e) => {
+            // this can happen if tenant has lifecycle transition after we fetched it
+            debug!("failed to get tenant: {e:#}");
+            return ControlFlow::Continue(());
+        }
+    };
+
+    if cancel.is_cancelled() {
+        return ControlFlow::Break(());
+    }
+
+    let mut max_layer_size = ApproxAccurate::default();
+    for tl in tenant.list_timelines() {
+        if !tl.is_active() {
+            continue;
+        }
+        let info = tl.get_local_layers_for_disk_usage_eviction();
+        debug!(timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+        scratch.extend(
+            info.resident_layers
+                .into_iter()
+                .map(|layer_infos| (tl.clone(), layer_infos)),
+        );
+        max_layer_size = max_layer_size.max(info.max_layer_size.accurate());
+
+        if cancel.is_cancelled() {
+            return ControlFlow::Break(());
+        }
+    }
+
+    let min_resident_size = match mode {
+        Mode::GlobalLru => {
+            lru_candidates.append(scratch);
+            return ControlFlow::Continue(());
+        }
+        Mode::RespectTenantMinResidentSize => match tenant.get_min_resident_size_override() {
+            Some(size) => size,
+            None => {
+                match max_layer_size.accurate() {
+                    Some(size) => size,
+                    None => {
+                        let prod_max_layer_file_size = 332_880_000;
+                        // rate-limit warning in case above comment is wrong and we're missing `LayerMetadata` for many layers
+                        static LAST_WARNED: Mutex<Option<Instant>> = Mutex::new(None);
+                        let mut last_warned = LAST_WARNED.lock().unwrap();
+                        if last_warned
+                            .map(|v| v.elapsed() > Duration::from_secs(60))
+                            .unwrap_or(true)
+                        {
+                            warn!(value=prod_max_layer_file_size, "some layers don't have LayerMetadata to calculate max_layer_file_size, using default value");
+                            *last_warned = Some(Instant::now());
+                        }
+                        prod_max_layer_file_size
+                    }
+                }
+            }
+        },
+    };
+
+    scratch.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts);
+
+    let mut current: u64 = scratch.iter().map(|(_, layer)| layer.file_size()).sum();
+    for (tl, layer) in scratch.drain(..) {
+        if cancel.is_cancelled() {
+            return ControlFlow::Break(());
+        }
+        if current <= min_resident_size {
+            break;
+        }
+        current -= layer.file_size();
+        debug!(?layer, "adding layer to lru_candidates");
+        lru_candidates.push((tl, layer));
+    }
+
+    ControlFlow::Continue(())
+}
+
+struct TimelineKey(Arc<Timeline>);
+
+impl PartialEq for TimelineKey {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::ptr_eq(&self.0, &other.0)
+    }
+}
+
+impl Eq for TimelineKey {}
+
+impl std::hash::Hash for TimelineKey {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        Arc::as_ptr(&self.0).hash(state);
+    }
+}
+
+impl std::ops::Deref for TimelineKey {
+    type Target = Timeline;
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+mod filesystem_level_usage {
+    use anyhow::Context;
+    use nix::{
+        dir::Dir,
+        sys::statvfs::{self, Statvfs},
+    };
+    use sync_wrapper::SyncWrapper;
+
+    use super::DiskUsageEvictionTaskConfig;
+
+    // The `#[allow(dead_code)]` is to suppress warnings about only the Debug impl reading these fields.
+    // We use the Debug impl for logging, so, it's allright.
+    #[derive(Debug, Clone, Copy)]
+    #[allow(dead_code)]
+    pub struct Usage<'a> {
+        config: &'a DiskUsageEvictionTaskConfig,
+
+        /// Filesystem capacity
+        total_bytes: u64,
+        /// Free filesystem space
+        avail_bytes: u64,
+    }
+
+    impl super::Usage for Usage<'_> {
+        fn has_pressure(&self) -> bool {
+            let usage_pct =
+                (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64;
+
+            let pressures = [
+                (
+                    "min_avail_bytes",
+                    self.avail_bytes < self.config.min_avail_bytes,
+                ),
+                ("max_usage_pct", usage_pct > self.config.max_usage_pct),
+            ];
+
+            pressures.into_iter().any(|(_, has_pressure)| has_pressure)
+        }
+
+        fn add_available_bytes(&mut self, bytes: u64) {
+            self.avail_bytes += bytes;
+        }
+    }
+
+    pub fn get<'a>(
+        tenants_dir_fd: &mut SyncWrapper<Dir>,
+        config: &'a DiskUsageEvictionTaskConfig,
+    ) -> anyhow::Result<Usage<'a>> {
+        let stat: Statvfs = statvfs::fstatvfs(tenants_dir_fd.get_mut())
+            .context("statvfs failed, presumably directory got unlinked")?;
+
+        // https://unix.stackexchange.com/a/703650
+        let blocksize = if stat.fragment_size() > 0 {
+            stat.fragment_size()
+        } else {
+            stat.block_size()
+        };
+
+        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
+        let avail_bytes = stat.blocks_available() * blocksize;
+        let total_bytes = stat.blocks() * blocksize;
+
+        Ok(Usage {
+            config,
+            total_bytes,
+            avail_bytes,
+        })
+    }
+}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -27,6 +27,35 @@ paths:
                  id:
                    type: integer

+  /v1/disk_usage_eviction/run:
+    put:
+      description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space.
+      security: []
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+                - evict_bytes
+              properties:
+                evict_bytes:
+                  description: Unsigned bytes or a human writable amount of bytes with IEC kibibyte suffixes.
+                  oneOf:
+                    - type: integer
+                    - type: string
+                      pattern: '^[0-9]+ ?(([KMGTP]i)?B)?$'
+      responses:
+        "200":
+          description: |
+            The run completed.
+            This does not necessarily mean that we actually evicted `evict_bytes`.
+            Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`.
+          content:
+            application/json:
+              schema:
+                type: object
+
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -245,6 +274,53 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    put:
+      description: Garbage collect given timeline
+      responses:
+        "200":
+          description: OK
+          content:
+            application/json:
+              schema:
+                type: string
+        "400":
+          description: Error when no tenant id found in path, no timeline id or invalid timestamp
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_id}/attach:
    parameters:
      - name: tenant_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::http::endpoint::RequestSpan;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};

 use super::models::{
@@ -20,7 +21,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::mgr::TenantMapInsertError;
+use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{PageReconstructError, Timeline};
@@ -81,38 +82,52 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
    get_state(request).conf
 }

+/// Check that the requester is authorized to operate on given tenant
 fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Result<(), ApiError> {
    check_permission_with(request, |claims| {
        crate::auth::check_permission(claims, tenant_id)
    })
 }

-fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
-    match err {
-        PageReconstructError::Other(err) => ApiError::InternalServerError(err),
-        PageReconstructError::NeedsDownload(_, _) => {
-            // This shouldn't happen, because we use a RequestContext that requests to
-            // download any missing layer files on-demand.
-            ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
-        }
-        PageReconstructError::Cancelled => {
-            ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
-        }
-        PageReconstructError::WalRedo(err) => {
-            ApiError::InternalServerError(anyhow::Error::new(err))
+impl From<PageReconstructError> for ApiError {
+    fn from(pre: PageReconstructError) -> ApiError {
+        match pre {
+            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
+            PageReconstructError::NeedsDownload(_, _) => {
+                // This shouldn't happen, because we use a RequestContext that requests to
+                // download any missing layer files on-demand.
+                ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
+            }
+            PageReconstructError::Cancelled => {
+                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
+            }
+            PageReconstructError::WalRedo(pre) => {
+                ApiError::InternalServerError(anyhow::Error::new(pre))
+            }
        }
    }
 }

-fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
-    match e {
-        TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-            ApiError::InternalServerError(anyhow::Error::new(e))
+impl From<TenantMapInsertError> for ApiError {
+    fn from(tmie: TenantMapInsertError) -> ApiError {
+        match tmie {
+            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+                ApiError::InternalServerError(anyhow::Error::new(tmie))
+            }
+            TenantMapInsertError::TenantAlreadyExists(id, state) => {
+                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+            }
+            TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
        }
-        TenantMapInsertError::TenantAlreadyExists(id, state) => {
-            ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+    }
+}
+
+impl From<TenantStateError> for ApiError {
+    fn from(tse: TenantStateError) -> ApiError {
+        match tse {
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
-        TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
    }
 }

@@ -216,9 +231,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);

-    let tenant = mgr::get_tenant(tenant_id, true)
-        .await
-        .map_err(ApiError::NotFound)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
    match tenant.create_timeline(
        new_timeline_id,
        request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -248,9 +261,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)
-            .await
-            .map_err(ApiError::NotFound)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -266,7 +277,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,

            response_data.push(timeline_info);
        }
-        Ok(response_data)
+        Ok::<Vec<TimelineInfo>, ApiError>(response_data)
    }
    .instrument(info_span!("timeline_list", tenant = %tenant_id))
    .await?;
@@ -285,9 +296,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)
-            .await
-            .map_err(ApiError::NotFound)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -323,10 +332,7 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let result = timeline
-        .find_lsn_for_timestamp(timestamp_pg, &ctx)
-        .await
-        .map_err(apierror_from_prerror)?;
+    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;

    let result = match result {
        LsnForTimestamp::Present(lsn) => format!("{lsn}"),
@@ -351,8 +357,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
    if let Some(remote_storage) = &state.remote_storage {
        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
            .instrument(info_span!("tenant_attach", tenant = %tenant_id))
-            .await
-            .map_err(apierror_from_tenant_map_insert_error)?;
+            .await?;
    } else {
        return Err(ApiError::BadRequest(anyhow!(
            "attach_tenant is not possible because pageserver was configured without remote storage"
@@ -371,11 +376,7 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body

    mgr::delete_timeline(tenant_id, timeline_id, &ctx)
        .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
-        .await
-        // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
-        // user and internal errors. Replace this with better handling once the error type permits
-        // it.
-        .map_err(ApiError::InternalServerError)?;
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -388,10 +389,7 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
    let conf = state.conf;
    mgr::detach_tenant(conf, tenant_id)
        .instrument(info_span!("tenant_detach", tenant = %tenant_id))
-        .await
-        // FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
-        // Replace this with better handling once the error type permits it.
-        .map_err(ApiError::InternalServerError)?;
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -405,8 +403,7 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
    let state = get_state(&request);
    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
        .instrument(info_span!("load", tenant = %tenant_id))
-        .await
-        .map_err(apierror_from_tenant_map_insert_error)?;
+        .await?;

    json_response(StatusCode::ACCEPTED, ())
 }
@@ -419,10 +416,7 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
    let conf = state.conf;
    mgr::ignore_tenant(conf, tenant_id)
        .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
-        .await
-        // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
-        // Replace this with better handling once the error type permits it.
-        .map_err(ApiError::InternalServerError)?;
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -496,9 +490,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;

    // this can be long operation
    let inputs = tenant
@@ -746,6 +738,16 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        );
    }

+    if let Some(eviction_policy) = request_data.eviction_policy {
+        tenant_conf.eviction_policy = Some(
+            serde_json::from_value(eviction_policy)
+                .context("parse field `eviction_policy`")
+                .map_err(ApiError::BadRequest)?,
+        );
+    }
+
+    tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+
    let target_tenant_id = request_data
        .new_tenant_id
        .map(TenantId::from)
@@ -761,8 +763,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        &ctx,
    )
    .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
-    .await
-    .map_err(apierror_from_tenant_map_insert_error)?;
+    .await?;

    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
@@ -786,9 +787,7 @@ async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Bo
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_id, false)
-        .await
-        .map_err(ApiError::NotFound)?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;

    let response = HashMap::from([
        (
@@ -880,13 +879,12 @@ async fn update_tenant_config_handler(
        );
    }

+    tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+
    let state = get_state(&request);
    mgr::set_new_tenant_config(state.conf, tenant_conf, tenant_id)
        .instrument(info_span!("tenant_config", tenant = ?tenant_id))
-        .await
-        // FIXME: `update_tenant_config` can fail because of both user and internal errors.
-        // Replace this `map_err` with better error handling once the type permits it
-        .map_err(ApiError::InternalServerError)?;
+        .await?;

    json_response(StatusCode::OK, ())
 }
@@ -971,19 +969,22 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        timeline
+            .freeze_and_flush()
+            .await
+            .map_err(ApiError::InternalServerError)?;
+        timeline
+            .compact(&ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    timeline
-        .freeze_and_flush()
-        .await
-        .map_err(ApiError::InternalServerError)?;
-    timeline
-        .compact(&ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
+    .await
 }

 async fn timeline_download_remote_layers_handler_post(
@@ -1020,9 +1021,7 @@ async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)
-        .await
-        .map_err(ApiError::NotFound)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)
@@ -1088,7 +1087,8 @@ pub fn make_router(
            let handler = $handler;
            #[cfg(not(feature = "testing"))]
            let handler = cfg_disabled;
-            handler
+
+            move |r| RequestSpan(handler).handle(r)
        }};
    }

@@ -1096,35 +1096,55 @@ pub fn make_router(
        .data(Arc::new(
            State::new(conf, auth, remote_storage).context("Failed to initialize router state")?,
        ))
-        .get("/v1/status", status_handler)
+        .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
        .put(
            "/v1/failpoints",
            testing_api!("manage failpoints", failpoints_handler),
        )
-        .get("/v1/tenant", tenant_list_handler)
-        .post("/v1/tenant", tenant_create_handler)
-        .get("/v1/tenant/:tenant_id", tenant_status)
-        .get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler)
-        .put("/v1/tenant/config", update_tenant_config_handler)
-        .get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
-        .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
-        .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
-        .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
-        .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
-        .post("/v1/tenant/:tenant_id/load", tenant_load_handler)
-        .post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id",
-            timeline_detail_handler,
-        )
+        .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
+        .post("/v1/tenant", |r| {
+            RequestSpan(tenant_create_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id", |r| {
+            RequestSpan(tenant_status).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
+            RequestSpan(tenant_size_handler).handle(r)
+        })
+        .put("/v1/tenant/config", |r| {
+            RequestSpan(update_tenant_config_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/config", |r| {
+            RequestSpan(get_tenant_config_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/timeline", |r| {
+            RequestSpan(timeline_list_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/timeline", |r| {
+            RequestSpan(timeline_create_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/attach", |r| {
+            RequestSpan(tenant_attach_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/detach", |r| {
+            RequestSpan(tenant_detach_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/load", |r| {
+            RequestSpan(tenant_load_handler).handle(r)
+        })
+        .post("/v1/tenant/:tenant_id/ignore", |r| {
+            RequestSpan(tenant_ignore_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            RequestSpan(timeline_detail_handler).handle(r)
+        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
-            get_lsn_by_timestamp_handler,
-        )
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
-            timeline_gc_handler,
+            |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
        )
+        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
+            RequestSpan(timeline_gc_handler).handle(r)
+        })
        .put(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
            testing_api!("run timeline compaction", timeline_compact_handler),
@@ -1135,28 +1155,181 @@ pub fn make_router(
        )
        .post(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            timeline_download_remote_layers_handler_post,
+            |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
        )
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            timeline_download_remote_layers_handler_get,
-        )
-        .delete(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id",
-            timeline_delete_handler,
-        )
-        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer",
-            layer_map_info_handler,
+            |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
        )
+        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            RequestSpan(timeline_delete_handler).handle(r)
+        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
+            RequestSpan(layer_map_info_handler).handle(r)
+        })
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            layer_download_handler,
+            |r| RequestSpan(layer_download_handler).handle(r),
        )
        .delete(
            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            evict_timeline_layer_handler,
+            |r| RequestSpan(evict_timeline_layer_handler).handle(r),
        )
-        .get("/v1/panic", always_panic_handler)
+        .put("/v1/disk_usage_eviction/run", |r| {
+            RequestSpan(disk_usage_eviction_run).handle(r)
+        })
+        .put(
+            "/v1/tenant/:tenant_id/break",
+            testing_api!("set tenant state to broken", handle_tenant_break),
+        )
+        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
        .any(handler_404))
 }
+
+/// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
+#[cfg(feature = "testing")]
+async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    use std::str::FromStr;
+    let tenant_id = get_request_param(&r, "tenant_id")?;
+    let tenant_id = TenantId::from_str(tenant_id).map_err(|e| ApiError::BadRequest(e.into()))?;
+
+    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
+
+    tenant.set_broken("broken from test");
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
+
+    #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
+    struct Config {
+        /// How many bytes to evict before reporting that pressure is relieved.
+        #[serde(
+            deserialize_with = "deserialize_bytes",
+            serialize_with = "serialize_bytes"
+        )]
+        evict_bytes: u64,
+    }
+
+    fn serialize_bytes<S: serde::Serializer>(x: &u64, ser: S) -> Result<S::Ok, S::Error> {
+        use ubyte::ByteUnit;
+
+        let x = ByteUnit::from(*x);
+
+        // ByteUnit has a nice lossy serialization format as it's Display
+        ser.collect_str(&x)
+    }
+
+    fn deserialize_bytes<'d, D: serde::Deserializer<'d>>(des: D) -> Result<u64, D::Error> {
+        struct Visitor;
+
+        impl<'de> serde::de::Visitor<'de> for Visitor {
+            type Value = u64;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                formatter.write_str("positive nsigned number of bytes or positive number of bytes with SI/IEC suffix in a string")
+            }
+
+            fn visit_u64<E>(self, v: u64) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                if v == 0 {
+                    Err(serde::de::Error::invalid_value(
+                        serde::de::Unexpected::Unsigned(v),
+                        &self,
+                    ))
+                } else {
+                    Ok(v)
+                }
+            }
+
+            fn visit_str<E>(self, v: &str) -> std::result::Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                use std::str::FromStr;
+                let bytes = ubyte::ByteUnit::from_str(v).map_err(serde::de::Error::custom)?;
+                let bytes = u64::from(bytes);
+                self.visit_u64(bytes)
+            }
+        }
+
+        des.deserialize_any(Visitor)
+    }
+
+    #[derive(Debug, Clone, Copy, serde::Serialize)]
+    struct Usage {
+        // remains unchanged after instantiation of the struct
+        config: Config,
+        // updated by `add_available_bytes`
+        #[serde(serialize_with = "serialize_bytes")]
+        freed_bytes: u64,
+    }
+
+    impl crate::disk_usage_eviction_task::Usage for Usage {
+        fn has_pressure(&self) -> bool {
+            self.config.evict_bytes > self.freed_bytes
+        }
+
+        fn add_available_bytes(&mut self, bytes: u64) {
+            self.freed_bytes += bytes;
+        }
+    }
+
+    let config = json_request::<Config>(&mut r)
+        .await
+        .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?;
+
+    let usage = Usage {
+        config,
+        freed_bytes: 0,
+    };
+
+    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
+
+    let (tx, rx) = tokio::sync::oneshot::channel();
+
+    let state = get_state(&r);
+
+    let Some(storage) = state.remote_storage.clone() else {
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "remote storage not configured, cannot run eviction iteration"
+        )))
+    };
+
+    let cancel = CancellationToken::new();
+    let child_cancel = cancel.clone();
+    let _g = cancel.drop_guard();
+
+    crate::task_mgr::spawn(
+        MGMT_REQUEST_RUNTIME.handle(),
+        TaskKind::DiskUsageEviction,
+        None,
+        None,
+        "ondemand disk usage eviction",
+        false,
+        async move {
+            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
+                &storage,
+                usage,
+                &child_cancel,
+            )
+            .await;
+
+            info!(?res, "disk_usage_eviction_task_iteration_impl finished");
+
+            let _ = tx.send(res);
+            Ok(())
+        }
+        .in_current_span(),
+    );
+
+    let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -4,6 +4,7 @@ pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
+pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -9,22 +9,18 @@ use once_cell::sync::Lazy;
 use pageserver_api::models::state;
 use utils::id::{TenantId, TimelineId};

-/// Prometheus histogram buckets (in seconds) that capture the majority of
-/// latencies in the microsecond range but also extend far enough up to distinguish
-/// "bad" from "really bad".
-fn get_buckets_for_critical_operations() -> Vec<f64> {
-    let buckets_per_digit = 5;
-    let min_exponent = -6;
-    let max_exponent = 2;
-
-    let mut buckets = vec![];
-    // Compute 10^(exp / buckets_per_digit) instead of 10^(1/buckets_per_digit)^exp
-    // because it's more numerically stable and doesn't result in numbers like 9.999999
-    for exp in (min_exponent * buckets_per_digit)..=(max_exponent * buckets_per_digit) {
-        buckets.push(10_f64.powf(exp as f64 / buckets_per_digit as f64))
-    }
-    buckets
-}
+/// Prometheus histogram buckets (in seconds) for operations in the critical
+/// path. In other words, operations that directly affect that latency of user
+/// queries.
+///
+/// The buckets capture the majority of latencies in the microsecond and
+/// millisecond range but also extend far enough up to distinguish "bad" from
+/// "really bad".
+const CRITICAL_OP_BUCKETS: &[f64] = &[
+    0.000_001, 0.000_010, 0.000_100, // 1 us, 10 us, 100 us
+    0.001_000, 0.010_000, 0.100_000, // 1 ms, 10 ms, 100 ms
+    1.0, 10.0, 100.0, // 1 s, 10 s, 100 s
+];

 // Metrics collected on operations on the storage repository.
 const STORAGE_TIME_OPERATIONS: &[&str] = &[
@@ -55,12 +51,15 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+// Buckets for background operations like compaction, GC, size calculation
+const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
+
 pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_storage_operations_seconds_global",
        "Time spent on storage operations",
        &["operation"],
-        get_buckets_for_critical_operations(),
+        STORAGE_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -71,7 +70,7 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value",
        &["tenant_id", "timeline_id"],
-        get_buckets_for_critical_operations(),
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -90,7 +89,7 @@ static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
        &["tenant_id", "timeline_id"],
-        get_buckets_for_critical_operations(),
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -123,6 +122,22 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_remote_ondemand_downloaded_layers_total",
+        "Total on-demand downloaded layers"
+    )
+    .unwrap()
+});
+
+pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_remote_ondemand_downloaded_bytes_total",
+        "Total bytes of layers on-demand downloaded",
+    )
+    .unwrap()
+});
+
 static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_current_logical_size",
@@ -179,15 +194,101 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_evictions",
+        "Number of layers evicted from the pageserver",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_evictions_with_low_residence_duration",
+        "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
+         Residence duration is determined using the `residence_duration_data_source`.",
+        &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
+    )
+    .expect("failed to define a metric")
+});
+
+/// Each [`Timeline`]'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
+#[derive(Debug)]
+pub struct EvictionsWithLowResidenceDuration {
+    data_source: &'static str,
+    threshold: Duration,
+    counter: Option<IntCounter>,
+}
+
+pub struct EvictionsWithLowResidenceDurationBuilder {
+    data_source: &'static str,
+    threshold: Duration,
+}
+
+impl EvictionsWithLowResidenceDurationBuilder {
+    pub fn new(data_source: &'static str, threshold: Duration) -> Self {
+        Self {
+            data_source,
+            threshold,
+        }
+    }
+
+    fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
+        let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
+            .get_metric_with_label_values(&[
+                tenant_id,
+                timeline_id,
+                self.data_source,
+                &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
+            ])
+            .unwrap();
+        EvictionsWithLowResidenceDuration {
+            data_source: self.data_source,
+            threshold: self.threshold,
+            counter: Some(counter),
+        }
+    }
+}
+
+impl EvictionsWithLowResidenceDuration {
+    fn threshold_label_value(threshold: Duration) -> String {
+        format!("{}", threshold.as_secs())
+    }
+
+    pub fn observe(&self, observed_value: Duration) {
+        if self.threshold < observed_value {
+            self.counter
+                .as_ref()
+                .expect("nobody calls this function after `remove_from_vec`")
+                .inc();
+        }
+    }
+
+    // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
+    fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
+        let Some(_counter) = self.counter.take() else {
+            return;
+        };
+        EVICTIONS_WITH_LOW_RESIDENCE_DURATION
+            .remove_label_values(&[
+                tenant_id,
+                timeline_id,
+                self.data_source,
+                &Self::threshold_label_value(self.threshold),
+            ])
+            .expect("we own the metric, no-one else should remove it");
+    }
+}
+
 // Metrics collected on disk IO operations
+//
+// Roughly logarithmic scale.
 const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
-    0.000001, // 1 usec
-    0.00001,  // 10 usec
-    0.0001,   // 100 usec
-    0.001,    // 1 msec
-    0.01,     // 10 msec
-    0.1,      // 100 msec
-    1.0,      // 1 sec
+    0.000030, // 30 usec
+    0.001000, // 1000 usec
+    0.030,    // 30 ms
+    1.000,    // 1000 ms
 ];

 const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -222,20 +323,12 @@ const SMGR_QUERY_TIME_OPERATIONS: &[&str] = &[
    "get_db_size",
 ];

-const SMGR_QUERY_TIME_BUCKETS: &[f64] = &[
-    0.00001, // 1/100000 s
-    0.0001, 0.00015, 0.0002, 0.00025, 0.0003, 0.00035, 0.0005, 0.00075, // 1/10000 s
-    0.001, 0.0025, 0.005, 0.0075, // 1/1000 s
-    0.01, 0.0125, 0.015, 0.025, 0.05, // 1/100 s
-    0.1,  // 1/10 s
-];
-
 pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling",
        &["smgr_query_type", "tenant_id", "timeline_id"],
-        SMGR_QUERY_TIME_BUCKETS.into()
+        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
 });
@@ -504,10 +597,16 @@ pub struct TimelineMetrics {
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
    pub persistent_bytes_written: IntCounter,
+    pub evictions: IntCounter,
+    pub evictions_with_low_residence_duration: EvictionsWithLowResidenceDuration,
 }

 impl TimelineMetrics {
-    pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+    pub fn new(
+        tenant_id: &TenantId,
+        timeline_id: &TimelineId,
+        evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
+    ) -> Self {
        let tenant_id = tenant_id.to_string();
        let timeline_id = timeline_id.to_string();
        let reconstruct_time_histo = RECONSTRUCT_TIME
@@ -544,6 +643,11 @@ impl TimelineMetrics {
        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
+        let evictions = EVICTIONS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let evictions_with_low_residence_duration =
+            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);

        TimelineMetrics {
            tenant_id,
@@ -563,6 +667,8 @@ impl TimelineMetrics {
            current_logical_size_gauge,
            num_persistent_files_created,
            persistent_bytes_written,
+            evictions,
+            evictions_with_low_residence_duration,
        }
    }
 }
@@ -579,7 +685,9 @@ impl Drop for TimelineMetrics {
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
-
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        self.evictions_with_low_residence_duration
+            .remove(tenant_id, timeline_id);
        for op in STORAGE_TIME_OPERATIONS {
            let _ =
                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -614,7 +722,7 @@ use std::collections::HashMap;
 use std::pin::Pin;
 use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
-use std::time::Instant;
+use std::time::{Duration, Instant};

 pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -12,7 +12,7 @@
 use anyhow::Context;
 use bytes::Buf;
 use bytes::Bytes;
-use futures::{Stream, StreamExt};
+use futures::Stream;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -20,7 +20,9 @@ use pageserver_api::models::{
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
-use pq_proto::ConnectionError;
+use postgres_backend::PostgresBackendTCP;
+use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::io;
@@ -29,14 +31,13 @@ use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use tokio_util::io::StreamReader;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
    auth::{Claims, JwtAuth, Scope},
    id::{TenantId, TimelineId},
    lsn::Lsn,
-    postgres_backend::AuthType,
-    postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError},
    simple_rcu::RcuReadGuard,
 };

@@ -55,7 +56,7 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
+fn copyin_stream(pgb: &mut PostgresBackendTCP) -> impl Stream<Item = io::Result<Bytes>> + '_ {
    async_stream::try_stream! {
        loop {
            let msg = tokio::select! {
@@ -64,11 +65,11 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                _ = task_mgr::shutdown_watcher() => {
                    // We were requested to shut down.
                    let msg = format!("pageserver is shutting down");
-                    let _ = pgb.write_message(&BeMessage::ErrorResponse(&msg, None));
+                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None));
                    Err(QueryError::Other(anyhow::anyhow!(msg)))
                }

-                msg = pgb.read_message() => { msg }
+                msg = pgb.read_message() => { msg.map_err(QueryError::from)}
            };

            match msg {
@@ -79,14 +80,16 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                        FeMessage::Sync => continue,
                        FeMessage::Terminate => {
                            let msg = "client terminated connection with Terminate message during COPY";
-                            let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                            pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
+                            let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                            // error can't happen here, ErrorResponse serialization should be always ok
+                            pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                            break;
                        }
                        m => {
                            let msg = format!("unexpected message {m:?}");
-                            pgb.write_message(&BeMessage::ErrorResponse(&msg, None))?;
+                            // error can't happen here, ErrorResponse serialization should be always ok
+                            pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
                            Err(io::Error::new(io::ErrorKind::Other, msg))?;
                            break;
                        }
@@ -96,22 +99,66 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                }
                Ok(None) => {
                    let msg = "client closed connection during COPY";
-                    let query_error_error = QueryError::Disconnected(ConnectionError::Socket(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                    pgb.write_message(&BeMessage::ErrorResponse(msg, Some(query_error_error.pg_error_code())))?;
+                    let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                    // error can't happen here, ErrorResponse serialization should be always ok
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
                    pgb.flush().await?;
                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                }
-                Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
+                Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
                    Err(io_error)?;
                }
                Err(other) => {
-                    Err(io::Error::new(io::ErrorKind::Other, other))?;
+                    Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
                }
            };
        }
    }
 }

+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// XXX: Currently, any trailing data after the EOF marker prints a warning.
+/// Perhaps it should be a hard error?
+async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any data after the EOF marker
+    let mut trailing_bytes = 0;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if trailing_bytes > 0 {
+        warn!("ignored {trailing_bytes} unexpected bytes after the tar archive");
+    }
+    Ok(())
+}
+
 ///////////////////////////////////////////////////////////////////////////////

 ///
@@ -212,7 +259,7 @@ async fn page_service_conn_main(
            // we've been requested to shut down
            Ok(())
        }
-        Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
+        Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
            if is_expected_io_error(&io_error) {
                info!("Postgres client disconnected ({io_error})");
                Ok(())
@@ -286,7 +333,7 @@ impl PageServerHandler {
    #[instrument(skip(self, pgb, ctx))]
    async fn handle_pagerequests(
        &self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackendTCP,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        ctx: RequestContext,
@@ -311,7 +358,7 @@ impl PageServerHandler {
        let timeline = tenant.get_timeline(timeline_id, true)?;

        // switch client to COPYBOTH
-        pgb.write_message(&BeMessage::CopyBothResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
        pgb.flush().await?;

        let metrics = PageRequestMetrics::new(&tenant_id, &timeline_id);
@@ -380,7 +427,7 @@ impl PageServerHandler {
                })
            });

-            pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
+            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
            pgb.flush().await?;
        }
        Ok(())
@@ -390,7 +437,7 @@ impl PageServerHandler {
    #[instrument(skip(self, pgb, ctx))]
    async fn handle_import_basebackup(
        &self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackendTCP,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        base_lsn: Lsn,
@@ -416,22 +463,17 @@ impl PageServerHandler {

        // Import basebackup provided via CopyData
        info!("importing basebackup");
-        pgb.write_message(&BeMessage::CopyInResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
        pgb.flush().await?;

-        let mut copyin_stream = Box::pin(copyin_stream(pgb));
+        let copyin_reader = StreamReader::new(copyin_stream(pgb));
+        tokio::pin!(copyin_reader);
        timeline
-            .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
+            .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
            .await?;

-        // Drain the rest of the Copy data
-        let mut bytes_after_tar = 0;
-        while let Some(bytes) = copyin_stream.next().await {
-            bytes_after_tar += bytes?.len();
-        }
-        if bytes_after_tar > 0 {
-            warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive");
-        }
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;

        // TODO check checksum
        // Meanwhile you can verify client-side by taking fullbackup
@@ -446,7 +488,7 @@ impl PageServerHandler {
    #[instrument(skip(self, pgb, ctx))]
    async fn handle_import_wal(
        &self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackendTCP,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        start_lsn: Lsn,
@@ -468,21 +510,15 @@ impl PageServerHandler {

        // Import wal provided via CopyData
        info!("importing wal");
-        pgb.write_message(&BeMessage::CopyInResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
        pgb.flush().await?;
-        let mut copyin_stream = Box::pin(copyin_stream(pgb));
-        let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
-        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
+        let copyin_reader = StreamReader::new(copyin_stream(pgb));
+        tokio::pin!(copyin_reader);
+        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");

-        // Drain the rest of the Copy data
-        let mut bytes_after_tar = 0;
-        while let Some(bytes) = copyin_stream.next().await {
-            bytes_after_tar += bytes?.len();
-        }
-        if bytes_after_tar > 0 {
-            warn!("ignored {bytes_after_tar} unexpected bytes after the tar archive");
-        }
+        // Read the end of the tar archive.
+        read_tar_eof(copyin_reader).await?;

        // TODO Does it make sense to overshoot?
        if timeline.get_last_record_lsn() < end_lsn {
@@ -657,7 +693,7 @@ impl PageServerHandler {
    #[instrument(skip(self, pgb, ctx))]
    async fn handle_basebackup_request(
        &mut self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackendTCP,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        lsn: Option<Lsn>,
@@ -678,7 +714,7 @@ impl PageServerHandler {
        }

        // switch client to COPYOUT
-        pgb.write_message(&BeMessage::CopyOutResponse)?;
+        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
        pgb.flush().await?;

        // Send a tarball of the latest layer on the timeline
@@ -695,7 +731,7 @@ impl PageServerHandler {
            .await?;
        }

-        pgb.write_message(&BeMessage::CopyDone)?;
+        pgb.write_message_noflush(&BeMessage::CopyDone)?;
        pgb.flush().await?;
        info!("basebackup complete");

@@ -721,10 +757,10 @@ impl PageServerHandler {
 }

 #[async_trait::async_trait]
-impl postgres_backend_async::Handler for PageServerHandler {
+impl postgres_backend::Handler<tokio::net::TcpStream> for PageServerHandler {
    fn check_auth_jwt(
        &mut self,
-        _pgb: &mut PostgresBackend,
+        _pgb: &mut PostgresBackendTCP,
        jwt_response: &[u8],
    ) -> Result<(), QueryError> {
        // this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
@@ -752,7 +788,7 @@ impl postgres_backend_async::Handler for PageServerHandler {

    fn startup(
        &mut self,
-        _pgb: &mut PostgresBackend,
+        _pgb: &mut PostgresBackendTCP,
        _sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
        Ok(())
@@ -760,7 +796,7 @@ impl postgres_backend_async::Handler for PageServerHandler {

    async fn process_query(
        &mut self,
-        pgb: &mut PostgresBackend,
+        pgb: &mut PostgresBackendTCP,
        query_string: &str,
    ) -> Result<(), QueryError> {
        let ctx = self.connection_ctx.attached_child();
@@ -812,7 +848,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
            // Check that the timeline exists
            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
                .await?;
-            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
        // return pair of prev_lsn and last_lsn
        else if query_string.starts_with("get_last_record_rlsn ") {
@@ -835,15 +871,15 @@ impl postgres_backend_async::Handler for PageServerHandler {

            let end_of_timeline = timeline.get_last_record_rlsn();

-            pgb.write_message(&BeMessage::RowDescription(&[
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::text_col(b"prev_lsn"),
                RowDescriptor::text_col(b"last_lsn"),
            ]))?
-            .write_message(&BeMessage::DataRow(&[
+            .write_message_noflush(&BeMessage::DataRow(&[
                Some(end_of_timeline.prev.to_string().as_bytes()),
                Some(end_of_timeline.last.to_string().as_bytes()),
            ]))?
-            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
        // same as basebackup, but result includes relational data as well
        else if query_string.starts_with("fullbackup ") {
@@ -884,7 +920,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
            // Check that the timeline exists
            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
                .await?;
-            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("import basebackup ") {
            // Import the `base` section (everything but the wal) of a basebackup.
            // Assumes the tenant already exists on this pageserver.
@@ -929,10 +965,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                )
                .await
            {
-                Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
                Err(e) => {
                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
                        &e.to_string(),
                        Some(e.pg_error_code()),
                    ))?
@@ -965,10 +1001,10 @@ impl postgres_backend_async::Handler for PageServerHandler {
                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
                .await
            {
-                Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
+                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
                Err(e) => {
                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message(&BeMessage::ErrorResponse(
+                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
                        &e.to_string(),
                        Some(e.pg_error_code()),
                    ))?
@@ -977,7 +1013,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
        } else if query_string.to_ascii_lowercase().starts_with("set ") {
            // important because psycopg2 executes "SET datestyle TO 'ISO'"
            // on connect
-            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("show ") {
            // show <tenant_id>
            let (_, params_raw) = query_string.split_at("show ".len());
@@ -993,7 +1029,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
            self.check_permission(Some(tenant_id))?;

            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
-            pgb.write_message(&BeMessage::RowDescription(&[
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
                RowDescriptor::int8_col(b"checkpoint_timeout"),
                RowDescriptor::int8_col(b"compaction_target_size"),
@@ -1004,7 +1040,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                RowDescriptor::int8_col(b"image_creation_threshold"),
                RowDescriptor::int8_col(b"pitr_interval"),
            ]))?
-            .write_message(&BeMessage::DataRow(&[
+            .write_message_noflush(&BeMessage::DataRow(&[
                Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
                Some(
                    tenant
@@ -1027,7 +1063,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
                Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
            ]))?
-            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
+            .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            return Err(QueryError::Other(anyhow::anyhow!(
                "unknown command {query_string}"
@@ -1055,7 +1091,7 @@ impl From<GetActiveTenantError> for QueryError {
    fn from(e: GetActiveTenantError) -> Self {
        match e {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
-                ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
+                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
            GetActiveTenantError::Other(e) => QueryError::Other(e),
        }
@@ -1071,7 +1107,10 @@ async fn get_active_tenant_with_timeout(
    tenant_id: TenantId,
    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
-    let tenant = mgr::get_tenant(tenant_id, false).await?;
+    let tenant = match mgr::get_tenant(tenant_id, false).await {
+        Ok(tenant) => tenant,
+        Err(e) => return Err(GetActiveTenantError::Other(e.into())),
+    };
    let wait_time = Duration::from_secs(30);
    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
        Ok(Ok(())) => Ok(tenant),
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -234,6 +234,9 @@ pub enum TaskKind {
    // Eviction. One per timeline.
    Eviction,

+    /// See [`crate::disk_usage_eviction_task`].
+    DiskUsageEviction,
+
    // Initial logical size calculation
    InitialLogicalSizeCalculation,

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,9 +12,7 @@
 //!

 use anyhow::{bail, Context};
-use bytes::Bytes;
 use futures::FutureExt;
-use futures::Stream;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
@@ -96,7 +94,7 @@ mod timeline;

 pub mod size;

-pub use timeline::{PageReconstructError, Timeline};
+pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline};

 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -239,14 +237,13 @@ impl UninitializedTimeline<'_> {
    /// Prepares timeline data by loading it from the basebackup archive.
    pub async fn import_basebackup_from_tar(
        self,
-        copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
+        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        let raw_timeline = self.raw_timeline()?;

-        let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
-        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx)
+        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
            .await
            .context("Failed to import basebackup")?;

@@ -1243,11 +1240,8 @@ impl Tenant {
            "Cannot run GC iteration on inactive tenant"
        );

-        let gc_result = self
-            .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
-            .await;
-
-        gc_result
+        self.gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
+            .await
    }

    /// Perform one compaction iteration.
@@ -1699,6 +1693,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }

+    pub fn get_min_resident_size_override(&self) -> Option<u64> {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .min_resident_size_override
+            .or(self.conf.default_tenant_conf.min_resident_size_override)
+    }
+
    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
        *self.tenant_conf.write().unwrap() = new_tenant_conf;
    }
@@ -2768,6 +2769,7 @@ pub mod harness {
                max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
                trace_read_requests: Some(tenant_conf.trace_read_requests),
                eviction_policy: Some(tenant_conf.eviction_policy),
+                min_resident_size_override: tenant_conf.min_resident_size_override,
            }
        }
    }
@@ -3176,6 +3178,44 @@ mod tests {
    }
     */

+    #[tokio::test]
+    async fn test_get_branchpoints_from_an_inactive_timeline() -> anyhow::Result<()> {
+        let (tenant, ctx) =
+            TenantHarness::create("test_get_branchpoints_from_an_inactive_timeline")?
+                .load()
+                .await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
+        make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
+
+        tenant
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
+            .await?;
+        let newtline = tenant
+            .get_timeline(NEW_TIMELINE_ID, true)
+            .expect("Should have a local timeline");
+
+        make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
+
+        tline.set_state(TimelineState::Broken);
+
+        tenant
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
+            .await?;
+
+        assert_eq!(
+            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
+            TEST_IMG(&format!("foo at {}", Lsn(0x40)))
+        );
+
+        let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
+        assert_eq!(branchpoints.len(), 1);
+        assert_eq!(branchpoints[0], Lsn(0x40));
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
        let (tenant, ctx) =
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -51,9 +51,6 @@ where
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
-/// A cursor caches the last accessed page, allowing for faster access if the
-/// same block is accessed repeatedly.
-///
 /// You can access the last page with `*cursor`. 'read_blk' returns 'self', so
 /// that in many cases you can use a BlockCursor as a drop-in replacement for
 /// the underlying BlockReader. For example:
@@ -73,8 +70,6 @@ where
    R: BlockReader,
 {
    reader: R,
-    /// last accessed page
-    cache: Option<(u32, R::BlockLease)>,
 }

 impl<R> BlockCursor<R>
@@ -82,40 +77,13 @@ where
    R: BlockReader,
 {
    pub fn new(reader: R) -> Self {
-        BlockCursor {
-            reader,
-            cache: None,
-        }
+        BlockCursor { reader }
    }

-    pub fn read_blk(&mut self, blknum: u32) -> Result<&Self, std::io::Error> {
-        // Fast return if this is the same block as before
-        if let Some((cached_blk, _buf)) = &self.cache {
-            if *cached_blk == blknum {
-                return Ok(self);
-            }
-        }
-
-        // Read the block from the underlying reader, and cache it
-        self.cache = None;
-        let buf = self.reader.read_blk(blknum)?;
-        self.cache = Some((blknum, buf));
-
-        Ok(self)
+    pub fn read_blk(&mut self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
+        self.reader.read_blk(blknum)
    }
 }
-
-impl<R> Deref for BlockCursor<R>
-where
-    R: BlockReader,
-{
-    type Target = [u8; PAGE_SZ];
-
-    fn deref(&self) -> &<Self as Deref>::Target {
-        &self.cache.as_ref().unwrap().1
-    }
-}
-
 static NEXT_ID: AtomicU64 = AtomicU64::new(1);

 /// An adapter for reading a (virtual) file using the page cache.
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -92,6 +92,7 @@ pub struct TenantConf {
    pub max_lsn_wal_lag: NonZeroU64,
    pub trace_read_requests: bool,
    pub eviction_policy: EvictionPolicy,
+    pub min_resident_size_override: Option<u64>,
 }

 /// Same as TenantConf, but this struct preserves the information about
@@ -103,6 +104,7 @@ pub struct TenantConfOpt {
    pub checkpoint_distance: Option<u64>,

    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
    #[serde(default)]
    pub checkpoint_timeout: Option<Duration>,

@@ -158,6 +160,10 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub eviction_policy: Option<EvictionPolicy>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub min_resident_size_override: Option<u64>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -219,48 +225,9 @@ impl TenantConfOpt {
                .trace_read_requests
                .unwrap_or(global_conf.trace_read_requests),
            eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
-        }
-    }
-
-    pub fn update(&mut self, other: &TenantConfOpt) {
-        if let Some(checkpoint_distance) = other.checkpoint_distance {
-            self.checkpoint_distance = Some(checkpoint_distance);
-        }
-        if let Some(checkpoint_timeout) = other.checkpoint_timeout {
-            self.checkpoint_timeout = Some(checkpoint_timeout);
-        }
-        if let Some(compaction_target_size) = other.compaction_target_size {
-            self.compaction_target_size = Some(compaction_target_size);
-        }
-        if let Some(compaction_period) = other.compaction_period {
-            self.compaction_period = Some(compaction_period);
-        }
-        if let Some(compaction_threshold) = other.compaction_threshold {
-            self.compaction_threshold = Some(compaction_threshold);
-        }
-        if let Some(gc_horizon) = other.gc_horizon {
-            self.gc_horizon = Some(gc_horizon);
-        }
-        if let Some(gc_period) = other.gc_period {
-            self.gc_period = Some(gc_period);
-        }
-        if let Some(image_creation_threshold) = other.image_creation_threshold {
-            self.image_creation_threshold = Some(image_creation_threshold);
-        }
-        if let Some(pitr_interval) = other.pitr_interval {
-            self.pitr_interval = Some(pitr_interval);
-        }
-        if let Some(walreceiver_connect_timeout) = other.walreceiver_connect_timeout {
-            self.walreceiver_connect_timeout = Some(walreceiver_connect_timeout);
-        }
-        if let Some(lagging_wal_timeout) = other.lagging_wal_timeout {
-            self.lagging_wal_timeout = Some(lagging_wal_timeout);
-        }
-        if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
-            self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
-        }
-        if let Some(trace_read_requests) = other.trace_read_requests {
-            self.trace_read_requests = Some(trace_read_requests);
+            min_resident_size_override: self
+                .min_resident_size_override
+                .or(global_conf.min_resident_size_override),
        }
    }
 }
@@ -292,6 +259,7 @@ impl Default for TenantConf {
                .expect("cannot parse default max walreceiver Lsn wal lag"),
            trace_read_requests: false,
            eviction_policy: EvictionPolicy::NoEviction,
+            min_resident_size_override: None,
        }
    }
 }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,9 +2,7 @@
 //! used to keep in-memory layers spilled on disk.

 use crate::config::PageServerConf;
-use crate::page_cache;
-use crate::page_cache::PAGE_SZ;
-use crate::page_cache::{ReadBufResult, WriteBufResult};
+use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::virtual_file::VirtualFile;
@@ -427,7 +425,6 @@ mod tests {
            let actual = cursor.read_blob(pos)?;
            assert_eq!(actual, expected);
        }
-        drop(cursor);

        // Test a large blob that spans multiple pages
        let mut large_data = Vec::new();
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -154,11 +154,7 @@ where
        expected: &Arc<L>,
        new: Arc<L>,
    ) -> anyhow::Result<Replacement<Arc<L>>> {
-        fail::fail_point!("layermap-replace-notfound", |_| Ok(
-            // this is not what happens if an L0 layer was not found a anyhow error but perhaps
-            // that should be changed. this is good enough to show a replacement failure.
-            Replacement::NotFound
-        ));
+        fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));

        self.layer_map.replace_historic_noflush(expected, new)
    }
@@ -340,12 +336,15 @@ where

        let l0_index = if expected_l0 {
            // find the index in case replace worked, we need to replace that as well
-            Some(
-                self.l0_delta_layers
-                    .iter()
-                    .position(|slot| Self::compare_arced_layers(slot, expected))
-                    .ok_or_else(|| anyhow::anyhow!("existing l0 delta layer was not found"))?,
-            )
+            let pos = self
+                .l0_delta_layers
+                .iter()
+                .position(|slot| Self::compare_arced_layers(slot, expected));
+
+            if pos.is_none() {
+                return Ok(Replacement::NotFound);
+            }
+            pos
        } else {
            None
        };
@@ -804,6 +803,26 @@ mod tests {
            )
        }

+        #[test]
+        fn replacing_missing_l0_is_notfound() {
+            // original impl had an oversight, and L0 was an anyhow::Error. anyhow::Error should
+            // however only happen for precondition failures.
+
+            let layer = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000053423C21-0000000053424D69";
+            let layer = LayerFileName::from_str(layer).unwrap();
+            let layer = LayerDescriptor::from(layer);
+
+            // same skeletan construction; see scenario below
+            let not_found: Arc<dyn Layer> = Arc::new(layer.clone());
+            let new_version: Arc<dyn Layer> = Arc::new(layer);
+
+            let mut map = LayerMap::default();
+
+            let res = map.batch_update().replace_historic(&not_found, new_version);
+
+            assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
+        }
+
        fn l0_delta_layers_updated_scenario(layer_name: &str, expected_l0: bool) {
            let name = LayerFileName::from_str(layer_name).unwrap();
            let skeleton = LayerDescriptor::from(name);
@@ -813,7 +832,8 @@ mod tests {

            let mut map = LayerMap::default();

-            // two disjoint Arcs in different lifecycle phases.
+            // two disjoint Arcs in different lifecycle phases. even if it seems they must be the
+            // same layer, we use LayerMap::compare_arced_layers as the identity of layers.
            assert!(!LayerMap::compare_arced_layers(&remote, &downloaded));

            let expected_in_counts = (1, usize::from(expected_l0));
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -289,7 +289,7 @@ pub async fn set_new_tenant_config(
    conf: &'static PageServerConf,
    new_tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
-) -> anyhow::Result<()> {
+) -> Result<(), TenantStateError> {
    info!("configuring tenant {tenant_id}");
    let tenant = get_tenant(tenant_id, true).await?;

@@ -306,16 +306,20 @@ pub async fn set_new_tenant_config(

 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
-pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
+pub async fn get_tenant(
+    tenant_id: TenantId,
+    active_only: bool,
+) -> Result<Arc<Tenant>, TenantStateError> {
    let m = TENANTS.read().await;
    let tenant = m
        .get(&tenant_id)
-        .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
+        .ok_or(TenantStateError::NotFound(tenant_id))?;
    if active_only && !tenant.is_active() {
-        anyhow::bail!(
+        tracing::warn!(
            "Tenant {tenant_id} is not active. Current state: {:?}",
            tenant.current_state()
-        )
+        );
+        Err(TenantStateError::NotActive(tenant_id))
    } else {
        Ok(Arc::clone(tenant))
    }
@@ -325,21 +329,28 @@ pub async fn delete_timeline(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    ctx: &RequestContext,
-) -> anyhow::Result<()> {
-    match get_tenant(tenant_id, true).await {
-        Ok(tenant) => {
-            tenant.delete_timeline(timeline_id, ctx).await?;
-        }
-        Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
-    }
-
+) -> Result<(), TenantStateError> {
+    let tenant = get_tenant(tenant_id, true).await?;
+    tenant.delete_timeline(timeline_id, ctx).await?;
    Ok(())
 }

+#[derive(Debug, thiserror::Error)]
+pub enum TenantStateError {
+    #[error("Tenant {0} not found")]
+    NotFound(TenantId),
+    #[error("Tenant {0} is stopping")]
+    IsStopping(TenantId),
+    #[error("Tenant {0} is not active")]
+    NotActive(TenantId),
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 pub async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
-) -> anyhow::Result<()> {
+) -> Result<(), TenantStateError> {
    remove_tenant_from_memory(tenant_id, async {
        let local_tenant_directory = conf.tenant_path(&tenant_id);
        fs::remove_dir_all(&local_tenant_directory)
@@ -379,7 +390,7 @@ pub async fn load_tenant(
 pub async fn ignore_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
-) -> anyhow::Result<()> {
+) -> Result<(), TenantStateError> {
    remove_tenant_from_memory(tenant_id, async {
        let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
        fs::File::create(&ignore_mark_file)
@@ -489,7 +500,7 @@ where
 async fn remove_tenant_from_memory<V, F>(
    tenant_id: TenantId,
    tenant_cleanup: F,
-) -> anyhow::Result<V>
+) -> Result<V, TenantStateError>
 where
    F: std::future::Future<Output = anyhow::Result<V>>,
 {
@@ -505,11 +516,9 @@ where
                | TenantState::Loading
                | TenantState::Broken
                | TenantState::Active => tenant.set_stopping(),
-                TenantState::Stopping => {
-                    anyhow::bail!("Tenant {tenant_id} is stopping already")
-                }
+                TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
            },
-            None => anyhow::bail!("Tenant not found for id {tenant_id}"),
+            None => return Err(TenantStateError::NotFound(tenant_id)),
        }
    }

@@ -532,10 +541,15 @@ where
        Err(e) => {
            let tenants_accessor = TENANTS.read().await;
            match tenants_accessor.get(&tenant_id) {
-                Some(tenant) => tenant.set_broken(&e.to_string()),
-                None => warn!("Tenant {tenant_id} got removed from memory"),
+                Some(tenant) => {
+                    tenant.set_broken(&e.to_string());
+                }
+                None => {
+                    warn!("Tenant {tenant_id} got removed from memory");
+                    return Err(TenantStateError::NotFound(tenant_id));
+                }
            }
-            Err(e)
+            Err(TenantStateError::Other(e))
        }
    }
 }
@@ -555,7 +569,7 @@ pub async fn immediate_gc(
    let tenant = guard
        .get(&tenant_id)
        .map(Arc::clone)
-        .with_context(|| format!("Tenant {tenant_id} not found"))
+        .with_context(|| format!("tenant {tenant_id}"))
        .map_err(ApiError::NotFound)?;

    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
@@ -605,7 +619,7 @@ pub async fn immediate_compact(
    let tenant = guard
        .get(&tenant_id)
        .map(Arc::clone)
-        .with_context(|| format!("Tenant {tenant_id} not found"))
+        .with_context(|| format!("tenant {tenant_id}"))
        .map_err(ApiError::NotFound)?;

    let timeline = tenant
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -218,9 +218,10 @@ use tracing::{debug, info, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;

-use crate::metrics::RemoteOpFileKind;
-use crate::metrics::RemoteOpKind;
-use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
+use crate::metrics::{
+    MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
+    REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+};
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::{
    config::PageServerConf,
@@ -446,6 +447,10 @@ impl RemoteTimelineClient {
                );
            }
        }
+
+        REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
+        REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);
+
        Ok(downloaded_size)
    }

--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,11 +6,13 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::path::Path;
+use std::time::Duration;

 use anyhow::{anyhow, Context};
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
-use tracing::{error, info, warn};
+
+use tracing::{info, warn};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
@@ -26,6 +28,8 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
    fs::File::open(path).await?.sync_all().await
 }

+static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
+
 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
 /// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
@@ -64,22 +68,28 @@ pub async fn download_layer_file<'a>(
            // TODO: this doesn't use the cached fd for some reason?
            let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
                format!(
-                    "Failed to create a destination file for layer '{}'",
+                    "create a destination file for layer '{}'",
                    temp_file_path.display()
                )
            })
            .map_err(DownloadError::Other)?;
            let mut download = storage.download(&remote_path).await.with_context(|| {
                format!(
-                    "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
+                    "open a download stream for layer with remote storage path '{remote_path:?}'"
                )
            })
            .map_err(DownloadError::Other)?;
-            let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
-                format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
-            })
-            .map_err(DownloadError::Other)?;
+
+            let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
+                .await
+                .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
+                .with_context(|| {
+                    format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+                })
+                .map_err(DownloadError::Other)?;
+
            Ok((destination_file, bytes_amount))
+
        },
        &format!("download {remote_path:?}"),
    ).await?;
@@ -300,7 +310,7 @@ where
            }
            Err(DownloadError::Other(ref err)) => {
                // Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
-                error!("{description} still failed after {attempts} retries, giving up: {err:?}");
+                warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
                return result;
            }
        }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -121,10 +121,10 @@ struct LayerAccessStatsInner {
 }

 #[derive(Debug, Clone, Copy)]
-pub(super) struct LayerAccessStatFullDetails {
-    pub(super) when: SystemTime,
-    pub(super) task_kind: TaskKind,
-    pub(super) access_kind: LayerAccessKind,
+pub(crate) struct LayerAccessStatFullDetails {
+    pub(crate) when: SystemTime,
+    pub(crate) task_kind: TaskKind,
+    pub(crate) access_kind: LayerAccessKind,
 }

 #[derive(Clone, Copy, strum_macros::EnumString)]
@@ -255,7 +255,7 @@ impl LayerAccessStats {
        ret
    }

-    pub(super) fn most_recent_access_or_residence_event(
+    fn most_recent_access_or_residence_event(
        &self,
    ) -> Either<LayerAccessStatFullDetails, LayerResidenceEvent> {
        let locked = self.0.lock().unwrap();
@@ -268,6 +268,13 @@ impl LayerAccessStats {
            }
        }
    }
+
+    pub(crate) fn latest_activity(&self) -> SystemTime {
+        match self.most_recent_access_or_residence_event() {
+            Either::Left(mra) => mra.when,
+            Either::Right(re) => re.timestamp,
+        }
+    }
 }

 /// Supertrait of the [`Layer`] trait that captures the bare minimum interface
@@ -364,7 +371,7 @@ pub trait PersistentLayer: Layer {
    }

    /// Permanently remove this layer from disk.
-    fn delete(&self) -> Result<()>;
+    fn delete_resident_layer_file(&self) -> Result<()>;

    fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
        None
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -438,7 +438,7 @@ impl PersistentLayer for DeltaLayer {
        ))
    }

-    fn delete(&self) -> Result<()> {
+    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
        Ok(())
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -252,7 +252,7 @@ impl PersistentLayer for ImageLayer {
        unimplemented!();
    }

-    fn delete(&self) -> Result<()> {
+    fn delete_resident_layer_file(&self) -> Result<()> {
        // delete underlying file
        fs::remove_file(self.path())?;
        Ok(())
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -155,8 +155,8 @@ impl PersistentLayer for RemoteLayer {
        bail!("cannot iterate a remote layer");
    }

-    fn delete(&self) -> Result<()> {
-        Ok(())
+    fn delete_resident_layer_file(&self) -> Result<()> {
+        bail!("remote layer has no layer file");
    }

    fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -13,6 +13,7 @@ use pageserver_api::models::{
    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
    DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState,
 };
+use remote_storage::GenericRemoteStorage;
 use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -55,6 +56,7 @@ use pageserver_api::reltag::RelTag;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::to_pg_timestamp;
 use utils::{
+    approx_accurate::ApproxAccurate,
    id::{TenantId, TimelineId},
    lsn::{AtomicLsn, Lsn, RecordLsn},
    seqwait::SeqWait,
@@ -662,8 +664,8 @@ impl Timeline {
            // update the index file on next flush iteration too. But it
            // could take a while until that happens.
            //
-            // Additionally, only do this on the terminal round before sleeping.
-            if last_round {
+            // Additionally, only do this once before we return from this function.
+            if last_round || res.is_ok() {
                if let Some(remote_client) = &self.remote_client {
                    remote_client.schedule_index_upload_for_file_changes()?;
                }
@@ -860,7 +862,7 @@ impl Timeline {
            // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
            // we want to stay below that with a big margin.  The LSN distance determines how
            // much WAL the safekeepers need to store.
-            if distance >= self.get_checkpoint_distance().into()
+            if distance >= i128::from(self.get_checkpoint_distance())
                || open_layer_size > self.get_checkpoint_distance()
                || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
            {
@@ -974,6 +976,25 @@ impl Timeline {
        }
    }

+    /// Evict a batch of layers.
+    ///
+    /// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured."
+    ///
+    /// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
+    pub async fn evict_layers(
+        &self,
+        _: &GenericRemoteStorage,
+        layers_to_evict: &[Arc<dyn PersistentLayer>],
+        cancel: CancellationToken,
+    ) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
+        let remote_client = self.remote_client.clone().expect(
+            "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
+        );
+
+        self.evict_layer_batch(&remote_client, layers_to_evict, cancel)
+            .await
+    }
+
    /// Evict multiple layers at once, continuing through errors.
    ///
    /// Try to evict the given `layers_to_evict` by
@@ -1011,6 +1032,15 @@ impl Timeline {
        // now lock out layer removal (compaction, gc, timeline deletion)
        let layer_removal_guard = self.layer_removal_cs.lock().await;

+        {
+            // to avoid racing with detach and delete_timeline
+            let state = self.current_state();
+            anyhow::ensure!(
+                state == TimelineState::Active,
+                "timeline is not active but {state:?}"
+            );
+        }
+
        // start the batch update
        let mut layer_map = self.layers.write().unwrap();
        let mut batch_updates = layer_map.batch_update();
@@ -1044,14 +1074,33 @@ impl Timeline {
        use super::layer_map::Replacement;

        if local_layer.is_remote_layer() {
+            // TODO: consider returning an err here instead of false, which is the same out the
+            // match later
            return Ok(false);
        }

-        let layer_metadata = LayerFileMetadata::new(
-            local_layer
-                .file_size()
-                .expect("Local layer should have a file size"),
-        );
+        let layer_file_size = local_layer
+            .file_size()
+            .expect("Local layer should have a file size");
+
+        let local_layer_mtime = local_layer
+            .local_path()
+            .expect("local layer should have a local path")
+            .metadata()
+            .context("get local layer file stat")?
+            .modified()
+            .context("get mtime of layer file")?;
+        let local_layer_residence_duration =
+            match SystemTime::now().duration_since(local_layer_mtime) {
+                Err(e) => {
+                    warn!("layer mtime is in the future: {}", e);
+                    None
+                }
+                Ok(delta) => Some(delta),
+            };
+
+        let layer_metadata = LayerFileMetadata::new(layer_file_size);
+
        let new_remote_layer = Arc::new(match local_layer.filename() {
            LayerFileName::Image(image_name) => RemoteLayer::new_img(
                self.tenant_id,
@@ -1075,14 +1124,29 @@ impl Timeline {

        let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? {
            Replacement::Replaced { .. } => {
-                let layer_size = local_layer.file_size();
-
-                if let Err(e) = local_layer.delete() {
+                if let Err(e) = local_layer.delete_resident_layer_file() {
                    error!("failed to remove layer file on evict after replacement: {e:#?}");
                }
+                // Always decrement the physical size gauge, even if we failed to delete the file.
+                // Rationale: we already replaced the layer with a remote layer in the layer map,
+                // and any subsequent download_remote_layer will
+                // 1. overwrite the file on disk and
+                // 2. add the downloaded size to the resident size gauge.
+                //
+                // If there is no re-download, and we restart the pageserver, then load_layer_map
+                // will treat the file as a local layer again, count it towards resident size,
+                // and it'll be like the layer removal never happened.
+                // The bump in resident size is perhaps unexpected but overall a robust behavior.
+                self.metrics
+                    .resident_physical_size_gauge
+                    .sub(layer_file_size);

-                if let Some(layer_size) = layer_size {
-                    self.metrics.resident_physical_size_gauge.sub(layer_size);
+                self.metrics.evictions.inc();
+
+                if let Some(delta) = local_layer_residence_duration {
+                    self.metrics
+                        .evictions_with_low_residence_duration
+                        .observe(delta);
                }

                true
@@ -1104,6 +1168,8 @@ impl Timeline {
            }
        };

+        // TODO: update metrics for how
+
        Ok(replaced)
    }
 }
@@ -1200,7 +1266,14 @@ impl Timeline {
                ancestor_timeline: ancestor,
                ancestor_lsn: metadata.ancestor_lsn(),

-                metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
+                metrics: TimelineMetrics::new(
+                    &tenant_id,
+                    &timeline_id,
+                    crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
+                        "mtime",
+                        conf.evictions_low_residence_duration_metric_threshold,
+                    ),
+                ),

                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

@@ -1327,6 +1400,7 @@ impl Timeline {
            lagging_wal_timeout,
            max_lsn_wal_lag,
            crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
+            self.conf.availability_zone.clone(),
            background_ctx,
        );
    }
@@ -1942,11 +2016,14 @@ impl Timeline {
        layer: Arc<dyn PersistentLayer>,
        updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
    ) -> anyhow::Result<()> {
-        let layer_size = layer.file_size();
-
-        layer.delete()?;
-        if let Some(layer_size) = layer_size {
-            self.metrics.resident_physical_size_gauge.sub(layer_size);
+        if !layer.is_remote_layer() {
+            layer.delete_resident_layer_file()?;
+            let layer_file_size = layer
+                .file_size()
+                .expect("Local layer should have a file size");
+            self.metrics
+                .resident_physical_size_gauge
+                .sub(layer_file_size);
        }

        // TODO Removing from the bottom of the layer map is expensive.
@@ -2704,10 +2781,22 @@ impl Timeline {
    ) -> Result<HashMap<LayerFileName, LayerFileMetadata>, PageReconstructError> {
        let timer = self.metrics.create_images_time_histo.start_timer();
        let mut image_layers: Vec<ImageLayer> = Vec::new();
+
+        // We need to avoid holes between generated image layers.
+        // Otherwise LayerMap::image_layer_exists will return false if key range of some layer is covered by more than one
+        // image layer with hole between them. In this case such layer can not be utilized by GC.
+        //
+        // How such hole between partitions can appear?
+        // if we have relation with relid=1 and size 100 and relation with relid=2 with size 200 then result of
+        // KeySpace::partition may contain partitions <100000000..100000099> and <200000000..200000199>.
+        // If there is delta layer <100000000..300000000> then it never be garbage collected because
+        // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
+        let mut start = Key::MIN;
+
        for partition in partitioning.parts.iter() {
+            let img_range = start..partition.ranges.last().unwrap().end;
+            start = img_range.end;
            if force || self.time_for_new_image_layer(partition, lsn)? {
-                let img_range =
-                    partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
                let mut image_layer_writer = ImageLayerWriter::new(
                    self.conf,
                    self.timeline_id,
@@ -2721,7 +2810,6 @@ impl Timeline {
                        "failpoint image-layer-writer-fail-before-finish"
                    )))
                });
-
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
@@ -3136,9 +3224,7 @@ impl Timeline {
            }

            fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                return Err(
-                    anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(),
-                );
+                Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
            });

            writer.as_mut().unwrap().put_value(key, lsn, value)?;
@@ -3808,7 +3894,7 @@ impl Timeline {
                    remote_layer.ongoing_download.close();
                } else {
                    // Keep semaphore open. We'll drop the permit at the end of the function.
-                    info!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
+                    error!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
                }

                // Don't treat it as an error if the task that triggered the download
@@ -3962,6 +4048,68 @@ impl Timeline {
    }
 }

+pub struct DiskUsageEvictionInfo {
+    /// Timeline's largest layer (remote or resident)
+    pub max_layer_size: ApproxAccurate<u64>,
+    /// Timeline's resident layers
+    pub resident_layers: Vec<LocalLayerInfoForDiskUsageEviction>,
+}
+
+pub struct LocalLayerInfoForDiskUsageEviction {
+    pub layer: Arc<dyn PersistentLayer>,
+    pub last_activity_ts: SystemTime,
+}
+
+impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
+        // having to allocate a string to this is bad, but it will rarely be formatted
+        let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
+        let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
+        f.debug_struct("LocalLayerInfoForDiskUsageEviction")
+            .field("layer", &self.layer)
+            .field("last_activity", &ts)
+            .finish()
+    }
+}
+
+impl LocalLayerInfoForDiskUsageEviction {
+    pub fn file_size(&self) -> u64 {
+        self.layer
+            .file_size()
+            .expect("we know this is a local layer")
+    }
+}
+
+impl Timeline {
+    pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
+        let layers = self.layers.read().unwrap();
+
+        let mut max_layer_size = ApproxAccurate::default();
+        let mut resident_layers = Vec::new();
+
+        for l in layers.iter_historic_layers() {
+            max_layer_size = max_layer_size.max(l.file_size());
+
+            if l.is_remote_layer() {
+                continue;
+            }
+
+            let last_activity_ts = l.access_stats().latest_activity();
+
+            resident_layers.push(LocalLayerInfoForDiskUsageEviction {
+                layer: l,
+                last_activity_ts,
+            });
+        }
+
+        DiskUsageEvictionInfo {
+            max_layer_size,
+            resident_layers,
+        }
+    }
+}
+
 type TraversalPathItem = (
    ValueReconstructResult,
    Lsn,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -6,7 +6,6 @@ use std::{
    time::{Duration, SystemTime},
 };

-use either::Either;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn};
@@ -126,13 +125,7 @@ impl Timeline {
                if hist_layer.is_remote_layer() {
                    continue;
                }
-                let last_activity_ts = match hist_layer
-                    .access_stats()
-                    .most_recent_access_or_residence_event()
-                {
-                    Either::Left(mra) => mra.when,
-                    Either::Right(re) => re.timestamp,
-                };
+                let last_activity_ts = hist_layer.access_stats().latest_activity();
                let no_activity_for = match now.duration_since(last_activity_ts) {
                    Ok(d) => d,
                    Err(_e) => {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -45,6 +45,7 @@ pub fn spawn_connection_manager_task(
    lagging_wal_timeout: Duration,
    max_lsn_wal_lag: NonZeroU64,
    auth_token: Option<Arc<String>>,
+    availability_zone: Option<String>,
    ctx: RequestContext,
 ) {
    let mut broker_client = get_broker_client().clone();
@@ -67,6 +68,7 @@ pub fn spawn_connection_manager_task(
                lagging_wal_timeout,
                max_lsn_wal_lag,
                auth_token,
+                availability_zone,
            );
            loop {
                select! {
@@ -334,6 +336,7 @@ struct WalreceiverState {
    /// Data about all timelines, available for connection, fetched from storage broker, grouped by their corresponding safekeeper node id.
    wal_stream_candidates: HashMap<NodeId, BrokerSkTimeline>,
    auth_token: Option<Arc<String>>,
+    availability_zone: Option<String>,
 }

 /// Current connection data.
@@ -381,6 +384,7 @@ impl WalreceiverState {
        lagging_wal_timeout: Duration,
        max_lsn_wal_lag: NonZeroU64,
        auth_token: Option<Arc<String>>,
+        availability_zone: Option<String>,
    ) -> Self {
        let id = TenantTimelineId {
            tenant_id: timeline.tenant_id,
@@ -396,6 +400,7 @@ impl WalreceiverState {
            wal_stream_candidates: HashMap::new(),
            wal_connection_retries: HashMap::new(),
            auth_token,
+            availability_zone,
        }
    }

@@ -740,6 +745,7 @@ impl WalreceiverState {
                        None => None,
                        Some(x) => Some(x),
                    },
+                    self.availability_zone.as_deref(),
                ) {
                    Ok(connstr) => Some((*sk_id, info, connstr)),
                    Err(e) => {
@@ -824,17 +830,24 @@ fn wal_stream_connection_config(
    }: TenantTimelineId,
    listen_pg_addr_str: &str,
    auth_token: Option<&str>,
+    availability_zone: Option<&str>,
 ) -> anyhow::Result<PgConnectionConfig> {
    let (host, port) =
        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
    let port = port.unwrap_or(5432);
-    Ok(PgConnectionConfig::new_host_port(host, port)
+    let mut connstr = PgConnectionConfig::new_host_port(host, port)
        .extend_options([
            "-c".to_owned(),
            format!("timeline_id={}", timeline_id),
            format!("tenant_id={}", tenant_id),
        ])
-        .set_password(auth_token.map(|s| s.to_owned())))
+        .set_password(auth_token.map(|s| s.to_owned()));
+
+    if let Some(availability_zone) = availability_zone {
+        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
+    }
+
+    Ok(connstr)
 }

 #[cfg(test)]
@@ -1273,6 +1286,7 @@ mod tests {
            wal_stream_candidates: HashMap::new(),
            wal_connection_retries: HashMap::new(),
            auth_token: None,
+            availability_zone: None,
        }
    }
 }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -33,10 +33,11 @@ use crate::{
    walingest::WalIngest,
    walrecord::DecodedWALRecord,
 };
+use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use pq_proto::ReplicationFeedback;
-use utils::{lsn::Lsn, postgres_backend_async::is_expected_io_error};
+use utils::lsn::Lsn;

 /// Status of the connection.
 #[derive(Debug, Clone, Copy)]
@@ -353,7 +354,7 @@ pub async fn handle_walreceiver_connection(
            debug!("neon_status_update {status_update:?}");

            let mut data = BytesMut::new();
-            status_update.serialize(&mut data)?;
+            status_update.serialize(&mut data);
            physical_stream
                .as_mut()
                .zenith_status_update(data.len() as u64, &data)
@@ -434,8 +435,8 @@ fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres:
    {
        return Ok(pg_error);
    } else if let Some(db_error) = pg_error.as_db_error() {
-        if db_error.code() == &SqlState::CONNECTION_FAILURE
-            && db_error.message().contains("end streaming")
+        if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+            && db_error.message().contains("ending streaming")
        {
            return Ok(pg_error);
        }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -23,13 +23,11 @@ use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
 use serde::Serialize;
 use std::collections::VecDeque;
-use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
 use std::os::unix::io::{AsRawFd, RawFd};
 use std::os::unix::prelude::CommandExt;
-use std::path::PathBuf;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
 use std::sync::{Mutex, MutexGuard};
@@ -256,52 +254,53 @@ impl PostgresRedoManager {
        pg_version: u32,
    ) -> Result<Bytes, WalRedoError> {
        let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
-
+        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let start_time = Instant::now();
+        let mut n_attempts = 0u32;
+        loop {
+            let mut proc = self.stdin.lock().unwrap();
+            let lock_time = Instant::now();

-        let mut proc = self.stdin.lock().unwrap();
-        let lock_time = Instant::now();
+            // launch the WAL redo process on first use
+            if proc.is_none() {
+                self.launch(&mut proc, pg_version)?;
+            }
+            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());

-        // launch the WAL redo process on first use
-        if proc.is_none() {
-            self.launch(&mut proc, pg_version)?;
-        }
-        WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            // Relational WAL records are applied using wal-redo-postgres
+            let buf_tag = BufferTag { rel, blknum };
+            let result = self
+                .apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout)
+                .map_err(WalRedoError::IoError);

-        // Relational WAL records are applied using wal-redo-postgres
-        let buf_tag = BufferTag { rel, blknum };
-        let result = self
-            .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout)
-            .map_err(WalRedoError::IoError);
+            let end_time = Instant::now();
+            let duration = end_time.duration_since(lock_time);

-        let end_time = Instant::now();
-        let duration = end_time.duration_since(lock_time);
+            let len = records.len();
+            let nbytes = records.iter().fold(0, |acumulator, record| {
+                acumulator
+                    + match &record.1 {
+                        NeonWalRecord::Postgres { rec, .. } => rec.len(),
+                        _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
+                    }
+            });

-        let len = records.len();
-        let nbytes = records.iter().fold(0, |acumulator, record| {
-            acumulator
-                + match &record.1 {
-                    NeonWalRecord::Postgres { rec, .. } => rec.len(),
-                    _ => unreachable!("Only PostgreSQL records are accepted in this batch"),
-                }
-        });
+            WAL_REDO_TIME.observe(duration.as_secs_f64());
+            WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
+            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);

-        WAL_REDO_TIME.observe(duration.as_secs_f64());
-        WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
-        WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
+            debug!(
+				"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+				len,
+				nbytes,
+				duration.as_micros(),
+				lsn
+			);

-        debug!(
-            "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-            len,
-            nbytes,
-            duration.as_micros(),
-            lsn
-        );
-
-        // If something went wrong, don't try to reuse the process. Kill it, and
-        // next request will launch a new one.
-        if result.is_err() {
-            error!(
+            // If something went wrong, don't try to reuse the process. Kill it, and
+            // next request will launch a new one.
+            if result.is_err() {
+                error!(
                "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
                records.len(),
 				records.first().map(|p| p.0).unwrap_or(Lsn(0)),
@@ -310,24 +309,28 @@ impl PostgresRedoManager {
 				base_img_lsn,
                lsn
            );
-            // self.stdin only holds stdin & stderr as_raw_fd().
-            // Dropping it as part of take() doesn't close them.
-            // The owning objects (ChildStdout and ChildStderr) are stored in
-            // self.stdout and self.stderr, respsectively.
-            // We intentionally keep them open here to avoid a race between
-            // currently running `apply_wal_records()` and a `launch()` call
-            // after we return here.
-            // The currently running `apply_wal_records()` must not read from
-            // the newly launched process.
-            // By keeping self.stdout and self.stderr open here, `launch()` will
-            // get other file descriptors for the new child's stdout and stderr,
-            // and hence the current `apply_wal_records()` calls will observe
-            //  `output.stdout.as_raw_fd() != stdout_fd` .
-            if let Some(proc) = self.stdin.lock().unwrap().take() {
-                proc.child.kill_and_wait();
+                // self.stdin only holds stdin & stderr as_raw_fd().
+                // Dropping it as part of take() doesn't close them.
+                // The owning objects (ChildStdout and ChildStderr) are stored in
+                // self.stdout and self.stderr, respsectively.
+                // We intentionally keep them open here to avoid a race between
+                // currently running `apply_wal_records()` and a `launch()` call
+                // after we return here.
+                // The currently running `apply_wal_records()` must not read from
+                // the newly launched process.
+                // By keeping self.stdout and self.stderr open here, `launch()` will
+                // get other file descriptors for the new child's stdout and stderr,
+                // and hence the current `apply_wal_records()` calls will observe
+                //  `output.stdout.as_raw_fd() != stdout_fd` .
+                if let Some(proc) = self.stdin.lock().unwrap().take() {
+                    proc.child.kill_and_wait();
+                }
+            }
+            n_attempts += 1;
+            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
+                return result;
            }
        }
-        result
    }

    ///
@@ -634,26 +637,26 @@ impl PostgresRedoManager {
        input: &mut MutexGuard<Option<ProcessInput>>,
        pg_version: u32,
    ) -> Result<(), Error> {
-        // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
-        // just create one with constant name. That fails if you try to launch more than
-        // one WAL redo manager concurrently.
-        let datadir = path_with_suffix_extension(
+        // Previous versions of wal-redo required data directory and that directories
+        // occupied some space on disk. Remove it if we face it.
+        //
+        // This code could be dropped after one release cycle.
+        let legacy_datadir = path_with_suffix_extension(
            self.conf
                .tenant_path(&self.tenant_id)
                .join("wal-redo-datadir"),
            TEMP_FILE_SUFFIX,
        );
-
-        // Create empty data directory for wal-redo postgres, deleting old one first.
-        if datadir.exists() {
-            info!("old temporary datadir {datadir:?} exists, removing");
-            fs::remove_dir_all(&datadir).map_err(|e| {
+        if legacy_datadir.exists() {
+            info!("legacy wal-redo datadir {legacy_datadir:?} exists, removing");
+            fs::remove_dir_all(&legacy_datadir).map_err(|e| {
                Error::new(
                    e.kind(),
-                    format!("Old temporary dir {datadir:?} removal failure: {e}"),
+                    format!("legacy wal-redo datadir {legacy_datadir:?} removal failure: {e}"),
                )
            })?;
        }
+
        let pg_bin_dir_path = self
            .conf
            .pg_bin_dir(pg_version)
@@ -663,35 +666,6 @@ impl PostgresRedoManager {
            .pg_lib_dir(pg_version)
            .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;

-        info!("running initdb in {}", datadir.display());
-        let initdb = Command::new(pg_bin_dir_path.join("initdb"))
-            .args(["-D", &datadir.to_string_lossy()])
-            .arg("-N")
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) // macOS
-            .close_fds()
-            .output()
-            .map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?;
-
-        if !initdb.status.success() {
-            return Err(Error::new(
-                ErrorKind::Other,
-                format!(
-                    "initdb failed\nstdout: {}\nstderr:\n{}",
-                    String::from_utf8_lossy(&initdb.stdout),
-                    String::from_utf8_lossy(&initdb.stderr)
-                ),
-            ));
-        } else {
-            // Limit shared cache for wal-redo-postgres
-            let mut config = OpenOptions::new()
-                .append(true)
-                .open(PathBuf::from(&datadir).join("postgresql.conf"))?;
-            config.write_all(b"shared_buffers=128kB\n")?;
-            config.write_all(b"fsync=off\n")?;
-        }
-
        // Start postgres itself
        let child = Command::new(pg_bin_dir_path.join("postgres"))
            .arg("--wal-redo")
@@ -701,7 +675,6 @@ impl PostgresRedoManager {
            .env_clear()
            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("PGDATA", &datadir)
            // The redo process is not trusted, and runs in seccomp mode that
            // doesn't allow it to open any files. We have to also make sure it
            // doesn't inherit any file descriptors from the pageserver, that
@@ -771,7 +744,7 @@ impl PostgresRedoManager {
        &self,
        mut input: MutexGuard<Option<ProcessInput>>,
        tag: BufferTag,
-        base_img: Option<Bytes>,
+        base_img: &Option<Bytes>,
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
    ) -> Result<Bytes, std::io::Error> {
@@ -787,7 +760,7 @@ impl PostgresRedoManager {
        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
        build_begin_redo_for_block_msg(tag, &mut writebuf);
        if let Some(img) = base_img {
-            build_push_page_msg(tag, &img, &mut writebuf);
+            build_push_page_msg(tag, img, &mut writebuf);
        }
        for (lsn, rec) in records.iter() {
            if let NeonWalRecord::Postgres {
@@ -910,7 +883,7 @@ impl PostgresRedoManager {
            // into this buffer.
            let mut resultbuf = vec![0; BLCKSZ.into()];
            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
+            while nresult < usize::from(BLCKSZ) {
                // We do two things simultaneously: reading response from stdout
                // and forward any logging information that the child writes to its stderr to the page server's log.
                let n = loop {
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -32,6 +32,9 @@

 #define PageStoreTrace DEBUG5

+#define MAX_RECONNECT_ATTEMPTS 5
+#define RECONNECT_INTERVAL_USEC 1000000
+
 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;

@@ -52,8 +55,8 @@ int			readahead_buffer_size = 128;

 static void pageserver_flush(void);

-static void
-pageserver_connect()
+static bool
+pageserver_connect(int elevel)
 {
 	char	   *query;
 	int			ret;
@@ -69,10 +72,11 @@ pageserver_connect()
 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;

-		ereport(ERROR,
+		ereport(elevel,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
 				 errmsg(NEON_TAG "could not establish connection to pageserver"),
 				 errdetail_internal("%s", msg)));
+		return false;
 	}

 	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
@@ -81,7 +85,8 @@ pageserver_connect()
 	{
 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
-		neon_log(ERROR, "could not send pagestream command to pageserver");
+		neon_log(elevel, "could not send pagestream command to pageserver");
+		return false;
 	}

 	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
@@ -113,8 +118,9 @@ pageserver_connect()
 				FreeWaitEventSet(pageserver_conn_wes);
 				pageserver_conn_wes = NULL;

-				neon_log(ERROR, "could not complete handshake with pageserver: %s",
+				neon_log(elevel, "could not complete handshake with pageserver: %s",
 						 msg);
+				return false;
 			}
 		}
 	}
@@ -122,6 +128,7 @@ pageserver_connect()
 	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);

 	connected = true;
+	return true;
 }

 /*
@@ -149,8 +156,11 @@ retry:
 		if (event.events & WL_SOCKET_READABLE)
 		{
 			if (!PQconsumeInput(pageserver_conn))
-				neon_log(ERROR, "could not get response from pageserver: %s",
+			{
+				neon_log(LOG, "could not get response from pageserver: %s",
 						 PQerrorMessage(pageserver_conn));
+				return -1;
+			}
 		}

 		goto retry;
@@ -190,31 +200,62 @@ static void
 pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;
+	int n_reconnect_attempts = 0;

 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
 		pageserver_disconnect();

-	if (!connected)
-		pageserver_connect();

 	req_buff = nm_pack_request(request);

 	/*
-	 * Send request.
-	 *
-	 * In principle, this could block if the output buffer is full, and we
-	 * should use async mode and check for interrupts while waiting. In
-	 * practice, our requests are small enough to always fit in the output and
-	 * TCP buffer.
+	 * If pageserver is stopped, the connections from compute node are broken.
+	 * The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
+	 * That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
+	 * See https://github.com/neondatabase/neon/issues/1138
+	 * So try to reestablish connection in case of failure.
 	 */
-	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
+	while (true)
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		if (!connected)
+		{
+			if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR))
+			{
+				n_reconnect_attempts += 1;
+				pg_usleep(RECONNECT_INTERVAL_USEC);
+				continue;
+			}
+		}

-		pageserver_disconnect();
-		neon_log(ERROR, "failed to send page request: %s", msg);
+		/*
+		 * Send request.
+		 *
+		 * In principle, this could block if the output buffer is full, and we
+		 * should use async mode and check for interrupts while waiting. In
+		 * practice, our requests are small enough to always fit in the output and
+		 * TCP buffer.
+		 */
+		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
+		{
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+			if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS)
+			{
+				neon_log(LOG, "failed to send page request (try to reconnect): %s", msg);
+				if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */
+					pg_usleep(RECONNECT_INTERVAL_USEC);
+				n_reconnect_attempts += 1;
+				continue;
+			}
+			else
+			{
+				pageserver_disconnect();
+				neon_log(ERROR, "failed to send page request: %s", msg);
+			}
+		}
+		break;
 	}
+
 	pfree(req_buff.data);

 	n_unflushed_requests++;
--- a/pgxn/neon_utils/Makefile
+++ b/pgxn/neon_utils/Makefile
@@ -0,0 +1,15 @@
+# pgxs/neon_utils/Makefile
+
+
+MODULE_big = neon_utils
+OBJS = \
+	$(WIN32RES) \
+	neon_utils.o
+
+EXTENSION = neon_utils
+DATA = neon_utils--1.0.sql
+PGFILEDESC = "neon_utils - small useful functions"
+
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
--- a/pgxn/neon_utils/neon_utils--1.0.sql
+++ b/pgxn/neon_utils/neon_utils--1.0.sql
@@ -0,0 +1,6 @@
+CREATE FUNCTION num_cpus()
+RETURNS int
+AS 'MODULE_PATHNAME', 'num_cpus'
+LANGUAGE C STRICT
+PARALLEL UNSAFE
+VOLATILE;
--- a/pgxn/neon_utils/neon_utils.c
+++ b/pgxn/neon_utils/neon_utils.c
@@ -0,0 +1,35 @@
+/*-------------------------------------------------------------------------
+ *
+ * neon_utils.c
+ *	  neon_utils - small useful functions
+ *
+ * IDENTIFICATION
+ *	 contrib/neon_utils/neon_utils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "postgres.h"
+#include "fmgr.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(num_cpus);
+
+Datum
+num_cpus(PG_FUNCTION_ARGS)
+{
+#ifdef _WIN32
+	SYSTEM_INFO sysinfo;
+	GetSystemInfo(&sysinfo);
+	uint32 num_cpus = (uint32) sysinfo.dwNumberOfProcessors;
+#else
+	uint32 num_cpus = (uint32) sysconf(_SC_NPROCESSORS_ONLN);
+#endif
+	PG_RETURN_UINT32(num_cpus);
+}
--- a/pgxn/neon_utils/neon_utils.control
+++ b/pgxn/neon_utils/neon_utils.control
@@ -0,0 +1,6 @@
+# neon_utils extension
+comment = 'neon_utils - small useful functions'
+default_version = '1.0'
+module_pathname = '$libdir/neon_utils'
+relocatable = true
+trusted = true
--- a/pgxn/neon_walredo/walredoproc.c
+++ b/pgxn/neon_walredo/walredoproc.c
@@ -65,6 +65,14 @@
 #include "rusagestub.h"
 #endif

+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/heapam.h"
+#include "access/multixact.h"
+#include "access/nbtree.h"
+#include "access/subtrans.h"
+#include "access/syncscan.h"
+#include "access/twophase.h"
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
 #if PG_VERSION_NUM >= 150000
@@ -72,18 +80,36 @@
 #endif
 #include "access/xlogutils.h"
 #include "catalog/pg_class.h"
-#include "libpq/libpq.h"
+#include "commands/async.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "postmaster/bgworker_internals.h"
+#include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
+#include "replication/logicallauncher.h"
+#include "replication/origin.h"
+#include "replication/slot.h"
+#include "replication/walreceiver.h"
+#include "replication/walsender.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/dsm.h"
 #include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+#include "storage/pmsignal.h"
+#include "storage/predicate.h"
 #include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/sinvaladt.h"
 #include "storage/smgr.h"
+#include "storage/spin.h"
 #include "tcop/tcopprot.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
+#include "utils/snapmgr.h"

 #include "inmem_smgr.h"

@@ -101,6 +127,7 @@ static void apply_error_callback(void *arg);
 static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
 static void GetPage(StringInfo input_message);
 static ssize_t buffered_read(void *buf, size_t count);
+static void CreateFakeSharedMemoryAndSemaphores();

 static BufferTag target_redo_tag;

@@ -141,7 +168,7 @@ enter_seccomp_mode(void)
 		PG_SCMP_ALLOW(shmctl),
 		PG_SCMP_ALLOW(shmdt),
 		PG_SCMP_ALLOW(unlink), // shm_unlink
-		*/
+	 */
 	};

 #ifdef MALLOC_NO_MMAP
@@ -177,6 +204,7 @@ WalRedoMain(int argc, char *argv[])
 	 * buffers. So let's keep it small (default value is 1024)
 	 */
 	num_temp_buffers = 4;
+	NBuffers = 4;

 	/*
 	 * install the simple in-memory smgr
@@ -184,49 +212,33 @@ WalRedoMain(int argc, char *argv[])
 	smgr_hook = smgr_inmem;
 	smgr_init_hook = smgr_init_inmem;

-	/*
-	 * Validate we have been given a reasonable-looking DataDir and change into it.
-	 */
-	checkDataDir();
-	ChangeToDataDir();
-
-	/*
-	 * Create lockfile for data directory.
-	 */
-	CreateDataDirLockFile(false);
-
-	/* read control file (error checking and contains config ) */
-	LocalProcessControlFile(false);
-
-	/*
-	 * process any libraries that should be preloaded at postmaster start
-	 */
-	process_shared_preload_libraries();

 	/* Initialize MaxBackends (if under postmaster, was done already) */
+	MaxConnections = 1;
+	max_worker_processes = 0;
+	max_parallel_workers = 0;
+	max_wal_senders = 0;
 	InitializeMaxBackends();

-#if PG_VERSION_NUM >= 150000
-	/*
-	 * Give preloaded libraries a chance to request additional shared memory.
-	 */
-	process_shmem_requests();
+	/* Disable lastWrittenLsnCache */
+	lastWrittenLsnCacheSize = 0;

-	/*
-	 * Now that loadable modules have had their chance to request additional
-	 * shared memory, determine the value of any runtime-computed GUCs that
-	 * depend on the amount of shared memory required.
-	 */
+#if PG_VERSION_NUM >= 150000
+	process_shmem_requests();
 	InitializeShmemGUCs();

 	/*
-	 * Now that modules have been loaded, we can process any custom resource
-	 * managers specified in the wal_consistency_checking GUC.
+	 * This will try to access data directory which we do not set.
+	 * Seems to be pretty safe to disable.
 	 */
-	InitializeWalConsistencyChecking();
+	/* InitializeWalConsistencyChecking(); */
 #endif

-	CreateSharedMemoryAndSemaphores();
+	/*
+	 * We have our own version of CreateSharedMemoryAndSemaphores() that
+	 * sets up local memory instead of shared one.
+	 */
+	CreateFakeSharedMemoryAndSemaphores();

 	/*
 	 * Remember stand-alone backend startup time,roughly at the same point
@@ -354,6 +366,172 @@ WalRedoMain(int argc, char *argv[])
 }


+/*
+ * Initialize dummy shmem.
+ *
+ * This code follows CreateSharedMemoryAndSemaphores() but manually sets up
+ * the shmem header and skips few initialization steps that are not needed for
+ * WAL redo.
+ *
+ * I've also tried removing most of initialization functions that request some
+ * memory (like ApplyLauncherShmemInit and friends) but in reality it haven't had
+ * any sizeable effect on RSS, so probably such clean up not worth the risk of having
+ * half-initialized postgres.
+ */
+static void
+CreateFakeSharedMemoryAndSemaphores()
+{
+	PGShmemHeader *shim = NULL;
+	PGShmemHeader *hdr;
+	Size		size;
+	int			numSemas;
+	char		cwd[MAXPGPATH];
+
+#if PG_VERSION_NUM >= 150000
+	size = CalculateShmemSize(&numSemas);
+#else
+	/*
+	 * Postgres v14 doesn't have a separate CalculateShmemSize(). Use result of the
+	 * corresponging calculation in CreateSharedMemoryAndSemaphores()
+	 */
+	size = 1409024;
+	numSemas = 10;
+#endif
+
+	/* Dummy implementation of PGSharedMemoryCreate() */
+	{
+		hdr = (PGShmemHeader *) malloc(size);
+		if (!hdr)
+			ereport(FATAL,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("[neon-wal-redo] can not allocate (pseudo-) shared memory")));
+
+		hdr->creatorPID = getpid();
+		hdr->magic = PGShmemMagic;
+		hdr->dsm_control = 0;
+		hdr->device = 42; /* not relevant for non-shared memory */
+		hdr->inode = 43; /* not relevant for non-shared memory */
+		hdr->totalsize = size;
+		hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
+
+		shim = hdr;
+		UsedShmemSegAddr = hdr;
+		UsedShmemSegID = (unsigned long) 42; /* not relevant for non-shared memory */
+	}
+
+	InitShmemAccess(hdr);
+
+	/*
+	 * Reserve semaphores uses dir name as a source of entropy. Set it to cwd(). Rest
+	 * of the code does not need DataDir access so nullify DataDir after
+	 * PGReserveSemaphores() to error out if something will try to access it.
+	 */
+	if (!getcwd(cwd, MAXPGPATH))
+		ereport(FATAL,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+			 errmsg("[neon-wal-redo] can not read current directory name")));
+	DataDir = cwd;
+	PGReserveSemaphores(numSemas);
+	DataDir = NULL;
+
+	/*
+	 * The rest of function follows CreateSharedMemoryAndSemaphores() closely,
+	 * skipped parts are marked with comments.
+	 */
+	InitShmemAllocation();
+
+	/*
+	 * Now initialize LWLocks, which do shared memory allocation and are
+	 * needed for InitShmemIndex.
+	 */
+	CreateLWLocks();
+
+	/*
+	 * Set up shmem.c index hashtable
+	 */
+	InitShmemIndex();
+
+	dsm_shmem_init();
+
+	/*
+	 * Set up xlog, clog, and buffers
+	 */
+	XLOGShmemInit();
+	CLOGShmemInit();
+	CommitTsShmemInit();
+	SUBTRANSShmemInit();
+	MultiXactShmemInit();
+	InitBufferPool();
+
+	/*
+	 * Set up lock manager
+	 */
+	InitLocks();
+
+	/*
+	 * Set up predicate lock manager
+	 */
+	InitPredicateLocks();
+
+	/*
+	 * Set up process table
+	 */
+	if (!IsUnderPostmaster)
+		InitProcGlobal();
+	CreateSharedProcArray();
+	CreateSharedBackendStatus();
+	TwoPhaseShmemInit();
+	BackgroundWorkerShmemInit();
+
+	/*
+	 * Set up shared-inval messaging
+	 */
+	CreateSharedInvalidationState();
+
+	/*
+	 * Set up interprocess signaling mechanisms
+	 */
+	PMSignalShmemInit();
+	ProcSignalShmemInit();
+	CheckpointerShmemInit();
+	AutoVacuumShmemInit();
+	ReplicationSlotsShmemInit();
+	ReplicationOriginShmemInit();
+	WalSndShmemInit();
+	WalRcvShmemInit();
+	PgArchShmemInit();
+	ApplyLauncherShmemInit();
+
+	/*
+	 * Set up other modules that need some shared memory space
+	 */
+	SnapMgrInit();
+	BTreeShmemInit();
+	SyncScanShmemInit();
+	/* Skip due to the 'pg_notify' directory check */
+	/* AsyncShmemInit(); */
+
+#ifdef EXEC_BACKEND
+
+	/*
+	 * Alloc the win32 shared backend array
+	 */
+	if (!IsUnderPostmaster)
+		ShmemBackendArrayAllocation();
+#endif
+
+	/* Initialize dynamic shared memory facilities. */
+	if (!IsUnderPostmaster)
+		dsm_postmaster_startup(shim);
+
+	/*
+	 * Now give loadable modules a chance to set up their shmem allocations
+	 */
+	if (shmem_startup_hook)
+		shmem_startup_hook();
+}
+
+
 /* Version compatility wrapper for ReadBufferWithoutRelcache */
 static inline Buffer
 NeonRedoReadBuffer(RelFileNode rnode,
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
@@ -253,43 +253,46 @@ files = [

 [[package]]
 name = "black"
-version = "22.6.0"
+version = "23.1.0"
 description = "The uncompromising code formatter."
 category = "dev"
 optional = false
-python-versions = ">=3.6.2"
+python-versions = ">=3.7"
 files = [
-    {file = "black-22.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:f586c26118bc6e714ec58c09df0157fe2d9ee195c764f630eb0d8e7ccce72e69"},
-    {file = "black-22.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b270a168d69edb8b7ed32c193ef10fd27844e5c60852039599f9184460ce0807"},
-    {file = "black-22.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6797f58943fceb1c461fb572edbe828d811e719c24e03375fd25170ada53825e"},
-    {file = "black-22.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c85928b9d5f83b23cee7d0efcb310172412fbf7cb9d9ce963bd67fd141781def"},
-    {file = "black-22.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:f6fe02afde060bbeef044af7996f335fbe90b039ccf3f5eb8f16df8b20f77666"},
-    {file = "black-22.6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cfaf3895a9634e882bf9d2363fed5af8888802d670f58b279b0bece00e9a872d"},
-    {file = "black-22.6.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94783f636bca89f11eb5d50437e8e17fbc6a929a628d82304c80fa9cd945f256"},
-    {file = "black-22.6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:2ea29072e954a4d55a2ff58971b83365eba5d3d357352a07a7a4df0d95f51c78"},
-    {file = "black-22.6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e439798f819d49ba1c0bd9664427a05aab79bfba777a6db94fd4e56fae0cb849"},
-    {file = "black-22.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187d96c5e713f441a5829e77120c269b6514418f4513a390b0499b0987f2ff1c"},
-    {file = "black-22.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:074458dc2f6e0d3dab7928d4417bb6957bb834434516f21514138437accdbe90"},
-    {file = "black-22.6.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a218d7e5856f91d20f04e931b6f16d15356db1c846ee55f01bac297a705ca24f"},
-    {file = "black-22.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:568ac3c465b1c8b34b61cd7a4e349e93f91abf0f9371eda1cf87194663ab684e"},
-    {file = "black-22.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6c1734ab264b8f7929cef8ae5f900b85d579e6cbfde09d7387da8f04771b51c6"},
-    {file = "black-22.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9a3ac16efe9ec7d7381ddebcc022119794872abce99475345c5a61aa18c45ad"},
-    {file = "black-22.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:b9fd45787ba8aa3f5e0a0a98920c1012c884622c6c920dbe98dbd05bc7c70fbf"},
-    {file = "black-22.6.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7ba9be198ecca5031cd78745780d65a3f75a34b2ff9be5837045dce55db83d1c"},
-    {file = "black-22.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a3db5b6409b96d9bd543323b23ef32a1a2b06416d525d27e0f67e74f1446c8f2"},
-    {file = "black-22.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:560558527e52ce8afba936fcce93a7411ab40c7d5fe8c2463e279e843c0328ee"},
-    {file = "black-22.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b154e6bbde1e79ea3260c4b40c0b7b3109ffcdf7bc4ebf8859169a6af72cd70b"},
-    {file = "black-22.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:4af5bc0e1f96be5ae9bd7aaec219c901a94d6caa2484c21983d043371c733fc4"},
-    {file = "black-22.6.0-py3-none-any.whl", hash = "sha256:ac609cf8ef5e7115ddd07d85d988d074ed00e10fbc3445aee393e70164a2219c"},
-    {file = "black-22.6.0.tar.gz", hash = "sha256:6c6d39e28aed379aec40da1c65434c77d75e65bb59a1e1c283de545fb4e7c6c9"},
+    {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"},
+    {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"},
+    {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"},
+    {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"},
+    {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"},
+    {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"},
+    {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"},
+    {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"},
+    {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"},
+    {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"},
+    {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"},
+    {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"},
+    {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"},
+    {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"},
+    {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"},
+    {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"},
+    {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"},
+    {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"},
+    {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"},
+    {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"},
+    {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"},
+    {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"},
+    {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"},
+    {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"},
+    {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"},
 ]

 [package.dependencies]
 click = ">=8.0.0"
 mypy-extensions = ">=0.4.3"
+packaging = ">=22.0"
 pathspec = ">=0.9.0"
 platformdirs = ">=2"
-tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""}
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
 typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}

 [package.extras]
@@ -884,6 +887,8 @@ files = [
    {file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
    {file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
    {file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"},
+    {file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"},
    {file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
    {file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
    {file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
@@ -963,23 +968,6 @@ files = [
 [package.extras]
 testing = ["pre-commit"]

-[[package]]
-name = "flake8"
-version = "5.0.4"
-description = "the modular source code checker: pep8 pyflakes and co"
-category = "dev"
-optional = false
-python-versions = ">=3.6.1"
-files = [
-    {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"},
-    {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"},
-]
-
-[package.dependencies]
-mccabe = ">=0.7.0,<0.8.0"
-pycodestyle = ">=2.9.0,<2.10.0"
-pyflakes = ">=2.5.0,<2.6.0"
-
 [[package]]
 name = "flask"
 version = "2.1.3"
@@ -1075,24 +1063,6 @@ files = [
    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
 ]

-[[package]]
-name = "isort"
-version = "5.10.1"
-description = "A Python utility / library to sort Python imports."
-category = "dev"
-optional = false
-python-versions = ">=3.6.1,<4.0"
-files = [
-    {file = "isort-5.10.1-py3-none-any.whl", hash = "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7"},
-    {file = "isort-5.10.1.tar.gz", hash = "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951"},
-]
-
-[package.extras]
-colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
-plugins = ["setuptools"]
-requirements-deprecated-finder = ["pip-api", "pipreqs"]
-
 [[package]]
 name = "itsdangerous"
 version = "2.1.2"
@@ -1238,6 +1208,7 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
+    {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"},
    {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"},
 ]

@@ -1294,18 +1265,6 @@ files = [
    {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
 ]

-[[package]]
-name = "mccabe"
-version = "0.7.0"
-description = "McCabe checker, plugin for flake8"
-category = "dev"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
-    {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
-]
-
 [[package]]
 name = "moto"
 version = "4.1.2"
@@ -1453,46 +1412,42 @@ files = [

 [[package]]
 name = "mypy"
-version = "0.991"
+version = "1.1.1"
 description = "Optional static typing for Python"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"},
-    {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"},
-    {file = "mypy-0.991-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0c8f3be99e8a8bd403caa8c03be619544bc2c77a7093685dcf308c6b109426c6"},
-    {file = "mypy-0.991-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc9ec663ed6c8f15f4ae9d3c04c989b744436c16d26580eaa760ae9dd5d662eb"},
-    {file = "mypy-0.991-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:4307270436fd7694b41f913eb09210faff27ea4979ecbcd849e57d2da2f65305"},
-    {file = "mypy-0.991-cp310-cp310-win_amd64.whl", hash = "sha256:901c2c269c616e6cb0998b33d4adbb4a6af0ac4ce5cd078afd7bc95830e62c1c"},
-    {file = "mypy-0.991-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d13674f3fb73805ba0c45eb6c0c3053d218aa1f7abead6e446d474529aafc372"},
-    {file = "mypy-0.991-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1c8cd4fb70e8584ca1ed5805cbc7c017a3d1a29fb450621089ffed3e99d1857f"},
-    {file = "mypy-0.991-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:209ee89fbb0deed518605edddd234af80506aec932ad28d73c08f1400ef80a33"},
-    {file = "mypy-0.991-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37bd02ebf9d10e05b00d71302d2c2e6ca333e6c2a8584a98c00e038db8121f05"},
-    {file = "mypy-0.991-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:26efb2fcc6b67e4d5a55561f39176821d2adf88f2745ddc72751b7890f3194ad"},
-    {file = "mypy-0.991-cp311-cp311-win_amd64.whl", hash = "sha256:3a700330b567114b673cf8ee7388e949f843b356a73b5ab22dd7cff4742a5297"},
-    {file = "mypy-0.991-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f7d1a520373e2272b10796c3ff721ea1a0712288cafaa95931e66aa15798813"},
-    {file = "mypy-0.991-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:641411733b127c3e0dab94c45af15fea99e4468f99ac88b39efb1ad677da5711"},
-    {file = "mypy-0.991-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3d80e36b7d7a9259b740be6d8d906221789b0d836201af4234093cae89ced0cd"},
-    {file = "mypy-0.991-cp37-cp37m-win_amd64.whl", hash = "sha256:e62ebaad93be3ad1a828a11e90f0e76f15449371ffeecca4a0a0b9adc99abcef"},
-    {file = "mypy-0.991-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b86ce2c1866a748c0f6faca5232059f881cda6dda2a893b9a8373353cfe3715a"},
-    {file = "mypy-0.991-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ac6e503823143464538efda0e8e356d871557ef60ccd38f8824a4257acc18d93"},
-    {file = "mypy-0.991-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0cca5adf694af539aeaa6ac633a7afe9bbd760df9d31be55ab780b77ab5ae8bf"},
-    {file = "mypy-0.991-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a12c56bf73cdab116df96e4ff39610b92a348cc99a1307e1da3c3768bbb5b135"},
-    {file = "mypy-0.991-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:652b651d42f155033a1967739788c436491b577b6a44e4c39fb340d0ee7f0d70"},
-    {file = "mypy-0.991-cp38-cp38-win_amd64.whl", hash = "sha256:4175593dc25d9da12f7de8de873a33f9b2b8bdb4e827a7cae952e5b1a342e243"},
-    {file = "mypy-0.991-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98e781cd35c0acf33eb0295e8b9c55cdbef64fcb35f6d3aa2186f289bed6e80d"},
-    {file = "mypy-0.991-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6d7464bac72a85cb3491c7e92b5b62f3dcccb8af26826257760a552a5e244aa5"},
-    {file = "mypy-0.991-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c9166b3f81a10cdf9b49f2d594b21b31adadb3d5e9db9b834866c3258b695be3"},
-    {file = "mypy-0.991-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8472f736a5bfb159a5e36740847808f6f5b659960115ff29c7cecec1741c648"},
-    {file = "mypy-0.991-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5e80e758243b97b618cdf22004beb09e8a2de1af481382e4d84bc52152d1c476"},
-    {file = "mypy-0.991-cp39-cp39-win_amd64.whl", hash = "sha256:74e259b5c19f70d35fcc1ad3d56499065c601dfe94ff67ae48b85596b9ec1461"},
-    {file = "mypy-0.991-py3-none-any.whl", hash = "sha256:de32edc9b0a7e67c2775e574cb061a537660e51210fbf6006b0b36ea695ae9bb"},
-    {file = "mypy-0.991.tar.gz", hash = "sha256:3c0165ba8f354a6d9881809ef29f1a9318a236a6d81c690094c5df32107bde06"},
+    {file = "mypy-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39c7119335be05630611ee798cc982623b9e8f0cff04a0b48dfc26100e0b97af"},
+    {file = "mypy-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:61bf08362e93b6b12fad3eab68c4ea903a077b87c90ac06c11e3d7a09b56b9c1"},
+    {file = "mypy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbb19c9f662e41e474e0cff502b7064a7edc6764f5262b6cd91d698163196799"},
+    {file = "mypy-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:315ac73cc1cce4771c27d426b7ea558fb4e2836f89cb0296cbe056894e3a1f78"},
+    {file = "mypy-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:5cb14ff9919b7df3538590fc4d4c49a0f84392237cbf5f7a816b4161c061829e"},
+    {file = "mypy-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:26cdd6a22b9b40b2fd71881a8a4f34b4d7914c679f154f43385ca878a8297389"},
+    {file = "mypy-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b5f81b40d94c785f288948c16e1f2da37203c6006546c5d947aab6f90aefef2"},
+    {file = "mypy-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21b437be1c02712a605591e1ed1d858aba681757a1e55fe678a15c2244cd68a5"},
+    {file = "mypy-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d809f88734f44a0d44959d795b1e6f64b2bbe0ea4d9cc4776aa588bb4229fc1c"},
+    {file = "mypy-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:a380c041db500e1410bb5b16b3c1c35e61e773a5c3517926b81dfdab7582be54"},
+    {file = "mypy-1.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7c7b708fe9a871a96626d61912e3f4ddd365bf7f39128362bc50cbd74a634d5"},
+    {file = "mypy-1.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c10fa12df1232c936830839e2e935d090fc9ee315744ac33b8a32216b93707"},
+    {file = "mypy-1.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0a28a76785bf57655a8ea5eb0540a15b0e781c807b5aa798bd463779988fa1d5"},
+    {file = "mypy-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:ef6a01e563ec6a4940784c574d33f6ac1943864634517984471642908b30b6f7"},
+    {file = "mypy-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d64c28e03ce40d5303450f547e07418c64c241669ab20610f273c9e6290b4b0b"},
+    {file = "mypy-1.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64cc3afb3e9e71a79d06e3ed24bb508a6d66f782aff7e56f628bf35ba2e0ba51"},
+    {file = "mypy-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce61663faf7a8e5ec6f456857bfbcec2901fbdb3ad958b778403f63b9e606a1b"},
+    {file = "mypy-1.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2b0c373d071593deefbcdd87ec8db91ea13bd8f1328d44947e88beae21e8d5e9"},
+    {file = "mypy-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:2888ce4fe5aae5a673386fa232473014056967f3904f5abfcf6367b5af1f612a"},
+    {file = "mypy-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:19ba15f9627a5723e522d007fe708007bae52b93faab00f95d72f03e1afa9598"},
+    {file = "mypy-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:59bbd71e5c58eed2e992ce6523180e03c221dcd92b52f0e792f291d67b15a71c"},
+    {file = "mypy-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9401e33814cec6aec8c03a9548e9385e0e228fc1b8b0a37b9ea21038e64cdd8a"},
+    {file = "mypy-1.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b398d8b1f4fba0e3c6463e02f8ad3346f71956b92287af22c9b12c3ec965a9f"},
+    {file = "mypy-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:69b35d1dcb5707382810765ed34da9db47e7f95b3528334a3c999b0c90fe523f"},
+    {file = "mypy-1.1.1-py3-none-any.whl", hash = "sha256:4e4e8b362cdf99ba00c2b218036002bdcdf1e0de085cdb296a49df03fb31dfc4"},
+    {file = "mypy-1.1.1.tar.gz", hash = "sha256:ae9ceae0f5b9059f33dbc62dea087e942c0ccab4b7a003719cb70f9b8abfa32f"},
 ]

 [package.dependencies]
-mypy-extensions = ">=0.4.3"
+mypy-extensions = ">=1.0.0"
 tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
 typing-extensions = ">=3.10"

@@ -1519,14 +1474,14 @@ typing-extensions = ">=4.1.0"

 [[package]]
 name = "mypy-extensions"
-version = "0.4.3"
-description = "Experimental type system extensions for programs checked with the mypy typechecker."
+version = "1.0.0"
+description = "Type system extensions for programs checked with the mypy type checker."
 category = "dev"
 optional = false
-python-versions = "*"
+python-versions = ">=3.5"
 files = [
-    {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"},
-    {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"},
+    {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
+    {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
 ]

 [[package]]
@@ -1591,19 +1546,16 @@ requests = ["requests"]

 [[package]]
 name = "packaging"
-version = "21.3"
+version = "23.0"
 description = "Core utilities for Python packages"
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
-    {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
+    {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"},
+    {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
 ]

-[package.dependencies]
-pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
-
 [[package]]
 name = "pathspec"
 version = "0.9.0"
@@ -1712,6 +1664,7 @@ python-versions = ">=3.6"
 files = [
    {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
+    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -1745,6 +1698,7 @@ files = [
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
+    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -1756,6 +1710,7 @@ files = [
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
+    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -1788,33 +1743,10 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
-    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
-    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
-    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
-    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
-    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
-    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
-    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
-    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
-    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]

-[[package]]
-name = "pycodestyle"
-version = "2.9.1"
-description = "Python style guide checker"
-category = "dev"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"},
-    {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"},
-]
-
 [[package]]
 name = "pycparser"
 version = "2.21"
@@ -1827,18 +1759,6 @@ files = [
    {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"},
 ]

-[[package]]
-name = "pyflakes"
-version = "2.5.0"
-description = "passive checker of Python programs"
-category = "dev"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"},
-    {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"},
-]
-
 [[package]]
 name = "pyjwt"
 version = "2.4.0"
@@ -2008,8 +1928,8 @@ files = [

 [package.dependencies]
 pytest = [
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
    {version = ">=5.0", markers = "python_version < \"3.10\""},
+    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]

 [[package]]
@@ -2121,6 +2041,13 @@ files = [
    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
@@ -2205,6 +2132,33 @@ files = [
 [package.dependencies]
 pyasn1 = ">=0.1.3"

+[[package]]
+name = "ruff"
+version = "0.0.255"
+description = "An extremely fast Python linter, written in Rust."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "ruff-0.0.255-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:b2d71fb6a7e50501a2473864acffc85dee6b750c25db198f7e71fe1dbbff1aad"},
+    {file = "ruff-0.0.255-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:6c97d746861a6010f941179e84bba9feb8a871815667471d9ed6beb98d45c252"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a7fa60085079b91a298b963361be9b1b1c724582af6c84be954cbabdbd9309a"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c089f7141496334ab5a127b54ce55e41f0d6714e68a4453a1e09d2204cdea8c3"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0423908caa7d437a416b853214565b9c33bbd1106c4f88147982216dddcbbd96"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:981493e92547cacbb8e0874904ec049fe744507ee890dc8736caf89a8864f9a7"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:59d5193d2aedb35db180824462b374dbcfc306b2e76076245088afa6e5837df2"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dd5e00733c9d160c8a34a22e62b390da9d1e9f326676402421cb8c1236beefc3"},
+    {file = "ruff-0.0.255-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:694418cf41838bd19c6229e4e1b2d04505b1e6b86fe3ab81165484fc96d36f01"},
+    {file = "ruff-0.0.255-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5d0408985c9777369daebb5d3340a99e9f7294bdd7120642239261508185cf89"},
+    {file = "ruff-0.0.255-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abd6376ef9d12f370d95a8c7c98682fbb9bfedfba59f40e84a816fef8ddcb8de"},
+    {file = "ruff-0.0.255-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f9b1a5df0bc09193cbef58a6f78e4a9a0b058a4f9733c0442866d078006d1bb9"},
+    {file = "ruff-0.0.255-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6a25c5f4ff087445b2e1bbcb9963f2ae7c868d65e4a8d5f84c36c12f71571179"},
+    {file = "ruff-0.0.255-py3-none-win32.whl", hash = "sha256:1ff87a8310354f9f1a099625e54a27fdd6756d9cd2a40b45922f2e943daf982d"},
+    {file = "ruff-0.0.255-py3-none-win_amd64.whl", hash = "sha256:f3d8416be618f023f93ec4fd6ee3048585ef85dba9563b2a7e38fc7e5131d5b1"},
+    {file = "ruff-0.0.255-py3-none-win_arm64.whl", hash = "sha256:8ba124819624145d7b6b53add40c367c44318893215ffc1bfe3d72e0225a1c9c"},
+    {file = "ruff-0.0.255.tar.gz", hash = "sha256:f9eb1d3b2eecbeedae419fa494c4e2a5e4484baf93a1ce0f81eddb005e1919c5"},
+]
+
 [[package]]
 name = "s3transfer"
 version = "0.6.0"
@@ -2643,4 +2597,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "3038940781ef59d1ed28cedf46120ad6623e21e602c38ad3c359428d79fa1efd"
+content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91"
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -43,17 +43,13 @@ def black(fix_inplace: bool) -> str:
    return cmd


-def isort(fix_inplace: bool) -> str:
-    cmd = "poetry run isort"
-    if not fix_inplace:
-        cmd += " --diff --check"
+def ruff(fix_inplace: bool) -> str:
+    cmd = "poetry run ruff"
+    if fix_inplace:
+        cmd += " --fix"
    return cmd


-def flake8() -> str:
-    return "poetry run flake8"
-
-
 def mypy() -> str:
    return "poetry run mypy"

@@ -112,13 +108,6 @@ if __name__ == "__main__":
        changed_files=files,
        no_color=args.no_color,
    )
-    check(
-        name="isort",
-        suffix=".py",
-        cmd=isort(fix_inplace=args.fix_inplace),
-        changed_files=files,
-        no_color=args.no_color,
-    )
    check(
        name="black",
        suffix=".py",
@@ -127,9 +116,9 @@ if __name__ == "__main__":
        no_color=args.no_color,
    )
    check(
-        name="flake8",
+        name="ruff",
        suffix=".py",
-        cmd=flake8(),
+        cmd=ruff(fix_inplace=args.fix_inplace),
        changed_files=files,
        no_color=args.no_color,
    )
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -31,6 +31,7 @@ once_cell.workspace = true
 opentelemetry.workspace = true
 parking_lot.workspace = true
 pin-project-lite.workspace = true
+postgres_backend.workspace = true
 pq_proto.workspace = true
 prometheus.workspace = true
 rand.workspace = true
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -3,7 +3,7 @@
 pub mod backend;
 pub use backend::BackendType;

-pub mod credentials;
+mod credentials;
 pub use credentials::ClientCredentials;

 mod password_hack;
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -11,7 +11,7 @@ use crate::{
        provider::{CachedNodeInfo, ConsoleReqExtra},
        Api,
    },
-    scram, stream, url,
+    stream, url,
 };
 use futures::TryFutureExt;
 use std::borrow::Cow;
@@ -59,8 +59,8 @@ impl std::fmt::Display for BackendType<'_, ()> {
    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        use BackendType::*;
        match self {
-            Console(api, _) => fmt.debug_tuple("Console").field(&api.url()).finish(),
-            Postgres(api, _) => fmt.debug_tuple("Postgres").field(&api.url()).finish(),
+            Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
+            Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
        }
    }
@@ -106,23 +106,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
    }
 }

-impl console::AuthInfo {
-    /// Either it's our way ([SCRAM](crate::scram)) or the highway :)
-    /// But seriously, we don't aim to support anything but SCRAM for now.
-    fn scram_or_goodbye(self) -> auth::Result<scram::ServerSecret> {
-        match self {
-            Self::Md5(_) => {
-                info!("auth endpoint chooses MD5");
-                Err(auth::AuthError::bad_auth_method("MD5"))
-            }
-            Self::Scram(secret) => {
-                info!("auth endpoint chooses SCRAM");
-                Ok(secret)
-            }
-        }
-    }
-}
-
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 async fn auth_quirks(
@@ -200,9 +183,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
        info!("user successfully authenticated");
        Ok(res)
    }
-}

-impl BackendType<'_, ClientCredentials<'_>> {
    /// When applicable, wake the compute node, gaining its connection info in the process.
    /// The link auth flow doesn't support this, so we return [`None`] in that case.
    pub async fn wake_compute(
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -2,54 +2,57 @@ use super::AuthSuccess;
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
-    console::{self, CachedNodeInfo, ConsoleReqExtra},
+    console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
    sasl, scram,
    stream::PqStream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::config::AuthKeys;
 use tracing::info;

-async fn do_scram(
-    secret: scram::ServerSecret,
-    creds: &ClientCredentials<'_>,
-    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<compute::ScramKeys> {
-    let outcome = AuthFlow::new(client)
-        .begin(auth::Scram(&secret))
-        .await?
-        .authenticate()
-        .await?;
-
-    let client_key = match outcome {
-        sasl::Outcome::Success(key) => key,
-        sasl::Outcome::Failure(reason) => {
-            info!("auth backend failed with an error: {reason}");
-            return Err(auth::AuthError::auth_failed(creds.user));
-        }
-    };
-
-    let keys = compute::ScramKeys {
-        client_key: client_key.as_bytes(),
-        server_key: secret.server_key.as_bytes(),
-    };
-
-    Ok(keys)
-}
-
-pub async fn authenticate(
+pub(super) async fn authenticate(
    api: &impl console::Api,
    extra: &ConsoleReqExtra<'_>,
    creds: &ClientCredentials<'_>,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
-    let info = console::get_auth_info(api, extra, creds).await?;
+    info!("fetching user's authentication info");
+    let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
+        // If we don't have an authentication secret, we mock one to
+        // prevent malicious probing (possible due to missing protocol steps).
+        // This mocked secret will never lead to successful authentication.
+        info!("authentication info not found, mocking it");
+        AuthInfo::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
+    });

-    let secret = info.scram_or_goodbye()?;
-    let scram_keys = do_scram(secret, creds, client).await?;
+    let flow = AuthFlow::new(client);
+    let scram_keys = match info {
+        AuthInfo::Md5(_) => {
+            info!("auth endpoint chooses MD5");
+            return Err(auth::AuthError::bad_auth_method("MD5"));
+        }
+        AuthInfo::Scram(secret) => {
+            info!("auth endpoint chooses SCRAM");
+            let scram = auth::Scram(&secret);
+            let client_key = match flow.begin(scram).await?.authenticate().await? {
+                sasl::Outcome::Success(key) => key,
+                sasl::Outcome::Failure(reason) => {
+                    info!("auth backend failed with an error: {reason}");
+                    return Err(auth::AuthError::auth_failed(creds.user));
+                }
+            };
+
+            Some(compute::ScramKeys {
+                client_key: client_key.as_bytes(),
+                server_key: secret.server_key.as_bytes(),
+            })
+        }
+    };

    let mut node = api.wake_compute(extra, creds).await?;
-    node.config.auth_keys(AuthKeys::ScramSha256(scram_keys));
+    if let Some(keys) = scram_keys {
+        use tokio_postgres::config::AuthKeys;
+        node.config.auth_keys(AuthKeys::ScramSha256(keys));
+    }

    Ok(AuthSuccess {
        reported_auth_ok: false,
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -5,33 +5,11 @@ use crate::{
        self,
        provider::{CachedNodeInfo, ConsoleReqExtra},
    },
-    stream::PqStream,
+    stream,
 };
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

-/// Wake the compute node, but only if the password is valid.
-async fn get_compute(
-    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
-    password: Vec<u8>,
-) -> auth::Result<CachedNodeInfo> {
-    // TODO: this will slow down both "hacks" below; we probably need a cache.
-    let info = console::get_auth_info(api, extra, creds).await?;
-
-    let secret = info.scram_or_goodbye()?;
-    if !secret.matches_password(&password) {
-        info!("our obscure magic indicates that the password doesn't match");
-        return Err(auth::AuthError::auth_failed(creds.user));
-    }
-
-    let mut node = api.wake_compute(extra, creds).await?;
-    node.config.password(password);
-
-    Ok(node)
-}
-
 /// Compared to [SCRAM](crate::scram), cleartext password auth saves
 /// one round trip and *expensive* computations (>= 4096 HMAC iterations).
 /// These properties are benefical for serverless JS workers, so we
@@ -40,7 +18,7 @@ pub async fn cleartext_hack(
    api: &impl console::Api,
    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
-    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    warn!("cleartext auth flow override is enabled, proceeding");
    let password = AuthFlow::new(client)
@@ -49,7 +27,8 @@ pub async fn cleartext_hack(
        .authenticate()
        .await?;

-    let node = get_compute(api, extra, creds, password).await?;
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(password);

    // Report tentative success; compute node will check the password anyway.
    Ok(AuthSuccess {
@@ -64,7 +43,7 @@ pub async fn password_hack(
    api: &impl console::Api,
    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
-    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
    warn!("project not specified, resorting to the password hack auth flow");
    let payload = AuthFlow::new(client)
@@ -76,7 +55,8 @@ pub async fn password_hack(
    info!(project = &payload.project, "received missing parameter");
    creds.project = Some(payload.project.into());

-    let node = get_compute(api, extra, creds, payload.password).await?;
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(payload.password);

    // Report tentative success; compute node will check the password anyway.
    Ok(AuthSuccess {
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -31,22 +31,12 @@ pub enum ClientCredsParseError {

 impl UserFacingError for ClientCredsParseError {}

-/// eSNI parameters which might contain endpoint/project name.
-#[derive(Default)]
-pub struct SniParams<'a> {
-    /// Server Name Indication (TLS jargon).
-    pub sni: Option<&'a str>,
-    /// Common Name from a TLS certificate.
-    pub common_name: Option<&'a str>,
-}
-
 /// Various client credentials which we use for authentication.
 /// Note that we don't store any kind of client key or password here.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ClientCredentials<'a> {
-    /// Name of postgres role.
    pub user: &'a str,
-    /// Also known as endpoint in the console.
+    // TODO: this is a severe misnomer! We should think of a new name ASAP.
    pub project: Option<Cow<'a, str>>,
 }

@@ -59,17 +49,18 @@ impl ClientCredentials<'_> {

 impl<'a> ClientCredentials<'a> {
    pub fn parse(
-        startup_params: &'a StartupMessageParams,
-        &SniParams { sni, common_name }: &SniParams<'_>,
+        params: &'a StartupMessageParams,
+        sni: Option<&str>,
+        common_name: Option<&str>,
    ) -> Result<Self, ClientCredsParseError> {
        use ClientCredsParseError::*;

        // Some parameters are stored in the startup message.
-        let get_param = |key| startup_params.get(key).ok_or(MissingKey(key));
+        let get_param = |key| params.get(key).ok_or(MissingKey(key));
        let user = get_param("user")?;

        // Project name might be passed via PG's command-line options.
-        let project_option = startup_params.options_raw().and_then(|mut options| {
+        let project_option = params.options_raw().and_then(|mut options| {
            options
                .find_map(|opt| opt.strip_prefix("project="))
                .map(Cow::Borrowed)
@@ -131,9 +122,7 @@ mod tests {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);

-        let sni = SniParams::default();
-
-        let creds = ClientCredentials::parse(&options, &sni)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);

@@ -142,15 +131,13 @@ mod tests {

    #[test]
    fn parse_excessive() -> anyhow::Result<()> {
-        let startup = StartupMessageParams::new([
+        let options = StartupMessageParams::new([
            ("user", "john_doe"),
            ("database", "world"), // should be ignored
            ("foo", "bar"),        // should be ignored
        ]);

-        let sni = SniParams::default();
-
-        let creds = ClientCredentials::parse(&startup, &sni)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);

@@ -159,14 +146,12 @@ mod tests {

    #[test]
    fn parse_project_from_sni() -> anyhow::Result<()> {
-        let startup = StartupMessageParams::new([("user", "john_doe")]);
+        let options = StartupMessageParams::new([("user", "john_doe")]);

-        let sni = SniParams {
-            sni: Some("foo.localhost"),
-            common_name: Some("localhost"),
-        };
+        let sni = Some("foo.localhost");
+        let common_name = Some("localhost");

-        let creds = ClientCredentials::parse(&startup, &sni)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));

@@ -175,14 +160,12 @@ mod tests {

    #[test]
    fn parse_project_from_options() -> anyhow::Result<()> {
-        let startup = StartupMessageParams::new([
+        let options = StartupMessageParams::new([
            ("user", "john_doe"),
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let sni = SniParams::default();
-
-        let creds = ClientCredentials::parse(&startup, &sni)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("bar"));

@@ -191,17 +174,12 @@ mod tests {

    #[test]
    fn parse_projects_identical() -> anyhow::Result<()> {
-        let startup = StartupMessageParams::new([
-            ("user", "john_doe"),
-            ("options", "project=baz"), // fmt
-        ]);
+        let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);

-        let sni = SniParams {
-            sni: Some("baz.localhost"),
-            common_name: Some("localhost"),
-        };
+        let sni = Some("baz.localhost");
+        let common_name = Some("localhost");

-        let creds = ClientCredentials::parse(&startup, &sni)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("baz"));

@@ -210,17 +188,13 @@ mod tests {

    #[test]
    fn parse_projects_different() {
-        let startup = StartupMessageParams::new([
-            ("user", "john_doe"),
-            ("options", "project=first"), // fmt
-        ]);
+        let options =
+            StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]);

-        let sni = SniParams {
-            sni: Some("second.localhost"),
-            common_name: Some("localhost"),
-        };
+        let sni = Some("second.localhost");
+        let common_name = Some("localhost");

-        let err = ClientCredentials::parse(&startup, &sni).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
        match err {
            InconsistentProjectNames { domain, option } => {
                assert_eq!(option, "first");
@@ -232,14 +206,12 @@ mod tests {

    #[test]
    fn parse_inconsistent_sni() {
-        let startup = StartupMessageParams::new([("user", "john_doe")]);
+        let options = StartupMessageParams::new([("user", "john_doe")]);

-        let sni = SniParams {
-            sni: Some("project.localhost"),
-            common_name: Some("example.com"),
-        };
+        let sni = Some("project.localhost");
+        let common_name = Some("example.com");

-        let err = ClientCredentials::parse(&startup, &sni).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
        match err {
            InconsistentSni { sni, cn } => {
                assert_eq!(sni, "project.localhost");
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -6,9 +6,9 @@ use std::{io, net::SocketAddr};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::NoTls;
-use tracing::{error, info};
+use tracing::{error, info, warn};

-const COULD_NOT_CONNECT: &str = "Could not connect to compute node";
+const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";

 #[derive(Debug, Error)]
 pub enum ConnectionError {
@@ -131,7 +131,7 @@ impl ConnCfg {
        use tokio_postgres::config::Host;

        let connect_once = |host, port| {
-            info!("trying to connect to a compute node at {host}:{port}");
+            info!("trying to connect to compute node at {host}:{port}");
            TcpStream::connect((host, port)).and_then(|socket| async {
                let socket_addr = socket.peer_addr()?;
                // This prevents load balancer from severing the connection.
@@ -151,7 +151,7 @@ impl ConnCfg {
            return Err(io::Error::new(
                io::ErrorKind::Other,
                format!(
-                    "couldn't connect: bad compute config, \
+                    "bad compute config, \
                     ports and hosts entries' count does not match: {:?}",
                    self.0
                ),
@@ -170,7 +170,7 @@ impl ConnCfg {
                Ok(socket) => return Ok(socket),
                Err(err) => {
                    // We can't throw an error here, as there might be more hosts to try.
-                    error!("failed to connect to a compute node at {host}:{port}: {err}");
+                    warn!("couldn't connect to compute node at {host}:{port}: {err}");
                    connection_error = Some(err);
                }
            }
@@ -179,7 +179,7 @@ impl ConnCfg {
        Err(connection_error.unwrap_or_else(|| {
            io::Error::new(
                io::ErrorKind::Other,
-                format!("couldn't connect: bad compute config: {:?}", self.0),
+                format!("bad compute config: {:?}", self.0),
            )
        }))
    }
@@ -195,12 +195,11 @@ pub struct PostgresConnection {
 }

 impl ConnCfg {
-    /// Connect to a corresponding compute node.
-    pub async fn connect(&self) -> Result<PostgresConnection, ConnectionError> {
+    async fn do_connect(&self) -> Result<PostgresConnection, ConnectionError> {
        // TODO: establish a secure connection to the DB.
        let (socket_addr, mut stream) = self.connect_raw().await?;
        let (client, connection) = self.0.connect_raw(&mut stream, NoTls).await?;
-        info!("connected to user's compute node at {socket_addr}");
+        info!("connected to compute node at {socket_addr}");

        // This is very ugly but as of now there's no better way to
        // extract the connection parameters from tokio-postgres' connection.
@@ -219,6 +218,16 @@ impl ConnCfg {

        Ok(connection)
    }
+
+    /// Connect to a corresponding compute node.
+    pub async fn connect(&self) -> Result<PostgresConnection, ConnectionError> {
+        self.do_connect()
+            .inspect_err(|err| {
+                // Immediately log the error we have at our disposal.
+                error!("couldn't connect to compute node: {err}");
+            })
+            .await
+    }
 }

 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
--- a/Show More
+++ b/Show More