persistent_range_query: add layer_map_test

persistent_range_query: add stress test
persistent_range_query: more refs
2026-02-05 11:40:37 +00:00 · 2022-11-24 04:47:19 +02:00 · 2022-11-24 03:50:18 +02:00 · 2022-11-24 03:45:02 +02:00 · 2022-11-24 02:31:48 +02:00 · 2022-11-24 02:11:06 +02:00
55 changed files with 1878 additions and 689 deletions
--- a/.github/ansible/.gitignore
+++ b/.github/ansible/.gitignore
@@ -1,5 +1,3 @@
-zenith_install.tar.gz
-.zenith_current_version
 neon_install.tar.gz
 .neon_current_version

--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -0,0 +1,33 @@
+storage:
+  vars:
+    bucket_name: neon-dev-storage-eu-west-1
+    bucket_region: eu-west-1
+    console_mgmt_base_url: http://console-staging.local
+    etcd_endpoints: etcd-0.eu-west-1.aws.neon.build:2379
+    pageserver_config_stub:
+      pg_distrib_dir: /usr/local
+      remote_storage:
+        bucket_name: "{{ bucket_name }}"
+        bucket_region: "{{ bucket_region }}"
+        prefix_in_bucket: "pageserver/v1"
+    safekeeper_s3_prefix: safekeeper/v1/wal
+    hostname_suffix: ""
+    remote_user: ssm-user
+    ansible_aws_ssm_region: eu-west-1
+    ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
+    console_region_id: aws-eu-west-1
+
+  children:
+    pageservers:
+      hosts:
+        pageserver-0.eu-west-1.aws.neon.build:
+          ansible_host: i-01d496c5041c7f34c
+
+    safekeepers:
+      hosts:
+        safekeeper-0.eu-west-1.aws.neon.build:
+          ansible_host: i-05226ef85722831bf
+        safekeeper-1.eu-west-1.aws.neon.build:
+          ansible_host: i-06969ee1bf2958bfc
+        safekeeper-2.eu-west-1.aws.neon.build:
+          ansible_host: i-087892e9625984a0b
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -22,6 +22,8 @@ storage:
      hosts:
        pageserver-0.us-east-2.aws.neon.build:
          ansible_host: i-0c3e70929edb5d691
+        pageserver-1.us-east-2.aws.neon.build:
+          ansible_host: i-0565a8b4008aa3f40

    safekeepers:
      hosts:
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.eu-west-1.aws.neon.build"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: dev
+  zenith_region: eu-west-1
+  zenith_region_slug: eu-west-1
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -144,7 +144,9 @@ jobs:
        # neon-captest-new: Run pgbench in a freshly created project
        # neon-captest-reuse: Same, but reusing existing project
        # neon-captest-prefetch: Same, with prefetching enabled (new project)
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ]
+        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
+        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
        db_size: [ 10gb ]
        include:
          - platform: neon-captest-new
@@ -207,8 +209,11 @@ jobs:
          rds-aurora)
            CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
+            ;;
          *)
-            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'"
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
            exit 1
            ;;
        esac
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -761,7 +761,6 @@ jobs:
        run: |
          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
          cd "$(pwd)/.github/ansible"
-
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            ./get_binaries.sh
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
@@ -770,6 +769,38 @@ jobs:
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
          fi
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-pr-test-new:
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
+    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+    needs: [ push-docker-hub, tag, regress-tests ]
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') && 
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ eu-west-1 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh

          ansible-galaxy collection install sivel.toiletwater
          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
@@ -780,7 +811,7 @@ jobs:
    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
    if: |
      (github.ref_name == 'release') &&
      github.event_name != 'workflow_dispatch'
@@ -861,7 +892,7 @@ jobs:
    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
    if: |
      (github.ref_name == 'main') &&
      github.event_name != 'workflow_dispatch'
@@ -873,6 +904,8 @@ jobs:
        include:
          - target_region:  us-east-2
            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -894,7 +927,7 @@ jobs:
    runs-on: prod
    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    needs: [ push-docker-hub, tag, regress-tests ]
    if: |
      (github.ref_name == 'release') &&
      github.event_name != 'workflow_dispatch'
--- a/1
+++ b/1
@@ -8,3 +8,4 @@
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/control-plane 
 /safekeeper/ @neondatabase/safekeepers
+/vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2255,6 +2255,14 @@ version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"

+[[package]]
+name = "persistent_range_query"
+version = "0.1.0"
+dependencies = [
+ "rand",
+ "workspace_hack",
+]
+
 [[package]]
 name = "petgraph"
 version = "0.6.2"
--- a/28
+++ b/28
@@ -20,18 +20,18 @@ else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
 endif

-# Seccomp BPF is only available for Linux
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
+	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
-endif
-
-# macOS with brew-installed openssl requires explicit paths
-# It can be configured with OPENSSL_PREFIX variable
-UNAME_S := $(shell uname -s)
-ifeq ($(UNAME_S),Darwin)
-    OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-    PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+else ifeq ($(UNAME_S),Darwin)
+	# macOS with brew-installed openssl requires explicit paths
+	# It can be configured with OPENSSL_PREFIX variable
+	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
@@ -73,7 +73,8 @@ $(POSTGRES_INSTALL_DIR)/build/v14/config.status:
 	+@echo "Configuring Postgres v14 build"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
 	(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
-	$(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
+		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)

@@ -81,7 +82,8 @@ $(POSTGRES_INSTALL_DIR)/build/v15/config.status:
 	+@echo "Configuring Postgres v15 build"
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
 	(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
-	$(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \
+	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
+		CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)

@@ -111,6 +113,8 @@ postgres-v14: postgres-v14-configure \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
 	+@echo "Compiling libpq v14"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm v14"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache v14"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect v14"
@@ -123,6 +127,8 @@ postgres-v15: postgres-v15-configure \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
 	+@echo "Compiling libpq v15"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
+	+@echo "Compiling pg_prewarm v15"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache v15"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect v15"
--- a/README.md
+++ b/README.md
@@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf etcd openssl
+brew install protobuf etcd openssl flex bison
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -125,24 +125,23 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/neon_local init
-Starting pageserver at '127.0.0.1:64000' in '.neon'
-
-Pageserver started
-Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7
-Stopping pageserver gracefully...done!
+Starting pageserver at '127.0.0.1:64000' in '.neon'.
+pageserver started, pid: 2545906
+Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
+Stopped pageserver 1 process with pid 2545906

 # start pageserver and safekeeper
 > ./target/debug/neon_local start
-Starting etcd broker using /usr/bin/etcd
-Starting pageserver at '127.0.0.1:64000' in '.neon'
-
-Pageserver started
-Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
-Safekeeper started
+Starting etcd broker using "/usr/bin/etcd"
+etcd started, pid: 2545996
+Starting pageserver at '127.0.0.1:64000' in '.neon'.
+pageserver started, pid: 2546005
+Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
+safekeeper 1 started, pid: 2546041

 # start postgres compute node
 > ./target/debug/neon_local pg start main
-Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
+Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
 Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

--- a/cli-v2-story.md
+++ b/cli-v2-story.md
@@ -1,188 +0,0 @@
-Create a new Zenith repository in the current directory:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init
-    The files belonging to this database system will be owned by user "heikki".
-    This user must also own the server process.
-    
-    The database cluster will be initialized with locale "en_GB.UTF-8".
-    The default database encoding has accordingly been set to "UTF8".
-    The default text search configuration will be set to "english".
-    
-    Data page checksums are disabled.
-    
-    creating directory tmp ... ok
-    creating subdirectories ... ok
-    selecting dynamic shared memory implementation ... posix
-    selecting default max_connections ... 100
-    selecting default shared_buffers ... 128MB
-    selecting default time zone ... Europe/Helsinki
-    creating configuration files ... ok
-    running bootstrap script ... ok
-    performing post-bootstrap initialization ... ok
-    syncing data to disk ... ok
-    
-    initdb: warning: enabling "trust" authentication for local connections
-    You can change this by editing pg_hba.conf or using the option -A, or
-    --auth-local and --auth-host, the next time you run initdb.
-    new zenith repository was created in .zenith
-
-Initially, there is only one branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
-      main
-
-Start a local Postgres instance on the branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main
-    Creating data directory from snapshot at 0/15FFB08...
-    waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG:  starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
-    2021-04-13 09:27:43.920 EEST [984664] LOG:  listening on IPv6 address "::1", port 5432
-    2021-04-13 09:27:43.920 EEST [984664] LOG:  listening on IPv4 address "127.0.0.1", port 5432
-    2021-04-13 09:27:43.927 EEST [984664] LOG:  listening on Unix socket "/tmp/.s.PGSQL.5432"
-    2021-04-13 09:27:43.939 EEST [984665] LOG:  database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
-    2021-04-13 09:27:43.939 EEST [984665] LOG:  creating missing WAL directory "pg_wal/archive_status"
-    2021-04-13 09:27:44.189 EEST [984665] LOG:  database system was not properly shut down; automatic recovery in progress
-    2021-04-13 09:27:44.195 EEST [984665] LOG:  invalid record length at 0/15FFB80: wanted 24, got 0
-    2021-04-13 09:27:44.195 EEST [984665] LOG:  redo is not required
-    2021-04-13 09:27:44.225 EEST [984664] LOG:  database system is ready to accept connections
-     done
-    server started
-
-Run some commands against it:
-
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);" 
-    CREATE TABLE
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');" 
-    INSERT 0 1
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-    (1 row)
-
-Create a new branch called 'experimental'. We create it from the
-current end of the 'main' branch, but you could specify a different
-LSN as the start point instead.
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main
-    branching at end of WAL: 0/161F478
-    
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch 
-      experimental
-      main
-
-Start another Postgres instance off the 'experimental' branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
-    Creating data directory from snapshot at 0/15FFB08...
-    waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG:  starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
-    2021-04-13 09:28:41.875 EEST [984766] LOG:  listening on IPv6 address "::1", port 5433
-    2021-04-13 09:28:41.875 EEST [984766] LOG:  listening on IPv4 address "127.0.0.1", port 5433
-    2021-04-13 09:28:41.883 EEST [984766] LOG:  listening on Unix socket "/tmp/.s.PGSQL.5433"
-    2021-04-13 09:28:41.896 EEST [984767] LOG:  database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
-    2021-04-13 09:28:42.265 EEST [984767] LOG:  database system was not properly shut down; automatic recovery in progress
-    2021-04-13 09:28:42.269 EEST [984767] LOG:  redo starts at 0/15FFB80
-    2021-04-13 09:28:42.272 EEST [984767] LOG:  invalid record length at 0/161F4B0: wanted 24, got 0
-    2021-04-13 09:28:42.272 EEST [984767] LOG:  redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
-    2021-04-13 09:28:42.321 EEST [984766] LOG:  database system is ready to accept connections
-     done
-    server started
-
-Insert some a row on the 'experimental' branch:
-
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-    (1 row)
-    
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')" 
-    INSERT 0 1
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-     inserted on experimental
-    (2 rows)
-    
-See that the other Postgres instance is still running on 'main' branch on port 5432:
-
-
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-    (1 row)
-
-
-
-
-Everything is stored in the .zenith directory:
-
-    ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/
-    total 12
-    drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs
-    drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs
-    drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines
-
-The 'datadirs' directory contains the datadirs of the running instances:
-
-    ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/
-    total 8
-    drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e
-    drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76
-    ~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/
-    total 124
-    drwxr-xr-x 5 heikki heikki  4096 Apr 13 09:27 base
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 global
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_commit_ts
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_dynshmem
-    -rw------- 1 heikki heikki  4760 Apr 13 09:27 pg_hba.conf
-    -rw------- 1 heikki heikki  1636 Apr 13 09:27 pg_ident.conf
-    drwxr-xr-x 4 heikki heikki  4096 Apr 13 09:32 pg_logical
-    drwxr-xr-x 4 heikki heikki  4096 Apr 13 09:27 pg_multixact
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_notify
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_replslot
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_serial
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_snapshots
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_stat
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:34 pg_stat_tmp
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_subtrans
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_tblspc
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_twophase
-    -rw------- 1 heikki heikki     3 Apr 13 09:27 PG_VERSION
-    lrwxrwxrwx 1 heikki heikki    52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal
-    drwxr-xr-x 2 heikki heikki  4096 Apr 13 09:27 pg_xact
-    -rw------- 1 heikki heikki    88 Apr 13 09:27 postgresql.auto.conf
-    -rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf
-    -rw------- 1 heikki heikki    96 Apr 13 09:27 postmaster.opts
-    -rw------- 1 heikki heikki   149 Apr 13 09:27 postmaster.pid
-
-Note how 'pg_wal' is just a symlink to the 'timelines' directory. The
-datadir is ephemeral, you can delete it at any time, and it can be reconstructed
-from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull
-the repository, the 'datadirs' are not included. (They are like git working trees)
-
-    ~/git-sandbox/zenith (cli-v2)$ killall -9 postgres
-    ~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/*
-    ~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
-    Creating data directory from snapshot at 0/15FFB08...
-    waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG:  starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
-    2021-04-13 09:37:05.477 EEST [985340] LOG:  listening on IPv6 address "::1", port 5433
-    2021-04-13 09:37:05.477 EEST [985340] LOG:  listening on IPv4 address "127.0.0.1", port 5433
-    2021-04-13 09:37:05.487 EEST [985340] LOG:  listening on Unix socket "/tmp/.s.PGSQL.5433"
-    2021-04-13 09:37:05.498 EEST [985341] LOG:  database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
-    2021-04-13 09:37:05.808 EEST [985341] LOG:  database system was not properly shut down; automatic recovery in progress
-    2021-04-13 09:37:05.813 EEST [985341] LOG:  redo starts at 0/15FFB80
-    2021-04-13 09:37:05.815 EEST [985341] LOG:  invalid record length at 0/161F770: wanted 24, got 0
-    2021-04-13 09:37:05.815 EEST [985341] LOG:  redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
-    2021-04-13 09:37:05.866 EEST [985340] LOG:  database system is ready to accept connections
-     done
-    server started
-    ~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo" 
-                  t              
-    -----------------------------
-     inserted on the main branch
-     inserted on experimental
-    (2 rows)
-
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -26,8 +26,18 @@ use nix::unistd::Pid;

 use utils::lock_file;

-const RETRIES: u32 = 15;
-const RETRY_TIMEOUT_MILLIS: u64 = 500;
+// These constants control the loop used to poll for process start / stop.
+//
+// The loop waits for at most 10 seconds, polling every 100 ms.
+// Once a second, it prints a dot ("."), to give the user an indication that
+// it's waiting. If the process hasn't started/stopped after 5 seconds,
+// it prints a notice that it's taking long, but keeps waiting.
+//
+const RETRY_UNTIL_SECS: u64 = 10;
+const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
+const RETRY_INTERVAL_MILLIS: u64 = 100;
+const DOT_EVERY_RETRIES: u64 = 10;
+const NOTICE_AFTER_RETRIES: u64 = 50;

 /// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
 /// it itself.
@@ -107,16 +117,16 @@ where
                return Ok(spawned_process);
            }
            Ok(false) => {
-                if retries < 5 {
+                if retries == NOTICE_AFTER_RETRIES {
+                    // The process is taking a long time to start up. Keep waiting, but
+                    // print a message
+                    print!("\n{process_name} has not started yet, continuing to wait");
+                }
+                if retries % DOT_EVERY_RETRIES == 0 {
                    print!(".");
                    io::stdout().flush().unwrap();
-                } else {
-                    if retries == 5 {
-                        println!() // put a line break after dots for second message
-                    }
-                    println!("{process_name} has not started yet, retrying ({retries})...");
                }
-                thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("{process_name} failed to start: {e:#}");
@@ -127,7 +137,8 @@ where
            }
        }
    }
-    anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
+    println!();
+    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -158,7 +169,7 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
    }

    // Wait until process is gone
-    for _ in 0..RETRIES {
+    for retries in 0..RETRIES {
        match process_has_stopped(pid) {
            Ok(true) => {
                println!("\n{process_name} stopped");
@@ -170,9 +181,16 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
                return Ok(());
            }
            Ok(false) => {
-                print!(".");
-                io::stdout().flush().unwrap();
-                thread::sleep(Duration::from_secs(1))
+                if retries == NOTICE_AFTER_RETRIES {
+                    // The process is taking a long time to start up. Keep waiting, but
+                    // print a message
+                    print!("\n{process_name} has not stopped yet, continuing to wait");
+                }
+                if retries % DOT_EVERY_RETRIES == 0 {
+                    print!(".");
+                    io::stdout().flush().unwrap();
+                }
+                thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
            }
            Err(e) => {
                println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -180,24 +198,21 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
            }
        }
    }
-
-    anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
+    println!();
+    anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
 }

 fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");

-    let var = "LLVM_PROFILE_FILE";
-    if let Some(val) = std::env::var_os(var) {
-        filled_cmd = filled_cmd.env(var, val);
+    // Pass through these environment variables to the command
+    for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
+        if let Some(val) = std::env::var_os(var) {
+            filled_cmd = filled_cmd.env(var, val);
+        }
    }

-    const RUST_LOG_KEY: &str = "RUST_LOG";
-    if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
-        filled_cmd.env(RUST_LOG_KEY, rust_log_value)
-    } else {
-        filled_cmd
-    }
+    filled_cmd
 }

 fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
--- a/control_plane/src/compute.rs
+++ b/control_plane/src/compute.rs
@@ -343,7 +343,7 @@ impl PostgresNode {
        //   To be able to restore database in case of pageserver node crash, safekeeper should not
        //   remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
        //   (if they are not able to upload WAL to S3).
-        conf.append("max_replication_write_lag", "500MB");
+        conf.append("max_replication_write_lag", "15MB");
        conf.append("max_replication_flush_lag", "10GB");

        if !self.env.safekeepers.is_empty() {
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -6,7 +6,7 @@ use crate::{background_process, local_env};

 pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_broker = &env.etcd_broker;
-    println!(
+    print!(
        "Starting etcd broker using {:?}",
        etcd_broker.etcd_binary_path
    );
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -237,7 +237,7 @@ impl PageServerNode {
        datadir: &Path,
        update_config: bool,
    ) -> anyhow::Result<Child> {
-        println!(
+        print!(
            "Starting pageserver at '{}' in '{}'",
            self.pg_connection_config.raw_address(),
            datadir.display()
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -83,6 +83,16 @@ A subject for future modularization.
 `/libs/metrics`:
 Helpers for exposing Prometheus metrics from the server.

+### Adding dependencies
+When you add a Cargo dependency, you should update hakari manifest by running commands below and committing the updated `Cargo.lock` and `workspace_hack/`. There may be no changes, that's fine.
+
+```bash
+cargo hakari generate
+cargo hakari manage-deps
+```
+
+If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
+
 ## Using Python
 Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
 so manual installation of dependencies is not recommended.
--- a/libs/persistent_range_query/Cargo.toml
+++ b/libs/persistent_range_query/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "persistent_range_query"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[dev-dependencies]
+rand = "0.8.3"
--- a/libs/persistent_range_query/src/lib.rs
+++ b/libs/persistent_range_query/src/lib.rs
@@ -0,0 +1,78 @@
+use std::ops::Range;
+
+pub mod naive;
+pub mod ops;
+pub mod segment_tree;
+
+/// Should be a monoid:
+/// * Identity element: for all a: combine(new_for_empty_range(), a) = combine(a, new_for_empty_range()) = a
+/// * Associativity: for all a, b, c: combine(combine(a, b), c) == combine(a, combine(b, c))
+pub trait RangeQueryResult<Key>: Sized + Clone {
+    // Clone is equivalent to combine with an empty range.
+
+    fn new_for_empty_range() -> Self;
+
+    // Contract: left_range.end == right_range.start
+    // left_range.start == left_range.end == right_range.start == right_range.end is still possible
+    fn combine(
+        left: &Self,
+        left_range: &Range<Key>,
+        right: &Self,
+        right_range: &Range<Key>,
+    ) -> Self;
+
+    fn add(left: &mut Self, left_range: &Range<Key>, right: &Self, right_range: &Range<Key>);
+}
+
+pub trait LazyRangeInitializer<Result: RangeQueryResult<Key>, Key> {
+    fn get(&self, range: &Range<Key>) -> Result;
+}
+
+/// Should be a monoid:
+/// * Identity element: for all op: compose(no_op(), op) == compose(op, no_op()) == op
+/// * Associativity: for all op_1, op_2, op_3: compose(compose(op_1, op_2), op_3) == compose(op_1, compose(op_2, op_3))
+///
+/// Should left act on Result:
+/// * Identity operation: for all r: no_op().apply(r) == r
+/// * Compatibility: for all op_1, op_2, r: op_1.apply(op_2.apply(r)) == compose(op_1, op_2).apply(r)
+pub trait RangeModification<Key> {
+    type Result: RangeQueryResult<Key>;
+
+    fn no_op() -> Self;
+    fn is_no_op(&self) -> bool;
+    fn is_reinitialization(&self) -> bool;
+    fn apply(&self, result: &mut Self::Result, range: &Range<Key>);
+    fn compose(later: &Self, earlier: &mut Self);
+}
+
+pub trait VecReadableVersion<Modification: RangeModification<Key>, Key> {
+    fn get(&self, keys: &Range<Key>) -> Modification::Result;
+}
+
+// TODO: use trait alias when stabilized
+pub trait VecFrozenVersion<Modification: RangeModification<Key>, Key>:
+    Clone + VecReadableVersion<Modification, Key>
+{
+}
+
+impl<
+        T: Clone + VecReadableVersion<Modification, Key>,
+        Modification: RangeModification<Key>,
+        Key,
+    > VecFrozenVersion<Modification, Key> for T
+{
+}
+
+pub trait PersistentVecStorage<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key,
+>: VecReadableVersion<Modification, Key>
+{
+    fn new(all_keys: Range<Key>, initializer: Initializer) -> Self;
+
+    type FrozenVersion: VecFrozenVersion<Modification, Key>;
+
+    fn modify(&mut self, keys: &Range<Key>, modification: &Modification);
+    fn freeze(&mut self) -> Self::FrozenVersion;
+}
--- a/libs/persistent_range_query/src/naive.rs
+++ b/libs/persistent_range_query/src/naive.rs
@@ -0,0 +1,115 @@
+use crate::{
+    LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
+    VecReadableVersion,
+};
+use std::marker::PhantomData;
+use std::ops::Range;
+use std::rc::Rc;
+
+pub struct NaiveFrozenVersion<Modification: RangeModification<Key>, Key> {
+    all_keys: Range<Key>,
+    values: Rc<Box<Vec<Modification::Result>>>,
+}
+
+pub trait IndexableKey: Clone {
+    fn index(all_keys: &Range<Self>, key: &Self) -> usize;
+    fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self>;
+}
+
+fn get<Modification: RangeModification<Key>, Key: IndexableKey>(
+    all_keys: &Range<Key>,
+    values: &Vec<Modification::Result>,
+    keys: &Range<Key>,
+) -> Modification::Result {
+    let mut result = Modification::Result::new_for_empty_range();
+    let mut result_range = keys.start.clone()..keys.start.clone();
+    for index in
+        IndexableKey::index(&all_keys, &keys.start)..IndexableKey::index(&all_keys, &keys.end)
+    {
+        let element_range = IndexableKey::element_range(&all_keys, index);
+        Modification::Result::add(&mut result, &result_range, &values[index], &element_range);
+        result_range.end = element_range.end;
+    }
+    result
+}
+
+impl<Modification: RangeModification<Key>, Key: IndexableKey> VecReadableVersion<Modification, Key>
+    for NaiveFrozenVersion<Modification, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        get::<Modification, Key>(&self.all_keys, &self.values, keys)
+    }
+}
+
+// Manual implementation of `Clone` becase `derive` requires `Modification: Clone`
+impl<Modification: RangeModification<Key>, Key: Clone> Clone
+    for NaiveFrozenVersion<Modification, Key>
+{
+    fn clone(&self) -> Self {
+        Self {
+            all_keys: self.all_keys.clone(),
+            values: self.values.clone(),
+        }
+    }
+}
+
+// TODO: is it at all possible to store previous versions in this struct,
+// without any Rc<>?
+pub struct NaiveVecStorage<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: IndexableKey,
+> {
+    all_keys: Range<Key>,
+    last_version: Vec<Modification::Result>,
+    _initializer: PhantomData<Initializer>,
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: IndexableKey,
+    > VecReadableVersion<Modification, Key> for NaiveVecStorage<Modification, Initializer, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        get::<Modification, Key>(&self.all_keys, &self.last_version, keys)
+    }
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: IndexableKey,
+    > PersistentVecStorage<Modification, Initializer, Key>
+    for NaiveVecStorage<Modification, Initializer, Key>
+{
+    fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
+        let mut values = Vec::with_capacity(IndexableKey::index(&all_keys, &all_keys.end));
+        for index in 0..values.capacity() {
+            values.push(initializer.get(&IndexableKey::element_range(&all_keys, index)));
+        }
+        NaiveVecStorage {
+            all_keys,
+            last_version: values,
+            _initializer: PhantomData,
+        }
+    }
+
+    type FrozenVersion = NaiveFrozenVersion<Modification, Key>;
+
+    fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
+        for index in IndexableKey::index(&self.all_keys, &keys.start)
+            ..IndexableKey::index(&self.all_keys, &keys.end)
+        {
+            let element_range = IndexableKey::element_range(&self.all_keys, index);
+            modification.apply(&mut self.last_version[index], &element_range);
+        }
+    }
+
+    fn freeze(&mut self) -> Self::FrozenVersion {
+        NaiveFrozenVersion::<Modification, Key> {
+            all_keys: self.all_keys.clone(),
+            values: Rc::new(Box::new(self.last_version.clone())),
+        }
+    }
+}
--- a/libs/persistent_range_query/src/ops/mod.rs
+++ b/libs/persistent_range_query/src/ops/mod.rs
@@ -0,0 +1,14 @@
+pub mod rsq;
+
+#[derive(Copy, Clone, Debug)]
+pub struct SameElementsInitializer<T> {
+    initial_element_value: T,
+}
+
+impl<T> SameElementsInitializer<T> {
+    pub fn new(initial_element_value: T) -> Self {
+        SameElementsInitializer {
+            initial_element_value,
+        }
+    }
+}
--- a/libs/persistent_range_query/src/ops/rsq.rs
+++ b/libs/persistent_range_query/src/ops/rsq.rs
@@ -0,0 +1,118 @@
+//! # Range Sum Query
+
+use crate::ops::SameElementsInitializer;
+use crate::{LazyRangeInitializer, RangeModification, RangeQueryResult};
+use std::borrow::Borrow;
+use std::ops::{Add, AddAssign, Range};
+
+// TODO: commutative Add
+
+#[derive(Clone, Copy, Debug)]
+pub struct SumResult<T> {
+    sum: T,
+}
+
+impl<T> SumResult<T> {
+    pub fn sum(&self) -> &T {
+        &self.sum
+    }
+}
+
+impl<T: Clone + for<'a> AddAssign<&'a T> + From<u8>, Key> RangeQueryResult<Key> for SumResult<T>
+where
+    for<'a> &'a T: Add<&'a T, Output = T>,
+{
+    fn new_for_empty_range() -> Self {
+        SumResult { sum: 0.into() }
+    }
+
+    fn combine(
+        left: &Self,
+        _left_range: &Range<Key>,
+        right: &Self,
+        _right_range: &Range<Key>,
+    ) -> Self {
+        SumResult {
+            sum: &left.sum + &right.sum,
+        }
+    }
+
+    fn add(left: &mut Self, _left_range: &Range<Key>, right: &Self, _right_range: &Range<Key>) {
+        left.sum += &right.sum
+    }
+}
+
+pub trait SumOfSameElements<Key> {
+    fn sum(initial_element_value: &Self, keys: &Range<Key>) -> Self;
+}
+
+impl<T: SumOfSameElements<Key>, TB: Borrow<T>, Key> LazyRangeInitializer<SumResult<T>, Key>
+    for SameElementsInitializer<TB>
+where
+    SumResult<T>: RangeQueryResult<Key>,
+{
+    fn get(&self, range: &Range<Key>) -> SumResult<T> {
+        SumResult {
+            sum: SumOfSameElements::sum(self.initial_element_value.borrow(), range),
+        }
+    }
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum AddAssignModification<T> {
+    None,
+    Add(T),
+    Assign(T),
+}
+
+impl<T: Clone + for<'a> AddAssign<&'a T>, Key> RangeModification<Key> for AddAssignModification<T>
+where
+    SumResult<T>: RangeQueryResult<Key>,
+    for<'a> SameElementsInitializer<&'a T>: LazyRangeInitializer<SumResult<T>, Key>,
+{
+    type Result = SumResult<T>;
+
+    fn no_op() -> Self {
+        AddAssignModification::None
+    }
+
+    fn is_no_op(&self) -> bool {
+        match self {
+            AddAssignModification::None => true,
+            _ => false,
+        }
+    }
+
+    fn is_reinitialization(&self) -> bool {
+        match self {
+            AddAssignModification::Assign(_) => true,
+            _ => false,
+        }
+    }
+
+    fn apply(&self, result: &mut SumResult<T>, range: &Range<Key>) {
+        use AddAssignModification::*;
+        match self {
+            None => {}
+            Add(x) | Assign(x) => {
+                let to_add = SameElementsInitializer::new(x).get(range).sum;
+                if let Assign(_) = self {
+                    result.sum = to_add;
+                } else {
+                    result.sum += &to_add;
+                }
+            }
+        }
+    }
+
+    fn compose(later: &Self, earlier: &mut Self) {
+        use AddAssignModification::*;
+        match (later, earlier) {
+            (_, e @ None) => *e = later.clone(),
+            (None, _) => {}
+            (Assign(_), e) => *e = later.clone(),
+            (Add(x), Add(y)) => *y += x,
+            (Add(x), Assign(value)) => *value += x,
+        }
+    }
+}
--- a/libs/persistent_range_query/src/segment_tree.rs
+++ b/libs/persistent_range_query/src/segment_tree.rs
@@ -0,0 +1,255 @@
+//! # Segment Tree
+//! It is a competitive programming folklore data structure. Do not confuse with the interval tree.
+
+use crate::{LazyRangeInitializer, PersistentVecStorage, RangeQueryResult, VecReadableVersion};
+use std::ops::Range;
+use std::rc::Rc;
+
+pub trait MidpointableKey: Clone + Ord + Sized {
+    fn midpoint(range: &Range<Self>) -> Self;
+}
+
+pub trait RangeModification<Key>: Clone + crate::RangeModification<Key> {}
+
+// TODO: use trait alias when stabilized
+impl<T: Clone + crate::RangeModification<Key>, Key> RangeModification<Key> for T {}
+
+#[derive(Debug)]
+struct Node<Modification: RangeModification<Key>, Key> {
+    result: Modification::Result,
+    modify_children: Modification,
+    left: Option<Rc<Self>>,
+    right: Option<Rc<Self>>,
+}
+
+// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
+impl<Modification: RangeModification<Key>, Key> Clone for Node<Modification, Key> {
+    fn clone(&self) -> Self {
+        Node {
+            result: self.result.clone(),
+            modify_children: self.modify_children.clone(),
+            left: self.left.clone(),
+            right: self.right.clone(),
+        }
+    }
+}
+
+impl<Modification: RangeModification<Key>, Key> Node<Modification, Key> {
+    fn new<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
+        range: &Range<Key>,
+        initializer: &Initializer,
+    ) -> Self {
+        Node {
+            result: initializer.get(range),
+            modify_children: Modification::no_op(),
+            left: None,
+            right: None,
+        }
+    }
+
+    pub fn apply(&mut self, modification: &Modification, range: &Range<Key>) {
+        modification.apply(&mut self.result, range);
+        Modification::compose(modification, &mut self.modify_children);
+        if self.modify_children.is_reinitialization() {
+            self.left = None;
+            self.right = None;
+        }
+    }
+
+    pub fn force_children<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
+        &mut self,
+        initializer: &Initializer,
+        range_left: &Range<Key>,
+        range_right: &Range<Key>,
+    ) {
+        let left = Rc::make_mut(
+            self.left
+                .get_or_insert_with(|| Rc::new(Node::new(&range_left, initializer))),
+        );
+        let right = Rc::make_mut(
+            self.right
+                .get_or_insert_with(|| Rc::new(Node::new(&range_right, initializer))),
+        );
+        left.apply(&self.modify_children, &range_left);
+        right.apply(&self.modify_children, &range_right);
+        self.modify_children = Modification::no_op();
+    }
+
+    pub fn recalculate_from_children(&mut self, range_left: &Range<Key>, range_right: &Range<Key>) {
+        assert!(self.modify_children.is_no_op());
+        assert!(self.left.is_some());
+        assert!(self.right.is_some());
+        self.result = Modification::Result::combine(
+            &self.left.as_ref().unwrap().result,
+            &range_left,
+            &self.right.as_ref().unwrap().result,
+            &range_right,
+        );
+    }
+}
+
+fn split_range<Key: MidpointableKey>(range: &Range<Key>) -> (Range<Key>, Range<Key>) {
+    let range_left = range.start.clone()..MidpointableKey::midpoint(range);
+    let range_right = range_left.end.clone()..range.end.clone();
+    (range_left, range_right)
+}
+
+pub struct PersistentSegmentTreeVersion<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: Clone,
+> {
+    root: Rc<Node<Modification, Key>>,
+    all_keys: Range<Key>,
+    initializer: Rc<Initializer>,
+}
+
+// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: Clone,
+    > Clone for PersistentSegmentTreeVersion<Modification, Initializer, Key>
+{
+    fn clone(&self) -> Self {
+        Self {
+            root: self.root.clone(),
+            all_keys: self.all_keys.clone(),
+            initializer: self.initializer.clone(),
+        }
+    }
+}
+
+fn get<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: MidpointableKey,
+>(
+    node: &mut Rc<Node<Modification, Key>>,
+    node_keys: &Range<Key>,
+    initializer: &Initializer,
+    keys: &Range<Key>,
+) -> Modification::Result {
+    if node_keys.end <= keys.start || keys.end <= node_keys.start {
+        return Modification::Result::new_for_empty_range();
+    }
+    if keys.start <= node_keys.start && node_keys.end <= keys.end {
+        return node.result.clone();
+    }
+    let node = Rc::make_mut(node);
+    let (left_keys, right_keys) = split_range(node_keys);
+    node.force_children(initializer, &left_keys, &right_keys);
+    let mut result = get(node.left.as_mut().unwrap(), &left_keys, initializer, keys);
+    Modification::Result::add(
+        &mut result,
+        &left_keys,
+        &get(node.right.as_mut().unwrap(), &right_keys, initializer, keys),
+        &right_keys,
+    );
+    result
+}
+
+fn modify<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: MidpointableKey,
+>(
+    node: &mut Rc<Node<Modification, Key>>,
+    node_keys: &Range<Key>,
+    initializer: &Initializer,
+    keys: &Range<Key>,
+    modification: &Modification,
+) {
+    if modification.is_no_op() || node_keys.end <= keys.start || keys.end <= node_keys.start {
+        return;
+    }
+    let node = Rc::make_mut(node);
+    if keys.start <= node_keys.start && node_keys.end <= keys.end {
+        node.apply(modification, node_keys);
+        return;
+    }
+    let (left_keys, right_keys) = split_range(node_keys);
+    node.force_children(initializer, &left_keys, &right_keys);
+    modify(
+        node.left.as_mut().unwrap(),
+        &left_keys,
+        initializer,
+        keys,
+        &modification,
+    );
+    modify(
+        node.right.as_mut().unwrap(),
+        &right_keys,
+        initializer,
+        keys,
+        &modification,
+    );
+    node.recalculate_from_children(&left_keys, &right_keys);
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: MidpointableKey,
+    > VecReadableVersion<Modification, Key>
+    for PersistentSegmentTreeVersion<Modification, Initializer, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        get(
+            &mut self.root.clone(), // TODO: do not always force a branch
+            &self.all_keys,
+            self.initializer.as_ref(),
+            keys,
+        )
+    }
+}
+
+pub struct PersistentSegmentTree<
+    Modification: RangeModification<Key>,
+    Initializer: LazyRangeInitializer<Modification::Result, Key>,
+    Key: MidpointableKey,
+>(PersistentSegmentTreeVersion<Modification, Initializer, Key>);
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: MidpointableKey,
+    > VecReadableVersion<Modification, Key>
+    for PersistentSegmentTree<Modification, Initializer, Key>
+{
+    fn get(&self, keys: &Range<Key>) -> Modification::Result {
+        self.0.get(keys)
+    }
+}
+
+impl<
+        Modification: RangeModification<Key>,
+        Initializer: LazyRangeInitializer<Modification::Result, Key>,
+        Key: MidpointableKey,
+    > PersistentVecStorage<Modification, Initializer, Key>
+    for PersistentSegmentTree<Modification, Initializer, Key>
+{
+    fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
+        PersistentSegmentTree(PersistentSegmentTreeVersion {
+            root: Rc::new(Node::new(&all_keys, &initializer)),
+            all_keys: all_keys,
+            initializer: Rc::new(initializer),
+        })
+    }
+
+    type FrozenVersion = PersistentSegmentTreeVersion<Modification, Initializer, Key>;
+
+    fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
+        modify(
+            &mut self.0.root, // TODO: do not always force a branch
+            &self.0.all_keys,
+            self.0.initializer.as_ref(),
+            keys,
+            modification,
+        )
+    }
+
+    fn freeze(&mut self) -> Self::FrozenVersion {
+        self.0.clone()
+    }
+}
--- a/libs/persistent_range_query/tests/layer_map_test.rs
+++ b/libs/persistent_range_query/tests/layer_map_test.rs
@@ -0,0 +1,295 @@
+use persistent_range_query::naive::{IndexableKey, NaiveVecStorage};
+use persistent_range_query::ops::SameElementsInitializer;
+use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
+use persistent_range_query::{
+    LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
+    VecReadableVersion,
+};
+use std::cmp::Ordering;
+use std::ops::Range;
+
+#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
+struct PageIndex(u32);
+type LayerId = String;
+
+impl IndexableKey for PageIndex {
+    fn index(all_keys: &Range<Self>, key: &Self) -> usize {
+        (key.0 as usize) - (all_keys.start.0 as usize)
+    }
+
+    fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
+        PageIndex(all_keys.start.0 + index as u32)..PageIndex(all_keys.start.0 + index as u32 + 1)
+    }
+}
+
+impl MidpointableKey for PageIndex {
+    fn midpoint(range: &Range<Self>) -> Self {
+        PageIndex(range.start.0 + (range.end.0 - range.start.0) / 2)
+    }
+}
+
+#[derive(Clone, Debug, Eq, PartialEq)]
+struct LayerMapInformation {
+    // Only make sense for a range of length 1.
+    last_layer: Option<LayerId>,
+    last_image_layer: Option<LayerId>,
+    // Work for all ranges
+    max_delta_layers: (usize, Range<PageIndex>),
+}
+
+impl LayerMapInformation {
+    fn last_layers(&self) -> (&Option<LayerId>, &Option<LayerId>) {
+        (&self.last_layer, &self.last_image_layer)
+    }
+
+    fn max_delta_layers(&self) -> &(usize, Range<PageIndex>) {
+        &self.max_delta_layers
+    }
+}
+
+fn merge_ranges(left: &Range<PageIndex>, right: &Range<PageIndex>) -> Range<PageIndex> {
+    if left.is_empty() {
+        right.clone()
+    } else if right.is_empty() {
+        left.clone()
+    } else if left.end == right.start {
+        left.start..right.end
+    } else {
+        left.clone()
+    }
+}
+
+impl RangeQueryResult<PageIndex> for LayerMapInformation {
+    fn new_for_empty_range() -> Self {
+        LayerMapInformation {
+            last_layer: None,
+            last_image_layer: None,
+            max_delta_layers: (0, PageIndex(0)..PageIndex(0)),
+        }
+    }
+
+    fn combine(
+        left: &Self,
+        _left_range: &Range<PageIndex>,
+        right: &Self,
+        _right_range: &Range<PageIndex>,
+    ) -> Self {
+        // Note that either range may be empty.
+        LayerMapInformation {
+            last_layer: left
+                .last_layer
+                .as_ref()
+                .or_else(|| right.last_layer.as_ref())
+                .cloned(),
+            last_image_layer: left
+                .last_image_layer
+                .as_ref()
+                .or_else(|| right.last_image_layer.as_ref())
+                .cloned(),
+            max_delta_layers: match left.max_delta_layers.0.cmp(&right.max_delta_layers.0) {
+                Ordering::Less => right.max_delta_layers.clone(),
+                Ordering::Greater => left.max_delta_layers.clone(),
+                Ordering::Equal => (
+                    left.max_delta_layers.0,
+                    merge_ranges(&left.max_delta_layers.1, &right.max_delta_layers.1),
+                ),
+            },
+        }
+    }
+
+    fn add(
+        left: &mut Self,
+        left_range: &Range<PageIndex>,
+        right: &Self,
+        right_range: &Range<PageIndex>,
+    ) {
+        *left = Self::combine(&left, left_range, right, right_range);
+    }
+}
+
+#[derive(Clone, Debug)]
+struct AddDeltaLayers {
+    last_layer: LayerId,
+    count: usize,
+}
+
+#[derive(Clone, Debug)]
+struct LayerMapModification {
+    add_image_layer: Option<LayerId>,
+    add_delta_layers: Option<AddDeltaLayers>,
+}
+
+impl LayerMapModification {
+    fn add_image_layer(layer: impl Into<LayerId>) -> Self {
+        LayerMapModification {
+            add_image_layer: Some(layer.into()),
+            add_delta_layers: None,
+        }
+    }
+
+    fn add_delta_layer(layer: impl Into<LayerId>) -> Self {
+        LayerMapModification {
+            add_image_layer: None,
+            add_delta_layers: Some(AddDeltaLayers {
+                last_layer: layer.into(),
+                count: 1,
+            }),
+        }
+    }
+}
+
+impl RangeModification<PageIndex> for LayerMapModification {
+    type Result = LayerMapInformation;
+
+    fn no_op() -> Self {
+        LayerMapModification {
+            add_image_layer: None,
+            add_delta_layers: None,
+        }
+    }
+
+    fn is_no_op(&self) -> bool {
+        self.add_image_layer.is_none() && self.add_delta_layers.is_none()
+    }
+
+    fn is_reinitialization(&self) -> bool {
+        self.add_image_layer.is_some()
+    }
+
+    fn apply(&self, result: &mut Self::Result, range: &Range<PageIndex>) {
+        if let Some(layer) = &self.add_image_layer {
+            result.last_layer = Some(layer.clone());
+            result.last_image_layer = Some(layer.clone());
+            result.max_delta_layers = (0, range.clone());
+        }
+        if let Some(AddDeltaLayers { last_layer, count }) = &self.add_delta_layers {
+            result.last_layer = Some(last_layer.clone());
+            result.max_delta_layers.0 += count;
+        }
+    }
+
+    fn compose(later: &Self, earlier: &mut Self) {
+        if later.add_image_layer.is_some() {
+            *earlier = later.clone();
+            return;
+        }
+        if let Some(AddDeltaLayers { last_layer, count }) = &later.add_delta_layers {
+            let res = earlier.add_delta_layers.get_or_insert(AddDeltaLayers {
+                last_layer: LayerId::default(),
+                count: 0,
+            });
+            res.last_layer = last_layer.clone();
+            res.count += count;
+        }
+    }
+}
+
+impl LazyRangeInitializer<LayerMapInformation, PageIndex> for SameElementsInitializer<()> {
+    fn get(&self, range: &Range<PageIndex>) -> LayerMapInformation {
+        LayerMapInformation {
+            last_layer: None,
+            last_image_layer: None,
+            max_delta_layers: (0, range.clone()),
+        }
+    }
+}
+
+fn test_layer_map<
+    S: PersistentVecStorage<LayerMapModification, SameElementsInitializer<()>, PageIndex>,
+>() {
+    let mut s = S::new(
+        PageIndex(0)..PageIndex(100),
+        SameElementsInitializer::new(()),
+    );
+    s.modify(
+        &(PageIndex(0)..PageIndex(70)),
+        &LayerMapModification::add_image_layer("Img0..70"),
+    );
+    s.modify(
+        &(PageIndex(50)..PageIndex(100)),
+        &LayerMapModification::add_image_layer("Img50..100"),
+    );
+    s.modify(
+        &(PageIndex(10)..PageIndex(60)),
+        &LayerMapModification::add_delta_layer("Delta10..60"),
+    );
+    let s_before_last_delta = s.freeze();
+    s.modify(
+        &(PageIndex(20)..PageIndex(80)),
+        &LayerMapModification::add_delta_layer("Delta20..80"),
+    );
+
+    assert_eq!(
+        s.get(&(PageIndex(5)..PageIndex(6))).last_layers(),
+        (&Some("Img0..70".to_owned()), &Some("Img0..70".to_owned()))
+    );
+    assert_eq!(
+        s.get(&(PageIndex(15)..PageIndex(16))).last_layers(),
+        (
+            &Some("Delta10..60".to_owned()),
+            &Some("Img0..70".to_owned())
+        )
+    );
+    assert_eq!(
+        s.get(&(PageIndex(25)..PageIndex(26))).last_layers(),
+        (
+            &Some("Delta20..80".to_owned()),
+            &Some("Img0..70".to_owned())
+        )
+    );
+    assert_eq!(
+        s.get(&(PageIndex(65)..PageIndex(66))).last_layers(),
+        (
+            &Some("Delta20..80".to_owned()),
+            &Some("Img50..100".to_owned())
+        )
+    );
+    assert_eq!(
+        s.get(&(PageIndex(95)..PageIndex(96))).last_layers(),
+        (
+            &Some("Img50..100".to_owned()),
+            &Some("Img50..100".to_owned())
+        )
+    );
+
+    assert_eq!(
+        s.get(&(PageIndex(0)..PageIndex(100))).max_delta_layers(),
+        &(2, PageIndex(20)..PageIndex(60)),
+    );
+    assert_eq!(
+        *s_before_last_delta
+            .get(&(PageIndex(0)..PageIndex(100)))
+            .max_delta_layers(),
+        (1, PageIndex(10)..PageIndex(60)),
+    );
+
+    assert_eq!(
+        *s.get(&(PageIndex(10)..PageIndex(30))).max_delta_layers(),
+        (2, PageIndex(20)..PageIndex(30))
+    );
+    assert_eq!(
+        *s.get(&(PageIndex(10)..PageIndex(20))).max_delta_layers(),
+        (1, PageIndex(10)..PageIndex(20))
+    );
+
+    assert_eq!(
+        *s.get(&(PageIndex(70)..PageIndex(80))).max_delta_layers(),
+        (1, PageIndex(70)..PageIndex(80))
+    );
+    assert_eq!(
+        *s_before_last_delta
+            .get(&(PageIndex(70)..PageIndex(80)))
+            .max_delta_layers(),
+        (0, PageIndex(70)..PageIndex(80))
+    );
+}
+
+#[test]
+fn test_naive() {
+    test_layer_map::<NaiveVecStorage<_, _, _>>();
+}
+
+#[test]
+fn test_segment_tree() {
+    test_layer_map::<PersistentSegmentTree<_, _, _>>();
+}
--- a/libs/persistent_range_query/tests/rsq_test.rs
+++ b/libs/persistent_range_query/tests/rsq_test.rs
@@ -0,0 +1,116 @@
+use persistent_range_query::naive::*;
+use persistent_range_query::ops::rsq::AddAssignModification::Add;
+use persistent_range_query::ops::rsq::*;
+use persistent_range_query::ops::SameElementsInitializer;
+use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
+use persistent_range_query::{PersistentVecStorage, VecReadableVersion};
+use rand::{Rng, SeedableRng};
+use std::ops::Range;
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+struct K(u16);
+
+impl IndexableKey for K {
+    fn index(all_keys: &Range<Self>, key: &Self) -> usize {
+        (key.0 as usize) - (all_keys.start.0 as usize)
+    }
+
+    fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
+        K(all_keys.start.0 + index as u16)..K(all_keys.start.0 + index as u16 + 1)
+    }
+}
+
+impl SumOfSameElements<K> for i32 {
+    fn sum(initial_element_value: &Self, keys: &Range<K>) -> Self {
+        initial_element_value * (keys.end.0 - keys.start.0) as Self
+    }
+}
+
+impl MidpointableKey for K {
+    fn midpoint(range: &Range<Self>) -> Self {
+        K(range.start.0 + (range.end.0 - range.start.0) / 2)
+    }
+}
+
+fn test_storage<
+    S: PersistentVecStorage<AddAssignModification<i32>, SameElementsInitializer<i32>, K>,
+>() {
+    let mut s = S::new(K(0)..K(12), SameElementsInitializer::new(0i32));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 0);
+
+    s.modify(&(K(2)..K(5)), &AddAssignModification::Add(3));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 3 + 3);
+    let s_old = s.freeze();
+
+    s.modify(&(K(3)..K(6)), &AddAssignModification::Assign(10));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 10 + 10);
+
+    s.modify(&(K(4)..K(7)), &AddAssignModification::Add(2));
+    assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 12 + 12 + 2);
+
+    assert_eq!(*s.get(&(K(4)..K(6))).sum(), 12 + 12);
+    assert_eq!(*s_old.get(&(K(4)..K(6))).sum(), 3);
+}
+
+#[test]
+fn test_naive() {
+    test_storage::<NaiveVecStorage<_, _, _>>();
+}
+
+#[test]
+fn test_segment_tree() {
+    test_storage::<PersistentSegmentTree<_, _, _>>();
+}
+
+#[test]
+fn test_stress() {
+    const LEN: u16 = 17_238;
+    const OPERATIONS: i32 = 20_000;
+
+    let mut rng = rand::rngs::StdRng::seed_from_u64(0);
+    let mut naive: NaiveVecStorage<AddAssignModification<i32>, _, _> =
+        NaiveVecStorage::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
+    let mut segm_tree: PersistentSegmentTree<AddAssignModification<i32>, _, _> =
+        PersistentSegmentTree::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
+
+    fn gen_range(rng: &mut impl Rng) -> Range<K> {
+        let l: u16 = rng.gen_range(0..LEN);
+        let r: u16 = rng.gen_range(0..LEN);
+        if l <= r {
+            K(l)..K(r)
+        } else {
+            K(r)..K(l)
+        }
+    }
+
+    for _ in 0..2 {
+        let checksum_range = gen_range(&mut rng);
+        let checksum_before: i32 = *naive.get(&checksum_range).sum();
+        assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
+
+        let naive_before = naive.freeze();
+        let segm_tree_before = segm_tree.freeze();
+        assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
+        assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
+
+        for _ in 0..OPERATIONS {
+            {
+                let range = gen_range(&mut rng);
+                assert_eq!(naive.get(&range).sum(), segm_tree.get(&range).sum());
+            }
+            {
+                let range = gen_range(&mut rng);
+                let val = rng.gen_range(-10i32..=10i32);
+                let op = Add(val);
+                naive.modify(&range, &op);
+                segm_tree.modify(&range, &op);
+            }
+        }
+
+        assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
+        assert_eq!(
+            checksum_before,
+            *segm_tree_before.get(&checksum_range).sum()
+        );
+    }
+}
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -33,8 +33,8 @@ pub struct Segment {
    /// Logical size before this state
    start_size: u64,

-    /// Logical size at this state
-    pub end_size: u64,
+    /// Logical size at this state. Can be None in the last Segment of a branch.
+    pub end_size: Option<u64>,

    /// Indices to [`Storage::segments`]
    ///
@@ -115,7 +115,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
            start_lsn: 0,
            end_lsn: 0,
            start_size: 0,
-            end_size: 0,
+            end_size: Some(0),
            children_after: Vec::new(),
        };

@@ -125,6 +125,39 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
        }
    }

+    /// Advances the branch with a new point, at given LSN.
+    pub fn insert_point<Q: ?Sized>(
+        &mut self,
+        branch: &Q,
+        op: Cow<'static, str>,
+        lsn: u64,
+        size: Option<u64>,
+    ) where
+        K: std::borrow::Borrow<Q>,
+        Q: std::hash::Hash + Eq,
+    {
+        let lastseg_id = *self.branches.get(branch).unwrap();
+        let newseg_id = self.segments.len();
+        let lastseg = &mut self.segments[lastseg_id];
+
+        assert!(lsn > lastseg.end_lsn);
+
+        let newseg = Segment {
+            op,
+            parent: Some(lastseg_id),
+            start_lsn: lastseg.end_lsn,
+            end_lsn: lsn,
+            start_size: lastseg.end_size.unwrap(),
+            end_size: size,
+            children_after: Vec::new(),
+            needed: false,
+        };
+        lastseg.children_after.push(newseg_id);
+
+        self.segments.push(newseg);
+        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+    }
+
    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
    pub fn modify_branch<Q: ?Sized>(
        &mut self,
@@ -145,8 +178,8 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
            parent: Some(lastseg_id),
            start_lsn: lastseg.end_lsn,
            end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: lastseg.end_size,
-            end_size: (lastseg.end_size as i64 + size_bytes) as u64,
+            start_size: lastseg.end_size.unwrap(),
+            end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
            children_after: Vec::new(),
            needed: false,
        };
@@ -321,7 +354,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                Some(SegmentSize {
                    seg_id,
                    method: SnapshotAfter,
-                    this_size: seg.end_size,
+                    this_size: seg.end_size.unwrap(),
                    children,
                })
            } else {
--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -174,7 +174,7 @@ fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
    let seg_id = node.seg_id;
    let seg = segments.get(seg_id).unwrap();
    let lsn = seg.end_lsn;
-    let size = seg.end_size;
+    let size = seg.end_size.unwrap_or(0);
    let method = node.method;

    println!("  {{");
@@ -226,7 +226,7 @@ fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
            print!(
                " label=\"{} / {}\"",
                next.end_lsn - seg.end_lsn,
-                (next.end_size as i128 - seg.end_size as i128)
+                (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
            );
        } else {
            print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -48,6 +48,25 @@ pub mod nonblock;
 // Default signal handling
 pub mod signals;

+/// use with fail::cfg("$name", "return(2000)")
+#[macro_export]
+macro_rules! failpoint_sleep_millis_async {
+    ($name:literal) => {{
+        let should_sleep: Option<std::time::Duration> = (|| {
+            fail::fail_point!($name, |v: Option<_>| {
+                let millis = v.unwrap().parse::<u64>().unwrap();
+                Some(Duration::from_millis(millis))
+            });
+            None
+        })();
+        if let Some(d) = should_sleep {
+            tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
+            tokio::time::sleep(d).await;
+            tracing::info!("failpoint {:?}: sleep done", $name);
+        }
+    }};
+}
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -199,6 +199,20 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
    logging::init(conf.log_format)?;
    info!("version: {}", version());

+    // If any failpoints were set from FAILPOINTS environment variable,
+    // print them to the log for debugging purposes
+    let failpoints = fail::list();
+    if !failpoints.is_empty() {
+        info!(
+            "started with failpoints: {}",
+            failpoints
+                .iter()
+                .map(|(name, actions)| format!("{name}={actions}"))
+                .collect::<Vec<String>>()
+                .join(";")
+        )
+    }
+
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
        lock_file::LockCreationResult::Created {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -667,6 +667,7 @@ components:
        - disk_consistent_lsn
        - awaits_download
        - state
+        - latest_gc_cutoff_lsn
      properties:
        timeline_id:
          type: string
@@ -711,6 +712,9 @@ components:
          type: boolean
        state:
          type: string
+        latest_gc_cutoff_lsn:
+          type: string
+          format: hex

        # These 'local' and 'remote' fields just duplicate some of the fields
        # above. They are kept for backwards-compatibility. They can be removed,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -461,14 +461,7 @@ impl Tenant {
                    .context("Cannot branch off the timeline that's not present in pageserver")?;

                if let Some(lsn) = ancestor_start_lsn.as_mut() {
-                    // Wait for the WAL to arrive and be processed on the parent branch up
-                    // to the requested branch point. The repository code itself doesn't
-                    // require it, but if we start to receive WAL on the new timeline,
-                    // decoding the new WAL might need to look up previous pages, relation
-                    // sizes etc. and that would get confused if the previous page versions
-                    // are not in the repository yet.
                    *lsn = lsn.align();
-                    ancestor_timeline.wait_lsn(*lsn).await?;

                    let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
                    if ancestor_ancestor_lsn > *lsn {
@@ -480,6 +473,14 @@ impl Tenant {
                            ancestor_ancestor_lsn,
                        );
                    }
+
+                    // Wait for the WAL to arrive and be processed on the parent branch up
+                    // to the requested branch point. The repository code itself doesn't
+                    // require it, but if we start to receive WAL on the new timeline,
+                    // decoding the new WAL might need to look up previous pages, relation
+                    // sizes etc. and that would get confused if the previous page versions
+                    // are not in the repository yet.
+                    ancestor_timeline.wait_lsn(*lsn).await?;
                }

                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
@@ -1010,6 +1011,10 @@ impl Tenant {

        let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;

+        utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
+
+        info!("starting on {} timelines", gc_timelines.len());
+
        // Perform GC for each timeline.
        //
        // Note that we don't hold the GC lock here because we don't want
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -183,6 +183,19 @@ pub(super) async fn gather_inputs(
            }
        }

+        // all timelines also have an end point if they have made any progress
+        if last_record_lsn > timeline.get_ancestor_lsn()
+            && !interesting_lsns
+                .iter()
+                .any(|(lsn, _)| lsn == &last_record_lsn)
+        {
+            updates.push(Update {
+                lsn: last_record_lsn,
+                command: Command::EndOfBranch,
+                timeline_id: timeline.timeline_id,
+            });
+        }
+
        timeline_inputs.insert(
            timeline.timeline_id,
            TimelineInputs {
@@ -270,48 +283,22 @@ impl ModelInputs {
        // impossible to always determine the a one main branch.
        let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);

-        // tracking these not to require modifying the current implementation of the size model,
-        // which works in relative LSNs and sizes.
-        let mut last_state: HashMap<TimelineId, (Lsn, u64)> = HashMap::new();
-
        for update in &self.updates {
            let Update {
                lsn,
                command: op,
                timeline_id,
            } = update;
+            let Lsn(now) = *lsn;
            match op {
                Command::Update(sz) => {
-                    let latest = last_state.get_mut(timeline_id).ok_or_else(|| {
-                        anyhow::anyhow!(
-                        "ordering-mismatch: there must had been a previous state for {timeline_id}"
-                    )
-                    })?;
-
-                    let lsn_bytes = {
-                        let Lsn(now) = lsn;
-                        let Lsn(prev) = latest.0;
-                        debug_assert!(prev <= *now, "self.updates should had been sorted");
-                        now - prev
-                    };
-
-                    let size_diff =
-                        i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| {
-                            format!("size difference i64 overflow for {timeline_id}")
-                        })?;
-
-                    storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff);
-                    *latest = (*lsn, *sz);
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
+                }
+                Command::EndOfBranch => {
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, None);
                }
                Command::BranchFrom(parent) => {
                    storage.branch(parent, Some(*timeline_id));
-
-                    let size = parent
-                        .as_ref()
-                        .and_then(|id| last_state.get(id))
-                        .map(|x| x.1)
-                        .unwrap_or(0);
-                    last_state.insert(*timeline_id, (*lsn, size));
                }
            }
        }
@@ -320,10 +307,7 @@ impl ModelInputs {
    }
 }

-/// Single size model update.
-///
-/// Sizing model works with relative increments over latest branch state.
-/// Updates are absolute, so additional state needs to be tracked when applying.
+/// A point of interest in the tree of branches
 #[serde_with::serde_as]
 #[derive(
    Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
@@ -342,6 +326,7 @@ struct Update {
 enum Command {
    Update(u64),
    BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
+    EndOfBranch,
 }

 impl std::fmt::Debug for Command {
@@ -351,6 +336,7 @@ impl std::fmt::Debug for Command {
        match self {
            Self::Update(arg0) => write!(f, "Update({arg0})"),
            Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
+            Self::EndOfBranch => write!(f, "EndOfBranch"),
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -61,6 +61,13 @@ use crate::{
    storage_sync::{self, index::LayerFileMetadata},
 };

+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+enum FlushLoopState {
+    NotStarted,
+    Running,
+    Exited,
+}
+
 pub struct Timeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -122,7 +129,7 @@ pub struct Timeline {
    write_lock: Mutex<()>,

    /// Used to avoid multiple `flush_loop` tasks running
-    flush_loop_started: Mutex<bool>,
+    flush_loop_state: Mutex<FlushLoopState>,

    /// layer_flush_start_tx can be used to wake up the layer-flushing task.
    /// The value is a counter, incremented every time a new flush cycle is requested.
@@ -755,7 +762,7 @@ impl Timeline {

            upload_layers: AtomicBool::new(upload_layers),

-            flush_loop_started: Mutex::new(false),
+            flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

            layer_flush_start_tx,
            layer_flush_done_tx,
@@ -794,13 +801,23 @@ impl Timeline {
    }

    pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
-        let mut flush_loop_started = self.flush_loop_started.lock().unwrap();
-        if *flush_loop_started {
-            info!(
-                "skipping attempt to start flush_loop twice {}/{}",
-                self.tenant_id, self.timeline_id
-            );
-            return;
+        let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
+        match *flush_loop_state {
+            FlushLoopState::NotStarted => (),
+            FlushLoopState::Running => {
+                info!(
+                    "skipping attempt to start flush_loop twice {}/{}",
+                    self.tenant_id, self.timeline_id
+                );
+                return;
+            }
+            FlushLoopState::Exited => {
+                warn!(
+                    "ignoring attempt to restart exited flush_loop {}/{}",
+                    self.tenant_id, self.timeline_id
+                );
+                return;
+            }
        }

        let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
@@ -813,11 +830,16 @@ impl Timeline {
                    Some(self.timeline_id),
                    "layer flush task",
                    false,
-                    async move { self_clone.flush_loop(layer_flush_start_rx).await; Ok(()) }
+                    async move {
+                         self_clone.flush_loop(layer_flush_start_rx).await;
+                         let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
+                         assert_eq!(*flush_loop_state, FlushLoopState::Running);
+                         *flush_loop_state  = FlushLoopState::Exited;
+                         Ok(()) }
                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
                );

-        *flush_loop_started = true;
+        *flush_loop_state = FlushLoopState::Running;
    }

    pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
@@ -1365,8 +1387,9 @@ impl Timeline {
        // finished, instead of some other flush that was started earlier.
        let mut my_flush_request = 0;

-        if !&*self.flush_loop_started.lock().unwrap() {
-            anyhow::bail!("cannot flush frozen layers when flush_loop is not running")
+        let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
+        if flush_loop_state != FlushLoopState::Running {
+            anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
        }

        self.layer_flush_start_tx.send_modify(|counter| {
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -71,7 +71,7 @@ async fn compaction_loop(tenant_id: TenantId) {
            let mut sleep_duration = tenant.get_compaction_period();
            if let Err(e) = tenant.compaction_iteration() {
                sleep_duration = wait_duration;
-                error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
+                error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
            }

            // Sleep
@@ -120,7 +120,7 @@ async fn gc_loop(tenant_id: TenantId) {
                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
                {
                    sleep_duration = wait_duration;
-                    error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
+                    error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
                }
            }

--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -32,11 +32,6 @@

 #define PageStoreTrace DEBUG5

-#define NEON_TAG "[NEON_SMGR] "
-#define neon_log(tag, fmt, ...) ereport(tag,                                  \
-										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true)))
-
 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;

@@ -97,11 +92,10 @@ pageserver_connect()

 	while (PQisBusy(pageserver_conn))
 	{
-		int			wc;
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -141,11 +135,10 @@ retry:

 	if (ret == 0)
 	{
-		int			wc;
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -239,6 +232,9 @@ pageserver_receive(void)
 	StringInfoData resp_buff;
 	NeonResponse *resp;

+	if (!connected)
+		return NULL;
+
 	PG_TRY();
 	{
 		/* read response */
@@ -248,7 +244,10 @@ pageserver_receive(void)
 		if (resp_buff.len < 0)
 		{
 			if (resp_buff.len == -1)
-				neon_log(ERROR, "end of COPY");
+			{
+				pageserver_disconnect();
+				return NULL;
+			}
 			else if (resp_buff.len == -2)
 				neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
 		}
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -49,6 +49,11 @@ typedef struct

 #define messageTag(m) (((const NeonMessage *)(m))->tag)

+#define NEON_TAG "[NEON_SMGR] "
+#define neon_log(tag, fmt, ...) ereport(tag,                                  \
+										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
+										 errhidestmt(true), errhidecontext(true)))
+
 /*
 * supertype of all the Neon*Request structs below
 *
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -251,9 +251,9 @@ XLogRecPtr	prefetch_lsn = 0;

 static void consume_prefetch_responses(void);
 static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
-static void prefetch_read(PrefetchRequest *slot);
+static bool prefetch_read(PrefetchRequest *slot);
 static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
-static void prefetch_wait_for(uint64 ring_index);
+static bool prefetch_wait_for(uint64 ring_index);
 static void prefetch_cleanup(void);
 static inline void prefetch_set_unused(uint64 ring_index);

@@ -393,7 +393,7 @@ prefetch_cleanup(void)
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
 */
-static void
+static bool
 prefetch_wait_for(uint64 ring_index)
 {
 	PrefetchRequest *entry;
@@ -412,8 +412,10 @@ prefetch_wait_for(uint64 ring_index)
 		entry = GetPrfSlot(MyPState->ring_receive);

 		Assert(entry->status == PRFS_REQUESTED);
-		prefetch_read(entry);
+		if (!prefetch_read(entry))
+			return false;
 	}
+	return true;
 }

 /*
@@ -425,7 +427,7 @@ prefetch_wait_for(uint64 ring_index)
 * NOTE: this function may indirectly update MyPState->pfs_hash; which
 * invalidates any active pointers into the hash table.
 */
-static void
+static bool
 prefetch_read(PrefetchRequest *slot)
 {
 	NeonResponse *response;
@@ -438,15 +440,22 @@ prefetch_read(PrefetchRequest *slot)
 	old = MemoryContextSwitchTo(MyPState->errctx);
 	response = (NeonResponse *) page_server->receive();
 	MemoryContextSwitchTo(old);
-	
-	/* update prefetch state */
-	MyPState->n_responses_buffered += 1;
-	MyPState->n_requests_inflight -= 1;
-	MyPState->ring_receive += 1;
+	if (response)
+	{
+		/* update prefetch state */
+		MyPState->n_responses_buffered += 1;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;

-	/* update slot state */
-	slot->status = PRFS_RECEIVED;
-	slot->response = response;
+		/* update slot state */
+		slot->status = PRFS_RECEIVED;
+		slot->response = response;
+		return true;
+	}
+	else
+	{
+		return false;
+	}
 }

 /*
@@ -746,11 +755,16 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 static NeonResponse *
 page_server_request(void const *req)
 {
-	page_server->send((NeonRequest *) req);
-	page_server->flush();
-	MyPState->ring_flush = MyPState->ring_unused;
-	consume_prefetch_responses();
-	return page_server->receive();
+	NeonResponse* resp;
+	do {
+		page_server->send((NeonRequest *) req);
+		page_server->flush();
+		MyPState->ring_flush = MyPState->ring_unused;
+		consume_prefetch_responses();
+		resp = page_server->receive();
+	} while (resp == NULL);
+	return resp;
+
 }


@@ -1635,7 +1649,8 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
 bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
+	BufferTag	tag;
+	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1651,7 +1666,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	BufferTag tag = (BufferTag) {
+	tag = (BufferTag) {
 		.rnode = reln->smgr_rnode.node,
 		.forkNum = forknum,
 		.blockNum = blocknum
@@ -1755,22 +1770,24 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 		}
 	}

-	if (entry == NULL)
+	do
 	{
-		n_prefetch_misses += 1;
+		if (entry == NULL)
+		{
+			n_prefetch_misses += 1;

-		ring_index = prefetch_register_buffer(buftag, &request_latest,
-											  &request_lsn);
-		slot = GetPrfSlot(ring_index);
-	}
+			ring_index = prefetch_register_buffer(buftag, &request_latest,
+												  &request_lsn);
+			slot = GetPrfSlot(ring_index);
+		}

-	Assert(slot->my_ring_index == ring_index);
-	Assert(MyPState->ring_last <= ring_index &&
-		   MyPState->ring_unused > ring_index);
-	Assert(slot->status != PRFS_UNUSED);
-	Assert(GetPrfSlot(ring_index) == slot);
+		Assert(slot->my_ring_index == ring_index);
+		Assert(MyPState->ring_last <= ring_index &&
+			   MyPState->ring_unused > ring_index);
+		Assert(slot->status != PRFS_UNUSED);
+		Assert(GetPrfSlot(ring_index) == slot);

-	prefetch_wait_for(ring_index);
+	} while (!prefetch_wait_for(ring_index));

 	Assert(slot->status == PRFS_RECEIVED);

--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -119,6 +119,7 @@ static TimestampTz last_reconnect_attempt;
 static WalproposerShmemState * walprop_shared;

 /* Prototypes for private functions */
+static void WalProposerRegister(void);
 static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
 static void WalProposerStart(void);
 static void WalProposerLoop(void);
@@ -455,7 +456,7 @@ WalProposerPoll(void)
 /*
 * Register a background worker proposing WAL to wal acceptors.
 */
-void
+static void
 WalProposerRegister(void)
 {
 	BackgroundWorker bgw;
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -377,18 +377,18 @@ typedef struct Safekeeper
 	AppendResponse appendResponse;	/* feedback for master */
 } Safekeeper;

-extern PGDLLIMPORT void WalProposerMain(Datum main_arg);
-void		WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
-void		WalProposerPoll(void);
-void		WalProposerRegister(void);
-void		ParseReplicationFeedbackMessage(StringInfo reply_message,
-											ReplicationFeedback * rf);
+extern void WalProposerSync(int argc, char *argv[]);
+extern void WalProposerMain(Datum main_arg);
+extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
+extern void WalProposerPoll(void);
+extern void ParseReplicationFeedbackMessage(StringInfo reply_message,
+											ReplicationFeedback *rf);
 extern void StartProposerReplication(StartReplicationCmd *cmd);

-Size		WalproposerShmemSize(void);
-bool		WalproposerShmemInit(void);
-void		replication_feedback_set(ReplicationFeedback * rf);
-void		replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
+extern Size WalproposerShmemSize(void);
+extern bool WalproposerShmemInit(void);
+extern void replication_feedback_set(ReplicationFeedback *rf);
+extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

 /* libpqwalproposer hooks & helper type */

--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -1,7 +1,7 @@
 //! Client authentication mechanisms.

 pub mod backend;
-pub use backend::{BackendType, ConsoleReqExtra, DatabaseInfo};
+pub use backend::{BackendType, ConsoleReqExtra};

 mod credentials;
 pub use credentials::ClientCredentials;
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -12,7 +12,6 @@ use crate::{
    waiters::{self, Waiter, Waiters},
 };
 use once_cell::sync::Lazy;
-use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -36,45 +35,6 @@ pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), wait
    CPLANE_WAITERS.notify(psql_session_id, msg)
 }

-/// Compute node connection params provided by the cloud.
-/// Note how it implements serde traits, since we receive it over the wire.
-#[derive(Serialize, Deserialize, Default)]
-pub struct DatabaseInfo {
-    pub host: String,
-    pub port: u16,
-    pub dbname: String,
-    pub user: String,
-    pub password: Option<String>,
-}
-
-// Manually implement debug to omit personal and sensitive info.
-impl std::fmt::Debug for DatabaseInfo {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
-        fmt.debug_struct("DatabaseInfo")
-            .field("host", &self.host)
-            .field("port", &self.port)
-            .finish_non_exhaustive()
-    }
-}
-
-impl From<DatabaseInfo> for tokio_postgres::Config {
-    fn from(db_info: DatabaseInfo) -> Self {
-        let mut config = tokio_postgres::Config::new();
-
-        config
-            .host(&db_info.host)
-            .port(db_info.port)
-            .dbname(&db_info.dbname)
-            .user(&db_info.user);
-
-        if let Some(password) = db_info.password {
-            config.password(password);
-        }
-
-        config
-    }
-}
-
 /// Extra query params we'd like to pass to the console.
 pub struct ConsoleReqExtra<'a> {
    /// A unique identifier for a connection.
@@ -158,54 +118,107 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
    }
 }

+/// A product of successful authentication.
+pub struct AuthSuccess<T> {
+    /// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client?
+    pub reported_auth_ok: bool,
+    /// Something to be considered a positive result.
+    pub value: T,
+}
+
+impl<T> AuthSuccess<T> {
+    /// Very similar to [`std::option::Option::map`].
+    /// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
+    /// a function to a contained value.
+    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
+        AuthSuccess {
+            reported_auth_ok: self.reported_auth_ok,
+            value: f(self.value),
+        }
+    }
+}
+
+/// Info for establishing a connection to a compute node.
+/// This is what we get after auth succeeded, but not before!
+pub struct NodeInfo {
+    /// Project from [`auth::ClientCredentials`].
+    pub project: String,
+    /// Compute node connection params.
+    pub config: compute::ConnCfg,
+}
+
 impl BackendType<'_, ClientCredentials<'_>> {
+    /// Do something special if user didn't provide the `project` parameter.
+    async fn try_password_hack(
+        &mut self,
+        extra: &ConsoleReqExtra<'_>,
+        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
+    ) -> auth::Result<Option<AuthSuccess<NodeInfo>>> {
+        use BackendType::*;
+
+        // If there's no project so far, that entails that client doesn't
+        // support SNI or other means of passing the project name.
+        // We now expect to see a very specific payload in the place of password.
+        let fetch_magic_payload = async {
+            warn!("project name not specified, resorting to the password hack auth flow");
+            let payload = AuthFlow::new(client)
+                .begin(auth::PasswordHack)
+                .await?
+                .authenticate()
+                .await?;
+
+            info!(project = &payload.project, "received missing parameter");
+            auth::Result::Ok(payload)
+        };
+
+        // TODO: find a proper way to merge those very similar blocks.
+        let (mut config, payload) = match self {
+            Console(endpoint, creds) if creds.project.is_none() => {
+                let payload = fetch_magic_payload.await?;
+
+                let mut creds = creds.as_ref();
+                creds.project = Some(payload.project.as_str().into());
+                let config = console::Api::new(endpoint, extra, &creds)
+                    .wake_compute()
+                    .await?;
+
+                (config, payload)
+            }
+            Postgres(endpoint, creds) if creds.project.is_none() => {
+                let payload = fetch_magic_payload.await?;
+
+                let mut creds = creds.as_ref();
+                creds.project = Some(payload.project.as_str().into());
+                let config = postgres::Api::new(endpoint, &creds).wake_compute().await?;
+
+                (config, payload)
+            }
+            _ => return Ok(None),
+        };
+
+        config.password(payload.password);
+        Ok(Some(AuthSuccess {
+            reported_auth_ok: false,
+            value: NodeInfo {
+                project: payload.project,
+                config,
+            },
+        }))
+    }
+
    /// Authenticate the client via the requested backend, possibly using credentials.
    pub async fn authenticate(
        mut self,
        extra: &ConsoleReqExtra<'_>,
        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> super::Result<compute::NodeInfo> {
+    ) -> auth::Result<AuthSuccess<NodeInfo>> {
        use BackendType::*;

-        if let Console(_, creds) | Postgres(_, creds) = &mut self {
-            // If there's no project so far, that entails that client doesn't
-            // support SNI or other means of passing the project name.
-            // We now expect to see a very specific payload in the place of password.
-            if creds.project().is_none() {
-                warn!("project name not specified, resorting to the password hack auth flow");
-
-                let payload = AuthFlow::new(client)
-                    .begin(auth::PasswordHack)
-                    .await?
-                    .authenticate()
-                    .await?;
-
-                // Finally we may finish the initialization of `creds`.
-                // TODO: add missing type safety to ClientCredentials.
-                info!(project = &payload.project, "received missing parameter");
-                creds.project = Some(payload.project.into());
-
-                let mut config = match &self {
-                    Console(endpoint, creds) => {
-                        console::Api::new(endpoint, extra, creds)
-                            .wake_compute()
-                            .await?
-                    }
-                    Postgres(endpoint, creds) => {
-                        postgres::Api::new(endpoint, creds).wake_compute().await?
-                    }
-                    _ => unreachable!("see the patterns above"),
-                };
-
-                // We should use a password from payload as well.
-                config.password(payload.password);
-
-                info!("user successfully authenticated (using the password hack)");
-                return Ok(compute::NodeInfo {
-                    reported_auth_ok: false,
-                    config,
-                });
-            }
+        // Handle cases when `project` is missing in `creds`.
+        // TODO: type safety: return `creds` with irrefutable `project`.
+        if let Some(res) = self.try_password_hack(extra, client).await? {
+            info!("user successfully authenticated (using the password hack)");
+            return Ok(res);
        }

        let res = match self {
@@ -215,22 +228,34 @@ impl BackendType<'_, ClientCredentials<'_>> {
                    project = creds.project(),
                    "performing authentication using the console"
                );
+
+                assert!(creds.project.is_some());
                console::Api::new(&endpoint, extra, &creds)
                    .handle_user(client)
-                    .await
+                    .await?
+                    .map(|config| NodeInfo {
+                        project: creds.project.unwrap().into_owned(),
+                        config,
+                    })
            }
            Postgres(endpoint, creds) => {
                info!("performing mock authentication using a local postgres instance");
+
+                assert!(creds.project.is_some());
                postgres::Api::new(&endpoint, &creds)
                    .handle_user(client)
-                    .await
+                    .await?
+                    .map(|config| NodeInfo {
+                        project: creds.project.unwrap().into_owned(),
+                        config,
+                    })
            }
            // NOTE: this auth backend doesn't use client credentials.
            Link(url) => {
                info!("performing link authentication");
-                link::handle_user(&url, client).await
+                link::handle_user(&url, client).await?
            }
-        }?;
+        };

        info!("user successfully authenticated");
        Ok(res)
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -1,9 +1,9 @@
 //! Cloud API V2.

-use super::ConsoleReqExtra;
+use super::{AuthSuccess, ConsoleReqExtra};
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
-    compute::{self, ComputeConnCfg},
+    compute,
    error::{io_error, UserFacingError},
    http, scram,
    stream::PqStream,
@@ -128,7 +128,7 @@ impl<'a> Api<'a> {
    pub(super) async fn handle_user(
        self,
        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<compute::NodeInfo> {
+    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
        handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
    }

@@ -164,7 +164,7 @@ impl<'a> Api<'a> {
    }

    /// Wake up the compute node and return the corresponding connection info.
-    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
+    pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
        let request_id = uuid::Uuid::new_v4().to_string();
        let req = self
            .endpoint
@@ -195,7 +195,7 @@ impl<'a> Api<'a> {
            Some(x) => x,
        };

-        let mut config = ComputeConnCfg::new();
+        let mut config = compute::ConnCfg::new();
        config
            .host(host)
            .port(port)
@@ -213,10 +213,10 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
    endpoint: &'a Endpoint,
    get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo,
    wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
-) -> auth::Result<compute::NodeInfo>
+) -> auth::Result<AuthSuccess<compute::ConnCfg>>
 where
    GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
-    WakeCompute: Future<Output = Result<ComputeConnCfg, WakeComputeError>>,
+    WakeCompute: Future<Output = Result<compute::ConnCfg, WakeComputeError>>,
 {
    info!("fetching user's authentication info");
    let auth_info = get_auth_info(endpoint).await?;
@@ -243,9 +243,9 @@ where
        config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys));
    }

-    Ok(compute::NodeInfo {
+    Ok(AuthSuccess {
        reported_auth_ok: false,
-        config,
+        value: config,
    })
 }

--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,3 +1,4 @@
+use super::{AuthSuccess, NodeInfo};
 use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
 use pq_proto::{BeMessage as Be, BeParameterStatusMessage};
 use thiserror::Error;
@@ -49,7 +50,7 @@ pub fn new_psql_session_id() -> String {
 pub async fn handle_user(
    link_uri: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<compute::NodeInfo> {
+) -> auth::Result<AuthSuccess<NodeInfo>> {
    let psql_session_id = new_psql_session_id();
    let span = info_span!("link", psql_session_id = &psql_session_id);
    let greeting = hello_message(link_uri, &psql_session_id);
@@ -71,8 +72,22 @@ pub async fn handle_user(

    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;

-    Ok(compute::NodeInfo {
+    let mut config = compute::ConnCfg::new();
+    config
+        .host(&db_info.host)
+        .port(db_info.port)
+        .dbname(&db_info.dbname)
+        .user(&db_info.user);
+
+    if let Some(password) = db_info.password {
+        config.password(password);
+    }
+
+    Ok(AuthSuccess {
        reported_auth_ok: true,
-        config: db_info.into(),
+        value: NodeInfo {
+            project: db_info.project,
+            config,
+        },
    })
 }
--- a/proxy/src/auth/backend/postgres.rs
+++ b/proxy/src/auth/backend/postgres.rs
@@ -1,12 +1,12 @@
 //! Local mock of Cloud API V2.

+use super::{
+    console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
+    AuthSuccess,
+};
 use crate::{
-    auth::{
-        self,
-        backend::console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
-        ClientCredentials,
-    },
-    compute::{self, ComputeConnCfg},
+    auth::{self, ClientCredentials},
+    compute,
    error::io_error,
    scram,
    stream::PqStream,
@@ -37,7 +37,7 @@ impl<'a> Api<'a> {
    pub(super) async fn handle_user(
        self,
        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
-    ) -> auth::Result<compute::NodeInfo> {
+    ) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
        // We reuse user handling logic from a production module.
        console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
    }
@@ -82,8 +82,8 @@ impl<'a> Api<'a> {
    }

    /// We don't need to wake anything locally, so we just return the connection info.
-    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
-        let mut config = ComputeConnCfg::new();
+    pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
+        let mut config = compute::ConnCfg::new();
        config
            .host(self.endpoint.host_str().unwrap_or("localhost"))
            .port(self.endpoint.port().unwrap_or(5432))
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -36,11 +36,23 @@ pub struct ClientCredentials<'a> {
 }

 impl ClientCredentials<'_> {
+    #[inline]
    pub fn project(&self) -> Option<&str> {
        self.project.as_deref()
    }
 }

+impl<'a> ClientCredentials<'a> {
+    #[inline]
+    pub fn as_ref(&'a self) -> ClientCredentials<'a> {
+        Self {
+            user: self.user,
+            dbname: self.dbname,
+            project: self.project().map(Cow::Borrowed),
+        }
+    }
+}
+
 impl<'a> ClientCredentials<'a> {
    pub fn parse(
        params: &'a StartupMessageParams,
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -40,17 +40,36 @@ impl UserFacingError for ConnectionError {
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
 pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;

-pub type ComputeConnCfg = tokio_postgres::Config;
+/// A config for establishing a connection to compute node.
+/// Eventually, `tokio_postgres` will be replaced with something better.
+/// Newtype allows us to implement methods on top of it.
+#[repr(transparent)]
+pub struct ConnCfg(pub tokio_postgres::Config);

-/// Various compute node info for establishing connection etc.
-pub struct NodeInfo {
-    /// Did we send [`pq_proto::BeMessage::AuthenticationOk`]?
-    pub reported_auth_ok: bool,
-    /// Compute node connection params.
-    pub config: tokio_postgres::Config,
+impl ConnCfg {
+    /// Construct a new connection config.
+    pub fn new() -> Self {
+        Self(tokio_postgres::Config::new())
+    }
 }

-impl NodeInfo {
+impl std::ops::Deref for ConnCfg {
+    type Target = tokio_postgres::Config;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// For now, let's make it easier to setup the config.
+impl std::ops::DerefMut for ConnCfg {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl ConnCfg {
+    /// Establish a raw TCP connection to the compute node.
    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
        use tokio_postgres::config::Host;

@@ -68,8 +87,8 @@ impl NodeInfo {
        // because it has no means for extracting the underlying socket which we
        // require for our business.
        let mut connection_error = None;
-        let ports = self.config.get_ports();
-        let hosts = self.config.get_hosts();
+        let ports = self.0.get_ports();
+        let hosts = self.0.get_hosts();
        // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array
        if ports.len() > 1 && ports.len() != hosts.len() {
            return Err(io::Error::new(
@@ -77,7 +96,7 @@ impl NodeInfo {
                format!(
                    "couldn't connect: bad compute config, \
                        ports and hosts entries' count does not match: {:?}",
-                    self.config
+                    self.0
                ),
            ));
        }
@@ -103,7 +122,7 @@ impl NodeInfo {
        Err(connection_error.unwrap_or_else(|| {
            io::Error::new(
                io::ErrorKind::Other,
-                format!("couldn't connect: bad compute config: {:?}", self.config),
+                format!("couldn't connect: bad compute config: {:?}", self.0),
            )
        }))
    }
@@ -116,7 +135,7 @@ pub struct PostgresConnection {
    pub version: String,
 }

-impl NodeInfo {
+impl ConnCfg {
    /// Connect to a corresponding compute node.
    pub async fn connect(
        mut self,
@@ -130,21 +149,21 @@ impl NodeInfo {
                .intersperse(" ") // TODO: use impl from std once it's stabilized
                .collect();

-            self.config.options(&options);
+            self.0.options(&options);
        }

        if let Some(app_name) = params.get("application_name") {
-            self.config.application_name(app_name);
+            self.0.application_name(app_name);
        }

        if let Some(replication) = params.get("replication") {
            use tokio_postgres::config::ReplicationMode;
            match replication {
                "true" | "on" | "yes" | "1" => {
-                    self.config.replication_mode(ReplicationMode::Physical);
+                    self.0.replication_mode(ReplicationMode::Physical);
                }
                "database" => {
-                    self.config.replication_mode(ReplicationMode::Logical);
+                    self.0.replication_mode(ReplicationMode::Logical);
                }
                _other => {}
            }
@@ -160,7 +179,7 @@ impl NodeInfo {
            .map_err(|_| ConnectionError::FailedToConnectToCompute)?;

        // TODO: establish a secure connection to the DB
-        let (client, conn) = self.config.connect_raw(&mut stream, NoTls).await?;
+        let (client, conn) = self.0.connect_raw(&mut stream, NoTls).await?;
        let version = conn
            .parameter("server_version")
            .ok_or(ConnectionError::FailedToFetchPgVersion)?
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -6,16 +6,11 @@ use std::{
    net::{TcpListener, TcpStream},
    thread,
 };
-use tracing::{error, info};
+use tracing::{error, info, info_span};
 use utils::postgres_backend::{self, AuthType, PostgresBackend};

-/// TODO: move all of that to auth-backend/link.rs when we ditch legacy-console backend
-
-///
-/// Main proxy listener loop.
-///
-/// Listens for connections, and launches a new handler thread for each.
-///
+/// Console management API listener thread.
+/// It spawns console response handlers needed for the link auth.
 pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
    scopeguard::defer! {
        info!("mgmt has shut down");
@@ -24,6 +19,7 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
    listener
        .set_nonblocking(false)
        .context("failed to set listener to blocking")?;
+
    loop {
        let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
        info!("accepted connection from {peer_addr}");
@@ -31,9 +27,19 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
            .set_nodelay(true)
            .context("failed to set client socket option")?;

+        // TODO: replace with async tasks.
        thread::spawn(move || {
-            if let Err(err) = handle_connection(socket) {
-                error!("{err}");
+            let tid = std::thread::current().id();
+            let span = info_span!("mgmt", thread = format_args!("{tid:?}"));
+            let _enter = span.enter();
+
+            info!("started a new console management API thread");
+            scopeguard::defer! {
+                info!("console management API thread is about to finish");
+            }
+
+            if let Err(e) = handle_connection(socket) {
+                error!("thread failed with an error: {e}");
            }
        });
    }
@@ -44,44 +50,21 @@ fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
    pgbackend.run(&mut MgmtHandler)
 }

-struct MgmtHandler;
-
-/// Serialized examples:
-// {
-//     "session_id": "71d6d03e6d93d99a",
-//     "result": {
-//         "Success": {
-//             "host": "127.0.0.1",
-//             "port": 5432,
-//             "dbname": "stas",
-//             "user": "stas",
-//             "password": "mypass"
-//         }
-//     }
-// }
-// {
-//     "session_id": "71d6d03e6d93d99a",
-//     "result": {
-//         "Failure": "oops"
-//     }
-// }
-//
-// // to test manually by sending a query to mgmt interface:
-// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
-#[derive(Deserialize)]
+/// Known as `kickResponse` in the console.
+#[derive(Debug, Deserialize)]
 struct PsqlSessionResponse {
    session_id: String,
    result: PsqlSessionResult,
 }

-#[derive(Deserialize)]
+#[derive(Debug, Deserialize)]
 enum PsqlSessionResult {
-    Success(auth::DatabaseInfo),
+    Success(DatabaseInfo),
    Failure(String),
 }

 /// A message received by `mgmt` when a compute node is ready.
-pub type ComputeReady = Result<auth::DatabaseInfo, String>;
+pub type ComputeReady = Result<DatabaseInfo, String>;

 impl PsqlSessionResult {
    fn into_compute_ready(self) -> ComputeReady {
@@ -92,25 +75,51 @@ impl PsqlSessionResult {
    }
 }

-impl postgres_backend::Handler for MgmtHandler {
-    fn process_query(
-        &mut self,
-        pgb: &mut PostgresBackend,
-        query_string: &str,
-    ) -> anyhow::Result<()> {
-        let res = try_process_query(pgb, query_string);
-        // intercept and log error message
-        if res.is_err() {
-            error!("mgmt query failed: {res:?}");
-        }
-        res
+/// Compute node connection params provided by the console.
+/// This struct and its parents are mgmt API implementation
+/// detail and thus should remain in this module.
+// TODO: restore deserialization tests from git history.
+#[derive(Deserialize)]
+pub struct DatabaseInfo {
+    pub host: String,
+    pub port: u16,
+    pub dbname: String,
+    pub user: String,
+    /// Console always provides a password, but it might
+    /// be inconvenient for debug with local PG instance.
+    pub password: Option<String>,
+    pub project: String,
+}
+
+// Manually implement debug to omit sensitive info.
+impl std::fmt::Debug for DatabaseInfo {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
+        fmt.debug_struct("DatabaseInfo")
+            .field("host", &self.host)
+            .field("port", &self.port)
+            .field("dbname", &self.dbname)
+            .field("user", &self.user)
+            .finish_non_exhaustive()
    }
 }

-fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::Result<()> {
-    info!("got mgmt query [redacted]"); // Content contains password, don't print it
+// TODO: replace with an http-based protocol.
+struct MgmtHandler;
+impl postgres_backend::Handler for MgmtHandler {
+    fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
+        try_process_query(pgb, query).map_err(|e| {
+            error!("failed to process response: {e:?}");
+            e
+        })
+    }
+}

-    let resp: PsqlSessionResponse = serde_json::from_str(query_string)?;
+fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
+    let resp: PsqlSessionResponse = serde_json::from_str(query)?;
+
+    let span = info_span!("event", session_id = resp.session_id);
+    let _enter = span.enter();
+    info!("got response: {:?}", resp.result);

    match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) {
        Ok(()) => {
@@ -119,9 +128,50 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R
                .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
        }
        Err(e) => {
+            error!("failed to deliver response to per-client task");
            pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
        }
    }

    Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn parse_db_info() -> anyhow::Result<()> {
+        // with password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+            "project": "hello_world",
+        }))?;
+
+        // without password
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "project": "hello_world",
+        }))?;
+
+        // new field (forward compatibility)
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "project": "hello_world",
+            "N.E.W": "forward compatibility check",
+        }))?;
+
+        Ok(())
+    }
+}
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -4,7 +4,7 @@ use crate::config::{ProxyConfig, TlsConfig};
 use crate::stream::{MeasuredStream, PqStream, Stream};
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
-use metrics::{register_int_counter, IntCounter};
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, *};
 use std::sync::Arc;
@@ -30,10 +30,16 @@ static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

-static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_io_bytes_total",
-        "Number of bytes sent/received between any client and backend."
+static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_io_bytes_per_client",
+        "Number of bytes sent/received between client and backend.",
+        &[
+            // Received (rx) / sent (tx).
+            "direction",
+            // Proxy can keep calling it `project` internally.
+            "endpoint_id"
+        ]
    )
    .unwrap()
 });
@@ -230,16 +236,17 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
            application_name: params.get("application_name"),
        };

-        // Authenticate and connect to a compute node.
-        let auth = creds
-            .authenticate(&extra, &mut stream)
-            .instrument(info_span!("auth"))
-            .await;
-
-        let node = async { auth }.or_else(|e| stream.throw_error(e)).await?;
-        let reported_auth_ok = node.reported_auth_ok;
+        let auth_result = async {
+            // `&mut stream` doesn't let us merge those 2 lines.
+            let res = creds.authenticate(&extra, &mut stream).await;
+            async { res }.or_else(|e| stream.throw_error(e)).await
+        }
+        .instrument(info_span!("auth"))
+        .await?;

+        let node = auth_result.value;
        let (db, cancel_closure) = node
+            .config
            .connect(params)
            .or_else(|e| stream.throw_error(e))
            .await?;
@@ -247,7 +254,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
        let cancel_key_data = session.enable_query_cancellation(cancel_closure);

        // Report authentication success if we haven't done this already.
-        if !reported_auth_ok {
+        // Note that we do this only (for the most part) after we've connected
+        // to a compute (see above) which performs its own authentication.
+        if !auth_result.reported_auth_ok {
            stream
                .write_message_noflush(&Be::AuthenticationOk)?
                .write_message_noflush(&BeParameterStatusMessage::encoding())?;
@@ -261,17 +270,23 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
            .write_message(&BeMessage::ReadyForQuery)
            .await?;

-        /// This function will be called for writes to either direction.
-        fn inc_proxied(cnt: usize) {
-            // Consider inventing something more sophisticated
-            // if this ever becomes a bottleneck (cacheline bouncing).
-            NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64);
-        }
+        // TODO: add more identifiers.
+        let metric_id = node.project;
+
+        let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx", &metric_id]);
+        let mut client = MeasuredStream::new(stream.into_inner(), |cnt| {
+            // Number of bytes we sent to the client (outbound).
+            m_sent.inc_by(cnt as u64);
+        });
+
+        let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx", &metric_id]);
+        let mut db = MeasuredStream::new(db.stream, |cnt| {
+            // Number of bytes the client sent to the compute node (inbound).
+            m_recv.inc_by(cnt as u64);
+        });

        // Starting from here we only proxy the client's traffic.
        info!("performing the proxy pass...");
-        let mut db = MeasuredStream::new(db.stream, inc_proxied);
-        let mut client = MeasuredStream::new(stream.into_inner(), inc_proxied);
        let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?;

        Ok(())
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1568,6 +1568,7 @@ class NeonCli(AbstractNeonCli):
    def pageserver_start(
        self,
        overrides: Tuple[str, ...] = (),
+        extra_env_vars: Optional[Dict[str, str]] = None,
    ) -> "subprocess.CompletedProcess[str]":
        start_args = ["pageserver", "start", *overrides]
        append_pageserver_param_overrides(
@@ -1577,11 +1578,11 @@ class NeonCli(AbstractNeonCli):
            pageserver_config_override=self.env.pageserver.config_override,
        )

-        s3_env_vars = None
        if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
            s3_env_vars = self.env.remote_storage.access_env_vars()
+            extra_env_vars = (extra_env_vars or {}) | s3_env_vars

-        return self.raw_cli(start_args, extra_env_vars=s3_env_vars)
+        return self.raw_cli(start_args, extra_env_vars=extra_env_vars)

    def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]":
        cmd = ["pageserver", "stop"]
@@ -1760,9 +1761,15 @@ class NeonPageserver(PgProtocol):
            ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
            ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
            ".*Removing intermediate uninit mark file.*",
+            # FIXME: known race condition in TaskHandle: https://github.com/neondatabase/neon/issues/2885
+            ".*sender is dropped while join handle is still alive.*",
        ]

-    def start(self, overrides: Tuple[str, ...] = ()) -> "NeonPageserver":
+    def start(
+        self,
+        overrides: Tuple[str, ...] = (),
+        extra_env_vars: Optional[Dict[str, str]] = None,
+    ) -> "NeonPageserver":
        """
        Start the page server.
        `overrides` allows to add some config to this pageserver start.
@@ -1770,7 +1777,7 @@ class NeonPageserver(PgProtocol):
        """
        assert self.running is False

-        self.env.neon_cli.pageserver_start(overrides=overrides)
+        self.env.neon_cli.pageserver_start(overrides=overrides, extra_env_vars=extra_env_vars)
        self.running = True
        return self

@@ -2073,9 +2080,9 @@ class NeonProxy(PgProtocol):
        self,
        proxy_port: int,
        http_port: int,
+        mgmt_port: int,
        neon_binpath: Path,
        auth_endpoint=None,
-        mgmt_port=None,
    ):
        super().__init__(dsn=auth_endpoint, port=proxy_port)
        self.host = "127.0.0.1"
@@ -2089,7 +2096,8 @@ class NeonProxy(PgProtocol):

    def start(self):
        """
-        Starts a proxy with option '--auth-backend postgres' and a postgres instance already provided though '--auth-endpoint <postgress-instance>'."
+        Starts a proxy with option '--auth-backend postgres' and a postgres instance
+        already provided though '--auth-endpoint <postgress-instance>'."
        """
        assert self._popen is None
        assert self.auth_endpoint is not None
@@ -2099,6 +2107,7 @@ class NeonProxy(PgProtocol):
            str(self.neon_binpath / "proxy"),
            *["--http", f"{self.host}:{self.http_port}"],
            *["--proxy", f"{self.host}:{self.proxy_port}"],
+            *["--mgmt", f"{self.host}:{self.mgmt_port}"],
            *["--auth-backend", "postgres"],
            *["--auth-endpoint", self.auth_endpoint],
        ]
@@ -2175,11 +2184,13 @@ def static_proxy(
    auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"

    proxy_port = port_distributor.get_port()
+    mgmt_port = port_distributor.get_port()
    http_port = port_distributor.get_port()

    with NeonProxy(
        proxy_port=proxy_port,
        http_port=http_port,
+        mgmt_port=mgmt_port,
        neon_binpath=neon_binpath,
        auth_endpoint=auth_endpoint,
    ) as proxy:
--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -6,6 +6,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import PgCompare
 from fixtures.log_helper import log
+from pytest_lazyfixture import lazy_fixture  # type: ignore


@pytest.mark.parametrize(
@@ -20,16 +21,24 @@ from fixtures.log_helper import log
        pytest.param(10000000, 1, 4),
    ],
 )
-def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: int):
-    env = neon_with_baseline
+@pytest.mark.parametrize(
+    "env, scale",
+    [
+        # Run on all envs. Use 50x larger table on remote cluster to make sure
+        # it doesn't fit in shared buffers, which are larger on remote than local.
+        pytest.param(lazy_fixture("neon_compare"), 1, id="neon"),
+        pytest.param(lazy_fixture("vanilla_compare"), 1, id="vanilla"),
+        pytest.param(
+            lazy_fixture("remote_compare"), 50, id="remote", marks=pytest.mark.remote_cluster
+        ),
+    ],
+)
+def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: int):
+    rows = scale * rows

    with closing(env.pg.connect()) as conn:
        with conn.cursor() as cur:
-
-            if True:
-                cur.execute("set enable_seqscan_prefetch = on;")
-                cur.execute("set seqscan_prefetch_buffers = 10;")
-
+            cur.execute("drop table if exists t;")
            cur.execute("create table t (i integer);")
            cur.execute(f"insert into t values (generate_series(1,{rows}));")

--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -8,6 +8,7 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 # normally restarts after it. Also, there should be GC ERRORs in the log,
 # but the fixture checks the log for any unexpected ERRORs after every
 # test anyway, so it doesn't need any special attention here.
+@pytest.mark.timeout(600)
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()

@@ -38,7 +39,7 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):

    for _ in range(5):
        with pytest.raises(Exception):
-            pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr])
+            pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr])
        env.pageserver.stop()
        env.pageserver.start()
        pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,5 +1,4 @@
 import json
-import subprocess
 from urllib.parse import urlparse

 import psycopg2
@@ -8,11 +7,11 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres


-def test_proxy_select_1(static_proxy):
+def test_proxy_select_1(static_proxy: NeonProxy):
    static_proxy.safe_psql("select 1", options="project=generic-project-name")


-def test_password_hack(static_proxy):
+def test_password_hack(static_proxy: NeonProxy):
    user = "borat"
    password = "password"
    static_proxy.safe_psql(
@@ -24,118 +23,75 @@ def test_password_hack(static_proxy):
    static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)

    # Must also check that invalid magic won't be accepted.
-    with pytest.raises(psycopg2.errors.OperationalError):
+    with pytest.raises(psycopg2.OperationalError):
        magic = "broken"
        static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)


-def get_session_id_from_uri_line(uri_prefix, uri_line):
+def get_session_id(uri_prefix, uri_line):
    assert uri_prefix in uri_line

    url_parts = urlparse(uri_line)
    psql_session_id = url_parts.path[1:]
-    assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars."
-    link_auth_uri_prefix = uri_line[: -len(url_parts.path)]
-    # invariant: the prefix must match the uri_prefix.
-    assert (
-        link_auth_uri_prefix == uri_prefix
-    ), f"Line='{uri_line}' should contain a http auth link of form '{uri_prefix}/<psql_session_id>'."
-    # invariant: the entire link_auth_uri should be on its own line, module spaces.
-    assert " ".join(uri_line.split(" ")) == f"{uri_prefix}/{psql_session_id}"
+    assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars"

    return psql_session_id


-def create_and_send_db_info(local_vanilla_pg, psql_session_id, mgmt_port):
-    pg_user = "proxy"
-    pg_password = "password"
-
-    local_vanilla_pg.start()
-    query = f"create user {pg_user} with login superuser password '{pg_password}'"
-    local_vanilla_pg.safe_psql(query)
-
-    port = local_vanilla_pg.default_options["port"]
-    host = local_vanilla_pg.default_options["host"]
-    dbname = local_vanilla_pg.default_options["dbname"]
-
-    db_info_dict = {
-        "session_id": psql_session_id,
-        "result": {
-            "Success": {
-                "host": host,
-                "port": port,
-                "dbname": dbname,
-                "user": pg_user,
-                "password": pg_password,
-            }
-        },
-    }
-    db_info_str = json.dumps(db_info_dict)
-    cmd_args = [
-        "psql",
-        "-h",
-        "127.0.0.1",  # localhost
-        "-p",
-        f"{mgmt_port}",
-        "-c",
-        db_info_str,
-    ]
-
-    log.info(f"Sending to proxy the user and db info: {' '.join(cmd_args)}")
-    p = subprocess.Popen(cmd_args, stdout=subprocess.PIPE)
-    out, err = p.communicate()
-    assert "ok" in str(out)
-
-
-async def get_uri_line_from_process_welcome_notice(link_auth_uri_prefix, proc):
-    """
-    Returns the line from the welcome notice from proc containing link_auth_uri_prefix.
-    :param link_auth_uri_prefix: the uri prefix used to indicate the line of interest
-    :param proc: the process to read the welcome message from.
-    :return: a line containing the full link authentication uri.
-    """
-    max_num_lines_of_welcome_message = 15
-    for attempt in range(max_num_lines_of_welcome_message):
-        raw_line = await proc.stderr.readline()
-        line = raw_line.decode("utf-8").strip()
+async def find_auth_link(link_auth_uri_prefix, proc):
+    for _ in range(100):
+        line = (await proc.stderr.readline()).decode("utf-8").strip()
+        log.info(f"psql line: {line}")
        if link_auth_uri_prefix in line:
+            log.info(f"SUCCESS, found auth url: {line}")
            return line
-    assert False, f"did not find line containing '{link_auth_uri_prefix}'"
+
+
+async def activate_link_auth(local_vanilla_pg, link_proxy, psql_session_id):
+    pg_user = "proxy"
+
+    log.info("creating a new user for link auth test")
+    local_vanilla_pg.start()
+    local_vanilla_pg.safe_psql(f"create user {pg_user} with login superuser")
+
+    db_info = json.dumps(
+        {
+            "session_id": psql_session_id,
+            "result": {
+                "Success": {
+                    "host": local_vanilla_pg.default_options["host"],
+                    "port": local_vanilla_pg.default_options["port"],
+                    "dbname": local_vanilla_pg.default_options["dbname"],
+                    "user": pg_user,
+                    "project": "irrelevant",
+                }
+            },
+        }
+    )
+
+    log.info("sending session activation message")
+    psql = await PSQL(host=link_proxy.host, port=link_proxy.mgmt_port).run(db_info)
+    out = (await psql.stdout.read()).decode("utf-8").strip()
+    assert out == "ok"


@pytest.mark.asyncio
 async def test_psql_session_id(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy):
-    """
-    Test copied and modified from: test_project_psql_link_auth test from cloud/tests_e2e/tests/test_project.py
-     Step 1. establish connection to the proxy
-     Step 2. retrieve session_id:
-        Step 2.1: read welcome message
-        Step 2.2: parse session_id
-     Step 3. create a vanilla_pg and send user and db info via command line (using Popen) a psql query via mgmt port to proxy.
-     Step 4. assert that select 1 has been executed correctly.
-    """
-
-    psql = PSQL(
-        host=link_proxy.host,
-        port=link_proxy.proxy_port,
-    )
-    proc = await psql.run("select 42")
+    psql = await PSQL(host=link_proxy.host, port=link_proxy.proxy_port).run("select 42")

    uri_prefix = link_proxy.link_auth_uri_prefix
-    line_str = await get_uri_line_from_process_welcome_notice(uri_prefix, proc)
+    link = await find_auth_link(uri_prefix, psql)

-    psql_session_id = get_session_id_from_uri_line(uri_prefix, line_str)
-    log.info(f"Parsed psql_session_id='{psql_session_id}' from Neon welcome message.")
+    psql_session_id = get_session_id(uri_prefix, link)
+    await activate_link_auth(vanilla_pg, link_proxy, psql_session_id)

-    create_and_send_db_info(vanilla_pg, psql_session_id, link_proxy.mgmt_port)
-
-    assert proc.stdout is not None
-    out = (await proc.stdout.read()).decode("utf-8").strip()
+    assert psql.stdout is not None
+    out = (await psql.stdout.read()).decode("utf-8").strip()
    assert out == "42"


 # Pass extra options to the server.
-def test_proxy_options(static_proxy):
+def test_proxy_options(static_proxy: NeonProxy):
    with static_proxy.connect(options="project=irrelevant -cproxytest.option=value") as conn:
        with conn.cursor() as cur:
            cur.execute("SHOW proxytest.option")
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,3 +1,4 @@
+import time
 from threading import Thread

 import pytest
@@ -11,11 +12,21 @@ def do_gc_target(
 ):
    """Hack to unblock main, see https://github.com/neondatabase/neon/issues/2211"""
    try:
+        log.info("sending gc http request")
        pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
    except Exception as e:
        log.error("do_gc failed: %s", e)
+    finally:
+        log.info("gc http thread returning")


+@pytest.mark.skip(
+    reason="""
+Commit 'make test_tenant_detach_smoke fail reproducibly' adds failpoint to make this test fail reproducibly.
+Fix in https://github.com/neondatabase/neon/pull/2851 will come as part of
+https://github.com/neondatabase/neon/pull/2785 .
+"""
+)
 def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()
@@ -51,7 +62,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        ]
    )

-    # gc should not try to even start
+    # gc should not try to even start on a timeline that doesn't exist
    with pytest.raises(
        expected_exception=PageserverApiException, match="gc target timeline does not exist"
    ):
@@ -61,25 +72,24 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        # the error will be printed to the log too
    env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")

-    # try to concurrently run gc and detach
+    # Detach while running manual GC.
+    # It should wait for manual GC to finish (right now it doesn't that's why this test fails sometimes)
+    pageserver_http.configure_failpoints(
+        ("gc_iteration_internal_after_getting_gc_timelines", "return(2000)")
+    )
    gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id))
    gc_thread.start()
+    time.sleep(1)
+    # By now the gc task is spawned but in sleep for another second due to the failpoint.

-    last_error = None
-    for i in range(3):
-        try:
-            pageserver_http.tenant_detach(tenant_id)
-        except Exception as e:
-            last_error = e
-            log.error(f"try {i} error detaching tenant: {e}")
-            continue
-        else:
-            break
-    # else is called if the loop finished without reaching "break"
-    else:
-        pytest.fail(f"could not detach tenant: {last_error}")
+    log.info("detaching tenant")
+    pageserver_http.tenant_detach(tenant_id)
+    log.info("tenant detached without error")

+    log.info("wait for gc thread to return")
    gc_thread.join(timeout=10)
+    assert not gc_thread.is_alive()
+    log.info("gc thread returned")

    # check that nothing is left on disk for deleted tenant
    assert not (env.repo_dir / "tenants" / str(tenant_id)).exists()
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -166,6 +166,10 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder

    env = neon_env_builder.init_start()

+    # FIXME: we have a race condition between GC and delete timeline. GC might fail with this
+    # error. Similar to https://github.com/neondatabase/neon/issues/2671
+    env.pageserver.allowed_errors.append(".*InternalServerError\\(No such file or directory.*")
+
    tenant_id = env.initial_tenant
    main_branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0]

@@ -188,10 +192,8 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
        "first-branch", main_branch_name, tenant_id
    )

-    # unsure why this happens, the size difference is more than a page alignment
    size_after_first_branch = http_client.tenant_size(tenant_id)
-    assert size_after_first_branch > size_at_branch
-    assert size_after_first_branch - size_at_branch == gc_horizon
+    assert size_after_first_branch == size_at_branch

    first_branch_pg = env.postgres.create_start("first-branch", tenant_id=tenant_id)

@@ -217,7 +219,7 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
        "second-branch", main_branch_name, tenant_id
    )
    size_after_second_branch = http_client.tenant_size(tenant_id)
-    assert size_after_second_branch > size_after_continuing_on_main
+    assert size_after_second_branch == size_after_continuing_on_main

    second_branch_pg = env.postgres.create_start("second-branch", tenant_id=tenant_id)

@@ -263,6 +265,8 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
        except PageserverApiException as e:
            # compaction is ok but just retry if this fails; related to #2442
            if "cannot lock compaction critical section" in str(e):
+                # also ignore it in the log
+                env.pageserver.allowed_errors.append(".*cannot lock compaction critical section.*")
                time.sleep(1)
                continue
            raise
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
Author	SHA1	Message	Date
Egor Suvorov	8261455019	persistent_range_query: add layer_map_test	2022-11-24 04:47:19 +02:00
Egor Suvorov	aad88d6c39	persistent_range_query: add stress test	2022-11-24 03:50:18 +02:00
Egor Suvorov	6188315b51	persistent_range_query: more refs	2022-11-24 03:45:02 +02:00
Egor Suvorov	3a4b932d8a	Draft generic persistent segment tree	2022-11-24 02:31:48 +02:00
Egor Suvorov	cc2b3c986c	Simplify code: fewer lifetimes, auto-impl VecFrozenVersion	2022-11-24 02:11:06 +02:00
Egor Suvorov	c250c2664b	Always require Clone for RangeQueryResult	2022-11-24 01:42:14 +02:00
Egor Suvorov	e5550a01b0	VecVersion: make it read-only, only the latest version can be modified	2022-11-24 01:28:13 +02:00
Egor Suvorov	45617ceaef	RangeModification: add is_no_op/is_reinitialization	2022-11-24 00:12:05 +02:00
Egor Suvorov	29b39301fe	RangeQueryResult: do not require Clone and do not provide default combine/add implementations	2022-11-24 00:11:52 +02:00
Egor Suvorov	b01a93be60	persistent_range_query: second draft Move same generic parameters to associated types. Work around private fields of SumResult<T>	2022-11-23 23:09:25 +02:00
Egor Suvorov	4c68d019e3	persistent_range_query: first draft	2022-11-23 20:52:34 +02:00
Sergey Melnikov	85f0975c5a	Setup eu-west-1 as region for PR testing (#2757 )	2022-11-23 10:54:39 +01:00
Konstantin Knizhnik	1af087449a	Reduce max_replication_write_lag to 10Mb (#1793 )	2022-11-23 08:41:22 +02:00
Heikki Linnakangas	37625c4433	Remove obsolete design doc. I considered archiving this under docs/rfcs, but looking at the contents, I don't think it's relevant at all anymore. So let's just remove it.	2022-11-23 00:40:17 +02:00
Heikki Linnakangas	e9f4ca5972	Remove references to obsolete files in .gitignore	2022-11-23 00:40:17 +02:00
Alexey Kondratov	4bf3087aed	[pageserver] list `latest_gc_cutoff_lsn` in the OpenAPI spec (#2894 ) It seems that it's present in the API response for quite a while. It's just not listed in the spec, fix it.	2022-11-22 21:10:49 +01:00
Dmitry Ivanov	9470bc9fe0	[proxy] Implement per-tenant traffic metrics	2022-11-22 18:50:57 +03:00
Heikki Linnakangas	86e483f87b	Fix tenant size modeling code to include WAL at end of branch Imagine that you have a tenant with a single branch like this: ---------------==========> ^ gc horizon where: ---- is the portion of the branch that is older than retention period ==== is the portion of the branch that is newer than retention period. Before this commit, the sizing model included the logical size at the GC horizon, but not the WAL after that. In particular, that meant that on a newly created tenant with just one timeline, where the retention period covered the whole history of the timeline, i.e. gc_cutoff was 0, the calculated tenant size was always zero. We now include the WAL after the GC horizon in the size. So in the above example, the calculated tenant size would be the logical size of the database the GC horizon, plus all the WAL after it (marked with ===). This adds a new `insert_point` function to the sizing model, alongside `modify_branch`, and changes the code in size.rs to use the new function. The new function takes an absolute lsn and logical size as argument, so we no longer need to calculate the difference to the previous point. Also, the end-size is now optional, because we now need to add a point to represent the end of each branch to the model, but we don't want to or need to calculate the logical size at that point.	2022-11-22 17:11:27 +02:00
Christian Schwarz	f50d0ec0c9	test_runner: ignore 'sender is dropped while join handle is still alive' warnings The need for a proper solution to this is tracked in https://github.com/neondatabase/neon/issues/2885	2022-11-22 11:30:34 +01:00
Sergey Melnikov	74ec36a1bf	Add pageserver-1.us-east-2.aws.neon.build (#2881 )	2022-11-22 10:55:02 +01:00
Anastasia Lubennikova	a63ebb6446	Update vendor postgres to 14.6 and 15.1	2022-11-22 10:46:21 +02:00
Alexander Stanovoy	a5b898a31c	Fix the order of checks in LSN (#2882 ) We should check if LSN is in the lower range because it's constant and only after wait for LSN to arrive if needed.	2022-11-22 02:28:41 +02:00
bojanserafimov	c6f095a821	Fix remote seqscan test (#2878 )	2022-11-21 17:21:47 -05:00
Alexander Bayandin	6b2bc7f775	Nightly Benchmarks: Add RDS Postgres (#2859 ) Add RDS Postgres `db.m5.large` instance to Nightly Benchmarks	2022-11-21 15:25:09 +00:00
Heikki Linnakangas	6c97fc941a	Enable passing FAILPOINTS at startup. - Pass through FAILPOINTS environment variable to the pageserver in "neon_local pageserver start" command - On startup, list any failpoints that were set with FAILPOINTS to the log - Add optional "extra_env_vars" argument to the NeonPageserver.start() function in the python fixture, so that you can pass FAILPOINTS None of the tests use this functionality yet; that comes in a separate commit. closes https://github.com/neondatabase/neon/pull/2865	2022-11-21 16:24:19 +01:00
Alexander Bayandin	cb9b26776e	Fix test_seqscans on remote cluster (#2869 ) A remote project is reused between tests, so we need to ensure that we don't have a table with the same name already created.	2022-11-19 23:39:42 +00:00
Heikki Linnakangas	684329d4d2	Another attempt at silencing test_gc_cutoff failures. Increse the pgbench runtimes even further. The theory is that when there are many other tests running at the same time, one pgbench run could take a long time until it generates enough layers for GC to kick in.	2022-11-19 19:28:56 +02:00
Heikki Linnakangas	ed40a045c0	Add more logging to track down test_gc_cutoff failure. see https://github.com/neondatabase/neon/issues/2856	2022-11-19 14:12:21 +02:00
Heikki Linnakangas	3f39327622	Silence a few compiler warnings I saw these from the build of the compute docker image in the CI (compute-node-image-v15): pagestore_smgr.c: In function 'neon_prefetch': pagestore_smgr.c:1654:2: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement] 1654 \| BufferTag tag = (BufferTag) { \| ^~~~~~~~~ walproposer.c:197:1: warning: no previous prototype for 'WalProposerSync' [-Wmissing-prototypes] 197 \| WalProposerSync(int argc, char *argv[]) \| ^~~~~~~~~~~~~~~ libpagestore.c: In function 'pageserver_connect': libpagestore.c💯9: warning: variable 'wc' set but not used [-Wunused-but-set-variable] 100 \| int wc; \| ^~ libpagestore.c: In function 'call_PQgetCopyData': libpagestore.c:144:9: warning: variable 'wc' set but not used [-Wunused-but-set-variable] 144 \| int wc; \| ^~ Harmless warnings, but let's be tidy. In the passing, I added some "extern" to a few function declarations that were missing them, and marked WalProposerSync as "static". Those changes are also purely cosmetic.	2022-11-19 14:11:04 +02:00
Heikki Linnakangas	a50a7e8ac0	Try to silence test_gc_cutoff flakiness. Commit `d013a2b227` changed the test, so that it fails if pgbench runs to completion without triggering the failpoint. That has now happened several times in the CI. That's not expected, so this needs some investigation, but as a quick fix just make the pgbench runs longer so that we're closer to the situation before commit `d013a2b227`. See https://github.com/neondatabase/neon/issues/2856	2022-11-19 01:19:09 +02:00
Egor Suvorov	e28eda7939	sourcetree/docs: mention hakari generate (#2864 )	2022-11-18 22:30:41 +00:00
Christian Schwarz	f564dff0e3	make test_tenant_detach_smoke fail reproducibly Add failpoint that triggers the race condition. Skip test until we'll land the fix from https://github.com/neondatabase/neon/pull/2851 with https://github.com/neondatabase/neon/pull/2785	2022-11-18 17:15:34 +01:00
Christian Schwarz	d783889a1f	timeline: explicit tracking of flush loop state: NotStarted, Running, Exited This allows us to error out in the case where we request flush but the flush loop is not running. Before, we would only track whether it was started, but not when it exited. Better to use an enum with 3 states than a 2-state bool because then the error message can answer the question whether we ever started the flush loop or not.	2022-11-18 17:15:34 +01:00
bojanserafimov	2655bdbb2e	Add remote seqscans test (#2840 )	2022-11-18 09:05:13 -05:00
Konstantin Knizhnik	b9152f1ef4	Correctly terminate prefetch in case of pageserver restart (#2850 ) refer #2819 This patch requires deep knowledge of prefetch internals. So @MMeent please review it or suggest better solution.	2022-11-18 15:04:58 +02:00
Heikki Linnakangas	328ec1ce24	Print a more full error message, with stack trace, on GC failure. In a CI run, I got a test failure because of this error in the log, from the test_get_tenant_size_with_multiple_branches test: ERROR gc_loop{tenant_id=f1630516d4b526139836ced93be0c878}: Gc failed, retrying in 2s: No such file or directory (os error 2) There are known race conditions between GC and timeline deletion, which surely caused that error. But if we didn't know the cause, it would be pretty hard to debug without a stack trace.	2022-11-18 11:44:00 +02:00
Heikki Linnakangas	dcb79ef08f	Silence yet another test failure from race condition between GC and delete. Another similar case to commit `9ae4da4f31`.	2022-11-18 10:18:15 +02:00
Konstantin Knizhnik	fd99e0fbc4	Build pg_prewrm extension (#2794 )	2022-11-18 09:10:32 +02:00
Kirill Bulatov	60ac227196	Use modern flex and bison in macOS compilations (#2847 )	2022-11-17 14:48:21 +00:00
MMeent	4a60051b0d	Add codeowners section for /vendor/ (#2849 ) After this, consent of @neondatabase/compute is required to update the vendored PostgreSQL versions.	2022-11-17 14:31:34 +00:00
Heikki Linnakangas	24d3ed0952	Ignore another ERROR that's expected in test. Got a test failure in CI because of this.	2022-11-17 12:42:56 +02:00
Alexander Bayandin	0a87d71294	test_runner: make proxy mgmt port mandatory (#2839 ) Make `mgmt` port mandatory argument for `NeonProxy` (and set it for `static_proxy`) to avoid port collision when tests run in parallel.	2022-11-16 17:57:48 +00:00
Heikki Linnakangas	150bddb929	Clean up process start/stop handling * Poll more frequently when waiting for process start/stop. This speeds up startup and shutdown in tests. We did this already in commit `52ce1c9d53`, which reduced the interval to 100 ms, but it was inadvertently increased back to 500 ms in commit `d42700280f`. Reduce it to 100 ms again, for both start and stop operations. * Harmonize the start and stop loops, printing the dots and notices the same way in both. I considered extracting the logic to a separate retry-function that takes a closure as argument that does the polling, but as long as we only have two copies, the code duplication isn't that bad. * Remove newline after "Starting pageserver" and "Starting etcd" messages, so that the progress-indicator dots that are printed once a second are printed on the same line. Before: Starting pageserver at '127.0.0.1:64000' in '.neon' ... pageserver started, pid: 2538937 After: Starting pageserver at '127.0.0.1:64000' in '.neon'... pageserver started, pid: 2538937 The "Starting safekeeper" message already got this right. * Update example output in README.md to match	2022-11-16 19:51:37 +02:00