Do not set last written LSN for target page prefetch request

Update pgxn/neon/pagestore_smgr.c
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
2026-01-24 13:50:37 +00:00 · 2022-12-15 10:37:44 +02:00 · 2022-12-13 12:40:25 +02:00 · 2022-12-10 17:50:59 +02:00 · 2022-12-10 10:18:55 +02:00 · 2022-12-10 00:35:05 +02:00
125 changed files with 38478 additions and 8550 deletions
--- a/.github/actions/allure-report/action.yml
+++ b/.github/actions/allure-report/action.yml
@@ -32,8 +32,8 @@ runs:
          exit 2
        fi

-    - name: Calculate key
-      id: calculate-key
+    - name: Calculate variables
+      id: calculate-vars
      shell: bash -euxo pipefail {0}
      run: |
        # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
@@ -41,14 +41,22 @@ runs:
        pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
        if [ "${pr_number}" != "null" ]; then
          key=pr-${pr_number}
-        elif [ "${GITHUB_REF}" = "refs/heads/main" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ]; then
          # Shortcut for a special branch
          key=main
+        elif [ "${GITHUB_REF_NAME}" = "release" ]; then
+          # Shortcut for a special branch
+          key=release
        else
-          key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
+          key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
        fi
        echo "KEY=${key}" >> $GITHUB_OUTPUT

+        # Sanitize test selection to remove `/` and any other special characters
+        # Use printf instead of echo to avoid having `\n` at the end of the string
+        test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" )
+        echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT
+
    - uses: actions/setup-java@v3
      if: ${{ inputs.action == 'generate' }}
      with:
@@ -74,10 +82,11 @@ runs:
    - name: Upload Allure results
      if: ${{ inputs.action == 'store' }}
      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
+        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
+        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
        TEST_OUTPUT: /tmp/test_output
        BUCKET: neon-github-public-dev
+        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
      shell: bash -euxo pipefail {0}
      run: |
        # Add metadata
@@ -98,7 +107,7 @@ runs:
          BUILD_TYPE=${{ inputs.build_type }}
        EOF

-        ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
+        ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
        ZSTD_NBTHREADS=0

        tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
@@ -109,8 +118,9 @@ runs:
      if: ${{ inputs.action == 'generate' }}
      shell: bash -euxo pipefail {0}
      env:
-        LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
+        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
        BUCKET: neon-github-public-dev
+        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
      run: |
        LOCK_TIMEOUT=300 # seconds

@@ -123,12 +133,12 @@ runs:
            fi
            sleep 1
          done
-          echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" > lock.txt
+          echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt
          aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"

          # A double-check that exactly WE have acquired the lock
          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
-          if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
+          if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
            break
          fi
        done
@@ -137,8 +147,8 @@ runs:
      if: ${{ inputs.action == 'generate' }}
      id: generate-report
      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
+        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
+        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
        TEST_OUTPUT: /tmp/test_output
        BUCKET: neon-github-public-dev
      shell: bash -euxo pipefail {0}
@@ -192,12 +202,13 @@ runs:
      if: ${{ inputs.action == 'generate' && always() }}
      shell: bash -euxo pipefail {0}
      env:
-        LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
+        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
        BUCKET: neon-github-public-dev
+        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
      run: |
        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0

-        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
+        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
        fi

--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -0,0 +1,138 @@
+name: 'Create Branch'
+description: 'Create Branch using API'
+
+inputs:
+  api_key:
+    desctiption: 'Neon API key'
+    required: true
+  project_id:
+    desctiption: 'ID of the Project to create Branch in'
+    required: true
+  api_host:
+    desctiption: 'Neon API host'
+    default: console.stage.neon.tech
+outputs:
+  dsn:
+    description: 'Created Branch DSN (for main database)'
+    value: ${{ steps.change-password.outputs.dsn }}
+  branch_id:
+    description: 'Created Branch ID'
+    value: ${{ steps.create-branch.outputs.branch_id }}
+
+runs:
+  using: "composite"
+  steps:
+    - name: Create New Branch
+      id: create-branch
+      shell: bash -euxo pipefail {0}
+      run: |
+        for i in $(seq 1 10); do
+         branch=$(curl \
+            "https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches" \
+            --header "Accept: application/json" \
+            --header "Content-Type: application/json" \
+            --header "Authorization: Bearer ${API_KEY}" \
+            --data "{
+              \"branch\": {
+                \"name\": \"Created by actions/neon-branch-create; GITHUB_RUN_ID=${GITHUB_RUN_ID} at $(date +%s)\"
+              },
+              \"endpoints\": [
+                {
+                  \"type\": \"read_write\"
+                }
+              ]
+            }")
+
+          if [ -z "${branch}" ]; then
+            sleep 1
+            continue
+          fi
+
+          branch_id=$(echo $branch | jq --raw-output '.branch.id')
+          if [ "${branch_id}" == "null" ]; then
+            sleep 1
+            continue
+          fi
+
+          break
+        done
+
+        if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then
+          echo 2>&1 "Failed to create branch after 10 attempts, the latest response was: ${branch}"
+          exit 1
+        fi
+
+        branch_id=$(echo $branch | jq --raw-output '.branch.id')
+        echo "branch_id=${branch_id}" >> $GITHUB_OUTPUT
+
+        host=$(echo $branch | jq --raw-output '.endpoints[0].host')
+        echo "host=${host}" >> $GITHUB_OUTPUT
+      env:
+        API_HOST: ${{ inputs.api_host }}
+        API_KEY: ${{ inputs.api_key }}
+        PROJECT_ID: ${{ inputs.project_id }}
+
+    - name: Get Role name
+      id: role-name
+      shell: bash -euxo pipefail {0}
+      run: |
+        roles=$(curl \
+          "https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}/roles" \
+          --fail \
+          --header "Accept: application/json" \
+          --header "Content-Type: application/json" \
+          --header "Authorization: Bearer ${API_KEY}"
+          )
+
+        role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
+        echo "role_name=${role_name}" >> $GITHUB_OUTPUT
+      env:
+        API_HOST: ${{ inputs.api_host }}
+        API_KEY: ${{ inputs.api_key }}
+        PROJECT_ID: ${{ inputs.project_id }}
+        BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }}
+
+    - name: Change Password
+      id: change-password
+      # A shell without `set -x` to not to expose password/dsn in logs
+      shell: bash -euo pipefail {0}
+      run: |
+        for i in $(seq 1 10); do
+          reset_password=$(curl \
+            "https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}/roles/${ROLE_NAME}/reset_password" \
+            --request POST \
+            --header "Accept: application/json" \
+            --header "Content-Type: application/json" \
+            --header "Authorization: Bearer ${API_KEY}"
+            )
+
+          if [ -z "${reset_password}" ]; then
+            sleep 1
+            continue
+          fi
+
+          password=$(echo $reset_password | jq --raw-output '.role.password')
+          if [ "${password}" == "null" ]; then
+            sleep 1
+            continue
+          fi
+
+          echo "::add-mask::${password}"
+          break
+        done
+
+        if [ -z "${password}" ] || [ "${password}" == "null" ]; then
+          echo 2>&1 "Failed to reset password after 10 attempts, the latest response was: ${reset_password}"
+          exit 1
+        fi
+
+        dsn="postgres://${ROLE_NAME}:${password}@${HOST}/neondb"
+        echo "::add-mask::${dsn}"
+        echo "dsn=${dsn}" >> $GITHUB_OUTPUT
+      env:
+        API_HOST: ${{ inputs.api_host }}
+        API_KEY: ${{ inputs.api_key }}
+        PROJECT_ID: ${{ inputs.project_id }}
+        BRANCH_ID: ${{ steps.create-branch.outputs.branch_id }}
+        ROLE_NAME: ${{ steps.role-name.outputs.role_name }}
+        HOST: ${{ steps.create-branch.outputs.host }}
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -0,0 +1,58 @@
+name: 'Delete Branch'
+description: 'Delete Branch using API'
+
+inputs:
+  api_key:
+    desctiption: 'Neon API key'
+    required: true
+  project_id:
+    desctiption: 'ID of the Project which should be deleted'
+    required: true
+  branch_id:
+    desctiption: 'ID of the branch to delete'
+    required: true
+  api_host:
+    desctiption: 'Neon API host'
+    default: console.stage.neon.tech
+
+runs:
+  using: "composite"
+  steps:
+    - name: Delete Branch
+      # Do not try to delete a branch if .github/actions/neon-project-create
+      # or .github/actions/neon-branch-create failed before
+      if: ${{ inputs.project_id != '' && inputs.branch_id != '' }}
+      shell: bash -euxo pipefail {0}
+      run: |
+        for i in $(seq 1 10); do
+          deleted_branch=$(curl \
+            "https://${API_HOST}/api/v2/projects/${PROJECT_ID}/branches/${BRANCH_ID}" \
+            --request DELETE \
+            --header "Accept: application/json" \
+            --header "Content-Type: application/json" \
+            --header "Authorization: Bearer ${API_KEY}"
+            )
+
+          if [ -z "${deleted_branch}" ]; then
+            sleep 1
+            continue
+          fi
+
+          branch_id=$(echo $deleted_branch | jq --raw-output '.branch.id')
+          if [ "${branch_id}" == "null" ]; then
+            sleep 1
+            continue
+          fi
+
+          break
+        done
+
+        if [ -z "${branch_id}" ] || [ "${branch_id}" == "null" ]; then
+          echo 2>&1 "Failed to delete branch after 10 attempts, the latest response was: ${deleted_branch}"
+          exit 1
+        fi
+      env:
+        API_HOST: ${{ inputs.api_host }}
+        API_KEY: ${{ inputs.api_key }}
+        PROJECT_ID: ${{ inputs.project_id }}
+        BRANCH_ID: ${{ inputs.branch_id }}
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -5,12 +5,16 @@ inputs:
  api_key:
    desctiption: 'Neon API key'
    required: true
-  environment:
-    desctiption: 'dev (aka captest) or stage'
-    required: true
  region_id:
    desctiption: 'Region ID, if not set the project will be created in the default region'
-    required: false
+    default: aws-us-east-2
+  postgres_version:
+    desctiption: 'Postgres version; default is 15'
+    default: 15
+  api_host:
+    desctiption: 'Neon API host'
+    default: console.stage.neon.tech
+
 outputs:
  dsn:
    description: 'Created Project DSN (for main database)'
@@ -22,38 +26,13 @@ outputs:
 runs:
  using: "composite"
  steps:
-    - name: Parse Input
-      id: parse-input
-      shell: bash -euxo pipefail {0}
-      run: |
-        case "${ENVIRONMENT}" in
-          dev)
-            API_HOST=console.dev.neon.tech
-            REGION_ID=${REGION_ID:-eu-west-1}
-            ;;
-          staging)
-            API_HOST=console.stage.neon.tech
-            REGION_ID=${REGION_ID:-us-east-1}
-            ;;
-          *)
-            echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
-            exit 1
-            ;;
-        esac
-
-        echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
-        echo "region_id=${REGION_ID}" >> $GITHUB_OUTPUT
-      env:
-        ENVIRONMENT: ${{ inputs.environment }}
-        REGION_ID: ${{ inputs.region_id }}
-
    - name: Create Neon Project
      id: create-neon-project
      # A shell without `set -x` to not to expose password/dsn in logs
      shell: bash -euo pipefail {0}
      run: |
        project=$(curl \
-          "https://${API_HOST}/api/v1/projects" \
+          "https://${API_HOST}/api/v2/projects" \
          --fail \
          --header "Accept: application/json" \
          --header "Content-Type: application/json" \
@@ -61,7 +40,7 @@ runs:
          --data "{
            \"project\": {
              \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
-              \"platform_id\": \"aws\",
+              \"pg_version\": ${POSTGRES_VERSION},
              \"region_id\": \"${REGION_ID}\",
              \"settings\": { }
            }
@@ -70,13 +49,14 @@ runs:
        # Mask password
        echo "::add-mask::$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .password')"

-        dsn=$(echo $project | jq --raw-output '.roles[] | select(.name != "web_access") | .dsn')/main
+        dsn=$(echo $project | jq --raw-output '.connection_uris[0].connection_uri')
        echo "::add-mask::${dsn}"
        echo "dsn=${dsn}" >> $GITHUB_OUTPUT

-        project_id=$(echo $project | jq --raw-output '.id')
+        project_id=$(echo $project | jq --raw-output '.project.id')
        echo "project_id=${project_id}" >> $GITHUB_OUTPUT
      env:
+        API_HOST: ${{ inputs.api_host }}
        API_KEY: ${{ inputs.api_key }}
-        API_HOST: ${{ steps.parse-input.outputs.api_host }}
-        REGION_ID: ${{ steps.parse-input.outputs.region_id }}
+        REGION_ID: ${{ inputs.region_id }}
+        POSTGRES_VERSION: ${{ inputs.postgres_version }}
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -5,50 +5,29 @@ inputs:
  api_key:
    desctiption: 'Neon API key'
    required: true
-  environment:
-    desctiption: 'dev (aka captest) or stage'
-    required: true
  project_id:
    desctiption: 'ID of the Project to delete'
    required: true
+  api_host:
+    desctiption: 'Neon API host'
+    default: console.stage.neon.tech

 runs:
  using: "composite"
  steps:
-    - name: Parse Input
-      id: parse-input
-      shell: bash -euxo pipefail {0}
-      run: |
-        case "${ENVIRONMENT}" in
-          dev)
-            API_HOST=console.dev.neon.tech
-            ;;
-          staging)
-            API_HOST=console.stage.neon.tech
-            ;;
-          *)
-            echo 2>&1 "Unknown environment=${ENVIRONMENT}. Allowed 'dev' or 'staging' only"
-            exit 1
-            ;;
-        esac
-
-        echo "api_host=${API_HOST}" >> $GITHUB_OUTPUT
-      env:
-        ENVIRONMENT: ${{ inputs.environment }}
-
    - name: Delete Neon Project
+      # Do not try to delete a project if .github/actions/neon-project-create failed before
+      if: ${{ inputs.project_id != '' }}
      shell: bash -euxo pipefail {0}
      run: |
-        # Allow PROJECT_ID to be empty/null for cases when .github/actions/neon-project-create failed
-        if [ -n "${PROJECT_ID}" ]; then
-          curl -X "POST" \
-            "https://${API_HOST}/api/v1/projects/${PROJECT_ID}/delete" \
-            --fail \
-            --header "Accept: application/json" \
-            --header "Content-Type: application/json" \
-            --header "Authorization: Bearer ${API_KEY}"
-        fi
+        curl \
+          "https://${API_HOST}/api/v2/projects/${PROJECT_ID}" \
+          --fail \
+          --request DELETE \
+          --header "Accept: application/json" \
+          --header "Content-Type: application/json" \
+          --header "Authorization: Bearer ${API_KEY}"
      env:
+        API_HOST: ${{ inputs.api_host }}
        API_KEY: ${{ inputs.api_key }}
        PROJECT_ID: ${{ inputs.project_id }}
-        API_HOST: ${{ steps.parse-input.outputs.api_host }}
--- a/.github/ansible/scripts/init_pageserver.sh
+++ b/.github/ansible/scripts/init_pageserver.sh
@@ -1,7 +1,8 @@
 #!/bin/sh

-# get instance id from meta-data service
+# fetch params from meta-data service
 INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
+AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)

 # store fqdn hostname in var
 HOST=$(hostname -f)
@@ -16,7 +17,8 @@ cat <<EOF | tee /tmp/payload
  "instance_id": "${INSTANCE_ID}",
  "http_host": "${HOST}",
  "http_port": 9898,
-  "active": false
+  "active": false,
+  "availability_zone_id": "${AZ_ID}"
 }
 EOF

--- a/.github/ansible/systemd/pageserver.service
+++ b/.github/ansible/systemd/pageserver.service
@@ -5,7 +5,7 @@ After=network.target auditd.service
 [Service]
 Type=simple
 User=pageserver
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }}
 ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -5,7 +5,7 @@ After=network.target auditd.service
 [Service]
 Type=simple
 User=safekeeper
-Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }}
 ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -0,0 +1,53 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: staging
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker.zeta.eu-west-1.internal.aws.neon.build
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker.zeta.eu-west-1.internal.aws.neon.build
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.cloud.stage.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram-legacy
+  zenith_env: dev
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.beta.us-east-2.aws.neon.build
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -0,0 +1,53 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: staging
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker.beta.us-east-2.internal.aws.neon.build
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker.beta.us-east-2.internal.aws.neon.build
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
--- a/.github/helm-values/neon-stress.neon-storage-broker.yaml
+++ b/.github/helm-values/neon-stress.neon-storage-broker.yaml
@@ -0,0 +1,54 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: neon-stress
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: alb
+    alb.ingress.kubernetes.io/healthcheck-path: /status
+    alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
+    alb.ingress.kubernetes.io/scheme: "internal"
+    alb.ingress.kubernetes.io/target-type: "ip"
+    alb.ingress.kubernetes.io/ssl-redirect: "443"
+    alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
+
+  hosts:
+    - host: storage-broker-stress.stage.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -0,0 +1,53 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -0,0 +1,53 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker.gamma.eu-central-1.internal.aws.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker.gamma.eu-central-1.internal.aws.neon.tech
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -0,0 +1,53 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker.delta.us-east-2.internal.aws.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker.delta.us-east-2.internal.aws.neon.tech
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.us-west-2.aws.neon.tech"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-west-2
+  zenith_region_slug: us-west-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -0,0 +1,53 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: nginx-internal
+    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
+    nginx.ingress.kubernetes.io/ssl-redirect: "true"
+    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
+    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
+
+  hosts:
+    - host: storage-broker.eta.us-west-2.internal.aws.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+  tls:
+    - hosts:
+        - storage-broker.eta.us-west-2.internal.aws.neon.tech
+      secretName: storage-broker-tls
+
+
+metrics:
+  enabled: false
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
--- a/.github/helm-values/production.neon-storage-broker.yaml
+++ b/.github/helm-values/production.neon-storage-broker.yaml
@@ -0,0 +1,54 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: production
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: alb
+    alb.ingress.kubernetes.io/healthcheck-path: /status
+    alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
+    alb.ingress.kubernetes.io/scheme: "internal"
+    alb.ingress.kubernetes.io/target-type: "ip"
+    alb.ingress.kubernetes.io/ssl-redirect: "443"
+    alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
+
+  hosts:
+    - host: storage-broker.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
--- a/.github/helm-values/staging.neon-storage-broker.yaml
+++ b/.github/helm-values/staging.neon-storage-broker.yaml
@@ -0,0 +1,54 @@
+# Helm chart values for neon-storage-broker
+podLabels:
+  neon_env: staging
+  neon_service: storage-broker
+
+ingress:
+  enabled: true
+  annotations:
+    kubernetes.io/ingress.class: alb
+    alb.ingress.kubernetes.io/healthcheck-path: /status
+    alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
+    alb.ingress.kubernetes.io/scheme: "internal"
+    alb.ingress.kubernetes.io/target-type: "ip"
+    alb.ingress.kubernetes.io/ssl-redirect: "443"
+    alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
+
+  hosts:
+    - host: storage-broker.stage.neon.tech
+      paths:
+        - path: /
+          pathType: Prefix
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-storage-broker.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-storage-broker
+        app.kubernetes.io/instance: neon-storage-broker
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-storage-broker"
+      endpoints:
+        - port: broker
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
+
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -15,9 +15,6 @@ on:

  workflow_dispatch: # adds ability to run this manually
    inputs:
-      environment:
-        description: 'Environment to run remote tests on (dev or staging)'
-        required: false
      region_id:
        description: 'Use a particular region. If not set the default region will be used'
        required: false
@@ -37,97 +34,69 @@ concurrency:

 jobs:
  bench:
-    # this workflow runs on self hosteed runner
-    # it's environment is quite different from usual guthub runner
-    # probably the most important difference is that it doesn't start from clean workspace each time
-    # e g if you install system packages they are not cleaned up since you install them directly in host machine
-    # not a container or something
-    # See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
-    runs-on: [self-hosted, zenith-benchmarker]
-
    env:
-      POSTGRES_DISTRIB_DIR: /usr/pgsql
+      TEST_PG_BENCH_DURATIONS_MATRIX: "300"
+      TEST_PG_BENCH_SCALES_MATRIX: "10,100"
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init

    steps:
-    - name: Checkout zenith repo
-      uses: actions/checkout@v3
+    - uses: actions/checkout@v3

-    # actions/setup-python@v2 is not working correctly on self-hosted runners
-    # see https://github.com/actions/setup-python/issues/162
-    # and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
-    # so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
-    # there is Python 3.7.10 already installed on the machine so use it to install poetry and then use poetry's virtuealenvs
-    - name: Install poetry & deps
-      run: |
-        python3 -m pip install --upgrade poetry wheel
-        # since pip/poetry caches are reused there shouldn't be any troubles with install every time
-        ./scripts/pysync
-
-    - name: Show versions
-      run: |
-        echo Python
-        python3 --version
-        poetry run python3 --version
-        echo Poetry
-        poetry --version
-        echo Pgbench
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest

    - name: Create Neon Project
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        environment: ${{ github.event.inputs.environment || 'staging' }}
-        api_key: ${{ ( github.event.inputs.environment || 'staging' ) == 'staging' && secrets.NEON_STAGING_API_KEY  || secrets.NEON_CAPTEST_API_KEY }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

    - name: Run benchmark
-      # pgbench is installed system wide from official repo
-      # https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
-      # via
-      # sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
-      # [pgdg13]
-      # name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
-      # baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
-      # enabled=1
-      # gpgcheck=0
-      # EOF
-      # sudo yum makecache
-      # sudo yum install postgresql13-contrib
-      # actual binaries are located in /usr/pgsql-13/bin/
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        # Set --sparse-ordering option of pytest-order plugin
+        # to ensure tests are running in order of appears in the file.
+        # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
+        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py
      env:
-        # The pgbench test runs two tests of given duration against each scale.
-        # So the total runtime with these parameters is 2 * 2 * 300 = 1200, or 20 minutes.
-        # Plus time needed to initialize the test databases.
-        TEST_PG_BENCH_DURATIONS_MATRIX: "300"
-        TEST_PG_BENCH_SCALES_MATRIX: "10,100"
-        PLATFORM: "neon-staging"
        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
-      run: |
-        # just to be sure that no data was cached on self hosted runner
-        # since it might generate duplicates when calling ingest_perf_test_result.py
-        rm -rf perf-report-staging
-        mkdir -p perf-report-staging
-        # Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file,
-        # it's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --out-dir perf-report-staging --timeout 5400
-
-    - name: Submit result
-      env:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-      run: |
-        REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh

    - name: Delete Neon Project
      if: ${{ always() }}
      uses: ./.github/actions/neon-project-delete
      with:
-        environment: staging
        project_id: ${{ steps.create-neon-project.outputs.project_id }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

+    - name: Create Allure report
+      if: success() || failure()
+      uses: ./.github/actions/allure-report
+      with:
+        action: generate
+        build_type: ${{ env.BUILD_TYPE }}
+
    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
@@ -146,15 +115,22 @@ jobs:
        # neon-captest-prefetch: Same, with prefetching enabled (new project)
        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
-        platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
+        platform: [ neon-captest-new, neon-captest-prefetch, rds-postgres ]
        db_size: [ 10gb ]
+        runner: [ us-east-2 ]
        include:
+          - platform: neon-captest-reuse
+            db_size: 10gb
+            runner: dev  # TODO: Switch to us-east-2 after dry-bonus-223539 migration to staging
          - platform: neon-captest-new
            db_size: 50gb
+            runner: us-east-2
          - platform: neon-captest-prefetch
            db_size: 50gb
+            runner: us-east-2
          - platform: rds-aurora
            db_size: 50gb
+            runner: us-east-2

    env:
      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
@@ -166,9 +142,9 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, "${{ matrix.runner }}", x64 ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init

    timeout-minutes: 360 # 6h
@@ -193,8 +169,9 @@ jobs:
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        environment: ${{ github.event.inputs.environment || 'dev' }}
-        api_key: ${{ ( github.event.inputs.environment || 'dev' ) == 'staging' && secrets.NEON_STAGING_API_KEY  || secrets.NEON_CAPTEST_API_KEY }}
+        region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

    - name: Set up Connection String
      id: set-up-connstr
@@ -207,7 +184,7 @@ jobs:
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
          rds-aurora)
-            CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
            ;;
          rds-postgres)
            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
@@ -225,8 +202,11 @@ jobs:
    - name: Set database options
      if: matrix.platform == 'neon-captest-prefetch'
      run: |
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on"
-        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10"
+        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
+
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
      env:
        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}

@@ -269,6 +249,13 @@ jobs:
        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

+    - name: Delete Neon Project
+      if: ${{ steps.create-neon-project.outputs.project_id && always() }}
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
    - name: Create Allure report
      if: success() || failure()
      uses: ./.github/actions/allure-report
@@ -276,14 +263,6 @@ jobs:
        action: generate
        build_type: ${{ env.BUILD_TYPE }}

-    - name: Delete Neon Project
-      if: ${{ steps.create-neon-project.outputs.project_id && always() }}
-      uses: ./.github/actions/neon-project-delete
-      with:
-        environment: dev
-        project_id: ${{ steps.create-neon-project.outputs.project_id }}
-        api_key: ${{ secrets.NEON_CAPTEST_API_KEY }}
-
    - name: Post to a Slack channel
      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
@@ -292,3 +271,226 @@ jobs:
        slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  clickbench-compare:
+    # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
+    # we use for performance testing in pgbench-compare.
+    # Run this job only when pgbench-compare is finished to avoid the intersection.
+    # We might change it after https://github.com/neondatabase/neon/issues/2900.
+    #
+    # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
+    # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
+    if: success() || failure()
+    needs: [ pgbench-compare ]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        # neon-captest-prefetch: We have pre-created projects with prefetch enabled
+        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
+        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+        platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
+
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      PLATFORM: ${{ matrix.platform }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    timeout-minutes: 360 # 6h
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        case "${PLATFORM}" in
+          neon-captest-prefetch)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
+            ;;
+          rds-aurora)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CLICKBENCH_10M_CONNSTR }}
+            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
+            ;;
+          *)
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            exit 1
+            ;;
+        esac
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        psql ${CONNSTR} -c "SELECT version();"
+
+    - name: Set database options
+      if: matrix.platform == 'neon-captest-prefetch'
+      run: |
+        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
+
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: ClickBench benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_perf_olap.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: Create Allure report
+      if: success() || failure()
+      uses: ./.github/actions/allure-report
+      with:
+        action: generate
+        build_type: ${{ env.BUILD_TYPE }}
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  tpch-compare:
+    # TCP-H DB for rds-aurora and rds-Postgres deployed to the same clusters
+    # we use for performance testing in pgbench-compare & clickbench-compare.
+    # Run this job only when clickbench-compare is finished to avoid the intersection.
+    # We might change it after https://github.com/neondatabase/neon/issues/2900.
+    #
+    # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
+    if: success() || failure()
+    needs: [ clickbench-compare ]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        # neon-captest-prefetch: We have pre-created projects with prefetch enabled
+        # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
+        # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
+        platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
+
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
+      PLATFORM: ${{ matrix.platform }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
+      options: --init
+
+    timeout-minutes: 360 # 6h
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Add Postgres binaries to PATH
+      run: |
+        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
+        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+        case "${PLATFORM}" in
+          neon-captest-prefetch)
+            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }}
+            ;;
+          rds-aurora)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_TPCH_S10_CONNSTR }}
+            ;;
+          rds-postgres)
+            CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }}
+            ;;
+          *)
+            echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
+            exit 1
+            ;;
+        esac
+
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+        psql ${CONNSTR} -c "SELECT version();"
+
+    - name: Set database options
+      if: matrix.platform == 'neon-captest-prefetch'
+      run: |
+        DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
+
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
+        psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: Run TPC-H benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_perf_olap.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_tpch
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+
+    - name: Create Allure report
+      if: success() || failure()
+      uses: ./.github/actions/allure-report
+      with:
+        action: generate
+        build_type: ${{ env.BUILD_TYPE }}
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -305,7 +305,7 @@ jobs:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
    needs: [ regress-tests, benchmarks ]
-    if: success() || failure()
+    if: ${{ !cancelled() }}
    strategy:
      fail-fast: false
      matrix:
@@ -668,11 +668,11 @@ jobs:
      - id: set-matrix
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
-            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
+            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
+            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "storage_broker_ns": "neon-stress-storage-broker", "storage_broker_config": "neon-stress.neon-storage-broker", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY", storage_broker_config: }'
            echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
+            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
            echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
@@ -732,7 +732,7 @@ jobs:
          ssh-add ssh-key
          rm -f ssh-key ssh-key-cert.pub
          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }}
+          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-new:
@@ -770,7 +770,7 @@ jobs:
            exit 1
          fi
          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-pr-test-new:
@@ -803,7 +803,7 @@ jobs:
          ./get_binaries.sh

          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-prod-new:
@@ -843,7 +843,7 @@ jobs:
          fi

          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
@@ -885,8 +885,48 @@ jobs:
      - name: Re-deploy proxy
        run: |
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+  deploy-storage-broker-staging:
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Add curl
+        run: apt update && apt install curl -y
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Setup helm v3
+        run: |
+          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s

  deploy-proxy-new:
    runs-on: [ self-hosted, dev, x64 ]
@@ -905,9 +945,11 @@ jobs:
          - target_region:  us-east-2
            target_cluster: dev-us-east-2-beta
            deploy_link_proxy: true
+            deploy_legacy_scram_proxy: true
          - target_region:  eu-west-1
            target_cluster: dev-eu-west-1-zeta
            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -923,13 +965,53 @@ jobs:
      - name: Re-deploy scram proxy
        run: |
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s

      - name: Re-deploy link proxy
        if: matrix.deploy_link_proxy
        run: |
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Re-deploy legacy scram proxy
+        if: matrix.deploy_legacy_scram_proxy
+        run: |
+          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+  deploy-storage-broker-dev-new:
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'main') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s

  deploy-proxy-prod-new:
    runs-on: prod
@@ -947,6 +1029,8 @@ jobs:
        include:
          - target_region:  us-east-2
            target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
          - target_region: eu-central-1
            target_cluster: prod-eu-central-1-gamma
          - target_region: ap-southeast-1
@@ -966,7 +1050,45 @@ jobs:
      - name: Re-deploy proxy
        run: |
          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+  deploy-storage-broker-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
+    needs: [ push-docker-hub, tag, regress-tests ]
+    if: |
+      (github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --wait --timeout 5m0s

  promote-compatibility-data:
    runs-on: [ self-hosted, dev, x64 ]
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -23,6 +23,7 @@ jobs:
    runs-on: [ ubuntu-latest ]

    env:
+      DEFAULT_PG_VERSION: 14
      TEST_OUTPUT: /tmp/test_output

    steps:
@@ -51,8 +52,8 @@ jobs:
      id: create-neon-project
      uses: ./.github/actions/neon-project-create
      with:
-        environment: staging
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}

    - name: Run pytest
      env:
@@ -63,7 +64,7 @@ jobs:
      run: |
        # Test framework expects we have psql binary;
        # but since we don't really need it in this test, let's mock it
-        mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql";
+        mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
        ./scripts/pytest \
          --junitxml=$TEST_OUTPUT/junit.xml \
          --tb=short \
@@ -75,7 +76,6 @@ jobs:
      if: ${{ always() }}
      uses: ./.github/actions/neon-project-delete
      with:
-        environment: staging
        project_id: ${{ steps.create-neon-project.outputs.project_id }}
        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

--- a/Cargo.lock
+++ b/Cargo.lock
--- a/README.md
+++ b/README.md
@@ -2,29 +2,20 @@

 Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.

-The project used to be called "Zenith". Many of the commands and code comments
-still refer to "zenith", but we are in the process of renaming things.
-
 ## Quick start
-[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor.
+Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.

 Alternatively, compile and run the project [locally](#running-local-installation).

 ## Architecture overview

-A Neon installation consists of compute nodes and a Neon storage engine.
-
-Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
+A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.

 The Neon storage engine consists of two major components:
 - Pageserver. Scalable storage backend for the compute nodes.
- WAL service. The service receives WAL from the compute node and ensures that it is stored durably.
+- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.

-Pageserver consists of:
- Repository - Neon storage implementation.
- WAL receiver - service that receives WAL from WAL service and stores it in the repository.
- Page service - service that communicates with compute nodes and responds with pages from the repository.
- WAL redo - service that builds pages from base images and WAL records on Page service request
+See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information.

 ## Running local installation

@@ -229,12 +220,20 @@ CARGO_BUILD_FLAGS="--features=testing" make

 ## Documentation

-Now we use README files to cover design ideas and overall architecture for each module and `rustdoc` style documentation comments. See also [/docs/](/docs/) a top-level overview of all available markdown documentation.
+[/docs/](/docs/) Contains a top-level overview of all available markdown documentation.

 - [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.

 To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`

+See also README files in some source directories, and `rustdoc` style documentation comments.
+
+Other resources:
+
+- [SELECT 'Hello, World'](https://neon.tech/blog/hello-world/): Blog post by Nikita Shamgunov on the high level architecture
+- [Architecture decisions in Neon](https://neon.tech/blog/architecture-decisions-in-neon/): Blog post by Heikki Linnakangas
+- [Neon: Serverless PostgreSQL!](https://www.youtube.com/watch?v=rES0yzeERns): Presentation on storage system by Heikki Linnakangas in the CMU Database Group seminar series
+
 ### Postgres-specific terms

 Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used.
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"

 [dependencies]
 anyhow = "1.0"
-chrono = "0.4"
+chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = "4.0"
 env_logger = "0.9"
 futures = "0.3.13"
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -14,17 +14,19 @@

 use std::ffi::OsStr;
 use std::io::Write;
-use std::path::Path;
+use std::os::unix::prelude::AsRawFd;
+use std::os::unix::process::CommandExt;
+use std::path::{Path, PathBuf};
 use std::process::{Child, Command};
 use std::time::Duration;
 use std::{fs, io, thread};

-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::Context;
 use nix::errno::Errno;
+use nix::fcntl::{FcntlArg, FdFlag};
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-
-use utils::lock_file;
+use utils::pid_file::{self, PidFileRead};

 // These constants control the loop used to poll for process start / stop.
 //
@@ -86,6 +88,14 @@ where
    let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

+    let pid_file_to_check = match initial_pid_file {
+        InitialPidFile::Create(path) => {
+            pre_exec_create_pidfile(filled_cmd, path);
+            path
+        }
+        InitialPidFile::Expect(path) => path,
+    };
+
    let mut spawned_process = filled_cmd.spawn().with_context(|| {
        format!("Could not spawn {process_name}, see console output and log files for details.")
    })?;
@@ -95,29 +105,8 @@ where
            .with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
    );

-    let pid_file_to_check = match initial_pid_file {
-        InitialPidFile::Create(target_pid_file_path) => {
-            match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
-                lock_file::LockCreationResult::Created { .. } => {
-                    // We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
-                    // as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
-                }
-                lock_file::LockCreationResult::AlreadyLocked { .. } => {
-                    anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
-                }
-                lock_file::LockCreationResult::CreationFailed(e) => {
-                    return Err(e.context(format!(
-                    "Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
-                )))
-                }
-            }
-            None
-        }
-        InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
-    };
-
    for retries in 0..RETRIES {
-        match process_started(pid, pid_file_to_check, &process_status_check) {
+        match process_started(pid, Some(pid_file_to_check), &process_status_check) {
            Ok(true) => {
                println!("\n{process_name} started, pid: {pid}");
                return Ok(spawned_process);
@@ -147,14 +136,45 @@ where
    anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
 }

+/// Send SIGTERM to child process
+pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()> {
+    let pid = child.id();
+    match kill(
+        nix::unistd::Pid::from_raw(pid.try_into().unwrap()),
+        Signal::SIGTERM,
+    ) {
+        Ok(()) => Ok(()),
+        Err(Errno::ESRCH) => {
+            println!("child process with pid {pid} does not exist");
+            Ok(())
+        }
+        Err(e) => anyhow::bail!("Failed to send signal to child process with pid {pid}: {e}"),
+    }
+}
+
 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
 pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
-    if !pid_file.exists() {
-        println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
-        return Ok(());
-    }
-    let pid = read_pidfile(pid_file)?;
+    let pid = match pid_file::read(pid_file)
+        .with_context(|| format!("read pid_file {pid_file:?}"))?
+    {
+        PidFileRead::NotExist => {
+            println!("{process_name} is already stopped: no pid file present at {pid_file:?}");
+            return Ok(());
+        }
+        PidFileRead::NotHeldByAnyProcess(_) => {
+            // Don't try to kill according to file contents beacuse the pid might have been re-used by another process.
+            // Don't delete the file either, it can race with new pid file creation.
+            // Read `pid_file` module comment for details.
+            println!(
+                "No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: {pid_file:?}"
+            );
+            return Ok(());
+        }
+        PidFileRead::LockedByOtherProcess(pid) => pid,
+    };
+    // XXX the pid could become invalid (and recycled) at any time before the kill() below.

+    // send signal
    let sig = if immediate {
        print!("Stopping {process_name} with pid {pid} immediately..");
        Signal::SIGQUIT
@@ -166,8 +186,9 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
    match kill(pid, sig) {
        Ok(()) => (),
        Err(Errno::ESRCH) => {
+            // Again, don't delete the pid file. The unlink can race with a new pid file being created.
            println!(
-                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
+                "{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found. Likely the pid got recycled. Lucky we didn't harm anyone."
            );
            return Ok(());
        }
@@ -179,11 +200,6 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
        match process_has_stopped(pid) {
            Ok(true) => {
                println!("\n{process_name} stopped");
-                if let Err(e) = fs::remove_file(pid_file) {
-                    if e.kind() != io::ErrorKind::NotFound {
-                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
-                    }
-                }
                return Ok(());
            }
            Ok(false) => {
@@ -209,7 +225,14 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
 }

 fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
-    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
+    // If RUST_BACKTRACE is set, pass it through. But if it's not set, default
+    // to RUST_BACKTRACE=1.
+    let backtrace_setting = std::env::var_os("RUST_BACKTRACE");
+    let backtrace_setting = backtrace_setting
+        .as_deref()
+        .unwrap_or_else(|| OsStr::new("1"));
+
+    let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", backtrace_setting);

    // Pass through these environment variables to the command
    for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
@@ -234,6 +257,69 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    cmd
 }

+/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
+/// 1. Claims a pidfile with a fcntl lock on it and
+/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
+///    will remain held until the cmd exits.
+fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
+where
+    P: Into<PathBuf>,
+{
+    let path: PathBuf = path.into();
+    // SAFETY
+    // pre_exec is marked unsafe because it runs between fork and exec.
+    // Why is that dangerous in various ways?
+    // Long answer:  https://github.com/rust-lang/rust/issues/39575
+    // Short answer: in a multi-threaded program, other threads may have
+    // been inside of critical sections at the time of fork. In the
+    // original process, that was allright, assuming they protected
+    // the critical sections appropriately, e.g., through locks.
+    // Fork adds another process to the mix that
+    //   1. Has a single thread T
+    //   2. In an exact copy of the address space at the time of fork.
+    // A variety of problems scan occur now:
+    //   1. T tries to grab a lock that was locked at the time of fork.
+    //      It will wait forever since in its address space, the lock
+    //      is in state 'taken' but the thread that would unlock it is
+    //      not there.
+    //   2. A rust object that represented some external resource in the
+    //      parent now got implicitly copied by the the fork, even though
+    //      the object's type is not `Copy`. The parent program may use
+    //      non-copyability as way to enforce unique ownership of an
+    //      external resource in the typesystem. The fork breaks that
+    //      assumption, as now both parent and child process have an
+    //      owned instance of the object that represents the same
+    //      underlying resource.
+    // While these seem like niche problems, (1) in particular is
+    // highly relevant. For example, `malloc()` may grab a mutex internally,
+    // and so, if we forked while another thread was mallocing' and our
+    // pre_exec closure allocates as well, it will block on the malloc
+    // mutex forever
+    //
+    // The proper solution is to only use C library functions that are marked
+    // "async-signal-safe": https://man7.org/linux/man-pages/man7/signal-safety.7.html
+    //
+    // With this specific pre_exec() closure, the non-error path doesn't allocate.
+    // The error path uses `anyhow`, and hence does allocate.
+    // We take our chances there, hoping that any potential disaster is constrained
+    // to the child process (e.g., malloc has no state ourside of the child process).
+    // Last, `expect` prints to stderr, and stdio is not async-signal-safe.
+    // Again, we take our chances, making the same assumptions as for malloc.
+    unsafe {
+        cmd.pre_exec(move || {
+            let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
+            // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
+            // remains locked after exec.
+            nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
+                .expect("remove FD_CLOEXEC");
+            // Don't run drop(file), it would close the file before we actually exec.
+            std::mem::forget(file);
+            Ok(())
+        });
+    }
+    cmd
+}
+
 fn process_started<F>(
    pid: Pid,
    pid_file_to_check: Option<&Path>,
@@ -244,14 +330,11 @@ where
 {
    match status_check() {
        Ok(true) => match pid_file_to_check {
-            Some(pid_file_path) => {
-                if pid_file_path.exists() {
-                    let pid_in_file = read_pidfile(pid_file_path)?;
-                    Ok(pid_in_file == pid)
-                } else {
-                    Ok(false)
-                }
-            }
+            Some(pid_file_path) => match pid_file::read(pid_file_path)? {
+                PidFileRead::NotExist => Ok(false),
+                PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
+                PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
+            },
            None => Ok(true),
        },
        Ok(false) => Ok(false),
@@ -259,21 +342,6 @@ where
    }
 }

-/// Read a PID file
-///
-/// We expect a file that contains a single integer.
-fn read_pidfile(pidfile: &Path) -> Result<Pid> {
-    let pid_str = fs::read_to_string(pidfile)
-        .with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
-    let pid: i32 = pid_str
-        .parse()
-        .map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
-    if pid < 1 {
-        bail!("pidfile {pidfile:?} contained bad value '{pid}'");
-    }
-    Ok(Pid::from_raw(pid))
-}
-
 fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
    match kill(pid, None) {
        // Process exists, keep waiting
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -324,7 +324,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
            pg_version,
        )
        .unwrap_or_else(|e| {
-            eprintln!("pageserver init failed: {e}");
+            eprintln!("pageserver init failed: {e:?}");
            exit(1);
        });

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,12 +1,12 @@
 use std::collections::HashMap;
-use std::fs::{self, File};
+use std::fs::File;
 use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::{Path, PathBuf};
 use std::process::Child;
 use std::{io, result};

-use anyhow::{bail, Context};
+use anyhow::{bail, ensure, Context};
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
 };
@@ -168,29 +168,21 @@ impl PageServerNode {
            }
            Err(e) => eprintln!("{e:#}"),
        }
-        match pageserver_process.kill() {
-            Err(e) => {
-                eprintln!(
-                    "Failed to stop pageserver {} process with pid {}: {e:#}",
-                    self.env.pageserver.id,
-                    pageserver_process.id(),
-                )
-            }
-            Ok(()) => {
-                println!(
-                    "Stopped pageserver {} process with pid {}",
-                    self.env.pageserver.id,
-                    pageserver_process.id(),
-                );
-                // cleanup after pageserver startup, since we do not call regular `stop_process` during init
-                let pid_file = self.pid_file();
-                if let Err(e) = fs::remove_file(&pid_file) {
-                    if e.kind() != io::ErrorKind::NotFound {
-                        eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
-                    }
-                }
-            }
-        }
+        background_process::send_stop_child_process(&pageserver_process)?;
+
+        let exit_code = pageserver_process.wait()?;
+        ensure!(
+            exit_code.success(),
+            format!(
+                "pageserver init failed with exit code {:?}",
+                exit_code.code()
+            )
+        );
+        println!(
+            "Stopped pageserver {} process with pid {}",
+            self.env.pageserver.id,
+            pageserver_process.id(),
+        );
        init_result
    }

--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -45,9 +45,9 @@ and create new databases and accounts (control plane API in our case).

 Integration tests, written in Python using the `pytest` framework.

-`/vendor/postgres-v14`:
+`/vendor/postgres-v14` and `/vendor/postgres-v15`:

-PostgreSQL source tree, with the modifications needed for Neon.
+PostgreSQL source tree per version, with the modifications needed for Neon.

 `/pgxn/neon`:

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -23,7 +23,7 @@ pub enum TenantState {
    Active,
    /// A tenant is recognized by pageserver, but it is being detached or the
    /// system is being shut down.
-    Paused,
+    Stopping,
    /// A tenant is recognized by the pageserver, but can no longer be used for
    /// any operations, because it failed to be activated.
    Broken,
@@ -35,7 +35,7 @@ impl TenantState {
            Self::Loading => true,
            Self::Attaching => true,
            Self::Active => false,
-            Self::Paused => false,
+            Self::Stopping => false,
            Self::Broken => false,
        }
    }
@@ -53,7 +53,7 @@ pub enum TimelineState {
    Suspended,
    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
    /// automatically become Active after certain events: only a management call can change this status.
-    Paused,
+    Stopping,
    /// A timeline is recognized by the pageserver, but can no longer be used for
    /// any operations, because it failed to be activated.
    Broken,
@@ -201,8 +201,6 @@ pub struct TimelineInfo {
    pub last_received_msg_ts: Option<u128>,
    pub pg_version: u32,

-    pub awaits_download: bool,
-
    pub state: TimelineState,

    // Some of the above fields are duplicated in 'local' and 'remote', for backwards-
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -1,7 +1,6 @@
 use anyhow::*;
 use core::time::Duration;
 use log::*;
-use once_cell::sync::Lazy;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -26,15 +25,13 @@ pub struct PostgresServer {
    client_config: postgres::Config,
 }

-pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
-    vec![
-        "wal_keep_size=50MB",            // Ensure old WAL is not removed
-        "shared_preload_libraries=neon", // can only be loaded at startup
-        // Disable background processes as much as possible
-        "wal_writer_delay=10s",
-        "autovacuum=off",
-    ]
-});
+pub static REQUIRED_POSTGRES_CONFIG: [&str; 4] = [
+    "wal_keep_size=50MB",            // Ensure old WAL is not removed
+    "shared_preload_libraries=neon", // can only be loaded at startup
+    // Disable background processes as much as possible
+    "wal_writer_delay=10s",
+    "autovacuum=off",
+];

 impl Conf {
    pub fn pg_distrib_dir(&self) -> anyhow::Result<PathBuf> {
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -9,8 +9,11 @@ async-trait = "0.1"
 metrics = { version = "0.1", path = "../metrics" }
 utils = { version = "0.1", path = "../utils" }
 once_cell = "1.13.0"
-rusoto_core = "0.48"
-rusoto_s3 = "0.48"
+aws-smithy-http = "0.51.0"
+aws-types = "0.51.0"
+aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "0.21.0"
+hyper = { version = "0.14", features = ["stream"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -10,7 +10,7 @@ mod s3_bucket;

 use std::{
    collections::HashMap,
-    fmt::{Debug, Display},
+    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    ops::Deref,
    path::{Path, PathBuf},
@@ -41,44 +41,27 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;

 const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';

-#[derive(Clone, PartialEq, Eq)]
-pub struct RemoteObjectId(String);
+/// Path on the remote storage, relative to some inner prefix.
+/// The prefix is an implementation detail, that allows representing local paths
+/// as the remote ones, stripping the local storage prefix away.
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct RemotePath(PathBuf);
+
+impl RemotePath {
+    pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
+        anyhow::ensure!(
+            relative_path.is_relative(),
+            "Path {relative_path:?} is not relative"
+        );
+        Ok(Self(relative_path.to_path_buf()))
+    }
+
+    pub fn with_base(&self, base_path: &Path) -> PathBuf {
+        base_path.join(&self.0)
+    }

-///
-/// A key that refers to an object in remote storage. It works much like a Path,
-/// but it's a separate datatype so that you don't accidentally mix local paths
-/// and remote keys.
-///
-impl RemoteObjectId {
-    // Needed to retrieve last component for RemoteObjectId.
-    // In other words a file name
-    /// Turn a/b/c or a/b/c/ into c
    pub fn object_name(&self) -> Option<&str> {
-        // corner case, char::to_string is not const, thats why this is more verbose than it needs to be
-        // see https://github.com/rust-lang/rust/issues/88674
-        if self.0.len() == 1 && self.0.chars().next().unwrap() == REMOTE_STORAGE_PREFIX_SEPARATOR {
-            return None;
-        }
-
-        if self.0.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-            self.0.rsplit(REMOTE_STORAGE_PREFIX_SEPARATOR).nth(1)
-        } else {
-            self.0
-                .rsplit_once(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                .map(|(_, last)| last)
-        }
-    }
-}
-
-impl Debug for RemoteObjectId {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        Debug::fmt(&self.0, fmt)
-    }
-}
-
-impl Display for RemoteObjectId {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        Display::fmt(&self.0, fmt)
+        self.0.file_name().and_then(|os_str| os_str.to_str())
    }
 }

@@ -87,49 +70,40 @@ impl Display for RemoteObjectId {
 /// providing basic CRUD operations for storage files.
 #[async_trait::async_trait]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Attempts to derive the storage path out of the local path, if the latter is correct.
-    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId>;
-
-    /// Gets the download path of the given storage file.
-    fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf>;
-
    /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>>;
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>>;

    /// Lists all top level subdirectories for a given prefix
    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
    /// so this method doesnt need to.
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemoteObjectId>,
-    ) -> anyhow::Result<Vec<RemoteObjectId>>;
+    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;

    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
-        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
-        from_size_bytes: usize,
-        to: &RemoteObjectId,
+        data_size_bytes: usize,
+        to: &RemotePath,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()>;

    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
    /// Returns the metadata, if any was stored with the file previously.
-    async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError>;
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;

    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
    /// Returns the metadata, if any was stored with the file previously.
    async fn download_byte_range(
        &self,
-        from: &RemoteObjectId,
+        from: &RemotePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
    ) -> Result<Download, DownloadError>;

-    async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()>;
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;

    /// Downcast to LocalFs implementation. For tests.
    fn as_local(&self) -> Option<&LocalFs> {
@@ -178,34 +152,35 @@ impl std::error::Error for DownloadError {}
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
-pub struct GenericRemoteStorage(Arc<dyn RemoteStorage>);
+pub enum GenericRemoteStorage {
+    LocalFs(LocalFs),
+    AwsS3(Arc<S3Bucket>),
+}

 impl Deref for GenericRemoteStorage {
    type Target = dyn RemoteStorage;

    fn deref(&self) -> &Self::Target {
-        self.0.as_ref()
+        match self {
+            GenericRemoteStorage::LocalFs(local_fs) => local_fs,
+            GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
+        }
    }
 }

 impl GenericRemoteStorage {
-    pub fn new(storage: impl RemoteStorage) -> Self {
-        Self(Arc::new(storage))
-    }
-
    pub fn from_config(
-        working_directory: PathBuf,
        storage_config: &RemoteStorageConfig,
    ) -> anyhow::Result<GenericRemoteStorage> {
        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs(root) => {
                info!("Using fs root '{}' as a remote storage", root.display());
-                GenericRemoteStorage::new(LocalFs::new(root.clone(), working_directory)?)
+                GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?)
            }
            RemoteStorageKind::AwsS3(s3_config) => {
                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                GenericRemoteStorage::new(S3Bucket::new(s3_config, working_directory)?)
+                GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
            }
        })
    }
@@ -219,23 +194,12 @@ impl GenericRemoteStorage {
        &self,
        from: Box<dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static>,
        from_size_bytes: usize,
-        from_path: &Path,
+        to: &RemotePath,
    ) -> anyhow::Result<()> {
-        let target_storage_path = self.remote_object_id(from_path).with_context(|| {
-            format!(
-                "Failed to get the storage path for source local path '{}'",
-                from_path.display()
-            )
-        })?;
-
-        self.upload(from, from_size_bytes, &target_storage_path, None)
+        self.upload(from, from_size_bytes, to, None)
            .await
            .with_context(|| {
-                format!(
-                    "Failed to upload from '{}' to storage path '{:?}'",
-                    from_path.display(),
-                    target_storage_path
-                )
+                format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
            })
    }

@@ -244,24 +208,11 @@ impl GenericRemoteStorage {
    pub async fn download_storage_object(
        &self,
        byte_range: Option<(u64, Option<u64>)>,
-        to_path: &Path,
+        from: &RemotePath,
    ) -> Result<Download, DownloadError> {
-        let remote_object_path = self
-            .remote_object_id(to_path)
-            .with_context(|| {
-                format!(
-                    "Failed to get the storage path for target local path '{}'",
-                    to_path.display()
-                )
-            })
-            .map_err(DownloadError::BadInput)?;
-
        match byte_range {
-            Some((start, end)) => {
-                self.download_byte_range(&remote_object_path, start, end)
-                    .await
-            }
-            None => self.download(&remote_object_path).await,
+            Some((start, end)) => self.download_byte_range(from, start, end).await,
+            None => self.download(from).await,
        }
    }
 }
@@ -271,23 +222,6 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);

-fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
-    if prefix == path {
-        anyhow::bail!(
-            "Prefix and the path are equal, cannot strip: '{}'",
-            prefix.display()
-        )
-    } else {
-        path.strip_prefix(prefix).with_context(|| {
-            format!(
-                "Path '{}' is not prefixed with '{}'",
-                path.display(),
-                prefix.display(),
-            )
-        })
-    }
-}
-
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
@@ -431,21 +365,24 @@ mod tests {
    use super::*;

    #[test]
-    fn object_name() {
-        let k = RemoteObjectId("a/b/c".to_owned());
+    fn test_object_name() {
+        let k = RemotePath::new(Path::new("a/b/c")).unwrap();
        assert_eq!(k.object_name(), Some("c"));

-        let k = RemoteObjectId("a/b/c/".to_owned());
+        let k = RemotePath::new(Path::new("a/b/c/")).unwrap();
        assert_eq!(k.object_name(), Some("c"));

-        let k = RemoteObjectId("a/".to_owned());
+        let k = RemotePath::new(Path::new("a/")).unwrap();
        assert_eq!(k.object_name(), Some("a"));

        // XXX is it impossible to have an empty key?
-        let k = RemoteObjectId("".to_owned());
-        assert_eq!(k.object_name(), None);
-
-        let k = RemoteObjectId("/".to_owned());
+        let k = RemotePath::new(Path::new("")).unwrap();
        assert_eq!(k.object_name(), None);
    }
+
+    #[test]
+    fn rempte_path_cannot_be_created_from_absolute_ones() {
+        let err = RemotePath::new(Path::new("/")).expect_err("Should fail on absolute paths");
+        assert_eq!(err.to_string(), "Path \"/\" is not relative");
+    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,6 +5,7 @@
 //! volume is mounted to the local FS.

 use std::{
+    borrow::Cow,
    future::Future,
    path::{Path, PathBuf},
    pin::Pin,
@@ -18,60 +19,33 @@ use tokio::{
 use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;

-use crate::{Download, DownloadError, RemoteObjectId};
+use crate::{Download, DownloadError, RemotePath};

-use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
+use super::{RemoteStorage, StorageMetadata};

 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";

-/// Convert a Path in the remote storage into a RemoteObjectId
-fn remote_object_id_from_path(path: &Path) -> anyhow::Result<RemoteObjectId> {
-    Ok(RemoteObjectId(
-        path.to_str()
-            .ok_or_else(|| anyhow::anyhow!("unexpected characters found in path"))?
-            .to_string(),
-    ))
-}
-
+#[derive(Debug, Clone)]
 pub struct LocalFs {
-    working_directory: PathBuf,
    storage_root: PathBuf,
 }

 impl LocalFs {
    /// Attempts to create local FS storage, along with its root directory.
-    pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result<Self> {
-        if !root.exists() {
-            std::fs::create_dir_all(&root).with_context(|| {
-                format!(
-                    "Failed to create all directories in the given root path '{}'",
-                    root.display(),
-                )
+    /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
+    pub fn new(mut storage_root: PathBuf) -> anyhow::Result<Self> {
+        if !storage_root.exists() {
+            std::fs::create_dir_all(&storage_root).with_context(|| {
+                format!("Failed to create all directories in the given root path {storage_root:?}")
            })?;
        }
-        Ok(Self {
-            working_directory,
-            storage_root: root,
-        })
-    }
-
-    ///
-    /// Get the absolute path in the local filesystem to given remote object.
-    ///
-    /// This is public so that it can be used in tests. Should not be used elsewhere.
-    ///
-    pub fn resolve_in_storage(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
-        let path = PathBuf::from(&remote_object_id.0);
-        if path.is_relative() {
-            Ok(self.storage_root.join(path))
-        } else if path.starts_with(&self.storage_root) {
-            Ok(path)
-        } else {
-            bail!(
-                "Path '{}' does not belong to the current storage",
-                path.display()
-            )
+        if !storage_root.is_absolute() {
+            storage_root = storage_root.canonicalize().with_context(|| {
+                format!("Failed to represent path {storage_root:?} as an absolute path")
+            })?;
        }
+
+        Ok(Self { storage_root })
    }

    async fn read_storage_metadata(
@@ -103,45 +77,48 @@ impl LocalFs {

 #[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
-    /// Convert a "local" path into a "remote path"
-    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
-        let path = self.storage_root.join(
-            strip_path_prefix(&self.working_directory, local_path)
-                .context("local path does not belong to this storage")?,
-        );
-        remote_object_id_from_path(&path)
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
+        Ok(get_all_files(&self.storage_root, true)
+            .await?
+            .into_iter()
+            .map(|path| {
+                path.strip_prefix(&self.storage_root)
+                    .context("Failed to strip storage root prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    )
+            })
+            .collect())
    }

-    fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
-        let storage_path = PathBuf::from(&remote_object_id.0);
-        let relative_path = strip_path_prefix(&self.storage_root, &storage_path)
-            .context("local path does not belong to this storage")?;
-        Ok(self.working_directory.join(relative_path))
-    }
-
-    async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
-        get_all_files(&self.storage_root, true).await
-    }
-
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemoteObjectId>,
-    ) -> anyhow::Result<Vec<RemoteObjectId>> {
+    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        let path = match prefix {
-            Some(prefix) => Path::new(&prefix.0),
-            None => &self.storage_root,
+            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+            None => Cow::Borrowed(&self.storage_root),
        };
-        get_all_files(path, false).await
+        Ok(get_all_files(path.as_ref(), false)
+            .await?
+            .into_iter()
+            .map(|path| {
+                path.strip_prefix(&self.storage_root)
+                    .context("Failed to strip preifix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    )
+            })
+            .collect())
    }

    async fn upload(
        &self,
-        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
-        from_size_bytes: usize,
-        to: &RemoteObjectId,
+        data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
+        data_size_bytes: usize,
+        to: &RemotePath,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
-        let target_file_path = self.resolve_in_storage(to)?;
+        let target_file_path = to.with_base(&self.storage_root);
        create_target_directory(&target_file_path).await?;
        // We need this dance with sort of durable rename (without fsyncs)
        // to prevent partial uploads. This was really hit when pageserver shutdown
@@ -162,8 +139,8 @@ impl RemoteStorage for LocalFs {
                })?,
        );

-        let from_size_bytes = from_size_bytes as u64;
-        let mut buffer_to_read = from.take(from_size_bytes);
+        let from_size_bytes = data_size_bytes as u64;
+        let mut buffer_to_read = data.take(from_size_bytes);

        let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
            .await
@@ -220,27 +197,22 @@ impl RemoteStorage for LocalFs {
        Ok(())
    }

-    async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
-        let file_path = self
-            .resolve_in_storage(from)
-            .map_err(DownloadError::BadInput)?;
-        if file_exists(&file_path).map_err(DownloadError::BadInput)? {
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        let target_path = from.with_base(&self.storage_root);
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
            let source = io::BufReader::new(
                fs::OpenOptions::new()
                    .read(true)
-                    .open(&file_path)
+                    .open(&target_path)
                    .await
                    .with_context(|| {
-                        format!(
-                            "Failed to open source file '{}' to use in the download",
-                            file_path.display()
-                        )
+                        format!("Failed to open source file {target_path:?} to use in the download")
                    })
                    .map_err(DownloadError::Other)?,
            );

            let metadata = self
-                .read_storage_metadata(&file_path)
+                .read_storage_metadata(&target_path)
                .await
                .map_err(DownloadError::Other)?;
            Ok(Download {
@@ -254,7 +226,7 @@ impl RemoteStorage for LocalFs {

    async fn download_byte_range(
        &self,
-        from: &RemoteObjectId,
+        from: &RemotePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
    ) -> Result<Download, DownloadError> {
@@ -266,20 +238,15 @@ impl RemoteStorage for LocalFs {
                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
            }
        }
-        let file_path = self
-            .resolve_in_storage(from)
-            .map_err(DownloadError::BadInput)?;
-        if file_exists(&file_path).map_err(DownloadError::BadInput)? {
+        let target_path = from.with_base(&self.storage_root);
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
            let mut source = io::BufReader::new(
                fs::OpenOptions::new()
                    .read(true)
-                    .open(&file_path)
+                    .open(&target_path)
                    .await
                    .with_context(|| {
-                        format!(
-                            "Failed to open source file '{}' to use in the download",
-                            file_path.display()
-                        )
+                        format!("Failed to open source file {target_path:?} to use in the download")
                    })
                    .map_err(DownloadError::Other)?,
            );
@@ -289,7 +256,7 @@ impl RemoteStorage for LocalFs {
                .context("Failed to seek to the range start in a local storage file")
                .map_err(DownloadError::Other)?;
            let metadata = self
-                .read_storage_metadata(&file_path)
+                .read_storage_metadata(&target_path)
                .await
                .map_err(DownloadError::Other)?;

@@ -308,15 +275,12 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()> {
-        let file_path = self.resolve_in_storage(path)?;
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+        let file_path = path.with_base(&self.storage_root);
        if file_path.exists() && file_path.is_file() {
            Ok(fs::remove_file(file_path).await?)
        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
+            bail!("File {file_path:?} either does not exist or is not a file")
        }
    }

@@ -332,7 +296,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf {
 fn get_all_files<'a, P>(
    directory_path: P,
    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<RemoteObjectId>>> + Send + Sync + 'a>>
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
 where
    P: AsRef<Path> + Send + Sync + 'a,
 {
@@ -346,20 +310,20 @@ where
                    let file_type = dir_entry.file_type().await?;
                    let entry_path = dir_entry.path();
                    if file_type.is_symlink() {
-                        debug!("{:?} us a symlink, skipping", entry_path)
+                        debug!("{entry_path:?} us a symlink, skipping")
                    } else if file_type.is_dir() {
                        if recursive {
                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
                        } else {
-                            paths.push(remote_object_id_from_path(&dir_entry.path())?)
+                            paths.push(entry_path)
                        }
                    } else {
-                        paths.push(remote_object_id_from_path(&dir_entry.path())?);
+                        paths.push(entry_path);
                    }
                }
                Ok(paths)
            } else {
-                bail!("Path '{}' is not a directory", directory_path.display())
+                bail!("Path {directory_path:?} is not a directory")
            }
        } else {
            Ok(Vec::new())
@@ -394,173 +358,6 @@ fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
    }
 }

-#[cfg(test)]
-mod pure_tests {
-    use tempfile::tempdir;
-
-    use super::*;
-
-    #[test]
-    fn storage_path_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            working_directory: workdir.clone(),
-            storage_root: storage_root.clone(),
-        };
-
-        let local_path = workdir
-            .join("timelines")
-            .join("some_timeline")
-            .join("file_name");
-        let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?);
-
-        let actual_path = PathBuf::from(
-            storage
-                .remote_object_id(&local_path)
-                .expect("Matching path should map to storage path normally")
-                .0,
-        );
-        assert_eq!(
-            expected_path,
-            actual_path,
-            "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn storage_path_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
-            match storage.remote_object_id(mismatching_path) {
-                Ok(wrong_path) => panic!(
-                    "Expected path '{}' to error, but got storage path: {:?}",
-                    mismatching_path.display(),
-                    wrong_path,
-                ),
-                Err(e) => format!("{:?}", e),
-            }
-        }
-
-        let workdir = tempdir()?.path().to_owned();
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            working_directory: workdir.clone(),
-            storage_root,
-        };
-
-        let error_string = storage_path_error(&storage, &workdir);
-        assert!(error_string.contains("does not belong to this storage"));
-        assert!(error_string.contains(workdir.to_str().unwrap()));
-
-        let mismatching_path_str = "/something/else";
-        let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
-        assert!(
-            error_message.contains(mismatching_path_str),
-            "Error should mention wrong path"
-        );
-        assert!(
-            error_message.contains(workdir.to_str().unwrap()),
-            "Error should mention server workdir"
-        );
-        assert!(error_message.contains("does not belong to this storage"));
-
-        Ok(())
-    }
-
-    #[test]
-    fn local_path_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            working_directory: workdir.clone(),
-            storage_root: storage_root.clone(),
-        };
-
-        let name = "not a metadata";
-        let local_path = workdir.join("timelines").join("some_timeline").join(name);
-        assert_eq!(
-            local_path,
-            storage
-                .local_path(&remote_object_id_from_path(
-                    &storage_root.join(local_path.strip_prefix(&workdir)?)
-                )?)
-                .expect("For a valid input, valid local path should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote delta file"
-        );
-
-        let local_metadata_path = workdir
-            .join("timelines")
-            .join("some_timeline")
-            .join("metadata");
-        let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?;
-        assert_eq!(
-            local_metadata_path,
-            storage
-                .local_path(&remote_metadata_path)
-                .expect("For a valid input, valid local path should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote metadata file"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn local_path_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        fn local_path_error(storage: &LocalFs, storage_path: &RemoteObjectId) -> String {
-            match storage.local_path(storage_path) {
-                Ok(wrong_path) => panic!(
-                    "Expected local path input {:?} to cause an error, but got file path: {:?}",
-                    storage_path, wrong_path,
-                ),
-                Err(e) => format!("{:?}", e),
-            }
-        }
-
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let storage = LocalFs {
-            working_directory: tempdir()?.path().to_owned(),
-            storage_root,
-        };
-
-        let totally_wrong_path = "wrong_wrong_wrong";
-        let error_message =
-            local_path_error(&storage, &RemoteObjectId(totally_wrong_path.to_string()));
-        assert!(error_message.contains(totally_wrong_path));
-
-        Ok(())
-    }
-
-    #[test]
-    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let original_path = workdir
-            .join("timelines")
-            .join("some_timeline")
-            .join("some name");
-
-        let storage_root = PathBuf::from("somewhere").join("else");
-        let dummy_storage = LocalFs {
-            working_directory: workdir,
-            storage_root,
-        };
-
-        let storage_path = dummy_storage.remote_object_id(&original_path)?;
-        let download_destination = dummy_storage.local_path(&storage_path)?;
-
-        assert_eq!(
-            original_path, download_destination,
-            "'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
-        );
-
-        Ok(())
-    }
-}
-
 #[cfg(test)]
 mod fs_tests {
    use super::*;
@@ -572,7 +369,7 @@ mod fs_tests {
        storage: &LocalFs,
        #[allow(clippy::ptr_arg)]
        // have to use &PathBuf due to `storage.local_path` parameter requirements
-        remote_storage_path: &RemoteObjectId,
+        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
        let mut download = storage
@@ -595,41 +392,16 @@ mod fs_tests {

    #[tokio::test]
    async fn upload_file() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
        let storage = create_storage()?;

-        let (file, size) = create_file_for_upload(
-            &storage.working_directory.join("whatever"),
-            "whatever_contents",
-        )
-        .await?;
-        let target_path = "/somewhere/else";
-        match storage
-            .upload(
-                Box::new(file),
-                size,
-                &RemoteObjectId(target_path.to_string()),
-                None,
-            )
-            .await
-        {
-            Ok(()) => panic!("Should not allow storing files with wrong target path"),
-            Err(e) => {
-                let message = format!("{:?}", e);
-                assert!(message.contains(target_path));
-                assert!(message.contains("does not belong to the current storage"));
-            }
-        }
-        assert!(storage.list().await?.is_empty());
-
-        let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?;
+        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
        assert_eq!(
            storage.list().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );

-        let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?;
+        let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
        assert_eq!(
            list_files_sorted(&storage).await?,
            vec![target_path_1.clone(), target_path_2.clone()],
@@ -643,7 +415,7 @@ mod fs_tests {
    async fn upload_file_negatives() -> anyhow::Result<()> {
        let storage = create_storage()?;

-        let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?;
+        let id = RemotePath::new(Path::new("dummy"))?;
        let content = std::io::Cursor::new(b"12345");

        // Check that you get an error if the size parameter doesn't match the actual
@@ -668,16 +440,14 @@ mod fs_tests {
    }

    fn create_storage() -> anyhow::Result<LocalFs> {
-        LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
+        LocalFs::new(tempdir()?.path().to_owned())
    }

    #[tokio::test]
    async fn download_file() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
        assert_eq!(
@@ -687,7 +457,7 @@ mod fs_tests {
        );

        let non_existing_path = "somewhere/else";
-        match storage.download(&RemoteObjectId(non_existing_path.to_string())).await {
+        match storage.download(&RemotePath::new(Path::new(non_existing_path))?).await {
            Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
        }
@@ -696,11 +466,9 @@ mod fs_tests {

    #[tokio::test]
    async fn download_file_range_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        let full_range_download_contents =
            read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
@@ -766,11 +534,9 @@ mod fs_tests {

    #[tokio::test]
    async fn download_file_range_negative() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        let start = 1_000_000_000;
        let end = start + 1;
@@ -812,11 +578,9 @@ mod fs_tests {

    #[tokio::test]
    async fn delete_file() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
        assert!(storage.list().await?.is_empty());
@@ -826,7 +590,8 @@ mod fs_tests {
            Err(e) => {
                let error_string = e.to_string();
                assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&upload_target.0));
+                let expected_path = upload_target.with_base(&storage.storage_root);
+                assert!(error_string.contains(expected_path.to_str().unwrap()));
            }
        }
        Ok(())
@@ -834,8 +599,6 @@ mod fs_tests {

    #[tokio::test]
    async fn file_with_metadata() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
        let storage = create_storage()?;
        let upload_name = "upload_1";
        let metadata = StorageMetadata(HashMap::from([
@@ -843,7 +606,7 @@ mod fs_tests {
            ("two".to_string(), "2".to_string()),
        ]));
        let upload_target =
-            upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
+            upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;

        let full_range_download_contents =
            read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
@@ -883,23 +646,32 @@ mod fs_tests {
    }

    async fn upload_dummy_file(
-        workdir: &Path,
        storage: &LocalFs,
        name: &str,
        metadata: Option<StorageMetadata>,
-    ) -> anyhow::Result<RemoteObjectId> {
-        let timeline_path = workdir.join("timelines").join("some_timeline");
-        let relative_timeline_path = timeline_path.strip_prefix(&workdir)?;
-        let storage_path = storage.storage_root.join(relative_timeline_path).join(name);
-        let remote_object_id = RemoteObjectId(storage_path.to_str().unwrap().to_string());
-
-        let from_path = storage.working_directory.join(name);
+    ) -> anyhow::Result<RemotePath> {
+        let from_path = storage
+            .storage_root
+            .join("timelines")
+            .join("some_timeline")
+            .join(name);
        let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;

+        let relative_path = from_path
+            .strip_prefix(&storage.storage_root)
+            .context("Failed to strip storage root prefix")
+            .and_then(RemotePath::new)
+            .with_context(|| {
+                format!(
+                    "Failed to resolve remote part of path {:?} for base {:?}",
+                    from_path, storage.storage_root
+                )
+            })?;
+
        storage
-            .upload(Box::new(file), size, &remote_object_id, metadata)
+            .upload(Box::new(file), size, &relative_path, metadata)
            .await?;
-        remote_object_id_from_path(&storage_path)
+        Ok(relative_path)
    }

    async fn create_file_for_upload(
@@ -924,7 +696,7 @@ mod fs_tests {
        format!("contents for {name}")
    }

-    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemoteObjectId>> {
+    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
        let mut files = storage.list().await?;
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,27 +4,34 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::path::{Path, PathBuf};
+use std::env::var;
+use std::sync::Arc;
+use std::time::Duration;

 use anyhow::Context;
-use rusoto_core::{
-    credential::{InstanceMetadataProvider, StaticProvider},
-    HttpClient, Region, RusotoError,
+use aws_config::{
+    environment::credentials::EnvironmentVariableCredentialsProvider, imds,
+    imds::credentials::ImdsCredentialsProvider, meta::credentials::provide_credentials_fn,
 };
-use rusoto_s3::{
-    DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest,
-    S3Client, StreamingBody, S3,
+use aws_sdk_s3::{
+    config::Config,
+    error::{GetObjectError, GetObjectErrorKind},
+    types::{ByteStream, SdkError},
+    Client, Endpoint, Region,
 };
+use aws_smithy_http::body::SdkBody;
+use aws_types::credentials::{CredentialsError, ProvideCredentials};
+use hyper::Body;
 use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
 use tracing::debug;

+use super::StorageMetadata;
 use crate::{
-    strip_path_prefix, Download, DownloadError, RemoteObjectId, RemoteStorage, S3Config,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

-use super::StorageMetadata;
+const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);

 pub(super) mod metrics {
    use metrics::{register_int_counter_vec, IntCounterVec};
@@ -91,32 +98,9 @@ pub(super) mod metrics {
    }
 }

-fn download_destination(
-    id: &RemoteObjectId,
-    workdir: &Path,
-    prefix_to_strip: Option<&str>,
-) -> PathBuf {
-    let path_without_prefix = match prefix_to_strip {
-        Some(prefix) => id.0.strip_prefix(prefix).unwrap_or_else(|| {
-            panic!(
-                "Could not strip prefix '{}' from S3 object key '{}'",
-                prefix, id.0
-            )
-        }),
-        None => &id.0,
-    };
-
-    workdir.join(
-        path_without_prefix
-            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .collect::<PathBuf>(),
-    )
-}
-
 /// AWS S3 storage.
 pub struct S3Bucket {
-    workdir: PathBuf,
-    client: S3Client,
+    client: Client,
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
@@ -125,50 +109,53 @@ pub struct S3Bucket {
    concurrency_limiter: Semaphore,
 }

+#[derive(Default)]
+struct GetObjectRequest {
+    bucket: String,
+    key: String,
+    range: Option<String>,
+}
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
        debug!(
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
        );
-        let region = match aws_config.endpoint.clone() {
-            Some(custom_endpoint) => Region::Custom {
-                name: aws_config.bucket_region.clone(),
-                endpoint: custom_endpoint,
-            },
-            None => aws_config
-                .bucket_region
-                .parse::<Region>()
-                .context("Failed to parse the s3 region from config")?,
-        };
-        let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?;
+        let mut config_builder = Config::builder()
+            .region(Region::new(aws_config.bucket_region.clone()))
+            .credentials_provider(provide_credentials_fn(|| async {
+                match var("AWS_ACCESS_KEY_ID").is_ok() && var("AWS_SECRET_ACCESS_KEY").is_ok() {
+                    true => {
+                        EnvironmentVariableCredentialsProvider::new()
+                            .provide_credentials()
+                            .await
+                    }
+                    false => {
+                        let imds_client = imds::Client::builder()
+                            .connect_timeout(DEFAULT_IMDS_TIMEOUT)
+                            .read_timeout(DEFAULT_IMDS_TIMEOUT)
+                            .build()
+                            .await
+                            .map_err(CredentialsError::unhandled)?;
+                        ImdsCredentialsProvider::builder()
+                            .imds_client(imds_client)
+                            .build()
+                            .provide_credentials()
+                            .await
+                    }
+                }
+            }));

-        let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
-        let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
-        // session token is used when authorizing through sso
-        // which is typically the case when testing locally on developer machine
-        let session_token = std::env::var("AWS_SESSION_TOKEN").ok();
-
-        let client = if access_key_id.is_none() && secret_access_key.is_none() {
-            debug!("Using IAM-based AWS access");
-            S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
-        } else {
-            debug!(
-                "Using credentials-based AWS access. Session token is set: {}",
-                session_token.is_some()
+        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
+            let endpoint = Endpoint::immutable(
+                custom_endpoint
+                    .parse()
+                    .expect("Failed to parse S3 custom endpoint"),
            );
-            S3Client::new_with(
-                request_dispatcher,
-                StaticProvider::new(
-                    access_key_id.unwrap_or_default(),
-                    secret_access_key.unwrap_or_default(),
-                    session_token,
-                    None,
-                ),
-                region,
-            )
-        };
+            config_builder.set_endpoint_resolver(Some(Arc::new(endpoint)));
+        }
+        let client = Client::from_conf(config_builder.build());

        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
            let mut prefix = prefix;
@@ -182,16 +169,41 @@ impl S3Bucket {
            }
            prefix
        });
-
        Ok(Self {
            client,
-            workdir,
            bucket_name: aws_config.bucket_name.clone(),
            prefix_in_bucket,
            concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
        })
    }

+    fn s3_object_to_relative_path(&self, key: &str) -> RemotePath {
+        let relative_path =
+            match key.strip_prefix(self.prefix_in_bucket.as_deref().unwrap_or_default()) {
+                Some(stripped) => stripped,
+                // we rely on AWS to return properly prefixed paths
+                // for requests with a certain prefix
+                None => panic!(
+                    "Key {} does not start with bucket prefix {:?}",
+                    key, self.prefix_in_bucket
+                ),
+            };
+        RemotePath(
+            relative_path
+                .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                .collect(),
+        )
+    }
+
+    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
+        for segment in path.0.iter() {
+            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+            full_path.push_str(segment.to_str().unwrap_or_default());
+        }
+        full_path
+    }
+
    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
        let _guard = self
            .concurrency_limiter
@@ -202,20 +214,33 @@ impl S3Bucket {

        metrics::inc_get_object();

-        match self.client.get_object(request).await {
-            Ok(object_output) => match object_output.body {
-                None => {
-                    metrics::inc_get_object_fail();
-                    Err(DownloadError::Other(anyhow::anyhow!(
-                        "Got no body for the S3 object given"
-                    )))
-                }
-                Some(body) => Ok(Download {
-                    metadata: object_output.metadata.map(StorageMetadata),
-                    download_stream: Box::pin(io::BufReader::new(body.into_async_read())),
-                }),
-            },
-            Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound),
+        let get_object = self
+            .client
+            .get_object()
+            .bucket(request.bucket)
+            .key(request.key)
+            .set_range(request.range)
+            .send()
+            .await;
+
+        match get_object {
+            Ok(object_output) => {
+                let metadata = object_output.metadata().cloned().map(StorageMetadata);
+                Ok(Download {
+                    metadata,
+                    download_stream: Box::pin(io::BufReader::new(
+                        object_output.body.into_async_read(),
+                    )),
+                })
+            }
+            Err(SdkError::ServiceError {
+                err:
+                    GetObjectError {
+                        kind: GetObjectErrorKind::NoSuchKey(..),
+                        ..
+                    },
+                ..
+            }) => Err(DownloadError::NotFound),
            Err(e) => {
                metrics::inc_get_object_fail();
                Err(DownloadError::Other(anyhow::anyhow!(
@@ -228,25 +253,7 @@ impl S3Bucket {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
-        let relative_path = strip_path_prefix(&self.workdir, local_path)?;
-        let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
-        for segment in relative_path {
-            key.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-            key.push_str(&segment.to_string_lossy());
-        }
-        Ok(RemoteObjectId(key))
-    }
-
-    fn local_path(&self, storage_path: &RemoteObjectId) -> anyhow::Result<PathBuf> {
-        Ok(download_destination(
-            storage_path,
-            &self.workdir,
-            self.prefix_in_bucket.as_deref(),
-        ))
-    }
-
-    async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
+    async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
@@ -261,12 +268,11 @@ impl RemoteStorage for S3Bucket {

            let fetch_response = self
                .client
-                .list_objects_v2(ListObjectsV2Request {
-                    bucket: self.bucket_name.clone(),
-                    prefix: self.prefix_in_bucket.clone(),
-                    continuation_token,
-                    ..ListObjectsV2Request::default()
-                })
+                .list_objects_v2()
+                .bucket(self.bucket_name.clone())
+                .set_prefix(self.prefix_in_bucket.clone())
+                .set_continuation_token(continuation_token)
+                .send()
                .await
                .map_err(|e| {
                    metrics::inc_list_objects_fail();
@@ -277,7 +283,7 @@ impl RemoteStorage for S3Bucket {
                    .contents
                    .unwrap_or_default()
                    .into_iter()
-                    .filter_map(|o| Some(RemoteObjectId(o.key?))),
+                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
            );

            match fetch_response.continuation_token {
@@ -291,13 +297,10 @@ impl RemoteStorage for S3Bucket {

    /// See the doc for `RemoteStorage::list_prefixes`
    /// Note: it wont include empty "directories"
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemoteObjectId>,
-    ) -> anyhow::Result<Vec<RemoteObjectId>> {
+    async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
-            .map(|p| p.0.clone())
+            .map(|p| self.relative_path_to_s3_object(p))
            .or_else(|| self.prefix_in_bucket.clone())
            .map(|mut p| {
                // required to end with a separator
@@ -322,13 +325,12 @@ impl RemoteStorage for S3Bucket {

            let fetch_response = self
                .client
-                .list_objects_v2(ListObjectsV2Request {
-                    bucket: self.bucket_name.clone(),
-                    prefix: list_prefix.clone(),
-                    continuation_token,
-                    delimiter: Some(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()),
-                    ..ListObjectsV2Request::default()
-                })
+                .list_objects_v2()
+                .bucket(self.bucket_name.clone())
+                .set_prefix(list_prefix.clone())
+                .set_continuation_token(continuation_token)
+                .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
+                .send()
                .await
                .map_err(|e| {
                    metrics::inc_list_objects_fail();
@@ -340,7 +342,7 @@ impl RemoteStorage for S3Bucket {
                    .common_prefixes
                    .unwrap_or_default()
                    .into_iter()
-                    .filter_map(|o| Some(RemoteObjectId(o.prefix?))),
+                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
            );

            match fetch_response.continuation_token {
@@ -356,7 +358,7 @@ impl RemoteStorage for S3Bucket {
        &self,
        from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
        from_size_bytes: usize,
-        to: &RemoteObjectId,
+        to: &RemotePath,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
        let _guard = self
@@ -366,17 +368,18 @@ impl RemoteStorage for S3Bucket {
            .context("Concurrency limiter semaphore got closed during S3 upload")?;

        metrics::inc_put_object();
+
+        let body = Body::wrap_stream(ReaderStream::new(from));
+        let bytes_stream = ByteStream::new(SdkBody::from(body));
+
        self.client
-            .put_object(PutObjectRequest {
-                body: Some(StreamingBody::new_with_size(
-                    ReaderStream::new(from),
-                    from_size_bytes,
-                )),
-                bucket: self.bucket_name.clone(),
-                key: to.0.to_owned(),
-                metadata: metadata.map(|m| m.0),
-                ..PutObjectRequest::default()
-            })
+            .put_object()
+            .bucket(self.bucket_name.clone())
+            .key(self.relative_path_to_s3_object(to))
+            .set_metadata(metadata.map(|m| m.0))
+            .content_length(from_size_bytes.try_into()?)
+            .body(bytes_stream)
+            .send()
            .await
            .map_err(|e| {
                metrics::inc_put_object_fail();
@@ -385,10 +388,10 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }

-    async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
+    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
-            key: from.0.to_owned(),
+            key: self.relative_path_to_s3_object(from),
            ..GetObjectRequest::default()
        })
        .await
@@ -396,7 +399,7 @@ impl RemoteStorage for S3Bucket {

    async fn download_byte_range(
        &self,
-        from: &RemoteObjectId,
+        from: &RemotePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
    ) -> Result<Download, DownloadError> {
@@ -404,20 +407,19 @@ impl RemoteStorage for S3Bucket {
        // and needs both ends to be exclusive
        let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
        let range = Some(match end_inclusive {
-            Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
-            None => format!("bytes={}-", start_inclusive),
+            Some(end_inclusive) => format!("bytes={start_inclusive}-{end_inclusive}"),
+            None => format!("bytes={start_inclusive}-"),
        });

        self.download_object(GetObjectRequest {
            bucket: self.bucket_name.clone(),
-            key: from.0.to_owned(),
+            key: self.relative_path_to_s3_object(from),
            range,
-            ..GetObjectRequest::default()
        })
        .await
    }

-    async fn delete(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<()> {
+    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
        let _guard = self
            .concurrency_limiter
            .acquire()
@@ -427,11 +429,10 @@ impl RemoteStorage for S3Bucket {
        metrics::inc_delete_object();

        self.client
-            .delete_object(DeleteObjectRequest {
-                bucket: self.bucket_name.clone(),
-                key: remote_object_id.0.to_owned(),
-                ..DeleteObjectRequest::default()
-            })
+            .delete_object()
+            .bucket(self.bucket_name.clone())
+            .key(self.relative_path_to_s3_object(path))
+            .send()
            .await
            .map_err(|e| {
                metrics::inc_delete_object_fail();
@@ -440,181 +441,3 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }
 }
-
-#[cfg(test)]
-mod tests {
-    use tempfile::tempdir;
-
-    use super::*;
-
-    #[test]
-    fn test_download_destination() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let local_path = workdir.join("one").join("two").join("test_name");
-        let relative_path = local_path.strip_prefix(&workdir)?;
-
-        let key = RemoteObjectId(format!(
-            "{}{}",
-            REMOTE_STORAGE_PREFIX_SEPARATOR,
-            relative_path
-                .iter()
-                .map(|segment| segment.to_str().unwrap())
-                .collect::<Vec<_>>()
-                .join(&REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()),
-        ));
-
-        assert_eq!(
-            local_path,
-            download_destination(&key, &workdir, None),
-            "Download destination should consist of s3 path joined with the workdir prefix"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn storage_path_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-
-        let segment_1 = "matching";
-        let segment_2 = "file";
-        let local_path = &workdir.join(segment_1).join(segment_2);
-
-        let storage = dummy_storage(workdir);
-
-        let expected_key = RemoteObjectId(format!(
-            "{}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_1}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_2}",
-            storage.prefix_in_bucket.as_deref().unwrap_or_default(),
-        ));
-
-        let actual_key = storage
-            .remote_object_id(local_path)
-            .expect("Matching path should map to S3 path normally");
-        assert_eq!(
-            expected_key,
-            actual_key,
-            "S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn storage_path_negatives() -> anyhow::Result<()> {
-        #[track_caller]
-        fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String {
-            match storage.remote_object_id(mismatching_path) {
-                Ok(wrong_key) => panic!(
-                    "Expected path '{}' to error, but got S3 key: {:?}",
-                    mismatching_path.display(),
-                    wrong_key,
-                ),
-                Err(e) => e.to_string(),
-            }
-        }
-
-        let workdir = tempdir()?.path().to_owned();
-        let storage = dummy_storage(workdir.clone());
-
-        let error_message = storage_path_error(&storage, &workdir);
-        assert!(
-            error_message.contains("Prefix and the path are equal"),
-            "Message '{}' does not contain the required string",
-            error_message
-        );
-
-        let mismatching_path = PathBuf::from("somewhere").join("else");
-        let error_message = storage_path_error(&storage, &mismatching_path);
-        assert!(
-            error_message.contains(mismatching_path.to_str().unwrap()),
-            "Error should mention wrong path"
-        );
-        assert!(
-            error_message.contains(workdir.to_str().unwrap()),
-            "Error should mention server workdir"
-        );
-        assert!(
-            error_message.contains("is not prefixed with"),
-            "Message '{}' does not contain a required string",
-            error_message
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn local_path_positive() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let storage = dummy_storage(workdir.clone());
-        let timeline_dir = workdir.join("timelines").join("test_timeline");
-        let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?;
-
-        let s3_key = create_s3_key(
-            &relative_timeline_path.join("not a metadata"),
-            storage.prefix_in_bucket.as_deref(),
-        );
-        assert_eq!(
-            download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
-            storage
-                .local_path(&s3_key)
-                .expect("For a valid input, valid S3 info should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote delta file"
-        );
-
-        let s3_key = create_s3_key(
-            &relative_timeline_path.join("metadata"),
-            storage.prefix_in_bucket.as_deref(),
-        );
-        assert_eq!(
-            download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
-            storage
-                .local_path(&s3_key)
-                .expect("For a valid input, valid S3 info should be parsed"),
-            "Should be able to parse metadata out of the correctly named remote metadata file"
-        );
-
-        Ok(())
-    }
-
-    #[test]
-    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let workdir = tempdir()?.path().to_owned();
-        let original_path = workdir
-            .join("timelines")
-            .join("some_timeline")
-            .join("some name");
-
-        let dummy_storage = dummy_storage(workdir);
-
-        let key = dummy_storage.remote_object_id(&original_path)?;
-        let download_destination = dummy_storage.local_path(&key)?;
-
-        assert_eq!(
-            original_path, download_destination,
-            "'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
-        );
-
-        Ok(())
-    }
-
-    fn dummy_storage(workdir: PathBuf) -> S3Bucket {
-        S3Bucket {
-            workdir,
-            client: S3Client::new("us-east-1".parse().unwrap()),
-            bucket_name: "dummy-bucket".to_string(),
-            prefix_in_bucket: Some("dummy_prefix/".to_string()),
-            concurrency_limiter: Semaphore::new(1),
-        }
-    }
-
-    fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> RemoteObjectId {
-        RemoteObjectId(relative_file_path.iter().fold(
-            prefix.unwrap_or_default().to_string(),
-            |mut path_string, segment| {
-                path_string.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                path_string.push_str(segment.to_str().unwrap());
-                path_string
-            },
-        ))
-    }
-}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
+sentry = "0.29.0"
 async-trait = "0.1"
 anyhow = "1.0"
 bincode = "1.3"
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -34,6 +34,7 @@ pub mod sock_split;
 pub mod logging;

 pub mod lock_file;
+pub mod pid_file;

 // Misc
 pub mod accum;
@@ -46,6 +47,7 @@ pub mod tcp_listener;
 pub mod nonblock;

 // Default signal handling
+pub mod sentry_init;
 pub mod signals;

 pub mod fs_ext;
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -1,81 +1,133 @@
-//! A module to create and read lock files. A lock file ensures that only one
-//! process is running at a time, in a particular directory.
+//! A module to create and read lock files.
 //!
-//! File locking is done using [`fcntl::flock`], which means that holding the
-//! lock on file only prevents acquiring another lock on it; all other
-//! operations are still possible on files. Other process can still open, read,
-//! write, or remove the file, for example.
-//! If the file is removed while a process is holding a lock on it,
-//! the process that holds the lock does not get any error or notification.
-//! Furthermore, you can create a new file with the same name and lock the new file,
-//! while the old process is still running.
-//! Deleting the lock file while the locking process is still running is a bad idea!
+//! File locking is done using [`fcntl::flock`] exclusive locks.
+//! The only consumer of this module is currently [`pid_file`].
+//! See the module-level comment there for potential pitfalls
+//! with lock files that are used to store PIDs (pidfiles).

-use std::{fs, os::unix::prelude::AsRawFd, path::Path};
+use std::{
+    fs,
+    io::{Read, Write},
+    ops::Deref,
+    os::unix::prelude::AsRawFd,
+    path::{Path, PathBuf},
+};

 use anyhow::Context;
-use nix::fcntl;
+use nix::{errno::Errno::EAGAIN, fcntl};

 use crate::crashsafe;

-pub enum LockCreationResult {
-    Created {
-        new_lock_contents: String,
-        file: fs::File,
-    },
-    AlreadyLocked {
-        existing_lock_contents: String,
-    },
-    CreationFailed(anyhow::Error),
+/// A handle to an open and unlocked, but not-yet-written lock file.
+/// Returned by [`create_exclusive`].
+#[must_use]
+pub struct UnwrittenLockFile {
+    path: PathBuf,
+    file: fs::File,
 }

-/// Creates a lock file in the path given and writes the given contents into the file.
-/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
-pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
-    let lock_file = match fs::OpenOptions::new()
+/// Returned by [`UnwrittenLockFile::write_content`].
+#[must_use]
+pub struct LockFileGuard(fs::File);
+
+impl Deref for LockFileGuard {
+    type Target = fs::File;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl UnwrittenLockFile {
+    /// Replace the content of this lock file with the byte representation of `contents`.
+    pub fn write_content(mut self, contents: String) -> anyhow::Result<LockFileGuard> {
+        self.file
+            .set_len(0)
+            .context("Failed to truncate lockfile")?;
+        self.file
+            .write_all(contents.as_bytes())
+            .with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?;
+        crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?;
+        Ok(LockFileGuard(self.file))
+    }
+}
+
+/// Creates and opens a lock file in the path, grabs an exclusive flock on it, and returns
+/// a handle that allows overwriting the locked file's content.
+///
+/// The exclusive lock is released when dropping the returned handle.
+///
+/// It is not an error if the file already exists.
+/// It is an error if the file is already locked.
+pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFile> {
+    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
        .write(true)
        .open(lock_file_path)
-        .context("Failed to open lock file")
-    {
-        Ok(file) => file,
-        Err(e) => return LockCreationResult::CreationFailed(e),
-    };
+        .context("open lock file")?;

-    match fcntl::flock(
+    let res = fcntl::flock(
        lock_file.as_raw_fd(),
        fcntl::FlockArg::LockExclusiveNonblock,
-    ) {
-        Ok(()) => {
-            match lock_file
-                .set_len(0)
-                .context("Failed to truncate lockfile")
-                .and_then(|()| {
-                    fs::write(lock_file_path, &contents).with_context(|| {
-                        format!("Failed to write '{contents}' contents into lockfile")
-                    })
-                })
-                .and_then(|()| {
-                    crashsafe::fsync_file_and_parent(lock_file_path)
-                        .context("Failed to fsync lockfile")
-                }) {
-                Ok(()) => LockCreationResult::Created {
-                    new_lock_contents: contents,
-                    file: lock_file,
-                },
-                Err(e) => LockCreationResult::CreationFailed(e),
-            }
-        }
-        Err(nix::errno::Errno::EAGAIN) => {
-            match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
-                Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
-                    existing_lock_contents,
-                },
-                Err(e) => LockCreationResult::CreationFailed(e),
-            }
-        }
-        Err(e) => {
-            LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
-        }
+    );
+    match res {
+        Ok(()) => Ok(UnwrittenLockFile {
+            path: lock_file_path.to_owned(),
+            file: lock_file,
+        }),
+        Err(EAGAIN) => anyhow::bail!("file is already locked"),
+        Err(e) => Err(e).context("flock error"),
+    }
+}
+
+/// Returned by [`read_and_hold_lock_file`].
+/// Check out the [`pid_file`] module for what the variants mean
+/// and potential caveats if the lock files that are used to store PIDs.
+pub enum LockFileRead {
+    /// No file exists at the given path.
+    NotExist,
+    /// No other process held the lock file, so we grabbed an flock
+    /// on it and read its contents.
+    /// Release the flock by dropping the [`LockFileGuard`].
+    NotHeldByAnyProcess(LockFileGuard, String),
+    /// The file exists but another process was holding an flock on it.
+    LockedByOtherProcess {
+        not_locked_file: fs::File,
+        content: String,
+    },
+}
+
+/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
+/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
+/// Check the [`LockFileRead`] variants for details.
+pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result<LockFileRead> {
+    let res = fs::OpenOptions::new().read(true).open(path);
+    let mut lock_file = match res {
+        Ok(f) => f,
+        Err(e) => match e.kind() {
+            std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
+            _ => return Err(e).context("open lock file"),
+        },
+    };
+    let res = fcntl::flock(
+        lock_file.as_raw_fd(),
+        fcntl::FlockArg::LockExclusiveNonblock,
+    );
+    // We need the content regardless of lock success / failure.
+    // But, read it after flock so that, if it succeeded, the content is consistent.
+    let mut content = String::new();
+    lock_file
+        .read_to_string(&mut content)
+        .context("read lock file")?;
+    match res {
+        Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
+            LockFileGuard(lock_file),
+            content,
+        )),
+        Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
+            not_locked_file: lock_file,
+            content,
+        }),
+        Err(e) => Err(e).context("flock error"),
    }
 }
--- a/libs/utils/src/pid_file.rs
+++ b/libs/utils/src/pid_file.rs
@@ -0,0 +1,165 @@
+//! Abstraction to create & read pidfiles.
+//!
+//! A pidfile is a file in the filesystem that stores a process's PID.
+//! Its purpose is to implement a singleton behavior where only
+//! one process of some "kind" is supposed to be running at a given time.
+//! The "kind" is identified by the pidfile.
+//!
+//! During process startup, the process that is supposed to be a singleton
+//! must [claim][`claim_for_current_process`] the pidfile first.
+//! If that is unsuccessful, the process must not act as the singleton, i.e.,
+//! it must not access any of the resources that only the singleton may access.
+//!
+//! A common need is to signal a running singleton process, e.g., to make
+//! it shut down and exit.
+//! For that, we have to [`read`] the pidfile. The result of the `read` operation
+//! tells us if there is any singleton process, and if so, what PID it has.
+//! We can then proceed to signal it, although some caveats still apply.
+//! Read the function-level documentation of [`read`] for that.
+//!
+//! ## Never Remove Pidfiles
+//!
+//! It would be natural to assume that the process who claimed the pidfile
+//! should remove it upon exit to avoid leaving a stale pidfile in place.
+//! However, we already have a reliable way to detect staleness of the pidfile,
+//! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it.
+//!
+//! And further, removing pidfiles would introduce a **catastrophic race condition**
+//! where two processes are running that are supposed to be singletons.
+//! Suppose we were to remove our pidfile during process shutdown.
+//! Here is how the race plays out:
+//! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`.
+//! - Process `A` starts to shut down.
+//! - Process `B` is just starting up
+//!     - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file
+//!     - It blocks on `flock`
+//! - Process `A` removes the pidfile as the last step of its shutdown procedure
+//!     - `unlink("myservice.pid")
+//! - Process `A` exits
+//!     - This releases its `flock` and unblocks `B`
+//! - Process `B` still has the file descriptor for `myservice.pid` open
+//! - Process `B` writes its PID into `myservice.pid`.
+//! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid`
+//!   in the directory.
+//! - Process `C` starts
+//!     - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode)
+//!     - It `flock`s the file, which, since it's a different file, does not block
+//!     - It writes its PID into the file
+//!
+//! At this point, `B` and `C` are running, which is hazardous.
+//! Morale of the story: don't unlink pidfiles, ever.
+
+use std::{ops::Deref, path::Path};
+
+use anyhow::Context;
+use nix::unistd::Pid;
+
+use crate::lock_file::{self, LockFileRead};
+
+/// Keeps a claim on a pidfile alive until it is dropped.
+/// Returned by [`claim_for_current_process`].
+#[must_use]
+pub struct PidFileGuard(lock_file::LockFileGuard);
+
+impl Deref for PidFileGuard {
+    type Target = lock_file::LockFileGuard;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+/// Try to claim `path` as a pidfile for the current process.
+///
+/// If another process has already claimed the pidfile, and it is still running,
+/// this function returns ane error.
+/// Otherwise, the function `flock`s the file and updates its contents to the
+/// current process's PID.
+/// If the update fails, the flock is released and an error returned.
+/// On success, the function returns a [`PidFileGuard`] to keep the flock alive.
+///
+/// ### Maintaining A Claim
+///
+/// It is the caller's responsibility to maintain the claim.
+/// The claim ends as soon as the returned guard object is dropped.
+/// To maintain the claim for the remaining lifetime of the current process,
+/// use [`std::mem::forget`] or similar.
+pub fn claim_for_current_process(path: &Path) -> anyhow::Result<PidFileGuard> {
+    let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
+    // if any of the next steps fail, we drop the file descriptor and thereby release the lock
+    let guard = unwritten_lock_file
+        .write_content(Pid::this().to_string())
+        .context("write pid to lock file")?;
+    Ok(PidFileGuard(guard))
+}
+
+/// Returned by [`read`].
+pub enum PidFileRead {
+    /// No file exists at the given path.
+    NotExist,
+    /// The given pidfile is currently not claimed by any process.
+    /// To determine this, the [`read`] operation acquired
+    /// an exclusive flock on the file. The lock is still held and responsibility
+    /// to release it is returned through the guard object.
+    /// Before releasing it, other [`claim_for_current_process`] or [`read`] calls
+    /// will fail.
+    ///
+    /// ### Caveats
+    ///
+    /// Do not unlink the pidfile from the filesystem. See module-comment for why.
+    NotHeldByAnyProcess(PidFileGuard),
+    /// The given pidfile is still claimed by another process whose PID is given
+    /// as part of this variant.
+    ///
+    /// ### Caveats
+    ///
+    /// 1. The other process might exit at any time, turning the given PID stale.
+    /// 2. There is a small window in which `claim_for_current_process` has already
+    ///    locked the file but not yet updates its contents. [`read`] will return
+    ///    this variant here, but with the old file contents, i.e., a stale PID.
+    ///
+    /// The kernel is free to recycle PID once it has been `wait(2)`ed upon by
+    /// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
+    /// system call on it, bears the risk of killing an unrelated process.
+    /// This is an inherent limitation of using pidfiles.
+    /// The only race-free solution is to have a supervisor-process with a lifetime
+    /// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
+    LockedByOtherProcess(Pid),
+}
+
+/// Try to read the file at the given path as a pidfile that was previously created
+/// through [`claim_for_current_process`].
+///
+/// On success, this function returns a [`PidFileRead`].
+/// Check its docs for a description of the meaning of its different variants.
+pub fn read(pidfile: &Path) -> anyhow::Result<PidFileRead> {
+    let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
+    let ret = match res {
+        LockFileRead::NotExist => PidFileRead::NotExist,
+        LockFileRead::NotHeldByAnyProcess(guard, _) => {
+            PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard))
+        }
+        LockFileRead::LockedByOtherProcess {
+            not_locked_file: _not_locked_file,
+            content,
+        } => {
+            // XXX the read races with the write in claim_pid_file_for_pid().
+            // But pids are smaller than a page, so the kernel page cache will lock for us.
+            // The only problem is that we might get the old contents here.
+            // Can only fix that by implementing some scheme that downgrades the
+            // exclusive lock to shared lock in claim_pid_file_for_pid().
+            PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?)
+        }
+    };
+    Ok(ret)
+}
+
+fn parse_pidfile_content(content: &str) -> anyhow::Result<Pid> {
+    let pid: i32 = content
+        .parse()
+        .map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?;
+    if pid < 1 {
+        anyhow::bail!("bad value in pidfile '{pid}'");
+    }
+    Ok(Pid::from_raw(pid))
+}
--- a/libs/utils/src/sentry_init.rs
+++ b/libs/utils/src/sentry_init.rs
@@ -0,0 +1,27 @@
+use sentry::ClientInitGuard;
+use std::borrow::Cow;
+use std::env;
+
+pub use sentry::release_name;
+
+#[must_use]
+pub fn init_sentry(
+    release_name: Option<Cow<'static, str>>,
+    extra_options: &[(&str, &str)],
+) -> Option<ClientInitGuard> {
+    let dsn = env::var("SENTRY_DSN").ok()?;
+
+    let guard = sentry::init((
+        dsn,
+        sentry::ClientOptions {
+            release: release_name,
+            ..Default::default()
+        },
+    ));
+    sentry::configure_scope(|scope| {
+        for &(key, value) in extra_options {
+            scope.set_extra(key, value.into());
+        }
+    });
+    Some(guard)
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -18,7 +18,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 byteorder = "1.4.3"
 bytes = "1.0.1"
-chrono = "0.4.19"
+chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["string"] }
 close_fds = "0.3.2"
 const_format = "0.2.21"
--- a/pageserver/benches/README.md
+++ b/pageserver/benches/README.md
@@ -0,0 +1,12 @@
+## Pageserver Benchmarks
+
+# How to run
+
+To run all benchmarks:
+`cargo bench`
+
+To run a specific file:
+`cargo bench --bench bench_layer_map`
+
+To run a specific function:
+`cargo bench --bench bench_layer_map -- real_map_uniform_queries`
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -431,7 +431,7 @@ fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
 struct Request {
    key: Key,
    lsn: Lsn,
-    base_img: Option<Bytes>,
+    base_img: Option<(Lsn, Bytes)>,
    records: Vec<(Lsn, NeonWalRecord)>,
    pg_version: u32,
 }
--- a/pageserver/benches/large-layer-map-layernames.txt
+++ b/pageserver/benches/large-layer-map-layernames.txt
--- a/pageserver/benches/odd-brook-layernames.txt
+++ b/pageserver/benches/odd-brook-layernames.txt
--- a/pageserver/src/bin/draw_timeline_dir.rs
+++ b/pageserver/src/bin/draw_timeline_dir.rs
@@ -11,8 +11,8 @@
 //!
 //! Example use:
 //! ```
-//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
-//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//! $   grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
 //! $ firefox out.svg
 //! ```
 //!
@@ -25,6 +25,8 @@ use anyhow::Result;
 use pageserver::repository::Key;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
+use std::path::PathBuf;
+use std::str::FromStr;
 use std::{
    collections::{BTreeMap, BTreeSet},
    ops::Range,
@@ -65,7 +67,11 @@ fn main() -> Result<()> {
    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
    let stdin = io::stdin();
    for line in stdin.lock().lines() {
-        let range = parse_filename(&line.unwrap());
+        let line = line.unwrap();
+        let line = PathBuf::from_str(&line).unwrap();
+        let filename = line.file_name().unwrap();
+        let filename = filename.to_str().unwrap();
+        let range = parse_filename(filename);
        ranges.push(range);
    }

--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -7,7 +7,6 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
 use anyhow::{anyhow, Context};
 use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
-use nix::unistd::Pid;
 use tracing::*;

 use metrics::set_build_info_metric;
@@ -23,9 +22,10 @@ use pageserver::{
 use remote_storage::GenericRemoteStorage;
 use utils::{
    auth::JwtAuth,
-    lock_file, logging,
+    logging,
    postgres_backend::AuthType,
    project_git_version,
+    sentry_init::{init_sentry, release_name},
    signals::{self, Signal},
    tcp_listener,
 };
@@ -85,6 +85,9 @@ fn main() -> anyhow::Result<()> {
        }
    };

+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
+
    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
        utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
@@ -216,28 +219,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
    }

    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
-        lock_file::LockCreationResult::Created {
-            new_lock_contents,
-            file,
-        } => {
-            info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
-            file
-        }
-        lock_file::LockCreationResult::AlreadyLocked {
-            existing_lock_contents,
-        } => anyhow::bail!(
-            "Could not lock pid file; pageserver is already running in {:?} with PID {}",
-            conf.workdir,
-            existing_lock_contents
-        ),
-        lock_file::LockCreationResult::CreationFailed(e) => {
-            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
-        }
-    };
+    let lock_file =
+        utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("Claimed pid file at {lock_file_path:?}");
+
    // ensure that the lock file is held even if the main thread of the process is panics
    // we need to release the lock file only when the current process is gone
-    let _ = Box::leak(Box::new(lock_file));
+    std::mem::forget(lock_file);

    // TODO: Check that it looks like a valid repository before going further

@@ -292,15 +280,23 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
    let remote_storage = conf
        .remote_storage_config
        .as_ref()
-        .map(|storage_config| {
-            GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config)
-        })
+        .map(GenericRemoteStorage::from_config)
        .transpose()
        .context("Failed to init generic remote storage")?;
-    {
-        let _rt_guard = BACKGROUND_RUNTIME.enter();
-        tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?
-    };
+
+    let (init_result_sender, init_result_receiver) =
+        std::sync::mpsc::channel::<anyhow::Result<()>>();
+    let storage_for_spawn = remote_storage.clone();
+    let _handler = BACKGROUND_RUNTIME.spawn(async move {
+        let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await;
+        init_result_sender.send(result)
+    });
+    match init_result_receiver.recv() {
+        Ok(init_result) => init_result.context("Failed to init tenant_mgr")?,
+        Err(_sender_dropped_err) => {
+            anyhow::bail!("Failed to init tenant_mgr: no init status was returned");
+        }
+    }

    // Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
    // bind before launching separate thread so the error reported before startup exits
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,7 +5,7 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
-use remote_storage::RemoteStorageConfig;
+use remote_storage::{RemotePath, RemoteStorageConfig};
 use std::env;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
@@ -27,7 +27,9 @@ use utils::{

 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::tenant_config::{TenantConf, TenantConfOpt};
-use crate::{METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
+use crate::{
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
+};

 pub mod defaults {
    use crate::tenant_config::defaults::*;
@@ -331,10 +333,6 @@ impl PageServerConfigBuilder {
    }

    pub fn build(self) -> anyhow::Result<PageServerConf> {
-        let broker_endpoints = self
-            .broker_endpoints
-            .ok_or(anyhow!("No broker endpoints provided"))?;
-
        Ok(PageServerConf {
            listen_pg_addr: self
                .listen_pg_addr
@@ -370,7 +368,9 @@ impl PageServerConfigBuilder {
            profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
            // TenantConf is handled separately
            default_tenant_conf: TenantConf::default(),
-            broker_endpoints,
+            broker_endpoints: self
+                .broker_endpoints
+                .ok_or(anyhow!("No broker endpoints provided"))?,
            broker_etcd_prefix: self
                .broker_etcd_prefix
                .ok_or(anyhow!("missing broker_etcd_prefix"))?,
@@ -402,6 +402,10 @@ impl PageServerConf {
            .join(TENANT_ATTACHING_MARKER_FILENAME)
    }

+    pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
+        self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
+    }
+
    /// Points to a place in pageserver's local directory,
    /// where certain tenant's tenantconf file should be located.
    pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
@@ -450,6 +454,28 @@ impl PageServerConf {
            .join(METADATA_FILE_NAME)
    }

+    /// Files on the remote storage are stored with paths, relative to the workdir.
+    /// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
+    ///
+    /// Errors if the path provided does not start from pageserver's workdir.
+    pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
+        local_path
+            .strip_prefix(&self.workdir)
+            .context("Failed to strip workdir prefix")
+            .and_then(RemotePath::new)
+            .with_context(|| {
+                format!(
+                    "Failed to resolve remote part of path {:?} for base {:?}",
+                    local_path, self.workdir
+                )
+            })
+    }
+
+    /// Turns storage remote path of a file into its local path.
+    pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
+        remote_path.with_base(&self.workdir)
+    }
+
    //
    // Postgres distribution paths
    //
@@ -486,7 +512,7 @@ impl PageServerConf {
        let mut builder = PageServerConfigBuilder::default();
        builder.workdir(workdir.to_owned());

-        let mut t_conf: TenantConfOpt = Default::default();
+        let mut t_conf = TenantConfOpt::default();

        for (key, item) in toml.iter() {
            match key {
@@ -617,6 +643,12 @@ impl PageServerConf {
        if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
            t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
        }
+        if let Some(trace_read_requests) = item.get("trace_read_requests") {
+            t_conf.trace_read_requests =
+                Some(trace_read_requests.as_bool().with_context(|| {
+                    "configure option trace_read_requests is not a bool".to_string()
+                })?);
+        }

        Ok(t_conf)
    }
@@ -1016,6 +1048,35 @@ broker_endpoints = ['{broker_endpoint}']
        Ok(())
    }

+    #[test]
+    fn parse_tenant_config() -> anyhow::Result<()> {
+        let tempdir = tempdir()?;
+        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
+
+        let broker_endpoint = "http://127.0.0.1:7777";
+        let trace_read_requests = true;
+
+        let config_string = format!(
+            r#"{ALL_BASE_VALUES_TOML}
+pg_distrib_dir='{}'
+broker_endpoints = ['{broker_endpoint}']
+
+[tenant_config]
+trace_read_requests = {trace_read_requests}"#,
+            pg_distrib_dir.display(),
+        );
+
+        let toml = config_string.parse()?;
+
+        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
+        assert_eq!(
+            conf.default_tenant_conf.trace_read_requests, trace_read_requests,
+            "Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
+        );
+
+        Ok(())
+    }
+
    fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
        let tempdir_path = tempdir.path();

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -274,6 +274,7 @@ paths:
        schema:
          type: string
          format: hex
+
    post:
      description: Schedules attach operation to happen in the background for given tenant
      responses:
@@ -325,7 +326,9 @@ paths:
          type: string
          format: hex
    post:
-      description: Detach local tenant
+      description: |
+        Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
+        Files on the remote storage are not affected.
      responses:
        "200":
          description: Tenant detached
@@ -354,6 +357,92 @@ paths:
              schema:
                $ref: "#/components/schemas/Error"

+  /v1/tenant/{tenant_id}/ignore:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: |
+        Remove tenant data (including all corresponding timelines) from pageserver's memory.
+        Files on local disk and remote storage are not affected.
+
+        Future pageserver restarts won't load the data back until `load` is called on such tenant.
+      responses:
+        "200":
+          description: Tenant ignored
+        "400":
+          description: Error when no tenant id found in path parameters
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
+  /v1/tenant/{tenant_id}/load:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+    post:
+      description: |
+        Schedules an operation that attempts to load a tenant from the local disk and
+        synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
+        If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
+
+        Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
+        Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
+      responses:
+        "202":
+          description: Tenant scheduled to load successfully
+        "400":
+          description: Error when no tenant id found in path parameters
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}/size:
    parameters:
      - name: tenant_id
@@ -659,7 +748,6 @@ components:
        - tenant_id
        - last_record_lsn
        - disk_consistent_lsn
-        - awaits_download
        - state
        - latest_gc_cutoff_lsn
      properties:
@@ -702,8 +790,6 @@ components:
          format: hex
        last_received_msg_ts:
          type: integer
-        awaits_download:
-          type: boolean
        state:
          type: string
        latest_gc_cutoff_lsn:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3,9 +3,7 @@ use std::sync::Arc;
 use anyhow::{anyhow, Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
-use pageserver_api::models::TenantState;
 use remote_storage::GenericRemoteStorage;
-use tokio::task::JoinError;
 use tracing::*;

 use super::models::{
@@ -82,12 +80,11 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res

 // Helper function to construct a TimelineInfo struct for a timeline
 fn build_timeline_info(
-    tenant_state: TenantState,
    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
    include_non_incremental_physical_size: bool,
 ) -> anyhow::Result<TimelineInfo> {
-    let mut info = build_timeline_info_common(tenant_state, timeline)?;
+    let mut info = build_timeline_info_common(timeline)?;
    if include_non_incremental_logical_size {
        info.current_logical_size_non_incremental =
            Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
@@ -99,10 +96,7 @@ fn build_timeline_info(
    Ok(info)
 }

-fn build_timeline_info_common(
-    tenant_state: TenantState,
-    timeline: &Arc<Timeline>,
-) -> anyhow::Result<TimelineInfo> {
+fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
    let last_record_lsn = timeline.get_last_record_lsn();
    let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
        let guard = timeline.last_received_wal.lock().unwrap();
@@ -154,10 +148,6 @@ fn build_timeline_info_common(

        state,

-        // XXX bring back tracking of downloads per timeline, or, introduce
-        // an 'Attaching' state for the timeline and get rid of this field.
-        awaits_download: tenant_state == TenantState::Attaching,
-
        // Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
        // with the control plane.
        local: LocalTimelineInfo {
@@ -189,7 +179,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
        .new_timeline_id
        .unwrap_or_else(TimelineId::generate);

-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
    match tenant.create_timeline(
        new_timeline_id,
        request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -200,7 +192,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
    .await {
        Ok(Some(new_timeline)) => {
            // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(tenant.current_state(), &new_timeline)
+            let timeline_info = build_timeline_info_common(&new_timeline)
                .map_err(ApiError::InternalServerError)?;
            json_response(StatusCode::CREATED, timeline_info)
        }
@@ -217,26 +209,29 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
        query_param_present(&request, "include-non-incremental-physical-size");
    check_permission(&request, Some(tenant_id))?;

-    let _entered = info_span!("timeline_list", tenant = %tenant_id).entered();
+    let response_data = async {
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+            .await
+            .map_err(ApiError::NotFound)?;
+        let timelines = tenant.list_timelines();

-    let (tenant_state, timelines) = {
-        let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-        (tenant.current_state(), tenant.list_timelines())
-    };
+        let mut response_data = Vec::with_capacity(timelines.len());
+        for timeline in timelines {
+            let timeline_info = build_timeline_info(
+                &timeline,
+                include_non_incremental_logical_size,
+                include_non_incremental_physical_size,
+            )
+            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
+            .map_err(ApiError::InternalServerError)?;

-    let mut response_data = Vec::with_capacity(timelines.len());
-    for timeline in timelines {
-        let timeline_info = build_timeline_info(
-            tenant_state,
-            &timeline,
-            include_non_incremental_logical_size,
-            include_non_incremental_physical_size,
-        )
-        .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
-        .map_err(ApiError::InternalServerError)?;
+            response_data.push(timeline_info);
+        }

-        response_data.push(timeline_info);
+        Ok(response_data)
    }
+    .instrument(info_span!("timeline_list", tenant = %tenant_id))
+    .await?;

    json_response(StatusCode::OK, response_data)
 }
@@ -281,20 +276,15 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    check_permission(&request, Some(tenant_id))?;

    let timeline_info = async {
-        let (tenant_state, timeline) = tokio::task::spawn_blocking(move || {
-            let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
-            Ok((
-                tenant.current_state(),
-                tenant.get_timeline(timeline_id, false),
-            ))
-        })
-        .await
-        .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+        let tenant = tenant_mgr::get_tenant(tenant_id, true)
+            .await
+            .map_err(ApiError::NotFound)?;

-        let timeline = timeline.map_err(ApiError::NotFound)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, false)
+            .map_err(ApiError::NotFound)?;

        let timeline_info = build_timeline_info(
-            tenant_state,
            &timeline,
            include_non_incremental_logical_size,
            include_non_incremental_physical_size,
@@ -322,6 +312,7 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);

    let timeline = tenant_mgr::get_tenant(tenant_id, true)
+        .await
        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
        .map_err(ApiError::NotFound)?;
    let result = match timeline
@@ -347,13 +338,13 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,

    if let Some(remote_storage) = &state.remote_storage {
        // FIXME: distinguish between "Tenant already exists" and other errors
-        tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage)
+        tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
            .instrument(info_span!("tenant_attach", tenant = %tenant_id))
            .await
            .map_err(ApiError::InternalServerError)?;
    } else {
        return Err(ApiError::BadRequest(anyhow!(
-            "attach_tenant is possible because pageserver was configured without remote storage"
+            "attach_tenant is not possible because pageserver was configured without remote storage"
        )));
    }

@@ -392,23 +383,49 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, ())
 }

+async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+    tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+        .instrument(info_span!("load", tenant = %tenant_id))
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
+async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    let state = get_state(&request);
+    let conf = state.conf;
+    tenant_mgr::ignore_tenant(conf, tenant_id)
+        .instrument(info_span!("ignore_tenant", tenant = %tenant_id))
+        .await
+        // FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
+        // Replace this with better handling once the error type permits it.
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permission(&request, None)?;

-    let response_data = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_list").entered();
-        tenant_mgr::list_tenants()
-            .iter()
-            .map(|(id, state)| TenantInfo {
-                id: *id,
-                state: *state,
-                current_physical_size: None,
-                has_in_progress_downloads: Some(state.has_in_progress_downloads()),
-            })
-            .collect::<Vec<TenantInfo>>()
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
+    let response_data = tenant_mgr::list_tenants()
+        .instrument(info_span!("tenant_list"))
+        .await
+        .iter()
+        .map(|(id, state)| TenantInfo {
+            id: *id,
+            state: *state,
+            current_physical_size: None,
+            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+        })
+        .collect::<Vec<TenantInfo>>();

    json_response(StatusCode::OK, response_data)
 }
@@ -417,9 +434,8 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant_info = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_status_handler", tenant = %tenant_id).entered();
-        let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
+    let tenant_info = async {
+        let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -428,17 +444,15 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
        }

        let state = tenant.current_state();
-        let tenant_info = TenantInfo {
+        Ok(TenantInfo {
            id: tenant_id,
            state,
            current_physical_size: Some(current_physical_size),
            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
-        };
-
-        Ok::<_, anyhow::Error>(tenant_info)
-    })
+        })
+    }
+    .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
    .map_err(ApiError::InternalServerError)?;

    json_response(StatusCode::OK, tenant_info)
@@ -448,7 +462,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::InternalServerError)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::InternalServerError)?;

    // this can be long operation, it currently is not backed by any request coalescing or similar
    let inputs = tenant
@@ -565,22 +581,19 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
        .map(TenantId::from)
        .unwrap_or_else(TenantId::generate);

-    let new_tenant = tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
-        let state = get_state(&request);
+    let state = get_state(&request);

-        tenant_mgr::create_tenant(
-            state.conf,
-            tenant_conf,
-            target_tenant_id,
-            state.remote_storage.clone(),
-        )
-        // FIXME: `create_tenant` can fail from both user and internal errors. Replace this
-        // with better error handling once the type permits it
-        .map_err(ApiError::InternalServerError)
-    })
+    let new_tenant = tenant_mgr::create_tenant(
+        state.conf,
+        tenant_conf,
+        target_tenant_id,
+        state.remote_storage.clone(),
+    )
+    .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    // FIXME: `create_tenant` can fail from both user and internal errors. Replace this
+    // with better error handling once the type permits it
+    .map_err(ApiError::InternalServerError)?;

    Ok(match new_tenant {
        Some(tenant) => {
@@ -671,17 +684,13 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
        );
    }

-    tokio::task::spawn_blocking(move || {
-        let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
-
-        let state = get_state(&request);
-        tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
-            // FIXME: `update_tenant_config` can fail because of both user and internal errors.
-            // Replace this `map_err` with better error handling once the type permits it
-            .map_err(ApiError::InternalServerError)
-    })
-    .await
-    .map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
+    let state = get_state(&request);
+    tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
+        .instrument(info_span!("tenant_config", tenant = ?tenant_id))
+        .await
+        // FIXME: `update_tenant_config` can fail because of both user and internal errors.
+        // Replace this `map_err` with better error handling once the type permits it
+        .map_err(ApiError::InternalServerError)?;

    json_response(StatusCode::OK, ())
 }
@@ -728,7 +737,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body

    let gc_req: TimelineGcRequest = json_request(&mut request).await?;

-    let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req)?;
+    let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -745,7 +754,9 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)?;
@@ -764,7 +775,9 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, true)
+        .await
+        .map_err(ApiError::NotFound)?;
    let timeline = tenant
        .get_timeline(timeline_id, true)
        .map_err(ApiError::NotFound)?;
@@ -838,6 +851,8 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
        .post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
        .post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
+        .post("/v1/tenant/:tenant_id/load", tenant_load_handler)
+        .post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
        .get(
            "/v1/tenant/:tenant_id/timeline/:timeline_id",
            timeline_detail_handler,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -10,7 +10,8 @@ pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
 pub mod repository;
-pub mod storage_sync;
+pub mod storage_sync2;
+pub use storage_sync2 as storage_sync;
 pub mod task_mgr;
 pub mod tenant;
 pub mod tenant_config;
@@ -124,6 +125,13 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";

+/// A marker file to prevent pageserver from loading a certain tenant on restart.
+/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
+/// `ignore` management API command, that expects the ignored tenant to be properly loaded
+/// into pageserver's memory before being ignored.
+/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
+pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
+
 pub fn is_temporary(path: &Path) -> bool {
    match path.file_name() {
        Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -315,6 +315,7 @@ impl PageServerHandler {

            let copy_data_bytes = match msg? {
                Some(FeMessage::CopyData(bytes)) => bytes,
+                Some(FeMessage::Terminate) => break,
                Some(m) => {
                    bail!("unexpected message: {m:?} during COPY");
                }
@@ -940,7 +941,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
 async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
-    let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
+    let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
    match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
        Ok(wait_result) => wait_result
            // no .context(), the error message is good enough and some tests depend on it
--- a/pageserver/src/storage_sync/delete.rs
+++ b/pageserver/src/storage_sync/delete.rs
@@ -1,38 +0,0 @@
-//! Helper functions to delete files from remote storage with a RemoteStorage
-use anyhow::Context;
-use std::path::Path;
-use tracing::debug;
-
-use remote_storage::GenericRemoteStorage;
-
-pub(super) async fn delete_layer(
-    storage: &GenericRemoteStorage,
-    local_layer_path: &Path,
-) -> anyhow::Result<()> {
-    fail::fail_point!("before-delete-layer", |_| {
-        anyhow::bail!("failpoint before-delete-layer")
-    });
-    debug!(
-        "Deleting layer from remote storage: {:?}",
-        local_layer_path.display()
-    );
-
-    let storage_path = storage
-        .remote_object_id(local_layer_path)
-        .with_context(|| {
-            format!(
-                "Failed to get the layer storage path for local path '{}'",
-                local_layer_path.display()
-            )
-        })?;
-
-    // XXX: If the deletion fails because the object already didn't exist,
-    // it would be good to just issue a warning but consider it success.
-    // https://github.com/neondatabase/neon/issues/2934
-    storage.delete(&storage_path).await.with_context(|| {
-        format!(
-            "Failed to delete remote layer from storage at '{:?}'",
-            storage_path
-        )
-    })
-}
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -79,6 +79,13 @@
 //! - We rely on read-after write consistency in the remote storage.
 //! - Layer files are immutable
 //!
+//! NB: Pageserver assumes that it has exclusive write access to the tenant in remote
+//! storage. Different tenants can be attached to different pageservers, but if the
+//! same tenant is attached to two pageservers at the same time, they will overwrite
+//! each other's index file updates, and confusion will ensue. There's no interlock or
+//! mechanism to detect that in the pageserver, we rely on the control plane to ensure
+//! that that doesn't happen.
+//!
 //! ## Implementation Note
 //!
 //! The *actual* remote state lags behind the *desired* remote state while
@@ -145,6 +152,10 @@
 //!
 //! # Downloads (= Tenant Attach)
 //!
+//! In addition to the upload queue, [`RemoteTimelineClient`] has functions for
+//! downloading files from the remote storage. Downloads are performed immediately,
+//! independently of the uploads.
+//!
 //! When we attach a tenant, we perform the following steps:
 //! - create `Tenant` object in `TenantState::Attaching` state
 //! - List timelines that are present in remote storage, and download their remote [`IndexPart`]s
@@ -174,60 +185,6 @@
 //! in remote storage.
 //! But note that we don't test any of this right now.
 //!
-//!
-//! # RANDOM NOTES FROM THE PAST (TODO: DELETE / DEDUP WITH CONTENT ABOVE)
-//!
-//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
-//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
-//!
-//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast.
-//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time.
-//!
-//! Uploads are queued and executed in the background and in parallel, enforcing the ordering rules.
-//! Downloads are performed immediately, and independently of the uploads.
-//!
-//! Deletion happens only after a successful upload only, otherwise the compaction output might make the timeline inconsistent until both tasks are fully processed without errors.
-//! Upload and download update the remote data (inmemory index and S3 json index part file) only after every layer is successfully synchronized, while the deletion task
-//! does otherwise: it requires to have the remote data updated first successfully: blob files will be invisible to pageserver this way.
-//!
-//! FIXME: how is the initial list of remote files created now? Update this paragraph
-//! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via downloading and merging the index data for all timelines,
-//! present locally.
-//! It's enough to poll such timelines' remote state once on startup only, due to an agreement that only one pageserver at a time has an exclusive
-//! write access to remote portion of timelines that are attached to the pagegserver.
-//! The index state is used to issue initial sync tasks, if needed:
-//! * all timelines with local state behind the remote gets download tasks scheduled.
-//! Such timelines are considered "remote" before the download succeeds, so a number of operations (gc, checkpoints) on that timeline are unavailable
-//! before up-to-date layers and metadata file are downloaded locally.
-//! * all newer local state gets scheduled for upload, such timelines are "local" and fully operational
-//! * remote timelines not present locally are unknown to pageserver, but can be downloaded on a separate request
-//!
-//! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization.
-//! The remote index gets updated after very remote storage change (after an upload), same as the index part files remotely.
-//!
-//! Remote timeline contains a set of layer files, created during checkpoint(s) and the serialized [`IndexPart`] file with timeline metadata and all remote layer paths inside.
-//! Those paths are used instead of `S3 list` command to avoid its slowliness and expenciveness for big amount of files.
-//! If the index part does not contain some file path but it's present remotely, such file is invisible to pageserver and ignored.
-//! Among other tasks, the index is used to prevent invalid uploads and non-existing downloads on demand, refer to [`index`] for more details.
-//!
-//! FIXME: update this paragraph
-//! Index construction is currently the only place where the storage sync can return an [`Err`] to the user.
-//! New sync tasks are accepted via [`schedule_layer_upload`], [`schedule_layer_download`] and [`schedule_layer_delete`] functions.
-//! After the initial state is loaded into memory and the loop starts, any further [`Err`] results do not stop the loop, but rather
-//! reschedule the same task, with possibly less files to sync:
-//! * download tasks currently never replace existing local file with metadata file as an exception
-//! (but this is a subject to change when checksum checks are implemented: all files could get overwritten on a checksum mismatch)
-//! * download tasks carry the information of skipped acrhives, so resubmissions are not downloading successfully processed layers again
-//! * downloads do not contain any actual files to download, so that "external", sync pageserver code is able to schedule the timeline download
-//! without accessing any extra information about its files.
-//!
-//! FIXME: update this paragraph
-//! Uploads and downloads sync layer files in arbitrary order, but only after all layer files are synched the local metadada (for download) and remote index part (for upload) are updated,
-//! to avoid having a corrupt state without the relevant layer files.
-//! Refer to [`upload`] and [`download`] for more details.
-//!
-//! Synchronization never removes any local files from pageserver workdir or remote files from the remote storage, yet there could be overwrites of the same files (index part and metadata file updates, future checksum mismatch fixes).
-//! NOTE: No real contents or checksum check happens right now and is a subject to improve later.

 mod delete;
 mod download;
@@ -245,9 +202,9 @@ use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};

 use anyhow::ensure;
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::runtime::Runtime;
-use tracing::{error, info, warn};
+use tracing::{info, warn};
 use tracing::{info_span, Instrument};

 use utils::lsn::Lsn;
@@ -260,7 +217,7 @@ use crate::metrics::RemoteOpKind;
 use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
 use crate::{
    config::PageServerConf,
-    storage_sync::index::{LayerFileMetadata, RelativePath},
+    storage_sync::index::LayerFileMetadata,
    task_mgr,
    task_mgr::TaskKind,
    task_mgr::BACKGROUND_RUNTIME,
@@ -330,7 +287,7 @@ struct UploadQueueInitialized {

    /// All layer files stored in the remote storage, taking into account all
    /// in-progress and queued operations
-    latest_files: HashMap<RelativePath, LayerFileMetadata>,
+    latest_files: HashMap<RemotePath, LayerFileMetadata>,

    /// Metadata stored in the remote storage, taking into account all
    /// in-progress and queued operations.
@@ -380,18 +337,18 @@ impl UploadQueue {

        let state = UploadQueueInitialized {
            // As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
-            latest_files: Default::default(),
+            latest_files: HashMap::new(),
            latest_metadata: metadata.clone(),
            // We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
            // safekeepers from garbage-collecting anything.
            last_uploaded_consistent_lsn: Lsn(0),
            // what follows are boring default initializations
-            task_counter: Default::default(),
+            task_counter: 0,
            num_inprogress_layer_uploads: 0,
            num_inprogress_metadata_uploads: 0,
            num_inprogress_deletions: 0,
-            inprogress_tasks: Default::default(),
-            queued_operations: Default::default(),
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
        };

        *self = UploadQueue::Initialized(state);
@@ -400,6 +357,10 @@ impl UploadQueue {

    fn initialize_with_current_remote_index_part(
        &mut self,
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+
        index_part: &IndexPart,
    ) -> anyhow::Result<&mut UploadQueueInitialized> {
        match self {
@@ -409,14 +370,19 @@ impl UploadQueue {
            }
        }

-        let mut files = HashMap::new();
-        for path in &index_part.timeline_layers {
+        let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
+        let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
+        for timeline_name in &index_part.timeline_layers {
+            let local_path = timeline_path.join(timeline_name);
+            let remote_timeline_path = conf.remote_path(&local_path).expect(
+                "Remote timeline path and local timeline path were constructed form the same conf",
+            );
            let layer_metadata = index_part
                .layer_metadata
-                .get(path)
+                .get(timeline_name)
                .map(LayerFileMetadata::from)
                .unwrap_or(LayerFileMetadata::MISSING);
-            files.insert(path.clone(), layer_metadata);
+            files.insert(remote_timeline_path, layer_metadata);
        }

        let index_part_metadata = index_part.parse_metadata()?;
@@ -434,8 +400,8 @@ impl UploadQueue {
            num_inprogress_layer_uploads: 0,
            num_inprogress_metadata_uploads: 0,
            num_inprogress_deletions: 0,
-            inprogress_tasks: Default::default(),
-            queued_operations: Default::default(),
+            inprogress_tasks: HashMap::new(),
+            queued_operations: VecDeque::new(),
        };

        *self = UploadQueue::Initialized(state);
@@ -499,7 +465,12 @@ impl RemoteTimelineClient {
    /// The given `index_part` must be the one on the remote.
    pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(
+            self.conf,
+            self.tenant_id,
+            self.timeline_id,
+            index_part,
+        )?;
        Ok(())
    }

@@ -553,15 +524,13 @@ impl RemoteTimelineClient {
    /// On success, returns the size of the downloaded file.
    pub async fn download_layer_file(
        &self,
-        path: &RelativePath,
+        remote_path: &RemotePath,
        layer_metadata: &LayerFileMetadata,
    ) -> anyhow::Result<u64> {
        let downloaded_size = download::download_layer_file(
            self.conf,
            &self.storage_impl,
-            self.tenant_id,
-            self.timeline_id,
-            path,
+            remote_path,
            layer_metadata,
        )
        .measure_remote_op(
@@ -579,13 +548,13 @@ impl RemoteTimelineClient {
            let new_metadata = LayerFileMetadata::new(downloaded_size);
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
-            if let Some(upgraded) = upload_queue.latest_files.get_mut(path) {
+            if let Some(upgraded) = upload_queue.latest_files.get_mut(remote_path) {
                upgraded.merge(&new_metadata);
            } else {
                // The file should exist, since we just downloaded it.
                warn!(
                    "downloaded file {:?} not found in local copy of the index file",
-                    path
+                    remote_path
                );
            }
        }
@@ -655,14 +624,9 @@ impl RemoteTimelineClient {
            "file size not initialized in metadata"
        );

-        let relative_path = RelativePath::from_local_path(
-            &self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-            path,
-        )?;
-
        upload_queue
            .latest_files
-            .insert(relative_path, layer_metadata.clone());
+            .insert(self.conf.remote_path(path)?, layer_metadata.clone());

        let op = UploadOp::UploadLayer(PathBuf::from(path), layer_metadata.clone());
        self.update_upload_queue_unfinished_metric(1, &op);
@@ -684,13 +648,10 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        // Convert the paths into RelativePaths, and gather other information we need.
-        let mut relative_paths = Vec::with_capacity(paths.len());
+        // Convert the paths into RemotePaths, and gather other information we need.
+        let mut remote_paths = Vec::with_capacity(paths.len());
        for path in paths {
-            relative_paths.push(RelativePath::from_local_path(
-                &self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-                path,
-            )?);
+            remote_paths.push(self.conf.remote_path(path)?);
        }

        // Deleting layers doesn't affect the values stored in TimelineMetadata,
@@ -706,8 +667,8 @@ impl RemoteTimelineClient {
        // from latest_files, but not yet scheduled for deletion. Use a closure
        // to syntactically forbid ? or bail! calls here.
        let no_bail_here = || {
-            for relative_path in relative_paths {
-                upload_queue.latest_files.remove(&relative_path);
+            for remote_path in remote_paths {
+                upload_queue.latest_files.remove(&remote_path);
            }

            let index_part = IndexPart::new(
@@ -881,14 +842,19 @@ impl RemoteTimelineClient {

            let upload_result: anyhow::Result<()> = match &task.op {
                UploadOp::UploadLayer(ref path, ref layer_metadata) => {
-                    upload::upload_timeline_layer(&self.storage_impl, path, layer_metadata)
-                        .measure_remote_op(
-                            self.tenant_id,
-                            self.timeline_id,
-                            RemoteOpFileKind::Layer,
-                            RemoteOpKind::Upload,
-                        )
-                        .await
+                    upload::upload_timeline_layer(
+                        self.conf,
+                        &self.storage_impl,
+                        path,
+                        layer_metadata,
+                    )
+                    .measure_remote_op(
+                        self.tenant_id,
+                        self.timeline_id,
+                        RemoteOpFileKind::Layer,
+                        RemoteOpKind::Upload,
+                    )
+                    .await
                }
                UploadOp::UploadMetadata(ref index_part, _lsn) => {
                    upload::upload_index_part(
@@ -907,7 +873,7 @@ impl RemoteTimelineClient {
                    .await
                }
                UploadOp::Delete(metric_file_kind, ref path) => {
-                    delete::delete_layer(&self.storage_impl, path)
+                    delete::delete_layer(self.conf, &self.storage_impl, path)
                        .measure_remote_op(
                            self.tenant_id,
                            self.timeline_id,
@@ -931,10 +897,20 @@ impl RemoteTimelineClient {
                Err(e) => {
                    let retries = task.retries.fetch_add(1, Ordering::SeqCst);

-                    error!(
-                        "failed to perform remote task {}, will retry (attempt {}): {:?}",
-                        task.op, retries, e
-                    );
+                    // uploads may fail due to rate limts (IAM, S3) or spurious network and external errors
+                    // such issues are relatively regular, so don't use WARN or ERROR to avoid alerting
+                    // people and tests until the retries are definitely causing delays.
+                    if retries < 3 {
+                        info!(
+                            "failed to perform remote task {}, will retry (attempt {}): {:?}",
+                            task.op, retries, e
+                        );
+                    } else {
+                        warn!(
+                            "failed to perform remote task {}, will retry (attempt {}): {:?}",
+                            task.op, retries, e
+                        );
+                    }

                    // sleep until it's time to retry, or we're cancelled
                    tokio::select! {
@@ -999,7 +975,8 @@ impl RemoteTimelineClient {
            UploadOp::UploadMetadata(_, _) => (RemoteOpFileKind::Index, RemoteOpKind::Upload),
            UploadOp::Delete(file_kind, _) => (*file_kind, RemoteOpKind::Delete),
            UploadOp::Barrier(_) => {
-                unreachable!("we execute barriers synchronously")
+                // we do not account these
+                return;
            }
        };
        REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS
@@ -1125,15 +1102,11 @@ mod tests {
        TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
    }

-    fn assert_file_list(a: &HashSet<RelativePath>, b: &[&str]) {
-        let xx = PathBuf::from("");
-        let mut avec: Vec<String> = a
-            .iter()
-            .map(|x| x.to_local_path(&xx).to_string_lossy().into())
-            .collect();
+    fn assert_file_list(a: &HashSet<String>, b: &[&str]) {
+        let mut avec: Vec<&str> = a.iter().map(|a| a.as_str()).collect();
        avec.sort();

-        let mut bvec = b.to_owned();
+        let mut bvec = b.to_vec();
        bvec.sort_unstable();

        assert_eq!(avec, bvec);
@@ -1201,8 +1174,7 @@ mod tests {

        println!("workdir: {}", harness.conf.workdir.display());

-        let storage_impl =
-            GenericRemoteStorage::from_config(harness.conf.workdir.clone(), &storage_config)?;
+        let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
        let client = Arc::new(RemoteTimelineClient {
            conf: harness.conf,
            runtime,
--- a/pageserver/src/storage_sync2/delete.rs
+++ b/pageserver/src/storage_sync2/delete.rs
@@ -0,0 +1,28 @@
+//! Helper functions to delete files from remote storage with a RemoteStorage
+use anyhow::Context;
+use std::path::Path;
+use tracing::debug;
+
+use remote_storage::GenericRemoteStorage;
+
+use crate::config::PageServerConf;
+
+pub(super) async fn delete_layer<'a>(
+    conf: &'static PageServerConf,
+    storage: &'a GenericRemoteStorage,
+    local_layer_path: &'a Path,
+) -> anyhow::Result<()> {
+    fail::fail_point!("before-delete-layer", |_| {
+        anyhow::bail!("failpoint before-delete-layer")
+    });
+    debug!("Deleting layer from remote storage: {local_layer_path:?}",);
+
+    let path_to_delete = conf.remote_path(local_layer_path)?;
+
+    // XXX: If the deletion fails because the object already didn't exist,
+    // it would be good to just issue a warning but consider it success.
+    // https://github.com/neondatabase/neon/issues/2934
+    storage.delete(&path_to_delete).await.with_context(|| {
+        format!("Failed to delete remote layer from storage at {path_to_delete:?}")
+    })
+}
--- a/pageserver/src/storage_sync2/download.rs
+++ b/pageserver/src/storage_sync2/download.rs
@@ -10,12 +10,11 @@ use tracing::debug;

 use crate::config::PageServerConf;
 use crate::storage_sync::index::LayerFileMetadata;
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};

 use super::index::IndexPart;
-use super::RelativePath;

 async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
    fs::File::open(path).await?.sync_all().await
@@ -29,21 +28,10 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
 pub async fn download_layer_file<'a>(
    conf: &'static PageServerConf,
    storage: &'a GenericRemoteStorage,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    path: &'a RelativePath,
+    remote_path: &'a RemotePath,
    layer_metadata: &'a LayerFileMetadata,
 ) -> anyhow::Result<u64> {
-    let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
-
-    let local_path = path.to_local_path(&timeline_path);
-
-    let layer_storage_path = storage.remote_object_id(&local_path).with_context(|| {
-        format!(
-            "Failed to get the layer storage path for local path '{}'",
-            local_path.display()
-        )
-    })?;
+    let local_path = conf.local_path(remote_path);

    // Perform a rename inspired by durable_rename from file_utils.c.
    // The sequence:
@@ -64,19 +52,14 @@ pub async fn download_layer_file<'a>(
            temp_file_path.display()
        )
    })?;
-    let mut download = storage
-        .download(&layer_storage_path)
-        .await
-        .with_context(|| {
-            format!(
-                "Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
-            )
-        })?;
-    let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
+    let mut download = storage.download(remote_path).await.with_context(|| {
        format!(
-            "Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
+            "Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
        )
    })?;
+    let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
+        format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
+    })?;

    // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
    // A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -151,12 +134,7 @@ pub async fn list_remote_timelines<'a>(
    tenant_id: TenantId,
 ) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
    let tenant_path = conf.timelines_path(&tenant_id);
-    let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
-        format!(
-            "Failed to get tenant storage path for local path '{}'",
-            tenant_path.display()
-        )
-    })?;
+    let tenant_storage_path = conf.remote_path(&tenant_path)?;

    let timelines = storage
        .list_prefixes(Some(&tenant_storage_path))
@@ -218,14 +196,8 @@ pub async fn download_index_part(
    let index_part_path = conf
        .metadata_path(timeline_id, tenant_id)
        .with_file_name(IndexPart::FILE_NAME);
-    let part_storage_path = storage
-        .remote_object_id(&index_part_path)
-        .with_context(|| {
-            format!(
-                "Failed to get the index part storage path for local path '{}'",
-                index_part_path.display()
-            )
-        })
+    let part_storage_path = conf
+        .remote_path(&index_part_path)
        .map_err(DownloadError::BadInput)?;

    let mut index_part_download = storage.download(&part_storage_path).await?;
@@ -236,20 +208,12 @@ pub async fn download_index_part(
        &mut index_part_bytes,
    )
    .await
-    .with_context(|| {
-        format!(
-            "Failed to download an index part into file '{}'",
-            index_part_path.display()
-        )
-    })
+    .with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
    .map_err(DownloadError::Other)?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
        .with_context(|| {
-            format!(
-                "Failed to deserialize index part file into file '{}'",
-                index_part_path.display()
-            )
+            format!("Failed to deserialize index part file into file {index_part_path:?}")
        })
        .map_err(DownloadError::Other)?;

--- a/pageserver/src/storage_sync2/index.rs
+++ b/pageserver/src/storage_sync2/index.rs
@@ -2,12 +2,9 @@
 //! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
 //! remote timeline layers and its metadata.

-use std::{
-    collections::{HashMap, HashSet},
-    path::{Path, PathBuf},
-};
+use std::collections::{HashMap, HashSet};

-use anyhow::{Context, Ok};
+use remote_storage::RemotePath;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};

@@ -15,33 +12,6 @@ use crate::tenant::metadata::TimelineMetadata;

 use utils::lsn::Lsn;

-/// A part of the filesystem path, that needs a root to become a path again.
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
-#[serde(transparent)]
-pub struct RelativePath(String);
-
-impl RelativePath {
-    /// Attempts to strip off the base from path, producing a relative path or an error.
-    pub fn from_local_path(timeline_path: &Path, path: &Path) -> anyhow::Result<RelativePath> {
-        let relative = path.strip_prefix(timeline_path).with_context(|| {
-            format!(
-                "path '{}' is not relative to base '{}'",
-                path.display(),
-                timeline_path.display()
-            )
-        })?;
-        Ok(Self::from_filename(relative))
-    }
-
-    pub fn from_filename(path: &Path) -> RelativePath {
-        RelativePath(path.to_string_lossy().to_string())
-    }
-
-    pub fn to_local_path(&self, timeline_path: &Path) -> PathBuf {
-        timeline_path.join(&self.0)
-    }
-}
-
 /// Metadata gathered for each of the layer files.
 ///
 /// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
@@ -97,21 +67,22 @@ pub struct IndexPart {
    #[serde(default)]
    version: usize,

-    /// Each of the layers present on remote storage.
+    /// Layer names, which are stored on the remote storage.
    ///
    /// Additional metadata can might exist in `layer_metadata`.
-    pub timeline_layers: HashSet<RelativePath>,
+    pub timeline_layers: HashSet<String>,

    /// FIXME: unused field. This should be removed, but that changes the on-disk format,
-    /// so we need to make sure we're backwards- (and maybe forwards-) compatible
-    missing_layers: HashSet<RelativePath>,
+    /// so we need to make sure we're backwards-` (and maybe forwards-) compatible
+    /// First pass is to move it to Optional and the next would be its removal
+    missing_layers: Option<HashSet<String>>,

-    /// Per layer file metadata, which can be present for a present or missing layer file.
+    /// Per layer file name metadata, which can be present for a present or missing layer file.
    ///
    /// Older versions of `IndexPart` will not have this property or have only a part of metadata
    /// that latest version stores.
    #[serde(default)]
-    pub layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
+    pub layer_metadata: HashMap<String, IndexLayerMetadata>,

    // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
    // It's duplicated here for convenience.
@@ -129,23 +100,29 @@ impl IndexPart {
    pub const FILE_NAME: &'static str = "index_part.json";

    pub fn new(
-        layers_and_metadata: HashMap<RelativePath, LayerFileMetadata>,
+        layers_and_metadata: HashMap<RemotePath, LayerFileMetadata>,
        disk_consistent_lsn: Lsn,
        metadata_bytes: Vec<u8>,
    ) -> Self {
-        let mut timeline_layers = HashSet::new();
-        let mut layer_metadata = HashMap::new();
+        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
+        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());

-        separate_paths_and_metadata(
-            &layers_and_metadata,
-            &mut timeline_layers,
-            &mut layer_metadata,
-        );
+        for (remote_path, metadata) in &layers_and_metadata {
+            let metadata = IndexLayerMetadata::from(metadata);
+            match remote_path.object_name() {
+                Some(layer_name) => {
+                    timeline_layers.insert(layer_name.to_owned());
+                    layer_metadata.insert(layer_name.to_owned(), metadata);
+                }
+                // TODO move this on a type level: we know, that every layer entry does have a name
+                None => panic!("Layer {remote_path:?} has no file name, skipping"),
+            }
+        }

        Self {
            version: Self::LATEST_VERSION,
            timeline_layers,
-            missing_layers: HashSet::new(),
+            missing_layers: Some(HashSet::new()),
            layer_metadata,
            disk_consistent_lsn,
            metadata_bytes,
@@ -171,18 +148,6 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
    }
 }

-fn separate_paths_and_metadata(
-    input: &HashMap<RelativePath, LayerFileMetadata>,
-    output: &mut HashSet<RelativePath>,
-    layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
-) {
-    for (path, metadata) in input {
-        let metadata = IndexLayerMetadata::from(metadata);
-        layer_metadata.insert(path.clone(), metadata);
-        output.insert(path.clone());
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -198,8 +163,8 @@ mod tests {

        let expected = IndexPart {
            version: 0,
-            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
-            missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
+            timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
+            missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
            layer_metadata: HashMap::default(),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
@@ -226,13 +191,13 @@ mod tests {
        let expected = IndexPart {
            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
            version: 1,
-            timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
-            missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
+            timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
+            missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
            layer_metadata: HashMap::from([
-                (RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
+                (String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"), IndexLayerMetadata {
                    file_size: Some(25600000),
                }),
-                (RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
+                (String::from("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: Some(9007199254741001),
@@ -245,4 +210,46 @@ mod tests {
        let part = serde_json::from_str::<IndexPart>(example).unwrap();
        assert_eq!(part, expected);
    }
+
+    #[test]
+    fn v1_indexpart_is_parsed_with_optional_missing_layers() {
+        let example = r#"{
+            "version":1,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 1,
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string()]),
+            layer_metadata: HashMap::from([
+                (
+                    "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string(),
+                    IndexLayerMetadata {
+                        file_size: Some(25600000),
+                    }
+                ),
+                (
+                    "not_a_real_layer_but_adding_coverage".to_string(),
+                    IndexLayerMetadata {
+                        // serde_json should always parse this but this might be a double with jq for
+                        // example.
+                        file_size: Some(9007199254741001),
+                    }
+                )
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            missing_layers: None,
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
 }
--- a/pageserver/src/storage_sync2/upload.rs
+++ b/pageserver/src/storage_sync2/upload.rs
@@ -30,12 +30,9 @@ pub(super) async fn upload_index_part<'a>(
    let index_part_path = conf
        .metadata_path(timeline_id, tenant_id)
        .with_file_name(IndexPart::FILE_NAME);
+    let storage_path = conf.remote_path(&index_part_path)?;
    storage
-        .upload_storage_object(
-            Box::new(index_part_bytes),
-            index_part_size,
-            &index_part_path,
-        )
+        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
        .await
        .with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
 }
@@ -44,36 +41,26 @@ pub(super) async fn upload_index_part<'a>(
 /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
 ///
 /// On an error, bumps the retries count and reschedules the entire task.
-pub(super) async fn upload_timeline_layer(
-    storage: &GenericRemoteStorage,
-    source_path: &Path,
-    known_metadata: &LayerFileMetadata,
+pub(super) async fn upload_timeline_layer<'a>(
+    conf: &'static PageServerConf,
+    storage: &'a GenericRemoteStorage,
+    source_path: &'a Path,
+    known_metadata: &'a LayerFileMetadata,
 ) -> anyhow::Result<()> {
    fail_point!("before-upload-layer", |_| {
        bail!("failpoint before-upload-layer")
    });
-    let storage_path = storage.remote_object_id(source_path).with_context(|| {
-        format!(
-            "Failed to get the layer storage path for local path '{}'",
-            source_path.display()
-        )
-    })?;
+    let storage_path = conf.remote_path(source_path)?;

-    let source_file = fs::File::open(&source_path).await.with_context(|| {
-        format!(
-            "Failed to open a source file for layer '{}'",
-            source_path.display()
-        )
-    })?;
+    let source_file = fs::File::open(&source_path)
+        .await
+        .with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?;

    let fs_size = source_file
        .metadata()
        .await
        .with_context(|| {
-            format!(
-                "Failed to get the source file metadata for layer '{}'",
-                source_path.display()
-            )
+            format!("Failed to get the source file metadata for layer {source_path:?}")
        })?
        .len();

--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -139,7 +139,7 @@ pub struct PageserverTaskId(u64);

 /// Each task that we track is associated with a "task ID". It's just an
 /// increasing number that we assign. Note that it is different from tokio::task::Id.
-static NEXT_TASK_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
+static NEXT_TASK_ID: AtomicU64 = AtomicU64::new(1);

 /// Global registry of tasks
 static TASKS: Lazy<Mutex<HashMap<u64, Arc<PageServerTask>>>> =
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -441,8 +441,6 @@ struct RemoteStartupData {
    remote_metadata: TimelineMetadata,
 }

-/// A repository corresponds to one .neon directory. One repository holds multiple
-/// timelines, forked off from the same initial call to 'initdb'.
 impl Tenant {
    /// Yet another helper for timeline initialization.
    /// Contains common part for `load_local_timeline` and `load_remote_timeline`
@@ -573,7 +571,7 @@ impl Tenant {
    pub fn spawn_attach(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
-        remote_storage: &GenericRemoteStorage,
+        remote_storage: GenericRemoteStorage,
    ) -> Arc<Tenant> {
        // XXX: Attach should provide the config, especially during tenant migration.
        //      See https://github.com/neondatabase/neon/issues/1555
@@ -586,7 +584,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            Some(remote_storage.clone()),
+            Some(remote_storage),
        ));

        // Do all the hard work in the background
@@ -784,7 +782,7 @@ impl Tenant {
        let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
            Ok(conf) => conf,
            Err(e) => {
-                error!("load tenant config failed: {}", e);
+                error!("load tenant config failed: {:?}", e);
                return Tenant::create_broken_tenant(conf, tenant_id);
            }
        };
@@ -1203,10 +1201,12 @@ impl Tenant {
        // compaction runs.
        let timelines_to_compact = {
            let timelines = self.timelines.lock().unwrap();
-            timelines
+            let timelines_to_compact = timelines
                .iter()
                .map(|(timeline_id, timeline)| (*timeline_id, timeline.clone()))
-                .collect::<Vec<_>>()
+                .collect::<Vec<_>>();
+            drop(timelines);
+            timelines_to_compact
        };

        for (timeline_id, timeline) in &timelines_to_compact {
@@ -1247,42 +1247,87 @@ impl Tenant {
    }

    /// Removes timeline-related in-memory data
-    pub fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
-        // in order to be retriable detach needs to be idempotent
-        // (or at least to a point that each time the detach is called it can make progress)
-        let mut timelines = self.timelines.lock().unwrap();
+    pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
+        // Transition the timeline into TimelineState::Stopping.
+        // This should prevent new operations from starting.
+        let timeline = {
+            let mut timelines = self.timelines.lock().unwrap();

-        // Ensure that there are no child timelines **attached to that pageserver**,
-        // because detach removes files, which will break child branches
-        let children_exist = timelines
-            .iter()
-            .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+            // Ensure that there are no child timelines **attached to that pageserver**,
+            // because detach removes files, which will break child branches
+            let children_exist = timelines
+                .iter()
+                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));

-        anyhow::ensure!(
-            !children_exist,
-            "Cannot delete timeline which has child timelines"
-        );
-        let timeline_entry = match timelines.entry(timeline_id) {
-            Entry::Occupied(e) => e,
-            Entry::Vacant(_) => bail!("timeline not found"),
+            anyhow::ensure!(
+                !children_exist,
+                "Cannot delete timeline which has child timelines"
+            );
+            let timeline_entry = match timelines.entry(timeline_id) {
+                Entry::Occupied(e) => e,
+                Entry::Vacant(_) => bail!("timeline not found"),
+            };
+
+            let timeline = Arc::clone(timeline_entry.get());
+            timeline.set_state(TimelineState::Stopping);
+
+            drop(timelines);
+            timeline
        };

-        let timeline = timeline_entry.get();
-        timeline.set_state(TimelineState::Paused);
+        info!("waiting for layer_removal_cs.lock()");
+        // No timeout here, GC & Compaction should be responsive to the `TimelineState::Stopping` change.
+        let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+        info!("got layer_removal_cs.lock(), deleting layer files");

-        // FIXME: Wait for all tasks, including GC and compaction, that are working on the
-        // timeline, to finish.
+        // NB: storage_sync upload tasks that reference these layers have been cancelled
+        //     by the caller.

        let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
+        // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
+        // with some layers missing.
        std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
            format!(
                "Failed to remove local timeline directory '{}'",
                local_timeline_directory.display()
            )
        })?;
-        info!("detach removed files");
+        info!("finished deleting layer files, releasing layer_removal_cs.lock()");

-        timeline_entry.remove();
+        drop(layer_removal_guard);
+
+        // Remove the timeline from the map.
+        let mut timelines = self.timelines.lock().unwrap();
+        let children_exist = timelines
+            .iter()
+            .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+        // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+        // We already deleted the layer files, so it's probably best to panic.
+        // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+        if children_exist {
+            panic!("Timeline grew children while we removed layer files");
+        }
+        let removed_timeline = timelines.remove(&timeline_id);
+        if removed_timeline.is_none() {
+            // This can legitimately happen if there's a concurrent call to this function.
+            //   T1                                             T2
+            //   lock
+            //   unlock
+            //                                                  lock
+            //                                                  unlock
+            //                                                  remove files
+            //                                                  lock
+            //                                                  remove from map
+            //                                                  unlock
+            //                                                  return
+            //   remove files
+            //   lock
+            //   remove from map observes empty map
+            //   unlock
+            //   return
+            debug!("concurrent call to this function won the race");
+        }
+        drop(timelines);

        Ok(())
    }
@@ -1310,10 +1355,10 @@ impl Tenant {
                        "Could not activate tenant because it is in broken state"
                    ));
                }
-                TenantState::Paused => {
+                TenantState::Stopping => {
                    // The tenant was detached, or system shutdown was requested, while we were
                    // loading or attaching the tenant.
-                    info!("Tenant is already in Paused state, skipping activation");
+                    info!("Tenant is already in Stopping state, skipping activation");
                }
                TenantState::Loading | TenantState::Attaching => {
                    *current_state = TenantState::Active;
@@ -1339,16 +1384,16 @@ impl Tenant {
        result
    }

-    /// Change tenant status to paused, to mark that it is being shut down
-    pub fn set_paused(&self) {
+    /// Change tenant status to Stopping, to mark that it is being shut down
+    pub fn set_stopping(&self) {
        self.state.send_modify(|current_state| {
            match *current_state {
                TenantState::Active | TenantState::Loading | TenantState::Attaching => {
-                    *current_state = TenantState::Paused;
+                    *current_state = TenantState::Stopping;

                    // FIXME: If the tenant is still Loading or Attaching, new timelines
                    // might be created after this. That's harmless, as the Timelines
-                    // won't be accessible to anyone, when the Tenant is in Paused
+                    // won't be accessible to anyone, when the Tenant is in Stopping
                    // state.
                    let timelines_accessor = self.timelines.lock().unwrap();
                    let not_broken_timelines = timelines_accessor
@@ -1359,12 +1404,12 @@ impl Tenant {
                    }
                }
                TenantState::Broken => {
-                    info!("Cannot set tenant to Paused state, it is already in Broken state");
+                    info!("Cannot set tenant to Stopping state, it is already in Broken state");
                }
-                TenantState::Paused => {
+                TenantState::Stopping => {
                    // The tenant was detached, or system shutdown was requested, while we were
                    // loading or attaching the tenant.
-                    info!("Tenant is already in Paused state");
+                    info!("Tenant is already in Stopping state");
                }
            }
        });
@@ -1385,10 +1430,10 @@ impl Tenant {
                    // This shouldn't happen either
                    warn!("Tenant is already broken");
                }
-                TenantState::Paused => {
+                TenantState::Stopping => {
                    // This shouldn't happen either
                    *current_state = TenantState::Broken;
-                    warn!("Marking Paused tenant as Broken");
+                    warn!("Marking Stopping tenant as Broken");
                }
                TenantState::Loading | TenantState::Attaching => {
                    *current_state = TenantState::Broken;
@@ -1413,7 +1458,7 @@ impl Tenant {
                TenantState::Active { .. } => {
                    return Ok(());
                }
-                TenantState::Broken | TenantState::Paused => {
+                TenantState::Broken | TenantState::Stopping => {
                    // There's no chance the tenant can transition back into ::Active
                    anyhow::bail!(
                        "Tenant {} will not become active. Current state: {:?}",
@@ -2047,9 +2092,10 @@ impl Tenant {
            format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
        })?;

-        // Flush loop needs to be spawned in order for checkpoint to be able to flush.
-        // We want to run proper checkpoint before we mark timeline as available to outside world
-        // Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
+        // Flush the new layer files to disk, before we mark the timeline as available to
+        // the outside world.
+        //
+        // Thus spawn flush loop manually and skip flush_loop setup in initialize_with_lock
        unfinished_timeline.maybe_spawn_flush_loop();

        fail::fail_point!("before-checkpoint-new-timeline", |_| {
@@ -2057,7 +2103,7 @@ impl Tenant {
        });

        unfinished_timeline
-            .checkpoint(CheckpointConfig::Forced).await
+            .checkpoint(CheckpointConfig::Flush).await
            .with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;

        let timeline = {
@@ -2555,7 +2601,11 @@ pub mod harness {
            // OK in a test.
            let conf: &'static PageServerConf = Box::leak(Box::new(conf));

-            let tenant_conf = TenantConf::dummy_conf();
+            // Disable automatic GC and compaction to make the unit tests more deterministic.
+            // The tests perform them manually if needed.
+            let mut tenant_conf = TenantConf::dummy_conf();
+            tenant_conf.gc_period = Duration::ZERO;
+            tenant_conf.compaction_period = Duration::ZERO;

            let tenant_id = TenantId::generate();
            fs::create_dir_all(conf.tenant_path(&tenant_id))?;
@@ -2619,7 +2669,7 @@ pub mod harness {
            &self,
            key: Key,
            lsn: Lsn,
-            base_img: Option<Bytes>,
+            base_img: Option<(Lsn, Bytes)>,
            records: Vec<(Lsn, NeonWalRecord)>,
            _pg_version: u32,
        ) -> Result<Bytes, WalRedoError> {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,7 +5,6 @@
 use crate::page_cache;
 use crate::page_cache::{ReadBufResult, PAGE_SZ};
 use bytes::Bytes;
-use once_cell::sync::Lazy;
 use std::ops::{Deref, DerefMut};
 use std::os::unix::fs::FileExt;
 use std::sync::atomic::AtomicU64;
@@ -117,7 +116,7 @@ where
    }
 }

-static NEXT_ID: Lazy<AtomicU64> = Lazy::new(|| AtomicU64::new(1));
+static NEXT_ID: AtomicU64 = AtomicU64::new(1);

 /// An adapter for reading a (virtual) file using the page cache.
 ///
--- a/pageserver/src/tenant/delta_layer.rs
+++ b/pageserver/src/tenant/delta_layer.rs
@@ -30,15 +30,14 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::filename::{DeltaFileName, PathOrConf};
-use crate::tenant::storage_layer::{
-    DropNotify, Layer, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::fs;
 use std::io::{BufWriter, Write};
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
@@ -192,8 +191,6 @@ pub struct DeltaLayerInner {

    /// Reader object for reading blocks from the file. (None if not loaded yet)
    file: Option<FileBlockReader<VirtualFile>>,
-
-    drop_watch: Option<DropNotify>,
 }

 impl Layer for DeltaLayer {
@@ -330,13 +327,10 @@ impl Layer for DeltaLayer {
        }
    }

-    fn drop_notify(&self) -> DropNotify {
-        let mut inner = self.inner.write().unwrap();
-
-        inner
-            .drop_watch
-            .get_or_insert_with(|| DropNotify::new())
-            .clone()
+    fn delete(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
    }

    fn is_incremental(&self) -> bool {
@@ -557,7 +551,6 @@ impl DeltaLayer {
                file: None,
                index_start_blk: 0,
                index_root_blk: 0,
-                drop_watch: None,
            }),
        }
    }
@@ -585,7 +578,6 @@ impl DeltaLayer {
                file: None,
                index_start_blk: 0,
                index_root_blk: 0,
-                drop_watch: None,
            }),
        })
    }
@@ -751,7 +743,6 @@ impl DeltaLayerWriterInner {
                file: None,
                index_start_blk,
                index_root_blk,
-                drop_watch: None,
            }),
        };

--- a/pageserver/src/tenant/image_layer.rs
+++ b/pageserver/src/tenant/image_layer.rs
@@ -26,9 +26,7 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::filename::{ImageFileName, PathOrConf};
-use crate::tenant::storage_layer::{
-    DropNotify, Layer, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
@@ -36,6 +34,7 @@ use bytes::Bytes;
 use hex;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
+use std::fs;
 use std::io::Write;
 use std::io::{Seek, SeekFrom};
 use std::ops::Range;
@@ -118,8 +117,6 @@ pub struct ImageLayerInner {

    /// Reader object for reading blocks from the file. (None if not loaded yet)
    file: Option<FileBlockReader<VirtualFile>>,
-
-    drop_watch: Option<DropNotify>,
 }

 impl Layer for ImageLayer {
@@ -187,13 +184,10 @@ impl Layer for ImageLayer {
        todo!();
    }

-    fn drop_notify(&self) -> DropNotify {
-        let mut inner = self.inner.write().unwrap();
-
-        inner
-            .drop_watch
-            .get_or_insert_with(|| DropNotify::new())
-            .clone()
+    fn delete(&self) -> Result<()> {
+        // delete underlying file
+        fs::remove_file(self.path())?;
+        Ok(())
    }

    fn is_incremental(&self) -> bool {
@@ -357,7 +351,6 @@ impl ImageLayer {
                file: None,
                index_start_blk: 0,
                index_root_blk: 0,
-                drop_watch: None,
            }),
        }
    }
@@ -385,7 +378,6 @@ impl ImageLayer {
                loaded: false,
                index_start_blk: 0,
                index_root_blk: 0,
-                drop_watch: None,
            }),
        })
    }
@@ -540,7 +532,6 @@ impl ImageLayerWriterInner {
                file: None,
                index_start_blk,
                index_root_blk,
-                drop_watch: None,
            }),
        };

--- a/pageserver/src/tenant/inmemory_layer.rs
+++ b/pageserver/src/tenant/inmemory_layer.rs
@@ -10,11 +10,9 @@ use crate::tenant::blob_io::{BlobCursor, BlobWriter};
 use crate::tenant::block_io::BlockReader;
 use crate::tenant::delta_layer::{DeltaLayer, DeltaLayerWriter};
 use crate::tenant::ephemeral_file::EphemeralFile;
-use crate::tenant::storage_layer::{
-    DropNotify, Layer, ValueReconstructResult, ValueReconstructState,
-};
+use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::walrecord;
-use anyhow::{ensure, Result};
+use anyhow::{bail, ensure, Result};
 use std::cell::RefCell;
 use std::collections::HashMap;
 use tracing::*;
@@ -174,8 +172,8 @@ impl Layer for InMemoryLayer {

    /// Nothing to do here. When you drop the last reference to the layer, it will
    /// be deallocated.
-    fn drop_notify(&self) -> DropNotify {
-        panic!("can't delete an InMemoryLayer")
+    fn delete(&self) -> Result<()> {
+        bail!("can't delete an InMemoryLayer")
    }

    fn is_incremental(&self) -> bool {
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -145,31 +145,9 @@ pub trait Layer: Send + Sync {
        panic!("Not implemented")
    }

-    fn drop_notify(&self) -> DropNotify;
+    /// Permanently remove this layer from disk.
+    fn delete(&self) -> Result<()>;

    /// Dump summary of the contents of the layer to stdout
    fn dump(&self, verbose: bool) -> Result<()>;
 }
-
-#[derive(Clone)]
-pub struct DropNotify(std::sync::Arc<tokio::sync::Notify>);
-
-impl DropNotify {
-    pub fn new() -> Self {
-        DropNotify(std::sync::Arc::new(tokio::sync::Notify::new()))
-    }
-
-    pub async fn dropped(&self) {
-        self.0.notified().await
-    }
-
-    pub fn notify_waiters(&self) {
-        self.0.notify_waiters();
-    }
-}
-
-impl Drop for DropNotify {
-    fn drop(&mut self) {
-        self.0.notify_waiters();
-    }
-}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -51,6 +51,7 @@ pub struct TenantConf {
    // This parameter determines L1 layer file size.
    pub compaction_target_size: u64,
    // How often to check if there's compaction work to be done.
+    // Duration::ZERO means automatic compaction is disabled.
    #[serde(with = "humantime_serde")]
    pub compaction_period: Duration,
    // Level0 delta layer threshold for compaction.
@@ -61,6 +62,7 @@ pub struct TenantConf {
    // Page versions older than this are garbage collected away.
    pub gc_horizon: u64,
    // Interval at which garbage collection is triggered.
+    // Duration::ZERO means automatic GC is disabled
    #[serde(with = "humantime_serde")]
    pub gc_period: Duration,
    // Delta layer churn threshold to create L1 image layers.
@@ -183,6 +185,9 @@ impl TenantConfOpt {
        if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
            self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
        }
+        if let Some(trace_read_requests) = other.trace_read_requests {
+            self.trace_read_requests = Some(trace_read_requests);
+        }
    }
 }

--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -1,75 +1,59 @@
 //! This module acts as a switchboard to access different repositories managed by this
 //! page server.

-use std::collections::hash_map;
+use std::collections::{hash_map, HashMap};
 use std::ffi::OsStr;
-use std::fs;
 use std::path::Path;
 use std::sync::Arc;
+use tokio::fs;

 use anyhow::Context;
+use once_cell::sync::Lazy;
+use tokio::sync::RwLock;
 use tracing::*;

 use remote_storage::GenericRemoteStorage;
+use utils::crashsafe;

 use crate::config::PageServerConf;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::{Tenant, TenantState};
 use crate::tenant_config::TenantConfOpt;
+use crate::IGNORED_TENANT_FILE_NAME;

 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-mod tenants_state {
-    use once_cell::sync::Lazy;
-    use std::{
-        collections::HashMap,
-        sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard},
-    };
-    use utils::id::TenantId;
-
-    use crate::tenant::Tenant;
-
-    static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
-        Lazy::new(|| RwLock::new(HashMap::new()));
-
-    pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
-        TENANTS
-            .read()
-            .expect("Failed to read() tenants lock, it got poisoned")
-    }
-
-    pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
-        TENANTS
-            .write()
-            .expect("Failed to write() tenants lock, it got poisoned")
-    }
-}
+static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
+    Lazy::new(|| RwLock::new(HashMap::new()));

 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
-pub fn init_tenant_mgr(
+#[instrument(skip(conf, remote_storage))]
+pub async fn init_tenant_mgr(
    conf: &'static PageServerConf,
    remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<()> {
-    let _entered = info_span!("init_tenant_mgr").entered();
-
    // Scan local filesystem for attached tenants
    let mut number_of_tenants = 0;
    let tenants_dir = conf.tenants_path();
-    for dir_entry in std::fs::read_dir(&tenants_dir)
-        .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
-    {
-        match &dir_entry {
-            Ok(dir_entry) => {
+
+    let mut dir_entries = fs::read_dir(&tenants_dir)
+        .await
+        .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+
+    loop {
+        match dir_entries.next_entry().await {
+            Ok(None) => break,
+            Ok(Some(dir_entry)) => {
                let tenant_dir_path = dir_entry.path();
                if crate::is_temporary(&tenant_dir_path) {
                    info!(
                        "Found temporary tenant directory, removing: {}",
                        tenant_dir_path.display()
                    );
-                    if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
+                    if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
                        error!(
                            "Failed to remove temporary directory '{}': {:?}",
                            tenant_dir_path.display(),
@@ -77,27 +61,38 @@ pub fn init_tenant_mgr(
                        );
                    }
                } else {
-                    match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
-                        Ok(Some(tenant)) => {
-                            tenants_state::write_tenants().insert(tenant.tenant_id(), tenant);
+                    // This case happens if we crash during attach before creating the attach marker file
+                    let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
+                        format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
+                    })?;
+                    if is_empty {
+                        info!("removing empty tenant directory {tenant_dir_path:?}");
+                        if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
+                            error!(
+                                "Failed to remove empty tenant directory '{}': {e:#}",
+                                tenant_dir_path.display()
+                            )
+                        }
+                        continue;
+                    }
+
+                    let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
+                    if tenant_ignore_mark_file.exists() {
+                        info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
+                        continue;
+                    }
+
+                    match schedule_local_tenant_processing(
+                        conf,
+                        &tenant_dir_path,
+                        remote_storage.clone(),
+                    ) {
+                        Ok(tenant) => {
+                            TENANTS.write().await.insert(tenant.tenant_id(), tenant);
                            number_of_tenants += 1;
                        }
-                        Ok(None) => {
-                            // This case happens if we crash during attach before creating the attach marker file
-                            if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
-                                error!(
-                                    "Failed to remove empty tenant directory '{}': {e:#}",
-                                    tenant_dir_path.display()
-                                )
-                            }
-                        }
                        Err(e) => {
-                            error!(
-                            "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
-                            tenants_dir.display(),
-                            dir_entry,
-                            e
-                        );
+                            error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
                        }
                    }
                }
@@ -107,10 +102,7 @@ pub fn init_tenant_mgr(
                // here, the pageserver startup fails altogether, causing outage for *all*
                // tenants. That seems worse.
                error!(
-                    "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
-                    dir_entry,
-                    tenants_dir.display(),
-                    e,
+                    "Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
                );
            }
        }
@@ -120,34 +112,45 @@ pub fn init_tenant_mgr(
    Ok(())
 }

-fn load_local_tenant(
+pub fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_path: &Path,
    remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<Option<Arc<Tenant>>> {
-    if !tenant_path.is_dir() {
-        anyhow::bail!("tenant_path is not a directory: {tenant_path:?}")
-    }
-
-    let is_empty = tenant_path
-        .is_empty_dir()
-        .context("check whether tenant_path is an empty dir")?;
-    if is_empty {
-        info!("skipping empty tenant directory {tenant_path:?}");
-        return Ok(None);
-    }
+) -> anyhow::Result<Arc<Tenant>> {
+    anyhow::ensure!(
+        tenant_path.is_dir(),
+        "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
+    );
+    anyhow::ensure!(
+        !crate::is_temporary(tenant_path),
+        "Cannot load tenant from temporary path {tenant_path:?}"
+    );
+    anyhow::ensure!(
+        !tenant_path.is_empty_dir().with_context(|| {
+            format!("Failed to check whether {tenant_path:?} is an empty dir")
+        })?,
+        "Cannot load tenant from empty directory {tenant_path:?}"
+    );

    let tenant_id = tenant_path
        .file_name()
        .and_then(OsStr::to_str)
        .unwrap_or_default()
        .parse::<TenantId>()
-        .context("Could not parse tenant id out of the tenant dir name")?;
+        .with_context(|| {
+            format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
+        })?;
+
+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
+    anyhow::ensure!(
+        !conf.tenant_ignore_mark_file_path(tenant_id).exists(),
+        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
+    );

    let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
        info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
        if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, &remote_storage)
+            Tenant::spawn_attach(conf, tenant_id, remote_storage)
        } else {
            warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
            Tenant::create_broken_tenant(conf, tenant_id)
@@ -157,7 +160,7 @@ fn load_local_tenant(
        // Start loading the tenant into memory. It will initially be in Loading state.
        Tenant::spawn_load(conf, tenant_id, remote_storage)
    };
-    Ok(Some(tenant))
+    Ok(tenant)
 }

 ///
@@ -165,12 +168,12 @@ fn load_local_tenant(
 ///
 pub async fn shutdown_all_tenants() {
    let tenants_to_shut_down = {
-        let mut m = tenants_state::write_tenants();
+        let mut m = TENANTS.write().await;
        let mut tenants_to_shut_down = Vec::with_capacity(m.len());
        for (_, tenant) in m.drain() {
            if tenant.is_active() {
                // updates tenant state, forbidding new GC and compaction iterations from starting
-                tenant.set_paused();
+                tenant.set_stopping();
                tenants_to_shut_down.push(tenant)
            }
        }
@@ -199,13 +202,13 @@ pub async fn shutdown_all_tenants() {
    }
 }

-pub fn create_tenant(
+pub async fn create_tenant(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
    remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<Option<Arc<Tenant>>> {
-    match tenants_state::write_tenants().entry(tenant_id) {
+    match TENANTS.write().await.entry(tenant_id) {
        hash_map::Entry::Occupied(_) => {
            debug!("tenant {tenant_id} already exists");
            Ok(None)
@@ -215,44 +218,36 @@ pub fn create_tenant(
            // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
            let tenant_directory =
                super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
-            let created_tenant = load_local_tenant(conf, &tenant_directory, remote_storage)?;
-            match created_tenant {
-                None => {
-                    // We get None in case the directory is empty.
-                    // This shouldn't happen here, because we just created the directory.
-                    // So, skip any cleanup work for now, we don't know how we reached this state.
-                    anyhow::bail!("we just created the tenant directory, it can't be empty");
-                }
-                Some(tenant) => {
-                    anyhow::ensure!(
-                        tenant_id == tenant.tenant_id(),
-                        "loaded created tenant has unexpected tenant id (expect {} != actual {})",
-                        tenant_id,
-                        tenant.tenant_id()
-                    );
-                    v.insert(Arc::clone(&tenant));
-                    Ok(Some(tenant))
-                }
-            }
+            let created_tenant =
+                schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
+            let crated_tenant_id = created_tenant.tenant_id();
+            anyhow::ensure!(
+                tenant_id == crated_tenant_id,
+                "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
+            );
+            v.insert(Arc::clone(&created_tenant));
+            Ok(Some(created_tenant))
        }
    }
 }

-pub fn update_tenant_config(
+pub async fn update_tenant_config(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
 ) -> anyhow::Result<()> {
    info!("configuring tenant {tenant_id}");
-    get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf);
+    get_tenant(tenant_id, true)
+        .await?
+        .update_tenant_config(tenant_conf);
    Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
    Ok(())
 }

 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
-pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
-    let m = tenants_state::read_tenants();
+pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
+    let m = TENANTS.read().await;
    let tenant = m
        .get(&tenant_id)
        .with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
@@ -288,9 +283,9 @@ pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> an
    info!("waiting for timeline tasks to shutdown");
    task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
    info!("timeline task shutdown completed");
-    match get_tenant(tenant_id, true) {
+    match get_tenant(tenant_id, true).await {
        Ok(tenant) => {
-            tenant.delete_timeline(timeline_id)?;
+            tenant.delete_timeline(timeline_id).await?;
        }
        Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
    }
@@ -302,40 +297,67 @@ pub async fn detach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
 ) -> anyhow::Result<()> {
-    let tenant = match {
-        let mut tenants_accessor = tenants_state::write_tenants();
-        tenants_accessor.remove(&tenant_id)
-    } {
-        Some(tenant) => tenant,
-        None => anyhow::bail!("Tenant not found for id {tenant_id}"),
-    };
+    remove_tenant_from_memory(tenant_id, async {
+        let local_tenant_directory = conf.tenant_path(&tenant_id);
+        fs::remove_dir_all(&local_tenant_directory)
+            .await
+            .with_context(|| {
+                format!("Failed to remove local tenant directory {local_tenant_directory:?}")
+            })?;
+        Ok(())
+    })
+    .await
+}

-    tenant.set_paused();
-    // shutdown all tenant and timeline tasks: gc, compaction, page service)
-    task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
+pub async fn load_tenant(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+    remote_storage: Option<GenericRemoteStorage>,
+) -> anyhow::Result<()> {
+    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+        let tenant_path = conf.tenant_path(&tenant_id);
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
+        if tenant_ignore_mark.exists() {
+            std::fs::remove_file(&tenant_ignore_mark)
+                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
+        }

-    // If removal fails there will be no way to successfully retry detach,
-    // because the tenant no longer exists in the in-memory map. And it needs to be removed from it
-    // before we remove files, because it contains references to tenant
-    // which references ephemeral files which are deleted on drop. So if we keep these references,
-    // we will attempt to remove files which no longer exist. This can be fixed by having shutdown
-    // mechanism for tenant that will clean temporary data to avoid any references to ephemeral files
-    let local_tenant_directory = conf.tenant_path(&tenant_id);
-    fs::remove_dir_all(&local_tenant_directory).with_context(|| {
-        format!(
-            "Failed to remove local tenant directory '{}'",
-            local_tenant_directory.display()
-        )
-    })?;
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
+            .with_context(|| {
+                format!("Failed to schedule tenant processing in path {tenant_path:?}")
+            })?;

-    Ok(())
+        vacant_entry.insert(new_tenant);
+        Ok(())
+    }).await
+}
+
+pub async fn ignore_tenant(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+) -> anyhow::Result<()> {
+    remove_tenant_from_memory(tenant_id, async {
+        let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
+        fs::File::create(&ignore_mark_file)
+            .await
+            .context("Failed to create ignore mark file")
+            .and_then(|_| {
+                crashsafe::fsync_file_and_parent(&ignore_mark_file)
+                    .context("Failed to fsync ignore mark file")
+            })
+            .with_context(|| format!("Failed to crate ignore mark for tenant {tenant_id}"))?;
+        Ok(())
+    })
+    .await
 }

 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub fn list_tenants() -> Vec<(TenantId, TenantState)> {
-    tenants_state::read_tenants()
+pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
+    TENANTS
+        .read()
+        .await
        .iter()
        .map(|(id, tenant)| (*id, tenant.current_state()))
        .collect()
@@ -348,25 +370,92 @@ pub fn list_tenants() -> Vec<(TenantId, TenantState)> {
 pub async fn attach_tenant(
    conf: &'static PageServerConf,
    tenant_id: TenantId,
-    remote_storage: &GenericRemoteStorage,
+    remote_storage: GenericRemoteStorage,
 ) -> anyhow::Result<()> {
-    match tenants_state::write_tenants().entry(tenant_id) {
+    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+        let tenant_path = conf.tenant_path(&tenant_id);
+        anyhow::ensure!(
+            !tenant_path.exists(),
+            "Cannot attach tenant {tenant_id}, local tenant directory already exists"
+        );
+
+        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
+        vacant_entry.insert(tenant);
+
+        Ok(())
+    })
+    .await
+}
+
+async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
+where
+    F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
+{
+    match TENANTS.write().await.entry(tenant_id) {
        hash_map::Entry::Occupied(e) => {
-            // Cannot attach a tenant that already exists. The error message depends on
-            // the state it's in.
-            match e.get().current_state() {
-                TenantState::Attaching => {
-                    anyhow::bail!("tenant {tenant_id} attach is already in progress")
-                }
-                current_state => {
-                    anyhow::bail!("tenant already exists, current state: {current_state:?}")
-                }
-            }
+            anyhow::bail!(
+                "tenant {tenant_id} already exists, state: {:?}",
+                e.get().current_state()
+            )
        }
-        hash_map::Entry::Vacant(v) => {
-            let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
-            v.insert(tenant);
-            Ok(())
+        hash_map::Entry::Vacant(v) => run(v),
+    }
+}
+
+/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
+/// Allows to remove other tenant resources manually, via `tenant_cleanup`.
+/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
+/// operation would be needed to remove it.
+async fn remove_tenant_from_memory<V, F>(
+    tenant_id: TenantId,
+    tenant_cleanup: F,
+) -> anyhow::Result<V>
+where
+    F: std::future::Future<Output = anyhow::Result<V>>,
+{
+    // It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
+    // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
+    // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
+    // avoid holding the lock for the entire process.
+    {
+        let tenants_accessor = TENANTS.write().await;
+        match tenants_accessor.get(&tenant_id) {
+            Some(tenant) => match tenant.current_state() {
+                TenantState::Attaching
+                | TenantState::Loading
+                | TenantState::Broken
+                | TenantState::Active => tenant.set_stopping(),
+                TenantState::Stopping => {
+                    anyhow::bail!("Tenant {tenant_id} is stopping already")
+                }
+            },
+            None => anyhow::bail!("Tenant not found for id {tenant_id}"),
+        }
+    }
+
+    // shutdown all tenant and timeline tasks: gc, compaction, page service)
+    // No new tasks will be started for this tenant because it's in `Stopping` state.
+    // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
+    task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
+
+    match tenant_cleanup
+        .await
+        .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
+    {
+        Ok(hook_value) => {
+            let mut tenants_accessor = TENANTS.write().await;
+            if tenants_accessor.remove(&tenant_id).is_none() {
+                warn!("Tenant {tenant_id} got removed from memory before operation finished");
+            }
+            Ok(hook_value)
+        }
+        Err(e) => {
+            let tenants_accessor = TENANTS.read().await;
+            match tenants_accessor.get(&tenant_id) {
+                Some(tenant) => tenant.set_broken(),
+                None => warn!("Tenant {tenant_id} got removed from memory"),
+            }
+            Err(e)
        }
    }
 }
@@ -378,12 +467,12 @@ use {
 };

 #[cfg(feature = "testing")]
-pub fn immediate_gc(
+pub async fn immediate_gc(
    tenant_id: TenantId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
-    let guard = tenants_state::read_tenants();
+    let guard = TENANTS.read().await;

    let tenant = guard
        .get(&tenant_id)
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -7,26 +7,12 @@ use std::time::Duration;

 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
+use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{Tenant, TenantState};
 use crate::tenant_mgr;
 use tracing::*;
 use utils::id::TenantId;

-#[cfg(test)]
-pub fn start_background_loops(tenant_id: TenantId) {
-    // Do not start the background loops.
-    // Right now, in tests, Tenant is only created by TenantHarness,
-    // and all tests that use TenantHarness assume that there are
-    // no background loops that do compaction and GC. If they want it
-    // to happen, they call the corresponding functions directly.
-    //
-    // XXX replace this with a TenantConfigRequest flag that is
-    // also usable by tests, see https://github.com/neondatabase/neon/issues/2917
-}
-
-#[cfg(not(test))]
-use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-#[cfg(not(test))]
 pub fn start_background_loops(tenant_id: TenantId) {
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
@@ -80,13 +66,17 @@ async fn compaction_loop(tenant_id: TenantId) {
                },
            };

-            // Run blocking part of the task
-
-            // Run compaction
            let mut sleep_duration = tenant.get_compaction_period();
-            if let Err(e) = tenant.compaction_iteration().await {
-                sleep_duration = wait_duration;
-                error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
+            if sleep_duration == Duration::ZERO {
+                info!("automatic compaction is disabled");
+                // check again in 10 seconds, in case it's been enabled again.
+                sleep_duration = Duration::from_secs(10);
+            } else {
+                // Run compaction
+                if let Err(e) = tenant.compaction_iteration().await {
+                    sleep_duration = wait_duration;
+                    error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
+                }
            }

            // Sleep
@@ -127,15 +117,21 @@ async fn gc_loop(tenant_id: TenantId) {
                },
            };

-            // Run gc
            let gc_period = tenant.get_gc_period();
            let gc_horizon = tenant.get_gc_horizon();
            let mut sleep_duration = gc_period;
-            if gc_horizon > 0 {
-                if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
-                {
-                    sleep_duration = wait_duration;
-                    error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
+            if sleep_duration == Duration::ZERO {
+                info!("automatic GC is disabled");
+                // check again in 10 seconds, in case it's been enabled again.
+                sleep_duration = Duration::from_secs(10);
+            } else {
+                // Run gc
+                if gc_horizon > 0 {
+                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
+                    {
+                        sleep_duration = wait_duration;
+                        error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
+                    }
                }
            }

@@ -159,7 +155,7 @@ async fn wait_for_active_tenant(
    wait: Duration,
 ) -> ControlFlow<(), Arc<Tenant>> {
    let tenant = loop {
-        match tenant_mgr::get_tenant(tenant_id, false) {
+        match tenant_mgr::get_tenant(tenant_id, false).await {
            Ok(tenant) => break tenant,
            Err(e) => {
                error!("Failed to get a tenant {tenant_id}: {e:#}");
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -214,7 +214,7 @@ async fn connection_manager_loop_step(
                            match new_state {
                                // we're already active as walreceiver, no need to reactivate
                                TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Paused | TimelineState::Suspended => return ControlFlow::Continue(new_state),
+                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
                            }
                        }
                        Err(_sender_dropped_error) => return ControlFlow::Break(()),
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -84,7 +84,7 @@ pub trait WalRedoManager: Send + Sync {
        &self,
        key: Key,
        lsn: Lsn,
-        base_img: Option<Bytes>,
+        base_img: Option<(Lsn, Bytes)>,
        records: Vec<(Lsn, NeonWalRecord)>,
        pg_version: u32,
    ) -> Result<Bytes, WalRedoError>;
@@ -147,7 +147,7 @@ impl WalRedoManager for PostgresRedoManager {
        &self,
        key: Key,
        lsn: Lsn,
-        base_img: Option<Bytes>,
+        base_img: Option<(Lsn, Bytes)>,
        records: Vec<(Lsn, NeonWalRecord)>,
        pg_version: u32,
    ) -> Result<Bytes, WalRedoError> {
@@ -156,7 +156,8 @@ impl WalRedoManager for PostgresRedoManager {
            return Err(WalRedoError::InvalidRequest);
        }

-        let mut img: Option<Bytes> = base_img;
+        let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
+        let mut img = base_img.map(|p| p.1);
        let mut batch_neon = can_apply_in_neon(&records[0].1);
        let mut batch_start = 0;
        for i in 1..records.len() {
@@ -170,6 +171,7 @@ impl WalRedoManager for PostgresRedoManager {
                        key,
                        lsn,
                        img,
+                        base_img_lsn,
                        &records[batch_start..i],
                        self.conf.wal_redo_timeout,
                        pg_version,
@@ -189,6 +191,7 @@ impl WalRedoManager for PostgresRedoManager {
                key,
                lsn,
                img,
+                base_img_lsn,
                &records[batch_start..],
                self.conf.wal_redo_timeout,
                pg_version,
@@ -223,11 +226,13 @@ impl PostgresRedoManager {
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
+    #[allow(clippy::too_many_arguments)]
    fn apply_batch_postgres(
        &self,
        key: Key,
        lsn: Lsn,
        base_img: Option<Bytes>,
+        base_img_lsn: Lsn,
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
        pg_version: u32,
@@ -282,9 +287,12 @@ impl PostgresRedoManager {
        // next request will launch a new one.
        if result.is_err() {
            error!(
-                "error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
+                "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
                records.len(),
+				records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+				records.last().map(|p| p.0).unwrap_or(Lsn(0)),
                nbytes,
+				base_img_lsn,
                lsn
            );
            let process = process_guard.take().unwrap();
@@ -922,8 +930,7 @@ impl NoLeakChild {

        match child.wait() {
            Ok(exit_status) => {
-                // log at error level since .kill() is something we only do on errors ATM
-                error!(exit_status = %exit_status, "wait successful");
+                info!(exit_status = %exit_status, "wait successful");
            }
            Err(e) => {
                error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -464,12 +464,12 @@ pg_init_libpagestore(void)
 							NULL, NULL, NULL);
 	DefineCustomIntVariable("neon.readahead_buffer_size",
 							"number of prefetches to buffer",
-							"This buffer is used to store prefetched data; so "
-							"it is important that this buffer is at least as "
-							"large as the configured value of all tablespaces' "
-							"effective_io_concurrency and maintenance_io_concurrency, "
-							"your sessions' values of these, and the value for "
-							"seqscan_prefetch_buffers.",
+							"This buffer is used to hold and manage prefetched "
+							"data; so it is important that this buffer is at "
+							"least as large as the configured value of all "
+							"tablespaces' effective_io_concurrency and "
+							"maintenance_io_concurrency, and your sessions' "
+							"values for these settings.",
 							&readahead_buffer_size,
 							128, 16, 1024,
 							PGC_USERSET,
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -242,6 +242,14 @@ PrefetchState *MyPState;
 	) \
 )

+#define ReceiveBufferNeedsCompaction() (\
+	(MyPState->n_responses_buffered / 8) < ( \
+		MyPState->ring_receive - \
+			MyPState->ring_last - \
+			MyPState->n_responses_buffered \
+	) \
+)
+
 int			n_prefetch_hits = 0;
 int			n_prefetch_misses = 0;
 int			n_prefetch_missed_caches = 0;
@@ -249,17 +257,99 @@ int			n_prefetch_dupes = 0;

 XLogRecPtr	prefetch_lsn = 0;

+static bool compact_prefetch_buffers(void);
 static void consume_prefetch_responses(void);
 static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
 static bool prefetch_read(PrefetchRequest *slot);
 static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
 static bool prefetch_wait_for(uint64 ring_index);
-static void prefetch_cleanup(void);
+static void prefetch_cleanup_trailing_unused(void);
 static inline void prefetch_set_unused(uint64 ring_index);

 static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
 									   ForkNumber forknum, BlockNumber blkno);

+static bool
+compact_prefetch_buffers(void)
+{
+	uint64	empty_ring_index = MyPState->ring_last;
+	uint64	search_ring_index = MyPState->ring_receive;
+	int n_moved = 0;
+	
+	if (MyPState->ring_receive == MyPState->ring_last)
+		return false;
+
+	while (search_ring_index > MyPState->ring_last)
+	{
+		search_ring_index--;
+		if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED)
+		{
+			empty_ring_index = search_ring_index;
+			break;
+		}
+	}
+
+	/*
+	 * Here we have established:
+	 * slots < search_ring_index may be unused (not scanned)
+	 * slots >= search_ring_index and <= empty_ring_index are unused
+	 * slots > empty_ring_index are in use, or outside our buffer's range.
+	 * 
+	 * Therefore, there is a gap of at least one unused items between
+	 * search_ring_index and empty_ring_index, which grows as we hit
+	 * more unused items while moving backwards through the array.
+	 */
+
+	while (search_ring_index > MyPState->ring_last)
+	{
+		PrefetchRequest *source_slot;
+		PrefetchRequest *target_slot;
+		bool		found;
+
+		search_ring_index--;
+
+		source_slot = GetPrfSlot(search_ring_index);
+
+		if (source_slot->status == PRFS_UNUSED)
+			continue;
+
+		target_slot = GetPrfSlot(empty_ring_index);
+
+		Assert(source_slot->status == PRFS_RECEIVED);
+		Assert(target_slot->status == PRFS_UNUSED);
+
+		target_slot->buftag = source_slot->buftag;
+		target_slot->status = source_slot->status;
+		target_slot->response = source_slot->response;
+		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
+		target_slot->my_ring_index = empty_ring_index;
+
+		prfh_delete(MyPState->prf_hash, source_slot);
+		prfh_insert(MyPState->prf_hash, target_slot, &found);
+
+		Assert(!found);
+
+		/* Adjust the location of our known-empty slot */
+		empty_ring_index--;
+
+		source_slot->status = PRFS_UNUSED;
+		source_slot->buftag = (BufferTag) {0};
+		source_slot->response = NULL;
+		source_slot->my_ring_index = 0;
+		source_slot->effective_request_lsn = 0;
+
+		n_moved++;
+	}
+
+	if (MyPState->ring_last != empty_ring_index)
+	{
+		prefetch_cleanup_trailing_unused();
+		return true;
+	}
+
+	return false;
+}
+
 void
 readahead_buffer_resize(int newsize, void *extra)
 {
@@ -267,7 +357,7 @@ readahead_buffer_resize(int newsize, void *extra)
 				nfree = newsize;
 	PrefetchState *newPState;
 	Size 		newprfs_size = offsetof(PrefetchState, prf_buffer) + (
-		sizeof(PrefetchRequest) * readahead_buffer_size
+		sizeof(PrefetchRequest) * newsize
 	);
 	
 	/* don't try to re-initialize if we haven't initialized yet */
@@ -323,7 +413,7 @@ readahead_buffer_resize(int newsize, void *extra)
 		prfh_insert(newPState->prf_hash, newslot, &found);

 		Assert(!found);
-		
+
 		switch (newslot->status)
 		{
 			case PRFS_UNUSED:
@@ -370,7 +460,7 @@ consume_prefetch_responses(void)
 }

 static void
-prefetch_cleanup(void)
+prefetch_cleanup_trailing_unused(void)
 {
 	uint64	ring_index;
 	PrefetchRequest *slot;
@@ -531,7 +621,10 @@ prefetch_set_unused(uint64 ring_index)

 	/* run cleanup if we're holding back ring_last */
 	if (MyPState->ring_last == ring_index)
-		prefetch_cleanup();
+		prefetch_cleanup_trailing_unused();
+	/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
+	else if (ReceiveBufferNeedsCompaction())
+		compact_prefetch_buffers();
 }

 static void
@@ -582,6 +675,33 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		request.req.lsn = lsn;
 		prefetch_lsn = Max(prefetch_lsn, lsn);
 		slot->effective_request_lsn = prefetch_lsn;
+
+		/*
+		 * Remember request LSN in the last-written LSN cache to avoid false
+		 * prefetch invalidations.
+		 *
+		 * Imagine what would happen without this, when you perform a large
+		 * sequential scan with UPDATE. The sequential scan issues a prefetch
+		 * request for each page in order, and every page is also dirtied. On
+		 * each page, the oldest page in the last-written LSN cache is evicted,
+		 * which advances the global last-written LSN. The pages being scanned are
+		 * not in the last-written cache, so each prefetch request will use the
+		 * global last-written LSN in the request and memorize that in the
+		 * slot. However, when we receive the response to the prefetch request,
+		 * the global last-written LSN has already moved forwards, and the
+		 * cross-check we make that the last-written LSN matches will fail, and we
+		 * discard the prefetched response unnecessary.
+		 *
+		 * Inserting the LSN we use in the prefetch request to the last-written LSN
+		 * cache avoids that problem. With that, we will use the cached value in
+		 * the cross-check, instead of the more recent global last-written LSN value.
+		 */
+		SetLastWrittenLSNForBlock(
+			request.req.lsn,
+			slot->buftag.rnode,
+			slot->buftag.forkNum,
+			slot->buftag.blockNum
+			);
 	}

 	Assert(slot->response == NULL);
@@ -702,20 +822,31 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls

 		Assert(slot->status != PRFS_UNUSED);

-		/* We have the slot for ring_last, so that must still be in progress */
-		switch (slot->status)
+		/*
+		 * If there is good reason to run compaction on the prefetch buffers,
+		 * try to do that.
+		 */
+		if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
 		{
-			case PRFS_REQUESTED:
-				Assert(MyPState->ring_receive == cleanup_index);
-				prefetch_wait_for(cleanup_index);
-				prefetch_set_unused(cleanup_index);
-				break;
-			case PRFS_RECEIVED:
-			case PRFS_TAG_REMAINS:
-				prefetch_set_unused(cleanup_index);
-				break;
-			default:
-				pg_unreachable();
+			Assert(slot->status == PRFS_UNUSED);
+		}
+		else
+		{
+			/* We have the slot for ring_last, so that must still be in progress */
+			switch (slot->status)
+			{
+				case PRFS_REQUESTED:
+					Assert(MyPState->ring_receive == cleanup_index);
+					prefetch_wait_for(cleanup_index);
+					prefetch_set_unused(cleanup_index);
+					break;
+				case PRFS_RECEIVED:
+				case PRFS_TAG_REMAINS:
+					prefetch_set_unused(cleanup_index);
+					break;
+				default:
+					pg_unreachable();
+			}
 		}
 	}

@@ -1102,7 +1233,7 @@ PageIsEmptyHeapPage(char *buffer)
 }

 static void
-neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
+neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
 {
 	XLogRecPtr	lsn = PageGetLSN(buffer);

@@ -1116,7 +1247,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 	 * correctness, the non-logged updates are not critical. But we want to
 	 * have a reasonably up-to-date VM and FSM in the page server.
 	 */
-	if (forknum == FSM_FORKNUM && !RecoveryInProgress())
+	if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
 	{
 		/* FSM is never WAL-logged and we don't care. */
 		XLogRecPtr	recptr;
@@ -1125,30 +1256,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
 		XLogFlush(recptr);
 		lsn = recptr;
 		ereport(SmgrTrace,
-				(errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
-						blocknum,
-						reln->smgr_rnode.node.spcNode,
-						reln->smgr_rnode.node.dbNode,
-						reln->smgr_rnode.node.relNode,
-						forknum, LSN_FORMAT_ARGS(lsn))));
-	}
-	else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
-	{
-		/*
-		 * Always WAL-log vm. We should never miss clearing visibility map
-		 * bits.
-		 *
-		 * TODO Is it too bad for performance? Hopefully we do not evict
-		 * actively used vm too often.
-		 */
-		XLogRecPtr	recptr;
-
-		recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
-		XLogFlush(recptr);
-		lsn = recptr;
-
-		ereport(SmgrTrace,
-				(errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X",
+				(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
 						blocknum,
 						reln->smgr_rnode.node.spcNode,
 						reln->smgr_rnode.node.dbNode,
@@ -1543,6 +1651,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 			char *buffer, bool skipFsync)
 {
 	XLogRecPtr	lsn;
+	BlockNumber	n_blocks = 0;

 	switch (reln->smgr_relpersistence)
 	{
@@ -1582,7 +1691,16 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 					 errhint("This limit is defined by neon.max_cluster_size GUC")));
 	}

-	neon_wallog_page(reln, forkNum, blkno, buffer);
+	/*
+	 * Usually Postgres doesn't extend relation on more than one page
+	 * (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
+	 * call smgrextend for destination relation n using size of source relation
+	 */
+	get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks);
+	while (n_blocks < blkno)
+		neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
+
+	neon_wallog_page(reln, forkNum, blkno, buffer, false);
 	set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);

 	lsn = PageGetLSN(buffer);
@@ -1780,6 +1898,17 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
 												  &request_lsn);
 			slot = GetPrfSlot(ring_index);
 		}
+		else
+		{
+			/*
+			 * Empty our reference to the prefetch buffer's hash entry.
+			 * When we wait for prefetches, the entry reference is invalidated by 
+			 * potential updates to the hash, and when we reconnect to the 
+			 * pageserver the prefetch we're waiting for may be dropped,
+			 * in which case we need to retry and take the branch above.
+			 */
+			entry = NULL;
+		}

 		Assert(slot->my_ring_index == ring_index);
 		Assert(MyPState->ring_last <= ring_index &&
@@ -1818,7 +1947,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,

 	/* buffer was used, clean up for later reuse */
 	prefetch_set_unused(ring_index);
-	prefetch_cleanup();
+	prefetch_cleanup_trailing_unused();
 }

 /*
@@ -1999,7 +2128,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}

-	neon_wallog_page(reln, forknum, blocknum, buffer);
+	neon_wallog_page(reln, forknum, blocknum, buffer, false);

 	lsn = PageGetLSN(buffer);
 	elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
--- a/poetry.lock
+++ b/poetry.lock
@@ -525,7 +525,7 @@ typing-extensions = ">=4.1.0"

 [[package]]
 name = "certifi"
-version = "2022.9.24"
+version = "2022.12.7"
 description = "Python package for providing Mozilla's CA Bundle."
 category = "main"
 optional = false
@@ -1248,8 +1248,8 @@ python-versions = ">=3.6"

 [package.dependencies]
 pytest = [
-    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
    {version = ">=5.0", markers = "python_version < \"3.10\""},
+    {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
 ]

 [[package]]
@@ -1702,8 +1702,8 @@ botocore-stubs = [
    {file = "botocore_stubs-1.27.38-py3-none-any.whl", hash = "sha256:7add7641e9a479a9c8366893bb522fd9ca3d58714201e43662a200a148a1bc38"},
 ]
 certifi = [
-    {file = "certifi-2022.9.24-py3-none-any.whl", hash = "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382"},
-    {file = "certifi-2022.9.24.tar.gz", hash = "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14"},
+    {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
+    {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
 ]
 cffi = [
    {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"},
@@ -2036,6 +2036,7 @@ psutil = [
 psycopg2-binary = [
    {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
+    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
    {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -2069,6 +2070,7 @@ psycopg2-binary = [
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
    {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
+    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -2080,6 +2082,7 @@ psycopg2-binary = [
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
    {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
+    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
    {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -2096,18 +2099,7 @@ py = [
    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
 ]
 pyasn1 = [
-    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
-    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
-    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
-    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
    {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
-    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
-    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
-    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
-    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
-    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
-    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
-    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
    {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 pycodestyle = [
@@ -2213,6 +2205,13 @@ pyyaml = [
    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
+    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
+    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
+    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
+    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -28,6 +28,7 @@ use std::{borrow::Cow, future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
 use tracing::info;
 use utils::project_git_version;
+use utils::sentry_init::{init_sentry, release_name};

 project_git_version!(GIT_VERSION);

@@ -45,6 +46,9 @@ async fn main() -> anyhow::Result<()> {
        .with_target(false)
        .init();

+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(release_name!(), &[]);
+
    let arg_matches = cli().get_matches();

    let tls_config = match (
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -4,7 +4,6 @@
 use anyhow::{bail, Context, Result};
 use clap::{value_parser, Arg, ArgAction, Command};
 use const_format::formatcp;
-use nix::unistd::Pid;
 use remote_storage::RemoteStorageConfig;
 use std::fs::{self, File};
 use std::io::{ErrorKind, Write};
@@ -15,7 +14,7 @@ use tokio::sync::mpsc;
 use toml_edit::Document;
 use tracing::*;
 use url::{ParseError, Url};
-use utils::lock_file;
+use utils::pid_file;

 use metrics::set_build_info_metric;
 use safekeeper::broker;
@@ -35,11 +34,14 @@ use utils::{
    http::endpoint,
    id::NodeId,
    logging::{self, LogFormat},
-    project_git_version, signals, tcp_listener,
+    project_git_version,
+    sentry_init::{init_sentry, release_name},
+    signals, tcp_listener,
 };

 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";
+
 project_git_version!(GIT_VERSION);

 fn main() -> anyhow::Result<()> {
@@ -133,6 +135,8 @@ fn main() -> anyhow::Result<()> {
        conf.log_format = LogFormat::from_config(log_format)?;
    }

+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
    start_safekeeper(conf, given_id, arg_matches.get_flag("init"))
 }

@@ -142,28 +146,13 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo

    // Prevent running multiple safekeepers on the same directory
    let lock_file_path = conf.workdir.join(PID_FILE_NAME);
-    let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
-        lock_file::LockCreationResult::Created {
-            new_lock_contents,
-            file,
-        } => {
-            info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
-            file
-        }
-        lock_file::LockCreationResult::AlreadyLocked {
-            existing_lock_contents,
-        } => anyhow::bail!(
-            "Could not lock pid file; safekeeper is already running in {:?} with PID {}",
-            conf.workdir,
-            existing_lock_contents
-        ),
-        lock_file::LockCreationResult::CreationFailed(e) => {
-            return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
-        }
-    };
+    let lock_file =
+        pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
+    info!("Claimed pid file at {lock_file_path:?}");
+
    // ensure that the lock file is held even if the main thread of the process is panics
    // we need to release the lock file only when the current process is gone
-    let _ = Box::leak(Box::new(lock_file));
+    std::mem::forget(lock_file);

    // Set or read our ID.
    set_id(&mut conf, given_id)?;
@@ -275,10 +264,10 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
    signals.handle(|signal| {
        // TODO: implement graceful shutdown with joining threads etc
        info!(
-            "Got {}. Terminating in immediate shutdown mode",
+            "received {}, terminating in immediate shutdown mode",
            signal.name()
        );
-        std::process::exit(111);
+        std::process::exit(0);
    })
 }

--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -226,6 +226,7 @@ impl ReplicationConn {
            let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn);

            let mut wal_reader = WalReader::new(
+                spg.conf.workdir.clone(),
                spg.conf.timeline_dir(&tli.ttid),
                &persisted_state,
                start_pos,
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -13,7 +13,7 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::GenericRemoteStorage;
+use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::fs::File;
 use tokio::runtime::Builder;

@@ -151,7 +151,7 @@ async fn update_task(
            let timeline_dir = conf.timeline_dir(&ttid);

            let handle = tokio::spawn(
-                backup_task_main(ttid, timeline_dir, shutdown_rx)
+                backup_task_main(ttid, timeline_dir, conf.workdir.clone(), shutdown_rx)
                    .instrument(info_span!("WAL backup task", ttid = %ttid)),
            );

@@ -182,10 +182,10 @@ async fn wal_backup_launcher_main_loop(

    let conf_ = conf.clone();
    REMOTE_STORAGE.get_or_init(|| {
-        conf_.remote_storage.as_ref().map(|c| {
-            GenericRemoteStorage::from_config(conf_.workdir, c)
-                .expect("failed to create remote storage")
-        })
+        conf_
+            .remote_storage
+            .as_ref()
+            .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
    });

    // Presense in this map means launcher is aware s3 offloading is needed for
@@ -234,6 +234,7 @@ async fn wal_backup_launcher_main_loop(
 struct WalBackupTask {
    timeline: Arc<Timeline>,
    timeline_dir: PathBuf,
+    workspace_dir: PathBuf,
    wal_seg_size: usize,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,
 }
@@ -242,6 +243,7 @@ struct WalBackupTask {
 async fn backup_task_main(
    ttid: TenantTimelineId,
    timeline_dir: PathBuf,
+    workspace_dir: PathBuf,
    mut shutdown_rx: Receiver<()>,
 ) {
    info!("started");
@@ -257,6 +259,7 @@ async fn backup_task_main(
        commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
        timeline: tli,
        timeline_dir,
+        workspace_dir,
    };

    // task is spinned up only when wal_seg_size already initialized
@@ -321,6 +324,7 @@ impl WalBackupTask {
                commit_lsn,
                self.wal_seg_size,
                &self.timeline_dir,
+                &self.workspace_dir,
            )
            .await
            {
@@ -353,11 +357,12 @@ pub async fn backup_lsn_range(
    end_lsn: Lsn,
    wal_seg_size: usize,
    timeline_dir: &Path,
+    workspace_dir: &Path,
 ) -> Result<Lsn> {
    let mut res = start_lsn;
    let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
    for s in &segments {
-        backup_single_segment(s, timeline_dir)
+        backup_single_segment(s, timeline_dir, workspace_dir)
            .await
            .with_context(|| format!("offloading segno {}", s.seg_no))?;

@@ -372,11 +377,24 @@ pub async fn backup_lsn_range(
    Ok(res)
 }

-async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> {
-    let segment_file_name = seg.file_path(timeline_dir)?;
+async fn backup_single_segment(
+    seg: &Segment,
+    timeline_dir: &Path,
+    workspace_dir: &Path,
+) -> Result<()> {
+    let segment_file_path = seg.file_path(timeline_dir)?;
+    let remote_segment_path = segment_file_path
+        .strip_prefix(&workspace_dir)
+        .context("Failed to strip workspace dir prefix")
+        .and_then(RemotePath::new)
+        .with_context(|| {
+            format!(
+                "Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
+            )
+        })?;

-    backup_object(&segment_file_name, seg.size()).await?;
-    debug!("Backup of {} done", segment_file_name.display());
+    backup_object(&segment_file_path, &remote_segment_path, seg.size()).await?;
+    debug!("Backup of {} done", segment_file_path.display());

    Ok(())
 }
@@ -426,7 +444,7 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {

 static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();

-async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
+async fn backup_object(source_file: &Path, target_file: &RemotePath, size: usize) -> Result<()> {
    let storage = REMOTE_STORAGE
        .get()
        .expect("failed to get remote storage")
@@ -441,12 +459,12 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
    })?);

    storage
-        .upload_storage_object(Box::new(file), size, source_file)
+        .upload_storage_object(Box::new(file), size, target_file)
        .await
 }

 pub async fn read_object(
-    file_path: PathBuf,
+    file_path: &RemotePath,
    offset: u64,
 ) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead>>> {
    let storage = REMOTE_STORAGE
@@ -455,19 +473,13 @@ pub async fn read_object(
        .as_ref()
        .context("No remote storage configured")?;

-    info!(
-        "segment download about to start for local path {} at offset {}",
-        file_path.display(),
-        offset
-    );
+    info!("segment download about to start from remote path {file_path:?} at offset {offset}");
+
    let download = storage
-        .download_storage_object(Some((offset, None)), &file_path)
+        .download_storage_object(Some((offset, None)), file_path)
        .await
        .with_context(|| {
-            format!(
-                "Failed to open WAL segment download stream for local path {}",
-                file_path.display()
-            )
+            format!("Failed to open WAL segment download stream for remote path {file_path:?}")
        })?;

    Ok(download.download_stream)
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -8,6 +8,7 @@
 //! Note that last file has `.partial` suffix, that's different from postgres.

 use anyhow::{bail, Context, Result};
+use remote_storage::RemotePath;

 use std::io::{self, Seek, SeekFrom};
 use std::pin::Pin;
@@ -445,6 +446,7 @@ fn remove_segments_from_disk(
 }

 pub struct WalReader {
+    workdir: PathBuf,
    timeline_dir: PathBuf,
    wal_seg_size: usize,
    pos: Lsn,
@@ -459,6 +461,7 @@ pub struct WalReader {

 impl WalReader {
    pub fn new(
+        workdir: PathBuf,
        timeline_dir: PathBuf,
        state: &SafeKeeperState,
        start_pos: Lsn,
@@ -478,6 +481,7 @@ impl WalReader {
        }

        Ok(Self {
+            workdir,
            timeline_dir,
            wal_seg_size: state.server.wal_seg_size as usize,
            pos: start_pos,
@@ -545,7 +549,17 @@ impl WalReader {

        // Try to open remote file, if remote reads are enabled
        if self.enable_remote_read {
-            return read_object(wal_file_path, xlogoff as u64).await;
+            let remote_wal_file_path = wal_file_path
+                .strip_prefix(&self.workdir)
+                .context("Failed to strip workdir prefix")
+                .and_then(RemotePath::new)
+                .with_context(|| {
+                    format!(
+                        "Failed to resolve remote part of path {:?} for base {:?}",
+                        wal_file_path, self.workdir,
+                    )
+                })?;
+            return read_object(&remote_wal_file_path, xlogoff as u64).await;
        }

        bail!("WAL segment is not found")
--- a/storage_broker/build.rs
+++ b/storage_broker/build.rs
@@ -1,7 +1,11 @@
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // Generate code to deterministic location to make finding it easier.
-    tonic_build::configure()
-        .out_dir("proto/") // put generated code to proto/
-        .compile(&["proto/broker.proto"], &["proto/"])?;
+    // Generate rust code from .proto protobuf.
+    //
+    // Note: we previously tried to use deterministic location at proto/ for
+    // easy location, but apparently interference with cachepot sometimes fails
+    // the build then. Anyway, per cargo docs build script shouldn't output to
+    // anywhere but $OUT_DIR.
+    tonic_build::compile_protos("proto/broker.proto")
+        .unwrap_or_else(|e| panic!("failed to compile protos {:?}", e));
    Ok(())
 }
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -11,7 +11,7 @@ use proto::{

 // Code generated by protobuf.
 pub mod proto {
-    include!("../proto/storage_broker.rs");
+    tonic::include_proto!("storage_broker");
 }

 pub mod metrics;
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1119,6 +1119,14 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach")
        self.verbose_error(res)

+    def tenant_load(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
+        self.verbose_error(res)
+
+    def tenant_ignore(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
+        self.verbose_error(res)
+
    def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
@@ -1743,6 +1751,7 @@ class NeonPageserver(PgProtocol):
            ".*Connection aborted: error communicating with the server: Connection reset by peer.*",
            ".*kill_and_wait_impl.*: wait successful.*",
            ".*end streaming to Some.*",
+            ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
            # safekeeper connection can fail with this, in the window between timeline creation
            # and streaming start
            ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
@@ -1761,6 +1770,14 @@ class NeonPageserver(PgProtocol):
            ".*Removing intermediate uninit mark file.*",
            # FIXME: known race condition in TaskHandle: https://github.com/neondatabase/neon/issues/2885
            ".*sender is dropped while join handle is still alive.*",
+            # Tenant::delete_timeline() can cause any of the four following errors.
+            # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
+            ".*could not flush frozen layer.*queue is in state Stopped",  # when schedule layer upload fails because queued got closed before compaction got killed
+            ".*wait for layer upload ops to complete.*",  # .*Caused by:.*wait_completion aborted because upload queue was stopped
+            ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping",  # When gc checks timeline state after acquiring layer_removal_cs
+            ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
+            ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
+            ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
        ]

    def start(
@@ -2528,6 +2545,7 @@ class SafekeeperTimelineStatus:
    acceptor_epoch: int
    pg_version: int
    flush_lsn: Lsn
+    commit_lsn: Lsn
    timeline_start_lsn: Lsn
    backup_lsn: Lsn
    remote_consistent_lsn: Lsn
@@ -2577,6 +2595,7 @@ class SafekeeperHttpClient(requests.Session):
            acceptor_epoch=resj["acceptor_state"]["epoch"],
            pg_version=resj["pg_info"]["pg_version"],
            flush_lsn=Lsn(resj["flush_lsn"]),
+            commit_lsn=Lsn(resj["commit_lsn"]),
            timeline_start_lsn=Lsn(resj["timeline_start_lsn"]),
            backup_lsn=Lsn(resj["backup_lsn"]),
            remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
@@ -2877,6 +2896,7 @@ def assert_no_in_progress_downloads_for_tenant(
 ):
    tenant_status = pageserver_http_client.tenant_status(tenant)
    assert tenant_status["has_in_progress_downloads"] is False, tenant_status
+    assert tenant_status["state"] == "Active"


 def remote_consistent_lsn(
@@ -2918,6 +2938,27 @@ def wait_for_upload(
    )


+# Does not use `wait_until` for debugging purposes
+def wait_until_tenant_state(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    expected_state: str,
+    iterations: int,
+) -> bool:
+    for _ in range(iterations):
+        try:
+            tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
+            log.debug(f"Tenant {tenant_id} data: {tenant}")
+            if tenant["state"] == expected_state:
+                return True
+        except Exception as e:
+            log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
+
+        time.sleep(1)
+
+    raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
+
+
 def last_record_lsn(
    pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
--- a/test_runner/performance/test_bulk_update.py
+++ b/test_runner/performance/test_bulk_update.py
@@ -42,7 +42,8 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)

    cur.execute("drop table t")
    cur.execute("set enable_seqscan_prefetch=on")
-    cur.execute("set seqscan_prefetch_buffers=100")
+    cur.execute("set effective_io_concurrency=32")
+    cur.execute("set maintenance_io_concurrency=32")

    cur.execute(f"create table t2(x integer) WITH (fillfactor={fillfactor})")

--- a/test_runner/performance/test_layer_map.py
+++ b/test_runner/performance/test_layer_map.py
@@ -12,12 +12,12 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark):
    n_iters = 10
    n_records = 100000

-    # We want to have a lot of lot of layer files to exercise the layer map. Make
-    # gc_horizon and checkpoint_distance very small, so that we get a lot of small layer files.
+    # We want to have a lot of lot of layer files to exercise the layer map. Disable
+    # GC, and make checkpoint_distance very small, so that we get a lot of small layer
+    # files.
    tenant, _ = env.neon_cli.create_tenant(
        conf={
-            "gc_period": "100 m",
-            "gc_horizon": "1048576",
+            "gc_period": "0s",
            "checkpoint_distance": "8192",
            "compaction_period": "1 s",
            "compaction_threshold": "1",
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -0,0 +1,146 @@
+from dataclasses import dataclass
+from typing import Dict, Tuple
+
+import pytest
+from _pytest.mark import ParameterSet
+from fixtures.compare_fixtures import RemoteCompare
+from fixtures.log_helper import log
+from fixtures.utils import get_self_dir
+
+
+@dataclass
+class LabelledQuery:
+    """An SQL query with a label for the test report."""
+
+    label: str
+    query: str
+
+
+# A list of queries to run.
+# Please do not alter the label for the query, as it is used to identify it.
+# Labels for ClickBench queries match the labels in ClickBench reports
+# on https://benchmark.clickhouse.com/ (the DB size may differ).
+QUERIES: Tuple[LabelledQuery, ...] = (
+    # Disable `black` formatting for the list of queries so that it's easier to read
+    # fmt: off
+    ### ClickBench queries:
+    LabelledQuery("Q0",  r"SELECT COUNT(*) FROM hits;"),
+    LabelledQuery("Q1",  r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"),
+    LabelledQuery("Q2",  r"SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;"),
+    LabelledQuery("Q3",  r"SELECT AVG(UserID) FROM hits;"),
+    LabelledQuery("Q4",  r"SELECT COUNT(DISTINCT UserID) FROM hits;"),
+    LabelledQuery("Q5",  r"SELECT COUNT(DISTINCT SearchPhrase) FROM hits;"),
+    LabelledQuery("Q6",  r"SELECT MIN(EventDate), MAX(EventDate) FROM hits;"),
+    LabelledQuery("Q7",  r"SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;"),
+    LabelledQuery("Q8",  r"SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;"),
+    LabelledQuery("Q9",  r"SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q10", r"SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;"),
+    LabelledQuery("Q11", r"SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;"),
+    LabelledQuery("Q12", r"SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q13", r"SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;"),
+    LabelledQuery("Q14", r"SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q15", r"SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;"),
+    LabelledQuery("Q16", r"SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;"),
+    LabelledQuery("Q17", r"SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;"),
+    LabelledQuery("Q18", r"SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;"),
+    LabelledQuery("Q19", r"SELECT UserID FROM hits WHERE UserID = 435090932899640449;"),
+    LabelledQuery("Q20", r"SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';"),
+    LabelledQuery("Q21", r"SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q22", r"SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q23", r"SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;"),
+    LabelledQuery("Q24", r"SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;"),
+    LabelledQuery("Q25", r"SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;"),
+    LabelledQuery("Q26", r"SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;"),
+    LabelledQuery("Q27", r"SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;"),
+    LabelledQuery("Q28", r"SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;"),
+    LabelledQuery("Q29", r"SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;"),
+    LabelledQuery("Q30", r"SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q31", r"SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q32", r"SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q33", r"SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q34", r"SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q35", r"SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;"),
+    LabelledQuery("Q36", r"SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;"),
+    LabelledQuery("Q37", r"SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;"),
+    LabelledQuery("Q38", r"SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;"),
+    LabelledQuery("Q39", r"SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;"),
+    LabelledQuery("Q40", r"SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;"),
+    LabelledQuery("Q41", r"SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;"),
+    LabelledQuery("Q42", r"SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;"),
+    ### Custom Neon queries:
+    # I suggest using the NQ prefix (which stands for Neon Query) instead of Q
+    # to not intersect with the original ClickBench queries if their list is extended.
+    #
+    # LabelledQuery("NQ0", r"..."),
+    # LabelledQuery("NQ1", r"..."),
+    # ...
+    # fmt: on
+)
+
+
+def run_psql(env: RemoteCompare, labelled_query: LabelledQuery, times: int) -> None:
+    # prepare connstr:
+    # - cut out password from connstr to pass it via env
+    # - add options to connstr
+    password = env.pg.default_options.get("password", None)
+    options = f"-cstatement_timeout=0 {env.pg.default_options.get('options', '')}"
+    connstr = env.pg.connstr(password=None, options=options)
+
+    environ: Dict[str, str] = {}
+    if password is not None:
+        environ["PGPASSWORD"] = password
+
+    label, query = labelled_query.label, labelled_query.query
+
+    log.info(f"Running query {label} {times} times")
+    for i in range(times):
+        run = i + 1
+        log.info(f"Run {run}/{times}")
+        with env.zenbenchmark.record_duration(f"{label}/{run}"):
+            env.pg_bin.run_capture(["psql", connstr, "-c", query], env=environ)
+
+
+@pytest.mark.parametrize("query", QUERIES)
+@pytest.mark.remote_cluster
+def test_clickbench(query: LabelledQuery, remote_compare: RemoteCompare):
+    """
+    An OLAP-style ClickHouse benchmark
+
+    Based on https://github.com/ClickHouse/ClickBench/tree/c00135ca5b6a0d86fedcdbf998fdaa8ed85c1c3b/aurora-postgresql
+    The DB prepared manually in advance
+    """
+
+    run_psql(remote_compare, query, times=3)
+
+
+def tpch_queuies() -> Tuple[ParameterSet, ...]:
+    """
+    A list of queries to run for the TPC-H benchmark.
+    - querues in returning tuple are ordered by the query number
+    - pytest parameters id is adjusted to match the query id (the numbering starts from 1)
+    """
+    queries_dir = get_self_dir().parent / "performance" / "tpc-h" / "queries"
+    assert queries_dir.exists(), f"TPC-H queries dir not found: {queries_dir}"
+
+    return tuple(
+        pytest.param(LabelledQuery(f"Q{f.stem}", f.read_text()), id=f"query{f.stem}")
+        for f in sorted(queries_dir.glob("*.sql"), key=lambda f: int(f.stem))
+    )
+
+
+@pytest.mark.parametrize("query", tpch_queuies())
+@pytest.mark.remote_cluster
+def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare):
+    """
+    TCP-H Benchmark
+
+    The DB prepared manually in advance:
+    - schema: test_runner/performance/tpc-h/create-schema.sql
+    - indexes: test_runner/performance/tpc-h/create-indexes.sql
+    - data generated by `dbgen` program of the official TPC-H benchmark
+    - `VACUUM (FREEZE, PARALLEL 0);`
+
+    For query generation `1669822882` is used as a seed to the RNG
+    """
+
+    run_psql(remote_compare, query, times=1)
--- a/test_runner/performance/tpc-h/create-indexes.sql
+++ b/test_runner/performance/tpc-h/create-indexes.sql
@@ -0,0 +1,43 @@
+-- Section 1.4.2.2
+
+ALTER TABLE part ADD PRIMARY KEY (p_partkey);
+ALTER TABLE supplier ADD PRIMARY KEY (s_suppkey);
+ALTER TABLE partsupp ADD PRIMARY KEY (ps_partkey, ps_suppkey);
+ALTER TABLE customer ADD PRIMARY KEY (c_custkey);
+ALTER TABLE orders ADD PRIMARY KEY (o_orderkey);
+ALTER TABLE lineitem ADD PRIMARY KEY (l_orderkey, l_linenumber);
+ALTER TABLE nation ADD PRIMARY KEY (n_nationkey);
+ALTER TABLE region ADD PRIMARY KEY (r_regionkey);
+
+-- Section 1.4.2.3
+
+CREATE INDEX ON supplier USING btree (s_nationkey);
+ALTER TABLE supplier ADD FOREIGN KEY (s_nationkey) REFERENCES nation (n_nationkey);
+
+/* IGNORE: implied by primary key */
+-- CREATE INDEX ON partsupp USING btree (ps_partkey);
+CREATE INDEX ON partsupp USING btree (ps_suppkey);
+ALTER TABLE partsupp ADD FOREIGN KEY (ps_partkey) REFERENCES part (p_partkey);
+ALTER TABLE partsupp ADD FOREIGN KEY (ps_suppkey) REFERENCES supplier (s_suppkey);
+
+CREATE INDEX ON customer USING btree (c_nationkey);
+ALTER TABLE customer ADD FOREIGN KEY (c_nationkey) REFERENCES nation (n_nationkey);
+
+CREATE INDEX ON orders USING btree (o_custkey);
+ALTER TABLE orders ADD FOREIGN KEY (o_custkey) REFERENCES customer (c_custkey);
+
+/* IGNORE: implied by primary key */
+-- CREATE INDEX ON lineitem USING btree (l_orderkey);
+CREATE INDEX ON lineitem USING btree (l_partkey, l_suppkey);
+CREATE INDEX ON lineitem USING btree (l_suppkey);
+ALTER TABLE lineitem ADD FOREIGN KEY (l_orderkey) REFERENCES orders (o_orderkey);
+ALTER TABLE lineitem ADD FOREIGN KEY (l_partkey) REFERENCES part (p_partkey);
+ALTER TABLE lineitem ADD FOREIGN KEY (l_suppkey) REFERENCES supplier (s_suppkey);
+ALTER TABLE lineitem ADD FOREIGN KEY (l_partkey, l_suppkey) REFERENCES partsupp (ps_partkey, ps_suppkey);
+
+CREATE INDEX ON nation USING btree (n_regionkey);
+ALTER TABLE nation ADD FOREIGN KEY (n_regionkey) REFERENCES region (r_regionkey);
+
+-- Section 1.4.2.4
+
+ALTER TABLE lineitem ADD CHECK (l_shipdate <= l_receiptdate);
--- a/test_runner/performance/tpc-h/create-schema.sql
+++ b/test_runner/performance/tpc-h/create-schema.sql
@@ -0,0 +1,69 @@
+-- Sccsid:     @(#)dss.ddl	2.1.8.1
+CREATE TABLE NATION  ( N_NATIONKEY  INTEGER NOT NULL,
+                            N_NAME       CHAR(25) NOT NULL,
+                            N_REGIONKEY  INTEGER NOT NULL,
+                            N_COMMENT    VARCHAR(152));
+
+CREATE TABLE REGION  ( R_REGIONKEY  INTEGER NOT NULL,
+                            R_NAME       CHAR(25) NOT NULL,
+                            R_COMMENT    VARCHAR(152));
+
+CREATE TABLE PART  ( P_PARTKEY     INTEGER NOT NULL,
+                          P_NAME        VARCHAR(55) NOT NULL,
+                          P_MFGR        CHAR(25) NOT NULL,
+                          P_BRAND       CHAR(10) NOT NULL,
+                          P_TYPE        VARCHAR(25) NOT NULL,
+                          P_SIZE        INTEGER NOT NULL,
+                          P_CONTAINER   CHAR(10) NOT NULL,
+                          P_RETAILPRICE DECIMAL(15,2) NOT NULL,
+                          P_COMMENT     VARCHAR(23) NOT NULL );
+
+CREATE TABLE SUPPLIER ( S_SUPPKEY     INTEGER NOT NULL,
+                             S_NAME        CHAR(25) NOT NULL,
+                             S_ADDRESS     VARCHAR(40) NOT NULL,
+                             S_NATIONKEY   INTEGER NOT NULL,
+                             S_PHONE       CHAR(15) NOT NULL,
+                             S_ACCTBAL     DECIMAL(15,2) NOT NULL,
+                             S_COMMENT     VARCHAR(101) NOT NULL);
+
+CREATE TABLE PARTSUPP ( PS_PARTKEY     INTEGER NOT NULL,
+                             PS_SUPPKEY     INTEGER NOT NULL,
+                             PS_AVAILQTY    INTEGER NOT NULL,
+                             PS_SUPPLYCOST  DECIMAL(15,2)  NOT NULL,
+                             PS_COMMENT     VARCHAR(199) NOT NULL );
+
+CREATE TABLE CUSTOMER ( C_CUSTKEY     INTEGER NOT NULL,
+                             C_NAME        VARCHAR(25) NOT NULL,
+                             C_ADDRESS     VARCHAR(40) NOT NULL,
+                             C_NATIONKEY   INTEGER NOT NULL,
+                             C_PHONE       CHAR(15) NOT NULL,
+                             C_ACCTBAL     DECIMAL(15,2)   NOT NULL,
+                             C_MKTSEGMENT  CHAR(10) NOT NULL,
+                             C_COMMENT     VARCHAR(117) NOT NULL);
+
+CREATE TABLE ORDERS  ( O_ORDERKEY       INTEGER NOT NULL,
+                           O_CUSTKEY        INTEGER NOT NULL,
+                           O_ORDERSTATUS    CHAR(1) NOT NULL,
+                           O_TOTALPRICE     DECIMAL(15,2) NOT NULL,
+                           O_ORDERDATE      DATE NOT NULL,
+                           O_ORDERPRIORITY  CHAR(15) NOT NULL,
+                           O_CLERK          CHAR(15) NOT NULL,
+                           O_SHIPPRIORITY   INTEGER NOT NULL,
+                           O_COMMENT        VARCHAR(79) NOT NULL);
+
+CREATE TABLE LINEITEM ( L_ORDERKEY    INTEGER NOT NULL,
+                             L_PARTKEY     INTEGER NOT NULL,
+                             L_SUPPKEY     INTEGER NOT NULL,
+                             L_LINENUMBER  INTEGER NOT NULL,
+                             L_QUANTITY    DECIMAL(15,2) NOT NULL,
+                             L_EXTENDEDPRICE  DECIMAL(15,2) NOT NULL,
+                             L_DISCOUNT    DECIMAL(15,2) NOT NULL,
+                             L_TAX         DECIMAL(15,2) NOT NULL,
+                             L_RETURNFLAG  CHAR(1) NOT NULL,
+                             L_LINESTATUS  CHAR(1) NOT NULL,
+                             L_SHIPDATE    DATE NOT NULL,
+                             L_COMMITDATE  DATE NOT NULL,
+                             L_RECEIPTDATE DATE NOT NULL,
+                             L_SHIPINSTRUCT CHAR(25) NOT NULL,
+                             L_SHIPMODE     CHAR(10) NOT NULL,
+                             L_COMMENT      VARCHAR(44) NOT NULL);
--- a/test_runner/performance/tpc-h/queries/1.sql
+++ b/test_runner/performance/tpc-h/queries/1.sql
@@ -0,0 +1,27 @@
+-- $ID$
+-- TPC-H/TPC-R Pricing Summary Report Query (Q1)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	l_returnflag,
+	l_linestatus,
+	sum(l_quantity) as sum_qty,
+	sum(l_extendedprice) as sum_base_price,
+	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+	avg(l_quantity) as avg_qty,
+	avg(l_extendedprice) as avg_price,
+	avg(l_discount) as avg_disc,
+	count(*) as count_order
+from
+	lineitem
+where
+	l_shipdate <= date '1998-12-01' - interval '89' day
+group by
+	l_returnflag,
+	l_linestatus
+order by
+	l_returnflag,
+	l_linestatus;
--- a/test_runner/performance/tpc-h/queries/10.sql
+++ b/test_runner/performance/tpc-h/queries/10.sql
@@ -0,0 +1,38 @@
+-- $ID$
+-- TPC-H/TPC-R Returned Item Reporting Query (Q10)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	c_custkey,
+	c_name,
+	sum(l_extendedprice * (1 - l_discount)) as revenue,
+	c_acctbal,
+	n_name,
+	c_address,
+	c_phone,
+	c_comment
+from
+	customer,
+	orders,
+	lineitem,
+	nation
+where
+	c_custkey = o_custkey
+	and l_orderkey = o_orderkey
+	and o_orderdate >= date '1993-08-01'
+	and o_orderdate < date '1993-08-01' + interval '3' month
+	and l_returnflag = 'R'
+	and c_nationkey = n_nationkey
+group by
+	c_custkey,
+	c_name,
+	c_acctbal,
+	c_phone,
+	n_name,
+	c_address,
+	c_comment
+order by
+	revenue desc
+limit 20;
--- a/test_runner/performance/tpc-h/queries/11.sql
+++ b/test_runner/performance/tpc-h/queries/11.sql
@@ -0,0 +1,34 @@
+-- $ID$
+-- TPC-H/TPC-R Important Stock Identification Query (Q11)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	ps_partkey,
+	sum(ps_supplycost * ps_availqty) as value
+from
+	partsupp,
+	supplier,
+	nation
+where
+	ps_suppkey = s_suppkey
+	and s_nationkey = n_nationkey
+	and n_name = 'INDONESIA'
+group by
+	ps_partkey having
+		sum(ps_supplycost * ps_availqty) > (
+			select
+				sum(ps_supplycost * ps_availqty) * 0.0001000000
+			from
+				partsupp,
+				supplier,
+				nation
+			where
+				ps_suppkey = s_suppkey
+				and s_nationkey = n_nationkey
+				and n_name = 'INDONESIA'
+		)
+order by
+	value desc
+;
--- a/test_runner/performance/tpc-h/queries/12.sql
+++ b/test_runner/performance/tpc-h/queries/12.sql
@@ -0,0 +1,35 @@
+-- $ID$
+-- TPC-H/TPC-R Shipping Modes and Order Priority Query (Q12)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	l_shipmode,
+	sum(case
+		when o_orderpriority = '1-URGENT'
+			or o_orderpriority = '2-HIGH'
+			then 1
+		else 0
+	end) as high_line_count,
+	sum(case
+		when o_orderpriority <> '1-URGENT'
+			and o_orderpriority <> '2-HIGH'
+			then 1
+		else 0
+	end) as low_line_count
+from
+	orders,
+	lineitem
+where
+	o_orderkey = l_orderkey
+	and l_shipmode in ('REG AIR', 'AIR')
+	and l_commitdate < l_receiptdate
+	and l_shipdate < l_commitdate
+	and l_receiptdate >= date '1995-01-01'
+	and l_receiptdate < date '1995-01-01' + interval '1' year
+group by
+	l_shipmode
+order by
+	l_shipmode
+;
--- a/test_runner/performance/tpc-h/queries/13.sql
+++ b/test_runner/performance/tpc-h/queries/13.sql
@@ -0,0 +1,27 @@
+-- $ID$
+-- TPC-H/TPC-R Customer Distribution Query (Q13)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	c_count,
+	count(*) as custdist
+from
+	(
+		select
+			c_custkey,
+			count(o_orderkey)
+		from
+			customer left outer join orders on
+				c_custkey = o_custkey
+				and o_comment not like '%special%accounts%'
+		group by
+			c_custkey
+	) as c_orders (c_custkey, c_count)
+group by
+	c_count
+order by
+	custdist desc,
+	c_count desc
+;
--- a/test_runner/performance/tpc-h/queries/14.sql
+++ b/test_runner/performance/tpc-h/queries/14.sql
@@ -0,0 +1,20 @@
+-- $ID$
+-- TPC-H/TPC-R Promotion Effect Query (Q14)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	100.00 * sum(case
+		when p_type like 'PROMO%'
+			then l_extendedprice * (1 - l_discount)
+		else 0
+	end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
+from
+	lineitem,
+	part
+where
+	l_partkey = p_partkey
+	and l_shipdate >= date '1995-07-01'
+	and l_shipdate < date '1995-07-01' + interval '1' month
+;
--- a/test_runner/performance/tpc-h/queries/15.sql
+++ b/test_runner/performance/tpc-h/queries/15.sql
@@ -0,0 +1,40 @@
+-- $ID$
+-- TPC-H/TPC-R Top Supplier Query (Q15)
+-- Functional Query Definition
+-- Approved February 1998
+
+create view revenue0 (supplier_no, total_revenue) as
+	select
+		l_suppkey,
+		sum(l_extendedprice * (1 - l_discount))
+	from
+		lineitem
+	where
+		l_shipdate >= date '1995-01-01'
+		and l_shipdate < date '1995-01-01' + interval '3' month
+	group by
+		l_suppkey;
+
+
+select
+	s_suppkey,
+	s_name,
+	s_address,
+	s_phone,
+	total_revenue
+from
+	supplier,
+	revenue0
+where
+	s_suppkey = supplier_no
+	and total_revenue = (
+		select
+			max(total_revenue)
+		from
+			revenue0
+	)
+order by
+	s_suppkey;
+
+drop view revenue0
+;
--- a/test_runner/performance/tpc-h/queries/16.sql
+++ b/test_runner/performance/tpc-h/queries/16.sql
@@ -0,0 +1,37 @@
+-- $ID$
+-- TPC-H/TPC-R Parts/Supplier Relationship Query (Q16)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	p_brand,
+	p_type,
+	p_size,
+	count(distinct ps_suppkey) as supplier_cnt
+from
+	partsupp,
+	part
+where
+	p_partkey = ps_partkey
+	and p_brand <> 'Brand#43'
+	and p_type not like 'PROMO POLISHED%'
+	and p_size in (35, 5, 42, 13, 11, 40, 50, 47)
+	and ps_suppkey not in (
+		select
+			s_suppkey
+		from
+			supplier
+		where
+			s_comment like '%Customer%Complaints%'
+	)
+group by
+	p_brand,
+	p_type,
+	p_size
+order by
+	supplier_cnt desc,
+	p_brand,
+	p_type,
+	p_size
+;
--- a/test_runner/performance/tpc-h/queries/17.sql
+++ b/test_runner/performance/tpc-h/queries/17.sql
@@ -0,0 +1,25 @@
+
+-- $ID$
+-- TPC-H/TPC-R Small-Quantity-Order Revenue Query (Q17)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	sum(l_extendedprice) / 7.0 as avg_yearly
+from
+	lineitem,
+	part
+where
+	p_partkey = l_partkey
+	and p_brand = 'Brand#35'
+	and p_container = 'JUMBO JAR'
+	and l_quantity < (
+		select
+			0.2 * avg(l_quantity)
+		from
+			lineitem
+		where
+			l_partkey = p_partkey
+	)
+;
--- a/test_runner/performance/tpc-h/queries/18.sql
+++ b/test_runner/performance/tpc-h/queries/18.sql
@@ -0,0 +1,39 @@
+-- $ID$
+-- TPC-H/TPC-R Large Volume Customer Query (Q18)
+-- Function Query Definition
+-- Approved February 1998
+
+
+select
+	c_name,
+	c_custkey,
+	o_orderkey,
+	o_orderdate,
+	o_totalprice,
+	sum(l_quantity)
+from
+	customer,
+	orders,
+	lineitem
+where
+	o_orderkey in (
+		select
+			l_orderkey
+		from
+			lineitem
+		group by
+			l_orderkey having
+				sum(l_quantity) > 315
+	)
+	and c_custkey = o_custkey
+	and o_orderkey = l_orderkey
+group by
+	c_name,
+	c_custkey,
+	o_orderkey,
+	o_orderdate,
+	o_totalprice
+order by
+	o_totalprice desc,
+	o_orderdate
+limit 100;
--- a/test_runner/performance/tpc-h/queries/19.sql
+++ b/test_runner/performance/tpc-h/queries/19.sql
@@ -0,0 +1,42 @@
+-- $ID$
+-- TPC-H/TPC-R Discounted Revenue Query (Q19)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	sum(l_extendedprice* (1 - l_discount)) as revenue
+from
+	lineitem,
+	part
+where
+	(
+		p_partkey = l_partkey
+		and p_brand = 'Brand#41'
+		and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
+		and l_quantity >= 10 and l_quantity <= 10 + 10
+		and p_size between 1 and 5
+		and l_shipmode in ('AIR', 'AIR REG')
+		and l_shipinstruct = 'DELIVER IN PERSON'
+	)
+	or
+	(
+		p_partkey = l_partkey
+		and p_brand = 'Brand#52'
+		and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
+		and l_quantity >= 20 and l_quantity <= 20 + 10
+		and p_size between 1 and 10
+		and l_shipmode in ('AIR', 'AIR REG')
+		and l_shipinstruct = 'DELIVER IN PERSON'
+	)
+	or
+	(
+		p_partkey = l_partkey
+		and p_brand = 'Brand#14'
+		and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
+		and l_quantity >= 22 and l_quantity <= 22 + 10
+		and p_size between 1 and 15
+		and l_shipmode in ('AIR', 'AIR REG')
+		and l_shipinstruct = 'DELIVER IN PERSON'
+	)
+;
--- a/test_runner/performance/tpc-h/queries/2.sql
+++ b/test_runner/performance/tpc-h/queries/2.sql
@@ -0,0 +1,50 @@
+-- $ID$
+-- TPC-H/TPC-R Minimum Cost Supplier Query (Q2)
+-- Functional Query Definition
+-- Approved February 1998
+
+
+select
+	s_acctbal,
+	s_name,
+	n_name,
+	p_partkey,
+	p_mfgr,
+	s_address,
+	s_phone,
+	s_comment
+from
+	part,
+	supplier,
+	partsupp,
+	nation,
+	region
+where
+	p_partkey = ps_partkey
+	and s_suppkey = ps_suppkey
+	and p_size = 39
+	and p_type like '%BRASS'
+	and s_nationkey = n_nationkey
+	and n_regionkey = r_regionkey
+	and r_name = 'MIDDLE EAST'
+	and ps_supplycost = (
+		select
+			min(ps_supplycost)
+		from
+			partsupp,
+			supplier,
+			nation,
+			region
+		where
+			p_partkey = ps_partkey
+			and s_suppkey = ps_suppkey
+			and s_nationkey = n_nationkey
+			and n_regionkey = r_regionkey
+			and r_name = 'MIDDLE EAST'
+	)
+order by
+	s_acctbal desc,
+	n_name,
+	s_name,
+	p_partkey
+limit 100;
--- a/Show More
+++ b/Show More