Fix the markdown rendering 004-durability.md RFC

Continue with pageserver startup, if loading some tenants fail.
Fixes https://github.com/neondatabase/neon/issues/1664
2026-06-02 04:50:38 +00:00 · 2022-05-16 10:11:39 +03:00 · 2022-05-15 00:25:38 +03:00 · 2022-05-13 21:41:00 +03:00 · 2022-05-13 20:41:54 +03:00 · 2022-05-13 17:36:18 +02:00
148 changed files with 7559 additions and 4085 deletions
--- a/.circleci/ansible/.gitignore
+++ b/.circleci/ansible/.gitignore
@@ -1,2 +1,4 @@
 zenith_install.tar.gz
 .zenith_current_version
+neon_install.tar.gz
+.neon_current_version
--- a/.circleci/ansible/get_binaries.sh
+++ b/.circleci/ansible/get_binaries.sh
@@ -7,7 +7,7 @@ RELEASE=${RELEASE:-false}
 # look at docker hub for latest tag for neon docker image
 if [ "${RELEASE}" = "true" ]; then
    echo "search latest relase tag"
-    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
    if [ -z "${VERSION}" ]; then
        echo "no any docker tags found, exiting..."
        exit 1
@@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then
    fi
 else
    echo "search latest dev tag"
-    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -v release | tail -1)
+    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
    if [ -z "${VERSION}" ]; then
        echo "no any docker tags found, exiting..."
        exit 1
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -15,3 +15,4 @@ console_mgmt_base_url = http://console-release.local
 bucket_name           = zenith-storage-oregon
 bucket_region         = us-west-2
 etcd_endpoints        = etcd-release.local:2379
+safekeeper_enable_s3_offload = true
--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -4,8 +4,8 @@ zenith-us-stage-ps-2 console_region_id=27

 [safekeepers]
 zenith-us-stage-sk-1 console_region_id=27
-zenith-us-stage-sk-2 console_region_id=27
 zenith-us-stage-sk-4 console_region_id=27
+zenith-us-stage-sk-5 console_region_id=27

 [storage:children]
 pageservers
@@ -16,3 +16,4 @@ console_mgmt_base_url = http://console-staging.local
 bucket_name           = zenith-staging-storage-us-east-1
 bucket_region         = us-east-1
 etcd_endpoints        = etcd-staging.local:2379
+safekeeper_enable_s3_offload = false
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
@@ -6,7 +6,7 @@ After=network.target auditd.service
 Type=simple
 User=safekeeper
 Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }}
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }}
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
 KillSignal=SIGINT
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -121,7 +121,7 @@ jobs:
            export RUSTC_WRAPPER=cachepot
            export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
            export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
-            "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests
+            "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
            cachepot -s

      - save_cache:
@@ -579,13 +579,13 @@ jobs:
          name: Setup helm v3
          command: |
            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-            helm repo add zenithdb https://neondatabase.github.io/helm-charts
+            helm repo add neondatabase https://neondatabase.github.io/helm-charts
      - run:
          name: Re-deploy proxy
          command: |
            DOCKER_TAG=$(git log --oneline|wc -l)
-            helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
-
+            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
+            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait

  deploy-release:
    docker:
--- a/.circleci/helm-values/production.proxy.yaml
+++ b/.circleci/helm-values/production.proxy.yaml
@@ -5,8 +5,8 @@ image:
  repository: neondatabase/neon

 settings:
-  authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/"
-  uri: "https://console.zenith.tech/psql_session/"
+  authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
+  uri: "https://console.neon.tech/psql_session/"

 # -- Additional labels for zenith-proxy pods
 podLabels:
@@ -28,7 +28,7 @@ exposedService:
    service.beta.kubernetes.io/aws-load-balancer-type: external
    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: start.zenith.tech
+    external-dns.alpha.kubernetes.io/hostname: start.zenith.tech,connect.neon.tech,pg.neon.tech

 metrics:
  enabled: true
--- a/.circleci/helm-values/staging.proxy-scram.yaml
+++ b/.circleci/helm-values/staging.proxy-scram.yaml
@@ -0,0 +1,31 @@
+# Helm chart values for zenith-proxy.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-staging.local/management/api/v2"
+  domain: "*.cloud.stage.neon.tech"
+
+# -- Additional labels for zenith-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: staging
+  zenith_region: us-east-1
+  zenith_region_slug: virginia
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
+
+metrics:
+  enabled: true
+  serviceMonitor:
+    enabled: true
+    selector:
+      release: kube-prometheus-stack
--- a/.circleci/helm-values/staging.proxy.yaml
+++ b/.circleci/helm-values/staging.proxy.yaml
@@ -5,8 +5,8 @@ image:
  repository: neondatabase/neon

 settings:
-  authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/"
-  uri: "https://console.stage.zenith.tech/psql_session/"
+  authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
+  uri: "https://console.stage.neon.tech/psql_session/"

 # -- Additional labels for zenith-proxy pods
 podLabels:
@@ -20,7 +20,7 @@ exposedService:
    service.beta.kubernetes.io/aws-load-balancer-type: external
    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: start.stage.zenith.tech
+    external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech

 metrics:
  enabled: true
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,6 @@ test_output/
 # Coverage
 *.profraw
 *.profdata
+
+*.key
+*.crt
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -113,6 +113,49 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"

+[[package]]
+name = "axum"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4af7447fc1214c1f3a1ace861d0216a6c8bb13965b64bbad9650f375b67689a"
+dependencies = [
+ "async-trait",
+ "axum-core",
+ "bitflags",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "itoa 1.0.1",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "serde",
+ "sync_wrapper",
+ "tokio",
+ "tower",
+ "tower-http",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3bdc19781b16e32f8a7200368a336fa4509d4b72ef15dd4e41df5290855ee1e6"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "mime",
+]
+
 [[package]]
 name = "backtrace"
 version = "0.3.64"
@@ -320,6 +363,15 @@ dependencies = [
 "textwrap 0.14.2",
 ]

+[[package]]
+name = "cmake"
+version = "0.1.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8ad8cef104ac57b68b89df3208164d228503abbdce70f6880ffa3d970e7443a"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "combine"
 version = "4.6.3"
@@ -330,6 +382,18 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "comfy-table"
+version = "5.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e"
+dependencies = [
+ "crossterm",
+ "strum",
+ "strum_macros",
+ "unicode-width",
+]
+
 [[package]]
 name = "compute_tools"
 version = "0.1.0"
@@ -526,6 +590,31 @@ dependencies = [
 "lazy_static",
 ]

+[[package]]
+name = "crossterm"
+version = "0.23.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2102ea4f781910f8a5b98dd061f4c2023f479ce7bb1236330099ceb5a93cf17"
+dependencies = [
+ "bitflags",
+ "crossterm_winapi",
+ "libc",
+ "mio",
+ "parking_lot 0.12.0",
+ "signal-hook",
+ "signal-hook-mio",
+ "winapi",
+]
+
+[[package]]
+name = "crossterm_winapi"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ae1b35a484aa10e07fe0638d02301c5ad24de82d310ccbd2f3693da5f09bf1c"
+dependencies = [
+ "winapi",
+]
+
 [[package]]
 name = "crypto-common"
 version = "0.1.3"
@@ -693,9 +782,9 @@ dependencies = [

 [[package]]
 name = "etcd-client"
-version = "0.8.4"
+version = "0.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "585de5039d1ecce74773db49ba4e8107e42be7c2cd0b1a9e7fce27181db7b118"
+checksum = "c434d2800b273a506b82397aad2f20971636f65e47b27c027f77d498530c5954"
 dependencies = [
 "http",
 "prost",
@@ -703,9 +792,26 @@ dependencies = [
 "tokio-stream",
 "tonic",
 "tonic-build",
+ "tower",
 "tower-service",
 ]

+[[package]]
+name = "etcd_broker"
+version = "0.1.0"
+dependencies = [
+ "etcd-client",
+ "regex",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "thiserror",
+ "tokio",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "fail"
 version = "0.5.0"
@@ -990,6 +1096,12 @@ dependencies = [
 "unicode-segmentation",
 ]

+[[package]]
+name = "heck"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
+
 [[package]]
 name = "hermit-abi"
 version = "0.1.19"
@@ -1055,6 +1167,12 @@ dependencies = [
 "pin-project-lite",
 ]

+[[package]]
+name = "http-range-header"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29"
+
 [[package]]
 name = "httparse"
 version = "1.6.0"
@@ -1073,6 +1191,16 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"

+[[package]]
+name = "humantime-serde"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57a3db5ea5923d99402c94e9feb261dc5ee9b4efa158b0315f788cf549cc200c"
+dependencies = [
+ "humantime",
+ "serde",
+]
+
 [[package]]
 name = "hyper"
 version = "0.14.17"
@@ -1310,6 +1438,12 @@ version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f"

+[[package]]
+name = "matchit"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb"
+
 [[package]]
 name = "md-5"
 version = "0.9.1"
@@ -1440,6 +1574,24 @@ dependencies = [
 "tempfile",
 ]

+[[package]]
+name = "neon_local"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap 3.0.14",
+ "comfy-table",
+ "control_plane",
+ "git-version",
+ "pageserver",
+ "postgres",
+ "postgres_ffi",
+ "safekeeper",
+ "serde_json",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "nix"
 version = "0.23.1"
@@ -1612,7 +1764,6 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "async-trait",
 "byteorder",
 "bytes",
 "chrono",
@@ -1623,9 +1774,11 @@ dependencies = [
 "daemonize",
 "fail",
 "futures",
+ "git-version",
 "hex",
 "hex-literal",
 "humantime",
+ "humantime-serde",
 "hyper",
 "itertools",
 "lazy_static",
@@ -1639,8 +1792,7 @@ dependencies = [
 "pprof",
 "rand",
 "regex",
- "rusoto_core",
- "rusoto_s3",
+ "remote_storage",
 "scopeguard",
 "serde",
 "serde_json",
@@ -1652,7 +1804,6 @@ dependencies = [
 "tokio",
 "tokio-postgres",
 "tokio-stream",
- "tokio-util 0.7.0",
 "toml_edit",
 "tracing",
 "url",
@@ -1911,6 +2062,16 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"

+[[package]]
+name = "prettyplease"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9e07e3a46d0771a8a06b5f4441527802830b43e679ba12f44960f48dd4c6803"
+dependencies = [
+ "proc-macro2",
+ "syn",
+]
+
 [[package]]
 name = "proc-macro-hack"
 version = "0.5.19"
@@ -1942,9 +2103,9 @@ dependencies = [

 [[package]]
 name = "prost"
-version = "0.9.0"
+version = "0.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "444879275cb4fd84958b1a1d5420d15e6fcf7c235fe47f053c9c2a80aceb6001"
+checksum = "bc03e116981ff7d8da8e5c220e374587b98d294af7ba7dd7fda761158f00086f"
 dependencies = [
 "bytes",
 "prost-derive",
@@ -1952,12 +2113,14 @@ dependencies = [

 [[package]]
 name = "prost-build"
-version = "0.9.0"
+version = "0.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62941722fb675d463659e49c4f3fe1fe792ff24fe5bbaa9c08cd3b98a1c354f5"
+checksum = "65a1118354442de7feb8a2a76f3d80ef01426bd45542c8c1fdffca41a758f846"
 dependencies = [
 "bytes",
- "heck",
+ "cfg-if",
+ "cmake",
+ "heck 0.4.0",
 "itertools",
 "lazy_static",
 "log",
@@ -1972,9 +2135,9 @@ dependencies = [

 [[package]]
 name = "prost-derive"
-version = "0.9.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f9cc1a3263e07e0bf68e96268f37665207b49560d98739662cdfaae215c720fe"
+checksum = "7b670f45da57fb8542ebdbb6105a925fe571b67f9e7ed9f47a06a84e72b4e7cc"
 dependencies = [
 "anyhow",
 "itertools",
@@ -1985,9 +2148,9 @@ dependencies = [

 [[package]]
 name = "prost-types"
-version = "0.9.0"
+version = "0.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "534b7a0e836e3c482d2693070f982e39e7611da9695d4d1f5a4b186b51faef0a"
+checksum = "2d0a014229361011dc8e69c8a1ec6c2e8d0f2af7c91e3ea3f5b2170298461e68"
 dependencies = [
 "bytes",
 "prost",
@@ -2003,6 +2166,7 @@ dependencies = [
 "bytes",
 "clap 3.0.14",
 "futures",
+ "git-version",
 "hashbrown",
 "hex",
 "hmac 0.12.1",
@@ -2029,6 +2193,7 @@ dependencies = [
 "tokio-postgres",
 "tokio-postgres-rustls",
 "tokio-rustls",
+ "url",
 "utils",
 "workspace_hack",
 ]
@@ -2158,9 +2323,9 @@ dependencies = [

 [[package]]
 name = "regex"
-version = "1.5.4"
+version = "1.5.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
+checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
 dependencies = [
 "aho-corasick",
 "memchr",
@@ -2182,6 +2347,23 @@ version = "0.6.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"

+[[package]]
+name = "remote_storage"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "rusoto_core",
+ "rusoto_s3",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "tokio",
+ "tokio-util 0.7.0",
+ "tracing",
+ "workspace_hack",
+]
+
 [[package]]
 name = "remove_dir_all"
 version = "0.5.3"
@@ -2281,9 +2463,9 @@ dependencies = [

 [[package]]
 name = "rusoto_core"
-version = "0.47.0"
+version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b4f000e8934c1b4f70adde180056812e7ea6b1a247952db8ee98c94cd3116cc"
+checksum = "1db30db44ea73551326269adcf7a2169428a054f14faf9e1768f2163494f2fa2"
 dependencies = [
 "async-trait",
 "base64",
@@ -2306,9 +2488,9 @@ dependencies = [

 [[package]]
 name = "rusoto_credential"
-version = "0.47.0"
+version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a46b67db7bb66f5541e44db22b0a02fed59c9603e146db3a9e633272d3bac2f"
+checksum = "ee0a6c13db5aad6047b6a44ef023dbbc21a056b6dab5be3b79ce4283d5c02d05"
 dependencies = [
 "async-trait",
 "chrono",
@@ -2324,9 +2506,9 @@ dependencies = [

 [[package]]
 name = "rusoto_s3"
-version = "0.47.0"
+version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "048c2fe811a823ad5a9acc976e8bf4f1d910df719dcf44b15c3e96c5b7a51027"
+checksum = "7aae4677183411f6b0b412d66194ef5403293917d66e70ab118f07cc24c5b14d"
 dependencies = [
 "async-trait",
 "bytes",
@@ -2337,9 +2519,9 @@ dependencies = [

 [[package]]
 name = "rusoto_signature"
-version = "0.47.0"
+version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6264e93384b90a747758bcc82079711eacf2e755c3a8b5091687b5349d870bcc"
+checksum = "a5ae95491c8b4847931e291b151127eccd6ff8ca13f33603eb3d0035ecb05272"
 dependencies = [
 "base64",
 "bytes",
@@ -2435,8 +2617,9 @@ dependencies = [
 "const_format",
 "crc32c",
 "daemonize",
- "etcd-client",
+ "etcd_broker",
 "fs2",
+ "git-version",
 "hex",
 "humantime",
 "hyper",
@@ -2446,8 +2629,7 @@ dependencies = [
 "postgres-protocol",
 "postgres_ffi",
 "regex",
- "rusoto_core",
- "rusoto_s3",
+ "remote_storage",
 "serde",
 "serde_json",
 "serde_with",
@@ -2652,6 +2834,17 @@ dependencies = [
 "signal-hook-registry",
 ]

+[[package]]
+name = "signal-hook-mio"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
+dependencies = [
+ "libc",
+ "mio",
+ "signal-hook",
+]
+
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.0"
@@ -2741,6 +2934,25 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"

+[[package]]
+name = "strum"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb"
+
+[[package]]
+name = "strum_macros"
+version = "0.23.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
+dependencies = [
+ "heck 0.3.3",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn",
+]
+
 [[package]]
 name = "subtle"
 version = "2.4.1"
@@ -2772,15 +2984,21 @@ dependencies = [

 [[package]]
 name = "syn"
-version = "1.0.86"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b"
+checksum = "7ff7c592601f11445996a06f8ad0c27f094a58857c2f89e97974ab9235b92c52"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-xid",
 ]

+[[package]]
+name = "sync_wrapper"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8"
+
 [[package]]
 name = "tar"
 version = "0.4.38"
@@ -3074,12 +3292,13 @@ dependencies = [

 [[package]]
 name = "tonic"
-version = "0.6.2"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ff08f4649d10a70ffa3522ca559031285d8e421d727ac85c60825761818f5d0a"
+checksum = "5be9d60db39854b30b835107500cf0aca0b0d14d6e1c3de124217c23a29c2ddb"
 dependencies = [
 "async-stream",
 "async-trait",
+ "axum",
 "base64",
 "bytes",
 "futures-core",
@@ -3095,7 +3314,7 @@ dependencies = [
 "prost-derive",
 "tokio",
 "tokio-stream",
- "tokio-util 0.6.9",
+ "tokio-util 0.7.0",
 "tower",
 "tower-layer",
 "tower-service",
@@ -3105,10 +3324,11 @@ dependencies = [

 [[package]]
 name = "tonic-build"
-version = "0.6.2"
+version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9403f1bafde247186684b230dc6f38b5cd514584e8bec1dd32514be4745fa757"
+checksum = "d9263bf4c9bfaae7317c1c2faf7f18491d2fe476f70c414b73bf5d445b00ffa1"
 dependencies = [
+ "prettyplease",
 "proc-macro2",
 "prost-build",
 "quote",
@@ -3135,6 +3355,25 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "tower-http"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e980386f06883cf4d0578d6c9178c81f68b45d77d00f2c2c1bc034b3439c2c56"
+dependencies = [
+ "bitflags",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-range-header",
+ "pin-project-lite",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "tower-layer"
 version = "0.3.1"
@@ -3576,13 +3815,22 @@ dependencies = [
 name = "workspace_hack"
 version = "0.1.0"
 dependencies = [
+ "ahash",
 "anyhow",
 "bytes",
 "chrono",
 "clap 2.34.0",
 "either",
+ "fail",
+ "futures-channel",
+ "futures-task",
+ "futures-util",
+ "generic-array",
 "hashbrown",
+ "hex",
+ "hyper",
 "indexmap",
+ "itoa 0.4.8",
 "libc",
 "log",
 "memchr",
@@ -3596,6 +3844,7 @@ dependencies = [
 "serde",
 "syn",
 "tokio",
+ "tokio-util 0.7.0",
 "tracing",
 "tracing-core",
 ]
@@ -3624,22 +3873,6 @@ dependencies = [
 "chrono",
 ]

-[[package]]
-name = "zenith"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "clap 3.0.14",
- "control_plane",
- "pageserver",
- "postgres",
- "postgres_ffi",
- "safekeeper",
- "serde_json",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "zeroize"
 version = "1.5.2"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,7 +6,7 @@ members = [
    "proxy",
    "safekeeper",
    "workspace_hack",
-    "zenith",
+    "neon_local",
    "libs/*",
 ]

--- a/README.md
+++ b/README.md
@@ -49,32 +49,30 @@ make -j5
 ```sh
 # Create repository in .zenith with proper paths to binaries and data
 # Later that would be responsibility of a package install script
-> ./target/debug/zenith init
-initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
-created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
-created main branch
+> ./target/debug/neon_local init
+initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
+created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50
+initial timeline de200bd42b49cc1814412c7e592dd6e9 created
 pageserver init succeeded

 # start pageserver and safekeeper
-> ./target/debug/zenith start
-Starting pageserver at 'localhost:64000' in '.zenith'
+> ./target/debug/neon_local start
+Starting pageserver at '127.0.0.1:64000' in '.zenith'
 Pageserver started
-initializing for single for 7676
-Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
+initializing for sk 1 for 7676
+Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1'
 Safekeeper started

 # start postgres compute node
-> ./target/debug/zenith pg start main
-Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
-Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
+> ./target/debug/neon_local pg start main
+Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
 Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
-waiting for server to start.... done
-server started

 # check list of running postgres instances
-> ./target/debug/zenith pg list
-NODE	ADDRESS	TIMELINES	BRANCH NAME	LSN		STATUS
-main	127.0.0.1:55432	5b014a9e41b4b63ce1a1febc04503636	main	0/1609610	running
+> ./target/debug/neon_local pg list
+ NODE  ADDRESS          TIMELINE                          BRANCH NAME  LSN        STATUS
+ main  127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
 ```

 4. Now it is possible to connect to postgres and run some queries:
@@ -94,18 +92,25 @@ postgres=# select * from t;
 5. And create branches and run postgres on them:
 ```sh
 # create branch named migration_check
-> ./target/debug/zenith timeline branch --branch-name migration_check
-Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'
+> ./target/debug/neon_local timeline branch --branch-name migration_check
+Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'

 # check branches tree
-> ./target/debug/zenith timeline list
- main [5b014a9e41b4b63ce1a1febc04503636]
- ┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]
+> ./target/debug/neon_local timeline list
+(L) main [de200bd42b49cc1814412c7e592dd6e9]
+(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]

 # start postgres on that branch
-> ./target/debug/zenith pg start migration_check
-Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
-waiting for server to start.... done
+> ./target/debug/neon_local pg start migration_check --branch-name migration_check
+Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
+Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
+Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres'
+
+# check the new list of running postgres instances
+> ./target/debug/neon_local pg list
+ NODE             ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
+ main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
+ migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running

 # this new postgres instance will have all the data from 'main' postgres,
 # but all modifications would not affect data in original postgres
@@ -118,12 +123,20 @@ postgres=# select * from t;

 postgres=# insert into t values(2,2);
 INSERT 0 1
+
+# check that the new change doesn't affect the 'main' postgres
+> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
 ```

 6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
   you have just started. You can stop them all with one command:
 ```sh
-> ./target/debug/zenith stop
+> ./target/debug/neon_local stop
 ```

 ## Running tests
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -136,13 +136,20 @@ pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
                xact.execute(query.as_str(), &[])?;
            }
        } else {
-            info!("role name {}", &name);
+            info!("role name: '{}'", &name);
            let mut query: String = format!("CREATE ROLE {} ", name.quote());
-            info!("role create query {}", &query);
+            info!("role create query: '{}'", &query);
            info_print!(" -> create");

            query.push_str(&role.to_pg_options());
            xact.execute(query.as_str(), &[])?;
+
+            let grant_query = format!(
+                "grant pg_read_all_data, pg_write_all_data to {}",
+                name.quote()
+            );
+            xact.execute(grant_query.as_str(), &[])?;
+            info!("role grant query: '{}'", &grant_query);
        }

        info_print!("\n");
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -63,6 +63,10 @@ pub struct LocalEnv {
    #[serde(default)]
    pub broker_endpoints: Option<String>,

+    /// A prefix to all to any key when pushing/polling etcd from a node.
+    #[serde(default)]
+    pub broker_etcd_prefix: Option<String>,
+
    pub pageserver: PageServerConf,

    #[serde(default)]
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -77,6 +77,7 @@ pub struct SafekeeperNode {
    pub pageserver: Arc<PageServerNode>,

    broker_endpoints: Option<String>,
+    broker_etcd_prefix: Option<String>,
 }

 impl SafekeeperNode {
@@ -94,6 +95,7 @@ impl SafekeeperNode {
            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
            pageserver,
            broker_endpoints: env.broker_endpoints.clone(),
+            broker_etcd_prefix: env.broker_etcd_prefix.clone(),
        }
    }

@@ -143,6 +145,9 @@ impl SafekeeperNode {
        if let Some(ref ep) = self.broker_endpoints {
            cmd.args(&["--broker-endpoints", ep]);
        }
+        if let Some(prefix) = self.broker_etcd_prefix.as_deref() {
+            cmd.args(&["--broker-etcd-prefix", prefix]);
+        }

        if !cmd.status()?.success() {
            bail!(
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -167,6 +167,9 @@ impl PageServerNode {
            );
        }

+        // echo the captured output of the init command
+        println!("{}", String::from_utf8_lossy(&init_output.stdout));
+
        Ok(initial_timeline_id)
    }

@@ -186,8 +189,6 @@ impl PageServerNode {
        );
        io::stdout().flush().unwrap();

-        let mut cmd = Command::new(self.env.pageserver_bin()?);
-
        let repo_path = self.repo_path();
        let mut args = vec!["-D", repo_path.to_str().unwrap()];

@@ -195,9 +196,11 @@ impl PageServerNode {
            args.extend(["-c", config_override]);
        }

-        fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
+        let mut cmd = Command::new(self.env.pageserver_bin()?);
+        let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
+        filled_cmd = fill_aws_secrets_vars(filled_cmd);

-        if !cmd.status()?.success() {
+        if !filled_cmd.status()?.success() {
            bail!(
                "Pageserver failed to start. See '{}' for details.",
                self.repo_path().join("pageserver.log").display()
@@ -369,6 +372,10 @@ impl PageServerNode {
                    .map(|x| x.parse::<u64>())
                    .transpose()?,
                gc_period: settings.get("gc_period").map(|x| x.to_string()),
+                image_creation_threshold: settings
+                    .get("image_creation_threshold")
+                    .map(|x| x.parse::<usize>())
+                    .transpose()?,
                pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
            })
            .send()?
@@ -405,6 +412,9 @@ impl PageServerNode {
                    .get("gc_horizon")
                    .map(|x| x.parse::<u64>().unwrap()),
                gc_period: settings.get("gc_period").map(|x| x.to_string()),
+                image_creation_threshold: settings
+                    .get("image_creation_threshold")
+                    .map(|x| x.parse::<usize>().unwrap()),
                pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
            })
            .send()?
@@ -450,3 +460,12 @@ impl PageServerNode {
        Ok(timeline_info_response)
    }
 }
+
+fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
+    for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
+        if let Ok(value) = std::env::var(env_key) {
+            cmd = cmd.env(env_key, value);
+        }
+    }
+    cmd
+}
--- a/docs/README.md
+++ b/docs/README.md
@@ -7,8 +7,8 @@
 - [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
 - [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
 - [sourcetree.md](sourcetree.md) — Overview of the source tree layeout.
- [pageserver/README](/pageserver/README) — pageserver overview.
- [postgres_ffi/README](/libs/postgres_ffi/README) — Postgres FFI overview.
+- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
+- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
 - [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [safekeeper/README](/safekeeper/README) — WAL service overview.
+- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
 - [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
--- a/docs/rfcs/004-durability.md
+++ b/docs/rfcs/004-durability.md
@@ -22,7 +22,7 @@ In addition to the WAL safekeeper nodes, the WAL is archived in
 S3. WAL that has been archived to S3 can be removed from the
 safekeepers, so the safekeepers don't need a lot of disk space.

-
+```
                                +----------------+
                        +-----> | WAL safekeeper |
                        |       +----------------+
@@ -42,23 +42,23 @@ safekeepers, so the safekeepers don't need a lot of disk space.
                  \
                   \
                    \
-                     \      +--------+
-					  \		|        |
-					   +-->	|   S3   |
-							|        |
-                            +--------+
-
+                     \          +--------+
+                      \         |        |
+                       +------> |   S3   |
+                                |        |
+                                +--------+

+```
 Every WAL safekeeper holds a section of WAL, and a VCL value.
 The WAL can be divided into three portions:

-
+```
                                    VCL                   LSN
                                     |                     |
                                     V                     V
 .................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
 Archived WAL       Completed WAL          In-flight WAL
-
+```

 Note that all this WAL kept in a safekeeper is a contiguous section.
 This is different from Aurora: In Aurora, there can be holes in the
--- a/docs/rfcs/016-connection-routing.md
+++ b/docs/rfcs/016-connection-routing.md
@@ -0,0 +1,151 @@
+# Dispatching a connection
+
+For each client connection, Neon service needs to authenticate the
+connection, and route it to the right PostgreSQL instance.
+
+## Authentication
+
+There are three different ways to authenticate:
+
+- anonymous; no authentication needed
+- PostgreSQL authentication
+- github single sign-on using browser
+
+In anonymous access, the user doesn't need to perform any
+authentication at all. This can be used e.g. in interactive PostgreSQL
+documentation, allowing you to run the examples very quickly. Similar
+to sqlfiddle.com.
+
+PostgreSQL authentication works the same as always. All the different
+PostgreSQL authentication options like SCRAM, kerberos, etc. are
+available. [1]
+
+The third option is to authenticate with github single sign-on. When
+you open the connection in psql, you get a link that you open with
+your browser. Opening the link redirects you to github authentication,
+and lets the connection to proceed. This is also known as "Link auth" [2].
+
+
+## Routing the connection
+
+When a client starts a connection, it needs to be routed to the
+correct PostgreSQL instance. Routing can be done by the proxy, acting
+as a man-in-the-middle, or the connection can be routed at the network
+level based on the hostname or IP address.
+
+Either way, Neon needs to identify which PostgreSQL instance the
+connection should be routed to. If the instance is not already
+running, it needs to be started. Some connections always require a new
+PostgreSQL instance to be created, e.g. if you want to run a one-off
+query against a particular point-in-time.
+
+The PostgreSQL instance is identified by:
+- Neon account (possibly anonymous)
+- cluster (known as tenant in the storage?)
+- branch or snapshot name
+- timestamp (PITR)
+- primary or read-replica
+- one-off read replica
+- one-off writeable branch
+
+When you are using regular PostgreSQL authentication or anonymous
+access, the connection URL needs to contain all the information needed
+for the routing. With github single sign-on, the browser is involved
+and some details - the Neon account in particular - can be deduced
+from the authentication exchange.
+
+There are three methods for identifying the PostgreSQL instance:
+
+- Browser interaction (link auth)
+- Options in the connection URL and the domain name
+- A pre-defined endpoint, identified by domain name or IP address
+
+### Link Auth
+
+    postgres://<username>@start.neon.tech/<dbname>
+
+This gives you a link that you open in browser. Clicking the link
+performs github authentication, and the Neon account name is
+provided to the proxy behind the scenes. The proxy routes the
+connection to the primary PostgreSQL instance in cluster called
+"main", branch "main".
+
+Further ideas:
+- You could pre-define a different target for link auth
+  connections in the UI.
+- You could have a drop-down in the browser, allowing you to connect
+  to any cluster you want. Link Auth can be like Teleport.
+
+### Connection URL
+
+The connection URL looks like this:
+
+    postgres://<username>@<cluster-id>.db.neon.tech/<dbname>
+
+By default, this connects you to the primary PostgreSQL instance
+running on the "main" branch in the named cluster [3]. However, you can
+change that by specifying options in the connection URL. The following
+options are supported:
+
+| option name  | Description                                                                                       | Examples                                            |
+| ---          | ---                                                                                               | ---                                                 |
+| cluster      | Cluster name                                                                                      | cluster:myproject                                   |
+| branch       | Branch name                                                                                       | branch:main                                         |
+| timestamp    | Connect to an instance at given point-in-time.                                                    | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z |
+| lsn          | Connect to an instance at given LSN                                                               | lsn:0/12FF0420                                      |
+| read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new                       |
+
+For example, to read branch 'testing' as it was on Mar 31, 2022, you could
+specify a timestamp in the connection URL [4]:
+
+    postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31
+
+Connecting with cluster name and options can be disabled in the UI. If
+disabled, you can only connect using a pre-defined endpoint.
+
+### Pre-defined Endpoint
+
+Instead of providing the cluster name, branch, and all those options
+in the connection URL, you can define a named endpoint with the same
+options.
+
+In the UI, click "create endpoint". Fill in the details:
+
+- Cluster name
+- Branch
+- timestamp or LSN
+- is this for the primary or for a read replica
+- etc.
+
+When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect:
+
+    postgres://<username>@<endpoint-id>.endpoint.neon.tech/<dbname>
+
+
+An endpoint can be assigned a static or dynamic IP address, so that
+you can connect to it with clients that don't support TLS SNI. Maybe
+bypass the proxy altogether, but that ought to be invisible to the
+user.
+
+You can limit the range of source IP addresses that are allowed to
+connect to an endpoint. An endpoint can also be exposed in an Amazon
+VPC, allowing direct connections from applications.
+
+
+# Footnotes
+
+[1] I'm not sure how feasible it is to set up configure like Kerberos
+or LDAP in a cloud environment. But in principle I think we should
+allow customers to have the full power of PostgreSQL, including all
+authentication options. However, it's up to the customer to configure
+it correctly.
+
+[2] Link is a way to both authenticate and to route the connection
+
+[3] This assumes that cluster-ids are globally unique, across all
+Neon accounts.
+
+[4] The syntax accepted in the connection URL is limited by libpq. The
+only way to pass arbitrary options to the server (or our proxy) is
+with the "options" keyword, and the options must be percent-encoded. I
+think the above would work but i haven't tested it
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -6,7 +6,6 @@ If there's no such file during `init` phase of the server, it creates the file i
 There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override
 the values in the config file, if any are specified for the same key and get into the final config during init phase.

-
 ### Config example

 ```toml
@@ -35,9 +34,9 @@ Yet, it validates the config values it can (e.g. postgres install dir) and error

 Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and

-* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
+- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`

-* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
+- or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`

 ### Config values

@@ -57,7 +56,7 @@ but it will trigger a checkpoint operation to get it back below the
 limit.

 `checkpoint_distance` also determines how much WAL needs to be kept
-durable in the safekeeper.  The safekeeper must have capacity to hold
+durable in the safekeeper. The safekeeper must have capacity to hold
 this much WAL, with some headroom, otherwise you can get stuck in a
 situation where the safekeeper is full and stops accepting new WAL,
 but the pageserver is not flushing out and releasing the space in the
@@ -72,7 +71,11 @@ The unit is # of bytes.

 Every `compaction_period` seconds, the page server checks if
 maintenance operations, like compaction, are needed on the layer
-files.  Default is 1 s, which should be fine.
+files. Default is 1 s, which should be fine.
+
+#### compaction_target_size
+
+File sizes for L0 delta and L1 image layers. Default is 128MB.

 #### gc_horizon

@@ -85,6 +88,14 @@ away.

 Interval at which garbage collection is triggered. Default is 100 s.

+#### image_creation_threshold
+
+L0 delta layer threshold for L1 iamge layer creation. Default is 3.
+
+#### pitr_interval
+
+WAL retention duration for PITR branching. Default is 30 days.
+
 #### initial_superuser_name

 Name of the initial superuser role, passed to initdb when a new tenant
@@ -151,16 +162,12 @@ bucket_region = 'eu-north-1'
 # Optional, pageserver uses entire bucket if the prefix is not specified.
 prefix_in_bucket = '/some/prefix/'

-# Access key to connect to the bucket ("login" part of the credentials)
-access_key_id = 'SOMEKEYAAAAASADSAH*#'
-
-# Secret access key to connect to the bucket ("password" part of the credentials)
-secret_access_key = 'SOMEsEcReTsd292v'
-
 # S3 API query limit to avoid getting errors/throttling from AWS.
 concurrency_limit = 100
 ```

+If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials.
+
 ###### General remote storage configuration

 Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used.
@@ -171,13 +178,12 @@ Besides, there are parameters common for all types of remote storage that can be
 ```toml
 [remote_storage]
 # Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time.
-max_concurrent_timelines_sync = 50
+max_concurrent_syncs = 50

 # Max number of errors a single task can have before it's considered failed and not attempted to run anymore.
 max_sync_errors = 10
 ```

-
 ## safekeeper

 TODO
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -28,7 +28,7 @@ The pageserver has a few different duties:
 - Receive WAL from the WAL service and decode it.
 - Replay WAL that's applicable to the chunks that the Page Server maintains

-For more detailed info, see `/pageserver/README`
+For more detailed info, see [/pageserver/README](/pageserver/README.md)

 `/proxy`:

@@ -57,7 +57,7 @@ PostgreSQL extension that contains functions needed for testing and debugging.
 The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
 It acts as a holding area and redistribution center for recently generated WAL.

-For more detailed info, see `/safekeeper/README`
+For more detailed info, see [/safekeeper/README](/safekeeper/README.md)

 `/workspace_hack`:
 The workspace_hack crate exists only to pin down some dependencies.
--- a/libs/etcd_broker/Cargo.toml
+++ b/libs/etcd_broker/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+ name = "etcd_broker"
+ version = "0.1.0"
+ edition = "2021"
+
+ [dependencies]
+ etcd-client = "0.9.0"
+ regex = "1.4.5"
+ serde = { version = "1.0", features = ["derive"] }
+ serde_json = "1"
+ serde_with = "1.12.0"
+
+ utils = { path = "../utils" }
+ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+ tokio = "1"
+ tracing = "0.1"
+ thiserror = "1"
--- a/libs/etcd_broker/src/lib.rs
+++ b/libs/etcd_broker/src/lib.rs
@@ -0,0 +1,344 @@
+//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
+//! Intended to connect services to each other, not to store their data.
+use std::{
+    collections::{hash_map, HashMap},
+    fmt::Display,
+    str::FromStr,
+};
+
+use regex::{Captures, Regex};
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+
+pub use etcd_client::*;
+
+use tokio::{sync::mpsc, task::JoinHandle};
+use tracing::*;
+use utils::{
+    lsn::Lsn,
+    zid::{ZNodeId, ZTenantId, ZTenantTimelineId},
+};
+
+#[derive(Debug, Deserialize, Serialize)]
+struct SafekeeperTimeline {
+    safekeeper_id: ZNodeId,
+    info: SkTimelineInfo,
+}
+
+/// Published data about safekeeper's timeline. Fields made optional for easy migrations.
+#[serde_as]
+#[derive(Debug, Deserialize, Serialize)]
+pub struct SkTimelineInfo {
+    /// Term of the last entry.
+    pub last_log_term: Option<u64>,
+    /// LSN of the last record.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub flush_lsn: Option<Lsn>,
+    /// Up to which LSN safekeeper regards its WAL as committed.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub commit_lsn: Option<Lsn>,
+    /// LSN up to which safekeeper offloaded WAL to s3.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub s3_wal_lsn: Option<Lsn>,
+    /// LSN of last checkpoint uploaded by pageserver.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub remote_consistent_lsn: Option<Lsn>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub peer_horizon_lsn: Option<Lsn>,
+    #[serde(default)]
+    pub safekeeper_connection_string: Option<String>,
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum BrokerError {
+    #[error("Etcd client error: {0}. Context: {1}")]
+    EtcdClient(etcd_client::Error, String),
+    #[error("Error during parsing etcd data: {0}")]
+    ParsingError(String),
+    #[error("Internal error: {0}")]
+    InternalError(String),
+}
+
+/// A way to control the data retrieval from a certain subscription.
+pub struct SkTimelineSubscription {
+    safekeeper_timeline_updates:
+        mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>>,
+    kind: SkTimelineSubscriptionKind,
+    watcher_handle: JoinHandle<Result<(), BrokerError>>,
+    watcher: Watcher,
+}
+
+impl SkTimelineSubscription {
+    /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
+    pub async fn fetch_data(
+        &mut self,
+    ) -> Option<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>> {
+        self.safekeeper_timeline_updates.recv().await
+    }
+
+    /// Cancels the subscription, stopping the data poller and waiting for it to shut down.
+    pub async fn cancel(mut self) -> Result<(), BrokerError> {
+        self.watcher.cancel().await.map_err(|e| {
+            BrokerError::EtcdClient(
+                e,
+                format!(
+                    "Failed to cancel timeline subscription, kind: {:?}",
+                    self.kind
+                ),
+            )
+        })?;
+        self.watcher_handle.await.map_err(|e| {
+            BrokerError::InternalError(format!(
+                "Failed to join the timeline updates task, kind: {:?}, error: {e}",
+                self.kind
+            ))
+        })?
+    }
+}
+
+/// The subscription kind to the timeline updates from safekeeper.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct SkTimelineSubscriptionKind {
+    broker_prefix: String,
+    kind: SubscriptionKind,
+}
+
+impl SkTimelineSubscriptionKind {
+    pub fn all(broker_prefix: String) -> Self {
+        Self {
+            broker_prefix,
+            kind: SubscriptionKind::All,
+        }
+    }
+
+    pub fn tenant(broker_prefix: String, tenant: ZTenantId) -> Self {
+        Self {
+            broker_prefix,
+            kind: SubscriptionKind::Tenant(tenant),
+        }
+    }
+
+    pub fn timeline(broker_prefix: String, timeline: ZTenantTimelineId) -> Self {
+        Self {
+            broker_prefix,
+            kind: SubscriptionKind::Timeline(timeline),
+        }
+    }
+
+    fn watch_regex(&self) -> Regex {
+        match self.kind {
+            SubscriptionKind::All => Regex::new(&format!(
+                r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
+                self.broker_prefix
+            ))
+            .expect("wrong regex for 'everything' subscription"),
+            SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!(
+                r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
+                self.broker_prefix
+            ))
+            .expect("wrong regex for 'tenant' subscription"),
+            SubscriptionKind::Timeline(ZTenantTimelineId {
+                tenant_id,
+                timeline_id,
+            }) => Regex::new(&format!(
+                r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$",
+                self.broker_prefix
+            ))
+            .expect("wrong regex for 'timeline' subscription"),
+        }
+    }
+
+    /// Etcd key to use for watching a certain timeline updates from safekeepers.
+    pub fn watch_key(&self) -> String {
+        match self.kind {
+            SubscriptionKind::All => self.broker_prefix.to_string(),
+            SubscriptionKind::Tenant(tenant_id) => {
+                format!("{}/{tenant_id}/safekeeper", self.broker_prefix)
+            }
+            SubscriptionKind::Timeline(ZTenantTimelineId {
+                tenant_id,
+                timeline_id,
+            }) => format!(
+                "{}/{tenant_id}/{timeline_id}/safekeeper",
+                self.broker_prefix
+            ),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+enum SubscriptionKind {
+    /// Get every timeline update.
+    All,
+    /// Get certain tenant timelines' updates.
+    Tenant(ZTenantId),
+    /// Get certain timeline updates.
+    Timeline(ZTenantTimelineId),
+}
+
+/// Creates a background task to poll etcd for timeline updates from safekeepers.
+/// Stops and returns `Err` on any error during etcd communication.
+/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
+/// exiting normally in such cases.
+pub async fn subscribe_to_safekeeper_timeline_updates(
+    client: &mut Client,
+    subscription: SkTimelineSubscriptionKind,
+) -> Result<SkTimelineSubscription, BrokerError> {
+    info!("Subscribing to timeline updates, subscription kind: {subscription:?}");
+
+    let (watcher, mut stream) = client
+        .watch(
+            subscription.watch_key(),
+            Some(WatchOptions::new().with_prefix()),
+        )
+        .await
+        .map_err(|e| {
+            BrokerError::EtcdClient(
+                e,
+                format!("Failed to init the watch for subscription {subscription:?}"),
+            )
+        })?;
+
+    let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel();
+
+    let subscription_kind = subscription.kind;
+    let regex = subscription.watch_regex();
+    let watcher_handle = tokio::spawn(async move {
+        while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
+            "Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}"
+        )))? {
+            if resp.canceled() {
+                info!("Watch for timeline updates subscription was canceled, exiting");
+                break;
+            }
+
+            let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>> = HashMap::new();
+            // Keep track that the timeline data updates from etcd arrive in the right order.
+            // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
+            // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
+            let mut timeline_etcd_versions: HashMap<ZTenantTimelineId, i64> = HashMap::new();
+
+
+            let events = resp.events();
+            debug!("Processing {} events", events.len());
+
+            for event in events {
+                if EventType::Put == event.event_type() {
+                    if let Some(new_etcd_kv) = event.kv() {
+                        let new_kv_version = new_etcd_kv.version();
+
+                        match parse_etcd_key_value(subscription_kind, &regex, new_etcd_kv) {
+                            Ok(Some((zttid, timeline))) => {
+                                match timeline_updates
+                                    .entry(zttid)
+                                    .or_default()
+                                    .entry(timeline.safekeeper_id)
+                                {
+                                    hash_map::Entry::Occupied(mut o) => {
+                                        let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN);
+                                        if old_etcd_kv_version < new_kv_version {
+                                            o.insert(timeline.info);
+                                            timeline_etcd_versions.insert(zttid,new_kv_version);
+                                        }
+                                    }
+                                    hash_map::Entry::Vacant(v) => {
+                                        v.insert(timeline.info);
+                                        timeline_etcd_versions.insert(zttid,new_kv_version);
+                                    }
+                                }
+                            }
+                            Ok(None) => {}
+                            Err(e) => error!("Failed to parse timeline update: {e}"),
+                        };
+                    }
+                }
+            }
+
+            if let Err(e) = timeline_updates_sender.send(timeline_updates) {
+                info!("Timeline updates sender got dropped, exiting: {e}");
+                break;
+            }
+        }
+
+        Ok(())
+    });
+
+    Ok(SkTimelineSubscription {
+        kind: subscription,
+        safekeeper_timeline_updates,
+        watcher_handle,
+        watcher,
+    })
+}
+
+fn parse_etcd_key_value(
+    subscription_kind: SubscriptionKind,
+    regex: &Regex,
+    kv: &KeyValue,
+) -> Result<Option<(ZTenantTimelineId, SafekeeperTimeline)>, BrokerError> {
+    let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| {
+        BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str"))
+    })?) {
+        caps
+    } else {
+        return Ok(None);
+    };
+
+    let (zttid, safekeeper_id) = match subscription_kind {
+        SubscriptionKind::All => (
+            ZTenantTimelineId::new(
+                parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
+                parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
+            ),
+            ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
+        ),
+        SubscriptionKind::Tenant(tenant_id) => (
+            ZTenantTimelineId::new(
+                tenant_id,
+                parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
+            ),
+            ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
+        ),
+        SubscriptionKind::Timeline(zttid) => (
+            zttid,
+            ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
+        ),
+    };
+
+    let info_str = kv.value_str().map_err(|e| {
+        BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str"))
+    })?;
+    Ok(Some((
+        zttid,
+        SafekeeperTimeline {
+            safekeeper_id,
+            info: serde_json::from_str(info_str).map_err(|e| {
+                BrokerError::ParsingError(format!(
+                    "Failed to parse '{info_str}' as safekeeper timeline info: {e}"
+                ))
+            })?,
+        },
+    )))
+}
+
+fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
+where
+    T: FromStr,
+    <T as FromStr>::Err: Display,
+{
+    let capture_match = caps
+        .get(index)
+        .ok_or_else(|| format!("Failed to get capture match at index {index}"))?
+        .as_str();
+    capture_match.parse().map_err(|e| {
+        format!(
+            "Failed to parse {} from {capture_match}: {e}",
+            std::any::type_name::<T>()
+        )
+    })
+}
--- a/libs/postgres_ffi/README.md
+++ b/libs/postgres_ffi/README.md
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 #![allow(deref_nullptr)]

 use serde::{Deserialize, Serialize};
+use utils::lsn::Lsn;

 include!(concat!(env!("OUT_DIR"), "/bindings.rs"));

@@ -37,3 +38,21 @@ pub const fn transaction_id_precedes(id1: TransactionId, id2: TransactionId) ->
    let diff = id1.wrapping_sub(id2) as i32;
    diff < 0
 }
+
+// Check if page is not yet initialized (port of Postgres PageIsInit() macro)
+pub fn page_is_new(pg: &[u8]) -> bool {
+    pg[14] == 0 && pg[15] == 0 // pg_upper == 0
+}
+
+// ExtractLSN from page header
+pub fn page_get_lsn(pg: &[u8]) -> Lsn {
+    Lsn(
+        ((u32::from_le_bytes(pg[0..4].try_into().unwrap()) as u64) << 32)
+            | u32::from_le_bytes(pg[4..8].try_into().unwrap()) as u64,
+    )
+}
+
+pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) {
+    pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
+    pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
+}
--- a/libs/postgres_ffi/src/waldecoder.rs
+++ b/libs/postgres_ffi/src/waldecoder.rs
@@ -89,7 +89,12 @@ impl WalStreamDecoder {
                    return Ok(None);
                }

-                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf);
+                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
+                    WalDecodeError {
+                        msg: format!("long header deserialization failed {}", e),
+                        lsn: self.lsn,
+                    }
+                })?;

                if hdr.std.xlp_pageaddr != self.lsn.0 {
                    return Err(WalDecodeError {
@@ -106,7 +111,12 @@ impl WalStreamDecoder {
                    return Ok(None);
                }

-                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf);
+                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
+                    WalDecodeError {
+                        msg: format!("header deserialization failed {}", e),
+                        lsn: self.lsn,
+                    }
+                })?;

                if hdr.xlp_pageaddr != self.lsn.0 {
                    return Err(WalDecodeError {
@@ -188,7 +198,13 @@ impl WalStreamDecoder {
        }

        // We now have a record in the 'recordbuf' local variable.
-        let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
+        let xlogrec =
+            XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| {
+                WalDecodeError {
+                    msg: format!("xlog record deserialization failed {}", e),
+                    lsn: self.lsn,
+                }
+            })?;

        let mut crc = 0;
        crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -15,7 +15,7 @@ use crate::XLogPageHeaderData;
 use crate::XLogRecord;
 use crate::XLOG_PAGE_MAGIC;

-use anyhow::{bail, Result};
+use anyhow::bail;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
 use bytes::{Buf, Bytes};
@@ -28,6 +28,8 @@ use std::io::prelude::*;
 use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
 use std::time::SystemTime;
+use utils::bin_ser::DeserializeError;
+use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;

 pub const XLOG_FNAME_LEN: usize = 24;
@@ -118,11 +120,15 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
 }

 pub fn get_current_timestamp() -> TimestampTz {
+    to_pg_timestamp(SystemTime::now())
+}
+
+pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
    const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
    const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
    const SECS_PER_DAY: u64 = 86400;
    const USECS_PER_SEC: u64 = 1000000;
-    match SystemTime::now().duration_since(SystemTime::UNIX_EPOCH) {
+    match time.duration_since(SystemTime::UNIX_EPOCH) {
        Ok(n) => {
            ((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
                * USECS_PER_SEC
@@ -140,7 +146,7 @@ fn find_end_of_wal_segment(
    tli: TimeLineID,
    wal_seg_size: usize,
    start_offset: usize, // start reading at this point
-) -> Result<u32> {
+) -> anyhow::Result<u32> {
    // step back to the beginning of the page to read it in...
    let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ;
    let mut contlen: usize = 0;
@@ -268,7 +274,7 @@ pub fn find_end_of_wal(
    wal_seg_size: usize,
    precise: bool,
    start_lsn: Lsn, // start reading WAL at this point or later
-) -> Result<(XLogRecPtr, TimeLineID)> {
+) -> anyhow::Result<(XLogRecPtr, TimeLineID)> {
    let mut high_segno: XLogSegNo = 0;
    let mut high_tli: TimeLineID = 0;
    let mut high_ispartial = false;
@@ -350,19 +356,19 @@ pub fn main() {
 }

 impl XLogRecord {
-    pub fn from_slice(buf: &[u8]) -> XLogRecord {
+    pub fn from_slice(buf: &[u8]) -> Result<XLogRecord, DeserializeError> {
        use utils::bin_ser::LeSer;
-        XLogRecord::des(buf).unwrap()
+        XLogRecord::des(buf)
    }

-    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogRecord {
+    pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogRecord, DeserializeError> {
        use utils::bin_ser::LeSer;
-        XLogRecord::des_from(&mut buf.reader()).unwrap()
+        XLogRecord::des_from(&mut buf.reader())
    }

-    pub fn encode(&self) -> Bytes {
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
        use utils::bin_ser::LeSer;
-        self.ser().unwrap().into()
+        Ok(self.ser()?.into())
    }

    // Is this record an XLOG_SWITCH record? They need some special processing,
@@ -372,35 +378,35 @@ impl XLogRecord {
 }

 impl XLogPageHeaderData {
-    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogPageHeaderData {
+    pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogPageHeaderData, DeserializeError> {
        use utils::bin_ser::LeSer;
-        XLogPageHeaderData::des_from(&mut buf.reader()).unwrap()
+        XLogPageHeaderData::des_from(&mut buf.reader())
    }
 }

 impl XLogLongPageHeaderData {
-    pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogLongPageHeaderData {
+    pub fn from_bytes<B: Buf>(buf: &mut B) -> Result<XLogLongPageHeaderData, DeserializeError> {
        use utils::bin_ser::LeSer;
-        XLogLongPageHeaderData::des_from(&mut buf.reader()).unwrap()
+        XLogLongPageHeaderData::des_from(&mut buf.reader())
    }

-    pub fn encode(&self) -> Bytes {
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
        use utils::bin_ser::LeSer;
-        self.ser().unwrap().into()
+        self.ser().map(|b| b.into())
    }
 }

 pub const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();

 impl CheckPoint {
-    pub fn encode(&self) -> Bytes {
+    pub fn encode(&self) -> Result<Bytes, SerializeError> {
        use utils::bin_ser::LeSer;
-        self.ser().unwrap().into()
+        Ok(self.ser()?.into())
    }

-    pub fn decode(buf: &[u8]) -> Result<CheckPoint, anyhow::Error> {
+    pub fn decode(buf: &[u8]) -> Result<CheckPoint, DeserializeError> {
        use utils::bin_ser::LeSer;
-        Ok(CheckPoint::des(buf)?)
+        CheckPoint::des(buf)
    }

    /// Update next XID based on provided new_xid and stored epoch.
@@ -438,7 +444,7 @@ impl CheckPoint {
 // Generate new, empty WAL segment.
 // We need this segment to start compute node.
 //
-pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
+pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, SerializeError> {
    let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);

    let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
@@ -458,12 +464,12 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
    };

-    let hdr_bytes = hdr.encode();
+    let hdr_bytes = hdr.encode()?;
    seg_buf.extend_from_slice(&hdr_bytes);

    //zero out the rest of the file
    seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
-    seg_buf.freeze()
+    Ok(seg_buf.freeze())
 }

 #[cfg(test)]
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "remote_storage"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+anyhow = { version = "1.0", features = ["backtrace"] }
+tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
+tokio-util = { version = "0.7", features = ["io"] }
+tracing = "0.1.27"
+rusoto_core = "0.48"
+rusoto_s3 = "0.48"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1"
+async-trait = "0.1"
+
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
+[dev-dependencies]
+tempfile = "3.2"
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -0,0 +1,232 @@
+//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
+//! No other modules from this tree are supposed to be used directly by the external code.
+//!
+//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
+//!   * [`local_fs`] allows to use local file system as an external storage
+//!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
+//!
+mod local_fs;
+mod s3_bucket;
+
+use std::{
+    borrow::Cow,
+    collections::HashMap,
+    ffi::OsStr,
+    num::{NonZeroU32, NonZeroUsize},
+    path::{Path, PathBuf},
+};
+
+use anyhow::Context;
+use tokio::io;
+use tracing::info;
+
+pub use self::{
+    local_fs::LocalFs,
+    s3_bucket::{S3Bucket, S3ObjectKey},
+};
+
+/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
+/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
+/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
+/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
+pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
+pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
+/// Currently, sync happens with AWS S3, that has two limits on requests per second:
+/// ~200 RPS for IAM services
+/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
+/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
+/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
+pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
+
+/// Storage (potentially remote) API to manage its state.
+/// This storage tries to be unaware of any layered repository context,
+/// providing basic CRUD operations for storage files.
+#[async_trait::async_trait]
+pub trait RemoteStorage: Send + Sync {
+    /// A way to uniquely reference a file in the remote storage.
+    type RemoteObjectId;
+
+    /// Attempts to derive the storage path out of the local path, if the latter is correct.
+    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
+
+    /// Gets the download path of the given storage file.
+    fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result<PathBuf>;
+
+    /// Lists all items the storage has right now.
+    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
+
+    /// Streams the local file contents into remote into the remote storage entry.
+    async fn upload(
+        &self,
+        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        // S3 PUT request requires the content length to be specified,
+        // otherwise it starts to fail with the concurrent connection count increasing.
+        from_size_bytes: usize,
+        to: &Self::RemoteObjectId,
+        metadata: Option<StorageMetadata>,
+    ) -> anyhow::Result<()>;
+
+    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Returns the metadata, if any was stored with the file previously.
+    async fn download(
+        &self,
+        from: &Self::RemoteObjectId,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<Option<StorageMetadata>>;
+
+    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Returns the metadata, if any was stored with the file previously.
+    async fn download_byte_range(
+        &self,
+        from: &Self::RemoteObjectId,
+        start_inclusive: u64,
+        end_exclusive: Option<u64>,
+        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
+    ) -> anyhow::Result<Option<StorageMetadata>>;
+
+    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
+}
+
+/// TODO kb
+pub enum GenericRemoteStorage {
+    Local(LocalFs),
+    S3(S3Bucket),
+}
+
+impl GenericRemoteStorage {
+    pub fn new(
+        working_directory: PathBuf,
+        storage_config: &RemoteStorageConfig,
+    ) -> anyhow::Result<Self> {
+        match &storage_config.storage {
+            RemoteStorageKind::LocalFs(root) => {
+                info!("Using fs root '{}' as a remote storage", root.display());
+                LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local)
+            }
+            RemoteStorageKind::AwsS3(s3_config) => {
+                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
+                    s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
+                S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3)
+            }
+        }
+    }
+}
+
+/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
+/// Immutable, cannot be changed once the file is created.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct StorageMetadata(HashMap<String, String>);
+
+fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
+    if prefix == path {
+        anyhow::bail!(
+            "Prefix and the path are equal, cannot strip: '{}'",
+            prefix.display()
+        )
+    } else {
+        path.strip_prefix(prefix).with_context(|| {
+            format!(
+                "Path '{}' is not prefixed with '{}'",
+                path.display(),
+                prefix.display(),
+            )
+        })
+    }
+}
+
+/// External backup storage configuration, enough for creating a client for that storage.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct RemoteStorageConfig {
+    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
+    pub max_concurrent_syncs: NonZeroUsize,
+    /// Max allowed errors before the sync task is considered failed and evicted.
+    pub max_sync_errors: NonZeroU32,
+    /// The storage connection configuration.
+    pub storage: RemoteStorageKind,
+}
+
+/// A kind of a remote storage to connect to, with its connection configuration.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum RemoteStorageKind {
+    /// Storage based on local file system.
+    /// Specify a root folder to place all stored files into.
+    LocalFs(PathBuf),
+    /// AWS S3 based storage, storing all files in the S3 bucket
+    /// specified by the config
+    AwsS3(S3Config),
+}
+
+/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
+#[derive(Clone, PartialEq, Eq)]
+pub struct S3Config {
+    /// Name of the bucket to connect to.
+    pub bucket_name: String,
+    /// The region where the bucket is located at.
+    pub bucket_region: String,
+    /// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
+    pub prefix_in_bucket: Option<String>,
+    /// A base URL to send S3 requests to.
+    /// By default, the endpoint is derived from a region name, assuming it's
+    /// an AWS S3 region name, erroring on wrong region name.
+    /// Endpoint provides a way to support other S3 flavors and their regions.
+    ///
+    /// Example: `http://127.0.0.1:5000`
+    pub endpoint: Option<String>,
+    /// AWS S3 has various limits on its API calls, we need not to exceed those.
+    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
+    pub concurrency_limit: NonZeroUsize,
+}
+
+impl std::fmt::Debug for S3Config {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("S3Config")
+            .field("bucket_name", &self.bucket_name)
+            .field("bucket_region", &self.bucket_region)
+            .field("prefix_in_bucket", &self.prefix_in_bucket)
+            .field("concurrency_limit", &self.concurrency_limit)
+            .finish()
+    }
+}
+
+pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
+    let new_extension = match original_path
+        .as_ref()
+        .extension()
+        .map(OsStr::to_string_lossy)
+    {
+        Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
+        None => Cow::Borrowed(suffix),
+    };
+    original_path
+        .as_ref()
+        .with_extension(new_extension.as_ref())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_path_with_suffix_extension() {
+        let p = PathBuf::from("/foo/bar");
+        assert_eq!(
+            &path_with_suffix_extension(&p, "temp").to_string_lossy(),
+            "/foo/bar.temp"
+        );
+        let p = PathBuf::from("/foo/bar");
+        assert_eq!(
+            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            "/foo/bar.temp.temp"
+        );
+        let p = PathBuf::from("/foo/bar.baz");
+        assert_eq!(
+            &path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
+            "/foo/bar.baz.temp.temp"
+        );
+        let p = PathBuf::from("/foo/bar.baz");
+        assert_eq!(
+            &path_with_suffix_extension(&p, ".temp").to_string_lossy(),
+            "/foo/bar.baz..temp"
+        );
+    }
+}
--- a/pageserver/src/remote_storage/local_fs.rs
+++ b/pageserver/src/remote_storage/local_fs.rs
@@ -1,7 +1,7 @@
 //! Local filesystem acting as a remote storage.
-//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
+//! Multiple API users can use the same "storage" of this kind by using different storage roots.
 //!
-//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
+//! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

 use std::{
@@ -17,16 +17,18 @@ use tokio::{
 };
 use tracing::*;

+use crate::path_with_suffix_extension;
+
 use super::{strip_path_prefix, RemoteStorage, StorageMetadata};

 pub struct LocalFs {
-    pageserver_workdir: &'static Path,
-    root: PathBuf,
+    working_directory: PathBuf,
+    storage_root: PathBuf,
 }

 impl LocalFs {
    /// Attempts to create local FS storage, along with its root directory.
-    pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
+    pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result<Self> {
        if !root.exists() {
            std::fs::create_dir_all(&root).with_context(|| {
                format!(
@@ -36,15 +38,15 @@ impl LocalFs {
            })?;
        }
        Ok(Self {
-            pageserver_workdir,
-            root,
+            working_directory,
+            storage_root: root,
        })
    }

    fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
        if path.is_relative() {
-            Ok(self.root.join(path))
-        } else if path.starts_with(&self.root) {
+            Ok(self.storage_root.join(path))
+        } else if path.starts_with(&self.storage_root) {
            Ok(path.to_path_buf())
        } else {
            bail!(
@@ -83,30 +85,30 @@ impl LocalFs {

 #[async_trait::async_trait]
 impl RemoteStorage for LocalFs {
-    type StoragePath = PathBuf;
+    type RemoteObjectId = PathBuf;

-    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
-        Ok(self.root.join(
-            strip_path_prefix(self.pageserver_workdir, local_path)
+    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
+        Ok(self.storage_root.join(
+            strip_path_prefix(&self.working_directory, local_path)
                .context("local path does not belong to this storage")?,
        ))
    }

-    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
-        let relative_path = strip_path_prefix(&self.root, storage_path)
+    fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
+        let relative_path = strip_path_prefix(&self.storage_root, storage_path)
            .context("local path does not belong to this storage")?;
-        Ok(self.pageserver_workdir.join(relative_path))
+        Ok(self.working_directory.join(relative_path))
    }

-    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
-        get_all_files(&self.root).await
+    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
+        get_all_files(&self.storage_root).await
    }

    async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        from_size_bytes: usize,
-        to: &Self::StoragePath,
+        to: &Self::RemoteObjectId,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
        let target_file_path = self.resolve_in_storage(to)?;
@@ -114,7 +116,7 @@ impl RemoteStorage for LocalFs {
        // We need this dance with sort of durable rename (without fsyncs)
        // to prevent partial uploads. This was really hit when pageserver shutdown
        // cancelled the upload and partial file was left on the fs
-        let temp_file_path = path_with_suffix_extension(&target_file_path, ".temp");
+        let temp_file_path = path_with_suffix_extension(&target_file_path, "temp");
        let mut destination = io::BufWriter::new(
            fs::OpenOptions::new()
                .write(true)
@@ -192,7 +194,7 @@ impl RemoteStorage for LocalFs {

    async fn download(
        &self,
-        from: &Self::StoragePath,
+        from: &Self::RemoteObjectId,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
    ) -> anyhow::Result<Option<StorageMetadata>> {
        let file_path = self.resolve_in_storage(from)?;
@@ -227,9 +229,9 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn download_range(
+    async fn download_byte_range(
        &self,
-        from: &Self::StoragePath,
+        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
@@ -286,7 +288,7 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
        let file_path = self.resolve_in_storage(path)?;
        if file_path.exists() && file_path.is_file() {
            Ok(fs::remove_file(file_path).await?)
@@ -299,15 +301,8 @@ impl RemoteStorage for LocalFs {
    }
 }

-fn path_with_suffix_extension(original_path: &Path, suffix: &str) -> PathBuf {
-    let mut extension_with_suffix = original_path.extension().unwrap_or_default().to_os_string();
-    extension_with_suffix.push(suffix);
-
-    original_path.with_extension(extension_with_suffix)
-}
-
 fn storage_metadata_path(original_path: &Path) -> PathBuf {
-    path_with_suffix_extension(original_path, ".metadata")
+    path_with_suffix_extension(original_path, "metadata")
 }

 fn get_all_files<'a, P>(
@@ -359,29 +354,30 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>

 #[cfg(test)]
 mod pure_tests {
-    use crate::{
-        layered_repository::metadata::METADATA_FILE_NAME,
-        repository::repo_harness::{RepoHarness, TIMELINE_ID},
-    };
+    use tempfile::tempdir;

    use super::*;

    #[test]
    fn storage_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("storage_path_positive")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root.clone(),
+            working_directory: workdir.clone(),
+            storage_root: storage_root.clone(),
        };

-        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name");
-        let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
+        let local_path = workdir
+            .join("timelines")
+            .join("some_timeline")
+            .join("file_name");
+        let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?);

        assert_eq!(
            expected_path,
-            storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
-            "File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
+            storage.remote_object_id(&local_path).expect("Matching path should map to storage path normally"),
+            "File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir"
        );

        Ok(())
@@ -391,7 +387,7 @@ mod pure_tests {
    fn storage_path_negatives() -> anyhow::Result<()> {
        #[track_caller]
        fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
-            match storage.storage_path(mismatching_path) {
+            match storage.remote_object_id(mismatching_path) {
                Ok(wrong_path) => panic!(
                    "Expected path '{}' to error, but got storage path: {:?}",
                    mismatching_path.display(),
@@ -401,16 +397,16 @@ mod pure_tests {
            }
        }

-        let repo_harness = RepoHarness::create("storage_path_negatives")?;
+        let workdir = tempdir()?.path().to_owned();
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root,
+            working_directory: workdir.clone(),
+            storage_root,
        };

-        let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
+        let error_string = storage_path_error(&storage, &workdir);
        assert!(error_string.contains("does not belong to this storage"));
-        assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
+        assert!(error_string.contains(workdir.to_str().unwrap()));

        let mismatching_path_str = "/something/else";
        let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
@@ -419,7 +415,7 @@ mod pure_tests {
            "Error should mention wrong path"
        );
        assert!(
-            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            error_message.contains(workdir.to_str().unwrap()),
            "Error should mention server workdir"
        );
        assert!(error_message.contains("does not belong to this storage"));
@@ -429,29 +425,28 @@ mod pure_tests {

    #[test]
    fn local_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("local_path_positive")?;
+        let workdir = tempdir()?.path().to_owned();
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root.clone(),
+            working_directory: workdir.clone(),
+            storage_root: storage_root.clone(),
        };

        let name = "not a metadata";
-        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
+        let local_path = workdir.join("timelines").join("some_timeline").join(name);
        assert_eq!(
            local_path,
            storage
-                .local_path(
-                    &storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
-                )
+                .local_path(&storage_root.join(local_path.strip_prefix(&workdir)?))
                .expect("For a valid input, valid local path should be parsed"),
            "Should be able to parse metadata out of the correctly named remote delta file"
        );

-        let local_metadata_path = repo_harness
-            .timeline_path(&TIMELINE_ID)
-            .join(METADATA_FILE_NAME);
-        let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
+        let local_metadata_path = workdir
+            .join("timelines")
+            .join("some_timeline")
+            .join("metadata");
+        let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?;
        assert_eq!(
            local_metadata_path,
            storage
@@ -477,11 +472,10 @@ mod pure_tests {
            }
        }

-        let repo_harness = RepoHarness::create("local_path_negatives")?;
        let storage_root = PathBuf::from("somewhere").join("else");
        let storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root,
+            working_directory: tempdir()?.path().to_owned(),
+            storage_root,
        };

        let totally_wrong_path = "wrong_wrong_wrong";
@@ -493,16 +487,19 @@ mod pure_tests {

    #[test]
    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
-        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+        let workdir = tempdir()?.path().to_owned();
+        let original_path = workdir
+            .join("timelines")
+            .join("some_timeline")
+            .join("some name");

        let storage_root = PathBuf::from("somewhere").join("else");
        let dummy_storage = LocalFs {
-            pageserver_workdir: &repo_harness.conf.workdir,
-            root: storage_root,
+            working_directory: workdir,
+            storage_root,
        };

-        let storage_path = dummy_storage.storage_path(&original_path)?;
+        let storage_path = dummy_storage.remote_object_id(&original_path)?;
        let download_destination = dummy_storage.local_path(&storage_path)?;

        assert_eq!(
@@ -517,18 +514,17 @@ mod pure_tests {
 #[cfg(test)]
 mod fs_tests {
    use super::*;
-    use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};

    use std::{collections::HashMap, io::Write};
    use tempfile::tempdir;

    #[tokio::test]
    async fn upload_file() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("upload_file")?;
+        let workdir = tempdir()?.path().to_owned();
        let storage = create_storage()?;

        let (file, size) = create_file_for_upload(
-            &storage.pageserver_workdir.join("whatever"),
+            &storage.working_directory.join("whatever"),
            "whatever_contents",
        )
        .await?;
@@ -543,14 +539,14 @@ mod fs_tests {
        }
        assert!(storage.list().await?.is_empty());

-        let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1", None).await?;
+        let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?;
        assert_eq!(
            storage.list().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );

-        let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2", None).await?;
+        let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?;
        assert_eq!(
            list_files_sorted(&storage).await?,
            vec![target_path_1.clone(), target_path_2.clone()],
@@ -561,17 +557,16 @@ mod fs_tests {
    }

    fn create_storage() -> anyhow::Result<LocalFs> {
-        let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
-        let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
-        Ok(storage)
+        LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
    }

    #[tokio::test]
    async fn download_file() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let metadata = storage.download(&upload_target, &mut content_bytes).await?;
@@ -602,14 +597,15 @@ mod fs_tests {

    #[tokio::test]
    async fn download_file_range_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file_range_positive")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

        let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let metadata = storage
-            .download_range(&upload_target, 0, None, &mut full_range_bytes)
+            .download_byte_range(&upload_target, 0, None, &mut full_range_bytes)
            .await?;
        assert!(
            metadata.is_none(),
@@ -625,7 +621,7 @@ mod fs_tests {
        let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let same_byte = 1_000_000_000;
        let metadata = storage
-            .download_range(
+            .download_byte_range(
                &upload_target,
                same_byte,
                Some(same_byte + 1), // exclusive end
@@ -647,7 +643,7 @@ mod fs_tests {

        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let metadata = storage
-            .download_range(
+            .download_byte_range(
                &upload_target,
                0,
                Some(first_part_local.len() as u64),
@@ -669,7 +665,7 @@ mod fs_tests {

        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let metadata = storage
-            .download_range(
+            .download_byte_range(
                &upload_target,
                first_part_local.len() as u64,
                Some((first_part_local.len() + second_part_local.len()) as u64),
@@ -694,16 +690,17 @@ mod fs_tests {

    #[tokio::test]
    async fn download_file_range_negative() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file_range_negative")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

        let start = 10000;
        let end = 234;
        assert!(start > end, "Should test an incorrect range");
        match storage
-            .download_range(&upload_target, start, Some(end), &mut io::sink())
+            .download_byte_range(&upload_target, start, Some(end), &mut io::sink())
            .await
        {
            Ok(_) => panic!("Should not allow downloading wrong ranges"),
@@ -717,7 +714,7 @@ mod fs_tests {

        let non_existing_path = PathBuf::from("somewhere").join("else");
        match storage
-            .download_range(&non_existing_path, 1, Some(3), &mut io::sink())
+            .download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink())
            .await
        {
            Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
@@ -732,10 +729,11 @@ mod fs_tests {

    #[tokio::test]
    async fn delete_file() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("delete_file")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

        storage.delete(&upload_target).await?;
        assert!(storage.list().await?.is_empty());
@@ -753,7 +751,8 @@ mod fs_tests {

    #[tokio::test]
    async fn file_with_metadata() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_file")?;
+        let workdir = tempdir()?.path().to_owned();
+
        let storage = create_storage()?;
        let upload_name = "upload_1";
        let metadata = StorageMetadata(HashMap::from([
@@ -761,7 +760,7 @@ mod fs_tests {
            ("two".to_string(), "2".to_string()),
        ]));
        let upload_target =
-            upload_dummy_file(&repo_harness, &storage, upload_name, Some(metadata.clone())).await?;
+            upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;

        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
@@ -785,7 +784,7 @@ mod fs_tests {

        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
        let partial_download_metadata = storage
-            .download_range(
+            .download_byte_range(
                &upload_target,
                0,
                Some(first_part_local.len() as u64),
@@ -810,16 +809,16 @@ mod fs_tests {
    }

    async fn upload_dummy_file(
-        harness: &RepoHarness<'_>,
+        workdir: &Path,
        storage: &LocalFs,
        name: &str,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<PathBuf> {
-        let timeline_path = harness.timeline_path(&TIMELINE_ID);
-        let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
-        let storage_path = storage.root.join(relative_timeline_path).join(name);
+        let timeline_path = workdir.join("timelines").join("some_timeline");
+        let relative_timeline_path = timeline_path.strip_prefix(&workdir)?;
+        let storage_path = storage.storage_root.join(relative_timeline_path).join(name);

-        let from_path = storage.pageserver_workdir.join(name);
+        let from_path = storage.working_directory.join(name);
        let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;
        storage.upload(file, size, &storage_path, metadata).await?;
        Ok(storage_path)
--- a/pageserver/src/remote_storage/s3_bucket.rs
+++ b/pageserver/src/remote_storage/s3_bucket.rs
@@ -1,7 +1,7 @@
 //! AWS S3 storage wrapper around `rusoto` library.
 //!
 //! Respects `prefix_in_bucket` property from [`S3Config`],
-//! allowing multiple pageservers to independently work with the same S3 bucket, if
+//! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

 use std::path::{Path, PathBuf};
@@ -19,16 +19,13 @@ use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
 use tracing::debug;

-use crate::{
-    config::S3Config,
-    remote_storage::{strip_path_prefix, RemoteStorage},
-};
+use crate::{strip_path_prefix, RemoteStorage, S3Config};

 use super::StorageMetadata;

-const S3_FILE_SEPARATOR: char = '/';
+const S3_PREFIX_SEPARATOR: char = '/';

-#[derive(Debug, Eq, PartialEq)]
+#[derive(Debug, Eq, PartialEq, PartialOrd, Ord, Hash)]
 pub struct S3ObjectKey(String);

 impl S3ObjectKey {
@@ -36,11 +33,7 @@ impl S3ObjectKey {
        &self.0
    }

-    fn download_destination(
-        &self,
-        pageserver_workdir: &Path,
-        prefix_to_strip: Option<&str>,
-    ) -> PathBuf {
+    fn download_destination(&self, workdir: &Path, prefix_to_strip: Option<&str>) -> PathBuf {
        let path_without_prefix = match prefix_to_strip {
            Some(prefix) => self.0.strip_prefix(prefix).unwrap_or_else(|| {
                panic!(
@@ -51,9 +44,9 @@ impl S3ObjectKey {
            None => &self.0,
        };

-        pageserver_workdir.join(
+        workdir.join(
            path_without_prefix
-                .split(S3_FILE_SEPARATOR)
+                .split(S3_PREFIX_SEPARATOR)
                .collect::<PathBuf>(),
        )
    }
@@ -61,7 +54,7 @@ impl S3ObjectKey {

 /// AWS S3 storage.
 pub struct S3Bucket {
-    pageserver_workdir: &'static Path,
+    workdir: PathBuf,
    client: S3Client,
    bucket_name: String,
    prefix_in_bucket: Option<String>,
@@ -73,7 +66,7 @@ pub struct S3Bucket {

 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result<Self> {
        debug!(
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
@@ -89,8 +82,11 @@ impl S3Bucket {
                .context("Failed to parse the s3 region from config")?,
        };
        let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?;
-        let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none()
-        {
+
+        let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
+        let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
+
+        let client = if access_key_id.is_none() && secret_access_key.is_none() {
            debug!("Using IAM-based AWS access");
            S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
        } else {
@@ -98,8 +94,8 @@ impl S3Bucket {
            S3Client::new_with(
                request_dispatcher,
                StaticProvider::new_minimal(
-                    aws_config.access_key_id.clone().unwrap_or_default(),
-                    aws_config.secret_access_key.clone().unwrap_or_default(),
+                    access_key_id.unwrap_or_default(),
+                    secret_access_key.unwrap_or_default(),
                ),
                region,
            )
@@ -107,12 +103,12 @@ impl S3Bucket {

        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
            let mut prefix = prefix;
-            while prefix.starts_with(S3_FILE_SEPARATOR) {
+            while prefix.starts_with(S3_PREFIX_SEPARATOR) {
                prefix = &prefix[1..]
            }

            let mut prefix = prefix.to_string();
-            while prefix.ends_with(S3_FILE_SEPARATOR) {
+            while prefix.ends_with(S3_PREFIX_SEPARATOR) {
                prefix.pop();
            }
            prefix
@@ -120,7 +116,7 @@ impl S3Bucket {

        Ok(Self {
            client,
-            pageserver_workdir,
+            workdir,
            bucket_name: aws_config.bucket_name.clone(),
            prefix_in_bucket,
            concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
@@ -130,24 +126,23 @@ impl S3Bucket {

 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
-    type StoragePath = S3ObjectKey;
+    type RemoteObjectId = S3ObjectKey;

-    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
-        let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
+    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId> {
+        let relative_path = strip_path_prefix(&self.workdir, local_path)?;
        let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
        for segment in relative_path {
-            key.push(S3_FILE_SEPARATOR);
+            key.push(S3_PREFIX_SEPARATOR);
            key.push_str(&segment.to_string_lossy());
        }
        Ok(S3ObjectKey(key))
    }

-    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
-        Ok(storage_path
-            .download_destination(self.pageserver_workdir, self.prefix_in_bucket.as_deref()))
+    fn local_path(&self, storage_path: &Self::RemoteObjectId) -> anyhow::Result<PathBuf> {
+        Ok(storage_path.download_destination(&self.workdir, self.prefix_in_bucket.as_deref()))
    }

-    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
+    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
        let mut document_keys = Vec::new();

        let mut continuation_token = None;
@@ -187,7 +182,7 @@ impl RemoteStorage for S3Bucket {
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
        from_size_bytes: usize,
-        to: &Self::StoragePath,
+        to: &Self::RemoteObjectId,
        metadata: Option<StorageMetadata>,
    ) -> anyhow::Result<()> {
        let _guard = self
@@ -212,7 +207,7 @@ impl RemoteStorage for S3Bucket {

    async fn download(
        &self,
-        from: &Self::StoragePath,
+        from: &Self::RemoteObjectId,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
    ) -> anyhow::Result<Option<StorageMetadata>> {
        let _guard = self
@@ -237,9 +232,9 @@ impl RemoteStorage for S3Bucket {
        Ok(object_output.metadata.map(StorageMetadata))
    }

-    async fn download_range(
+    async fn download_byte_range(
        &self,
-        from: &Self::StoragePath,
+        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
@@ -274,7 +269,7 @@ impl RemoteStorage for S3Bucket {
        Ok(object_output.metadata.map(StorageMetadata))
    }

-    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
        let _guard = self
            .concurrency_limiter
            .acquire()
@@ -293,34 +288,30 @@ impl RemoteStorage for S3Bucket {

 #[cfg(test)]
 mod tests {
-    use crate::{
-        layered_repository::metadata::METADATA_FILE_NAME,
-        repository::repo_harness::{RepoHarness, TIMELINE_ID},
-    };
+    use tempfile::tempdir;

    use super::*;

    #[test]
    fn download_destination() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_destination")?;
-
-        let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
-        let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
+        let workdir = tempdir()?.path().to_owned();
+        let local_path = workdir.join("one").join("two").join("test_name");
+        let relative_path = local_path.strip_prefix(&workdir)?;

        let key = S3ObjectKey(format!(
            "{}{}",
-            S3_FILE_SEPARATOR,
+            S3_PREFIX_SEPARATOR,
            relative_path
                .iter()
                .map(|segment| segment.to_str().unwrap())
                .collect::<Vec<_>>()
-                .join(&S3_FILE_SEPARATOR.to_string()),
+                .join(&S3_PREFIX_SEPARATOR.to_string()),
        ));

        assert_eq!(
            local_path,
-            key.download_destination(&repo_harness.conf.workdir, None),
-            "Download destination should consist of s3 path joined with the pageserver workdir prefix"
+            key.download_destination(&workdir, None),
+            "Download destination should consist of s3 path joined with the workdir prefix"
        );

        Ok(())
@@ -328,24 +319,21 @@ mod tests {

    #[test]
    fn storage_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("storage_path_positive")?;
+        let workdir = tempdir()?.path().to_owned();

        let segment_1 = "matching";
        let segment_2 = "file";
-        let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
+        let local_path = &workdir.join(segment_1).join(segment_2);

-        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let storage = dummy_storage(workdir);

        let expected_key = S3ObjectKey(format!(
-            "{}{SEPARATOR}{}{SEPARATOR}{}",
+            "{}{S3_PREFIX_SEPARATOR}{segment_1}{S3_PREFIX_SEPARATOR}{segment_2}",
            storage.prefix_in_bucket.as_deref().unwrap_or_default(),
-            segment_1,
-            segment_2,
-            SEPARATOR = S3_FILE_SEPARATOR,
        ));

        let actual_key = storage
-            .storage_path(local_path)
+            .remote_object_id(local_path)
            .expect("Matching path should map to S3 path normally");
        assert_eq!(
            expected_key,
@@ -360,7 +348,7 @@ mod tests {
    fn storage_path_negatives() -> anyhow::Result<()> {
        #[track_caller]
        fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String {
-            match storage.storage_path(mismatching_path) {
+            match storage.remote_object_id(mismatching_path) {
                Ok(wrong_key) => panic!(
                    "Expected path '{}' to error, but got S3 key: {:?}",
                    mismatching_path.display(),
@@ -370,10 +358,10 @@ mod tests {
            }
        }

-        let repo_harness = RepoHarness::create("storage_path_negatives")?;
-        let storage = dummy_storage(&repo_harness.conf.workdir);
+        let workdir = tempdir()?.path().to_owned();
+        let storage = dummy_storage(workdir.clone());

-        let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
+        let error_message = storage_path_error(&storage, &workdir);
        assert!(
            error_message.contains("Prefix and the path are equal"),
            "Message '{}' does not contain the required string",
@@ -387,7 +375,7 @@ mod tests {
            "Error should mention wrong path"
        );
        assert!(
-            error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
+            error_message.contains(workdir.to_str().unwrap()),
            "Error should mention server workdir"
        );
        assert!(
@@ -401,20 +389,17 @@ mod tests {

    #[test]
    fn local_path_positive() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("local_path_positive")?;
-        let storage = dummy_storage(&repo_harness.conf.workdir);
-        let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
-        let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?;
+        let workdir = tempdir()?.path().to_owned();
+        let storage = dummy_storage(workdir.clone());
+        let timeline_dir = workdir.join("timelines").join("test_timeline");
+        let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?;

        let s3_key = create_s3_key(
            &relative_timeline_path.join("not a metadata"),
            storage.prefix_in_bucket.as_deref(),
        );
        assert_eq!(
-            s3_key.download_destination(
-                &repo_harness.conf.workdir,
-                storage.prefix_in_bucket.as_deref()
-            ),
+            s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
            storage
                .local_path(&s3_key)
                .expect("For a valid input, valid S3 info should be parsed"),
@@ -422,14 +407,11 @@ mod tests {
        );

        let s3_key = create_s3_key(
-            &relative_timeline_path.join(METADATA_FILE_NAME),
+            &relative_timeline_path.join("metadata"),
            storage.prefix_in_bucket.as_deref(),
        );
        assert_eq!(
-            s3_key.download_destination(
-                &repo_harness.conf.workdir,
-                storage.prefix_in_bucket.as_deref()
-            ),
+            s3_key.download_destination(&workdir, storage.prefix_in_bucket.as_deref()),
            storage
                .local_path(&s3_key)
                .expect("For a valid input, valid S3 info should be parsed"),
@@ -441,12 +423,15 @@ mod tests {

    #[test]
    fn download_destination_matches_original_path() -> anyhow::Result<()> {
-        let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
-        let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
+        let workdir = tempdir()?.path().to_owned();
+        let original_path = workdir
+            .join("timelines")
+            .join("some_timeline")
+            .join("some name");

-        let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
+        let dummy_storage = dummy_storage(workdir);

-        let key = dummy_storage.storage_path(&original_path)?;
+        let key = dummy_storage.remote_object_id(&original_path)?;
        let download_destination = dummy_storage.local_path(&key)?;

        assert_eq!(
@@ -457,9 +442,9 @@ mod tests {
        Ok(())
    }

-    fn dummy_storage(pageserver_workdir: &'static Path) -> S3Bucket {
+    fn dummy_storage(workdir: PathBuf) -> S3Bucket {
        S3Bucket {
-            pageserver_workdir,
+            workdir,
            client: S3Client::new("us-east-1".parse().unwrap()),
            bucket_name: "dummy-bucket".to_string(),
            prefix_in_bucket: Some("dummy_prefix/".to_string()),
@@ -471,7 +456,7 @@ mod tests {
        S3ObjectKey(relative_file_path.iter().fold(
            prefix.unwrap_or_default().to_string(),
            |mut path_string, segment| {
-                path_string.push(S3_FILE_SEPARATOR);
+                path_string.push(S3_PREFIX_SEPARATOR);
                path_string.push_str(segment.to_str().unwrap());
                path_string
            },
--- a/libs/utils/build.rs
+++ b/libs/utils/build.rs
@@ -1,3 +0,0 @@
-fn main() {
-    println!("cargo:rerun-if-env-changed=GIT_VERSION");
-}
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -1,7 +1,7 @@
 use std::str::FromStr;

 use super::error::ApiError;
-use hyper::{Body, Request};
+use hyper::{body::HttpBody, Body, Request};
 use routerify::ext::RequestExt;

 pub fn get_request_param<'a>(
@@ -31,3 +31,10 @@ pub fn parse_request_param<T: FromStr>(
        ))),
    }
 }
+
+pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
+    match request.body_mut().data().await {
+        Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())),
+        None => Ok(()),
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -54,31 +54,44 @@ pub mod nonblock;
 // Default signal handling
 pub mod signals;

-// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
-//
-// we have several cases:
-// * building locally from git repo
-// * building in CI from git repo
-// * building in docker (either in CI or locally)
-//
-// One thing to note is that .git is not available in docker (and it is bad to include it there).
-// So everything becides docker build is covered by git_version crate.
-// For docker use environment variable to pass git version, which is then retrieved by buildscript (build.rs).
-// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
-// Git version received from environment variable used as a fallback in git_version invokation.
-// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
-// So the build script will be run only when GIT_VERSION envvar has changed.
-//
-// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
-// Caching and workspaces complicates that. In case `utils` is not
-// recompiled due to caching then version may become outdated.
-// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro,
-// so if we changed the index state git_version will pick that up and rerun the macro.
-//
-// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
-use git_version::git_version;
-pub const GIT_VERSION: &str = git_version!(
-    prefix = "git:",
-    fallback = concat!("git-env:", env!("GIT_VERSION")),
-    args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
-);
+/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
+///
+/// we have several cases:
+/// * building locally from git repo
+/// * building in CI from git repo
+/// * building in docker (either in CI or locally)
+///
+/// One thing to note is that .git is not available in docker (and it is bad to include it there).
+/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
+/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
+/// Git version received from environment variable used as a fallback in git_version invokation.
+/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
+/// So the build script will be run only when GIT_VERSION envvar has changed.
+///
+/// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
+/// Caching and workspaces complicates that. In case `utils` is not
+/// recompiled due to caching then version may become outdated.
+/// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro,
+/// so if we changed the index state git_version will pick that up and rerun the macro.
+///
+/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
+///
+/// #############################################################################################
+/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
+/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
+/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
+/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
+/// The problem needs further investigation and regular `const` declaration instead of a macro.
+#[macro_export]
+macro_rules! project_git_version {
+    ($const_identifier:ident) => {
+        const $const_identifier: &str = git_version::git_version!(
+            prefix = "git:",
+            fallback = concat!(
+                "git-env:",
+                env!("GIT_VERSION", "Missing GIT_VERSION envvar")
+            ),
+            args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
+        );
+    };
+}
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -433,7 +433,12 @@ impl PostgresBackend {
                    // full cause of the error, not just the top-level context + its trace.
                    // We don't want to send that in the ErrorResponse though,
                    // because it's not relevant to the compute node logs.
-                    error!("query handler for '{}' failed: {:?}", query_string, e);
+                    if query_string.starts_with("callmemaybe") {
+                        // FIXME avoid printing a backtrace for tenant x not found errors until this is properly fixed
+                        error!("query handler for '{}' failed: {}", query_string, e);
+                    } else {
+                        error!("query handler for '{}' failed: {:?}", query_string, e);
+                    }
                    self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?;
                    // TODO: untangle convoluted control flow
                    if e.to_string().contains("failed to run") {
--- a/libs/utils/src/pq_proto.rs
+++ b/libs/utils/src/pq_proto.rs
@@ -503,6 +503,18 @@ impl RowDescriptor<'_> {
            formatcode: 0,
        }
    }
+
+    pub const fn text_col(name: &[u8]) -> RowDescriptor {
+        RowDescriptor {
+            name,
+            tableoid: 0,
+            attnum: 0,
+            typoid: TEXT_OID,
+            typlen: -1,
+            typmod: 0,
+            formatcode: 0,
+        }
+    }
 }

 #[derive(Debug)]
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,11 +1,9 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

-use serde::{Deserialize, Serialize};
-
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
-#[derive(Clone, Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug)]
 pub struct VecMap<K, V>(Vec<(K, V)>);

 impl<K, V> Default for VecMap<K, V> {
--- a/libs/utils/src/zid.rs
+++ b/libs/utils/src/zid.rs
@@ -224,7 +224,7 @@ impl fmt::Display for ZTenantTimelineId {

 // Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued
 // by the console.
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Debug, Serialize, Deserialize)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Debug, Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct ZNodeId(pub u64);

--- a/neon_local/Cargo.toml
+++ b/neon_local/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "zenith"
+name = "neon_local"
 version = "0.1.0"
 edition = "2021"

@@ -7,7 +7,9 @@ edition = "2021"
 clap = "3.0"
 anyhow = "1.0"
 serde_json = "1"
+comfy-table = "5.0.1"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+git-version = "0.3.5"

 # FIXME: 'pageserver' is needed for BranchInfo. Refactor
 pageserver = { path = "../pageserver" }
--- a/neon_local/src/main.rs
+++ b/neon_local/src/main.rs
@@ -20,8 +20,8 @@ use utils::{
    auth::{Claims, Scope},
    lsn::Lsn,
    postgres_backend::AuthType,
+    project_git_version,
    zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
-    GIT_VERSION,
 };

 use pageserver::timelines::TimelineInfo;
@@ -30,6 +30,7 @@ use pageserver::timelines::TimelineInfo;
 const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1);
 const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
+project_git_version!(GIT_VERSION);

 fn default_conf() -> String {
    format!(
@@ -62,15 +63,15 @@ http_port = {safekeeper_http_port}
 struct TimelineTreeEl {
    /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call.
    pub info: TimelineInfo,
-    /// Name, recovered from zenith config mappings
+    /// Name, recovered from neon config mappings
    pub name: Option<String>,
    /// Holds all direct children of this timeline referenced using `timeline_id`.
    pub children: BTreeSet<ZTimelineId>,
 }

-// Main entry point for the 'zenith' CLI utility
+// Main entry point for the 'neon_local' CLI utility
 //
-// This utility helps to manage zenith installation. That includes following:
+// This utility helps to manage neon installation. That includes following:
 //   * Management of local postgres installations running on top of the
 //     pageserver.
 //   * Providing CLI api to the pageserver
@@ -125,12 +126,12 @@ fn main() -> Result<()> {
        .takes_value(true)
        .required(false);

-    let matches = App::new("Zenith CLI")
+    let matches = App::new("Neon CLI")
        .setting(AppSettings::ArgRequiredElseHelp)
        .version(GIT_VERSION)
        .subcommand(
            App::new("init")
-                .about("Initialize a new Zenith repository")
+                .about("Initialize a new Neon repository")
                .arg(pageserver_config_args.clone())
                .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
                .arg(
@@ -258,7 +259,7 @@ fn main() -> Result<()> {
        None => bail!("no subcommand provided"),
    };

-    // Check for 'zenith init' command first.
+    // Check for 'neon init' command first.
    let subcommand_result = if sub_name == "init" {
        handle_init(sub_args).map(Some)
    } else {
@@ -481,9 +482,8 @@ fn handle_init(init_match: &ArgMatches) -> Result<LocalEnv> {
    };

    let mut env =
-        LocalEnv::create_config(&toml_file).context("Failed to create zenith configuration")?;
-    env.init()
-        .context("Failed to initialize zenith repository")?;
+        LocalEnv::create_config(&toml_file).context("Failed to create neon configuration")?;
+    env.init().context("Failed to initialize neon repository")?;

    // default_tenantid was generated by the `env.init()` call above
    let initial_tenant_id = env.default_tenant_id.unwrap();
@@ -518,7 +518,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
        .collect()
 }

-fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
    let pageserver = PageServerNode::from_env(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
@@ -541,6 +541,29 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re
                "tenant {} successfully created on the pageserver",
                new_tenant_id
            );
+
+            // Create an initial timeline for the new tenant
+            let new_timeline_id = parse_timeline_id(create_match)?;
+            let timeline = pageserver
+                .timeline_create(new_tenant_id, new_timeline_id, None, None)?
+                .context(format!(
+                    "Failed to create initial timeline for tenant {new_tenant_id}"
+                ))?;
+            let new_timeline_id = timeline.timeline_id;
+            let last_record_lsn = timeline
+                .local
+                .context(format!("Failed to get last record LSN: no local timeline info for timeline {new_timeline_id}"))?
+                .last_record_lsn;
+
+            env.register_branch_mapping(
+                DEFAULT_BRANCH_NAME.to_string(),
+                new_tenant_id,
+                new_timeline_id,
+            )?;
+
+            println!(
+                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {new_tenant_id}",
+            );
        }
        Some(("config", create_match)) => {
            let tenant_id = get_tenant_id(create_match, env)?;
@@ -551,17 +574,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Re

            pageserver
                .tenant_config(tenant_id, tenant_conf)
-                .unwrap_or_else(|e| {
-                    anyhow!(
-                        "Tenant config failed for tenant with id {} : {}",
-                        tenant_id,
-                        e
-                    );
-                });
-            println!(
-                "tenant {} successfully configured on the pageserver",
-                tenant_id
-            );
+                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
+            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
@@ -665,35 +679,56 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {

            let timeline_name_mappings = env.timeline_name_mappings();

-            println!("NODE\tADDRESS\tTIMELINE\tBRANCH NAME\tLSN\t\tSTATUS");
+            let mut table = comfy_table::Table::new();
+
+            table.load_preset(comfy_table::presets::NOTHING);
+
+            table.set_header(&[
+                "NODE",
+                "ADDRESS",
+                "TIMELINE",
+                "BRANCH NAME",
+                "LSN",
+                "STATUS",
+            ]);
+
            for ((_, node_name), node) in cplane
                .nodes
                .iter()
                .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id)
            {
-                // FIXME: This shows the LSN at the end of the timeline. It's not the
-                // right thing to do for read-only nodes that might be anchored at an
-                // older point in time, or following but lagging behind the primary.
-                let lsn_str = timeline_infos
-                    .get(&node.timeline_id)
-                    .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string()))
-                    .unwrap_or_else(|| "?".to_string());
+                let lsn_str = match node.lsn {
+                    None => {
+                        // -> primary node
+                        // Use the LSN at the end of the timeline.
+                        timeline_infos
+                            .get(&node.timeline_id)
+                            .and_then(|bi| bi.local.as_ref().map(|l| l.last_record_lsn.to_string()))
+                            .unwrap_or_else(|| "?".to_string())
+                    }
+                    Some(lsn) => {
+                        // -> read-only node
+                        // Use the node's LSN.
+                        lsn.to_string()
+                    }
+                };

                let branch_name = timeline_name_mappings
                    .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id))
                    .map(|name| name.as_str())
                    .unwrap_or("?");

-                println!(
-                    "{}\t{}\t{}\t{}\t{}\t{}",
-                    node_name,
-                    node.address,
-                    node.timeline_id,
+                table.add_row(&[
+                    node_name.as_str(),
+                    &node.address.to_string(),
+                    &node.timeline_id.to_string(),
                    branch_name,
-                    lsn_str,
+                    lsn_str.as_str(),
                    node.status(),
-                );
+                ]);
            }
+
+            println!("{table}");
        }
        "create" => {
            let branch_name = sub_args
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -4,8 +4,12 @@ version = "0.1.0"
 edition = "2021"

 [features]
-default = []
+# It is simpler infra-wise to have failpoints enabled by default
+# It shouldn't affect perf in any way because failpoints
+# are not placed in hot code paths
+default = ["failpoints"]
 profiling = ["pprof"]
+failpoints = ["fail/failpoints"]

 [dependencies]
 chrono = "0.4.19"
@@ -21,7 +25,6 @@ lazy_static = "1.4.0"
 clap = "3.0"
 daemonize = "0.4.1"
 tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
-tokio-util = { version = "0.7", features = ["io"] }
 postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
@@ -35,6 +38,7 @@ humantime = "2.1.0"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "1.12.0"
+humantime-serde = "1.1.1"

 pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }

@@ -48,14 +52,12 @@ nix = "0.23"
 once_cell = "1.8.0"
 crossbeam-utils = "0.8.5"
 fail = "0.5.0"
-
-rusoto_core = "0.47"
-rusoto_s3 = "0.47"
-async-trait = "0.1"
+git-version = "0.3.5"

 postgres_ffi = { path = "../libs/postgres_ffi" }
 metrics = { path = "../libs/metrics" }
 utils = { path = "../libs/utils" }
+remote_storage = { path = "../libs/remote_storage" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }

 [dev-dependencies]
--- a/pageserver/README.md
+++ b/pageserver/README.md
@@ -135,7 +135,7 @@ The backup service is disabled by default and can be enabled to interact with a

 CLI examples:
 * Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
-* AWS S3  : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"`
+* AWS S3  : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`

 For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
 For local S3 installations, refer to the their documentation for name format and credentials.
@@ -155,11 +155,9 @@ or
 bucket_name = 'some-sample-bucket'
 bucket_region = 'eu-north-1'
 prefix_in_bucket = '/test_prefix/'
-access_key_id = 'SOMEKEYAAAAASADSAH*#'
-secret_access_key = 'SOMEsEcReTsd292v'
 ```

-Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.
+`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.

 TODO: Sharding
 --------------------
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -10,7 +10,7 @@
 //! This module is responsible for creation of such tarball
 //! from data stored in object storage.
 //!
-use anyhow::{ensure, Context, Result};
+use anyhow::{anyhow, ensure, Context, Result};
 use bytes::{BufMut, BytesMut};
 use std::fmt::Write as FmtWrite;
 use std::io;
@@ -154,9 +154,17 @@ impl<'a> Basebackup<'a> {
            let img = self
                .timeline
                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
-            ensure!(img.len() == pg_constants::BLCKSZ as usize);

-            slru_buf.extend_from_slice(&img);
+            if slru == SlruKind::Clog {
+                ensure!(
+                    img.len() == pg_constants::BLCKSZ as usize
+                        || img.len() == pg_constants::BLCKSZ as usize + 8
+                );
+            } else {
+                ensure!(img.len() == pg_constants::BLCKSZ as usize);
+            }
+
+            slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]);
        }

        let segname = format!("{}/{:>04X}", slru.to_str(), segno);
@@ -315,7 +323,8 @@ impl<'a> Basebackup<'a> {
        let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
        let wal_file_path = format!("pg_wal/{}", wal_file_name);
        let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
-        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
+        let wal_seg = generate_wal_segment(segno, pg_control.system_identifier)
+            .map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
        ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
        self.ar.append(&header, &wal_seg[..])?;
        Ok(())
--- a/pageserver/src/bin/dump_layerfile.rs
+++ b/pageserver/src/bin/dump_layerfile.rs
@@ -7,7 +7,9 @@ use pageserver::layered_repository::dump_layerfile_from_path;
 use pageserver::page_cache;
 use pageserver::virtual_file;
 use std::path::PathBuf;
-use utils::GIT_VERSION;
+use utils::project_git_version;
+
+project_git_version!(GIT_VERSION);

 fn main() -> Result<()> {
    let arg_matches = App::new("Zenith dump_layerfile utility")
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -8,12 +8,10 @@ use anyhow::{bail, Context, Result};
 use clap::{App, Arg};
 use daemonize::Daemonize;

+use fail::FailScenario;
 use pageserver::{
    config::{defaults::*, PageServerConf},
-    http, page_cache, page_service, profiling,
-    remote_storage::{self, SyncStartupData},
-    repository::{Repository, TimelineSyncStatusUpdate},
-    tenant_mgr, thread_mgr,
+    http, page_cache, page_service, profiling, tenant_mgr, thread_mgr,
    thread_mgr::ThreadKind,
    timelines, virtual_file, LOG_FILE_NAME,
 };
@@ -22,15 +20,21 @@ use utils::{
    http::endpoint,
    logging,
    postgres_backend::AuthType,
+    project_git_version,
    shutdown::exit_now,
    signals::{self, Signal},
    tcp_listener,
    zid::{ZTenantId, ZTimelineId},
-    GIT_VERSION,
 };

+project_git_version!(GIT_VERSION);
+
 fn version() -> String {
-    format!("{} profiling:{}", GIT_VERSION, cfg!(feature = "profiling"))
+    format!(
+        "{GIT_VERSION} profiling:{} failpoints:{}",
+        cfg!(feature = "profiling"),
+        fail::has_failpoints()
+    )
 }

 fn main() -> anyhow::Result<()> {
@@ -82,8 +86,23 @@ fn main() -> anyhow::Result<()> {
                .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
                Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
        )
+        .arg(
+            Arg::new("enabled-features")
+                .long("enabled-features")
+                .takes_value(false)
+                .help("Show enabled compile time features"),
+        )
        .get_matches();

+    if arg_matches.is_present("enabled-features") {
+        let features: &[&str] = &[
+            #[cfg(feature = "failpoints")]
+            "failpoints",
+        ];
+        println!("{{\"features\": {features:?} }}");
+        return Ok(());
+    }
+
    let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
    let workdir = workdir
        .canonicalize()
@@ -164,6 +183,14 @@ fn main() -> anyhow::Result<()> {
    // as a ref.
    let conf: &'static PageServerConf = Box::leak(Box::new(conf));

+    // If failpoints are used, terminate the whole pageserver process if they are hit.
+    let scenario = FailScenario::setup();
+    if fail::has_failpoints() {
+        std::panic::set_hook(Box::new(|_| {
+            std::process::exit(1);
+        }));
+    }
+
    // Basic initialization of things that don't change after startup
    virtual_file::init(conf.max_file_descriptors);
    page_cache::init(conf.page_cache_size);
@@ -179,17 +206,19 @@ fn main() -> anyhow::Result<()> {
                cfg_file_path.display()
            )
        })?;
-        Ok(())
    } else {
-        start_pageserver(conf, daemonize).context("Failed to start pageserver")
+        start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
    }
+
+    scenario.teardown();
+    Ok(())
 }

 fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
    // Initialize logger
    let log_file = logging::init(LOG_FILE_NAME, daemonize)?;

-    info!("version: {}", GIT_VERSION);
+    info!("version: {GIT_VERSION}");

    // TODO: Check that it looks like a valid repository before going further

@@ -235,47 +264,8 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()

    let signals = signals::install_shutdown_handlers()?;

-    // Initialize repositories with locally available timelines.
-    // Timelines that are only partially available locally (remote storage has more data than this pageserver)
-    // are scheduled for download and added to the repository once download is completed.
-    let SyncStartupData {
-        remote_index,
-        local_timeline_init_statuses,
-    } = remote_storage::start_local_timeline_sync(conf)
-        .context("Failed to set up local files sync with external storage")?;
-
-    for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
-        // initialize local tenant
-        let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index)
-            .with_context(|| format!("Failed to load repo for tenant {}", tenant_id))?;
-        for (timeline_id, init_status) in local_timeline_init_statuses {
-            match init_status {
-                remote_storage::LocalTimelineInitStatus::LocallyComplete => {
-                    debug!("timeline {} for tenant {} is locally complete, registering it in repository", timeline_id, tenant_id);
-                    // Lets fail here loudly to be on the safe side.
-                    // XXX: It may be a better api to actually distinguish between repository startup
-                    //   and processing of newly downloaded timelines.
-                    repo.apply_timeline_remote_sync_status_update(
-                        timeline_id,
-                        TimelineSyncStatusUpdate::Downloaded,
-                    )
-                    .with_context(|| {
-                        format!(
-                            "Failed to bootstrap timeline {} for tenant {}",
-                            timeline_id, tenant_id
-                        )
-                    })?
-                }
-                remote_storage::LocalTimelineInitStatus::NeedsSync => {
-                    debug!(
-                        "timeline {} for tenant {} needs sync, \
-                         so skipped for adding into repository until sync is finished",
-                        tenant_id, timeline_id
-                    );
-                }
-            }
-        }
-    }
+    // start profiler (if enabled)
+    let profiler_guard = profiling::init_profiler(conf);

    // initialize authentication for incoming connections
    let auth = match &conf.auth_type {
@@ -288,8 +278,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
    };
    info!("Using auth: {:#?}", conf.auth_type);

-    // start profiler (if enabled)
-    let profiler_guard = profiling::init_profiler(conf);
+    let remote_index = tenant_mgr::init_tenant_mgr(conf)?;

    // Spawn a new thread for the http endpoint
    // bind before launching separate thread so the error reported before startup exits
@@ -299,7 +288,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        None,
        None,
        "http_endpoint_thread",
-        false,
+        true,
        move || {
            let router = http::make_router(conf, auth_cloned, remote_index)?;
            endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
@@ -313,7 +302,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
        None,
        None,
        "libpq endpoint thread",
-        false,
+        true,
        move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
    )?;

@@ -333,7 +322,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
                signal.name()
            );
            profiling::exit_profiler(conf, &profiler_guard);
-            pageserver::shutdown_pageserver();
+            pageserver::shutdown_pageserver(0);
            unreachable!()
        }
    })
--- a/pageserver/src/bin/update_metadata.rs
+++ b/pageserver/src/bin/update_metadata.rs
@@ -6,7 +6,9 @@ use clap::{App, Arg};
 use pageserver::layered_repository::metadata::TimelineMetadata;
 use std::path::PathBuf;
 use std::str::FromStr;
-use utils::{lsn::Lsn, GIT_VERSION};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);

 fn main() -> Result<()> {
    let arg_matches = App::new("Zenith update metadata utility")
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,6 +5,7 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
+use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config};
 use std::env;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::{Path, PathBuf};
@@ -33,18 +34,6 @@ pub mod defaults {
    pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";

    pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
-    /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
-    /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
-    /// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
-    /// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
-    pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC: usize = 50;
-    pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
-    /// Currently, sync happens with AWS S3, that has two limits on requests per second:
-    /// ~200 RPS for IAM services
-    /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
-    /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
-    /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
-    pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;

    pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
    pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
@@ -75,6 +64,7 @@ pub mod defaults {

 #gc_period = '{DEFAULT_GC_PERIOD}'
 #gc_horizon = {DEFAULT_GC_HORIZON}
+#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
 #pitr_interval = '{DEFAULT_PITR_INTERVAL}'

 # [remote_storage]
@@ -314,67 +304,6 @@ impl PageServerConfigBuilder {
    }
 }

-/// External backup storage configuration, enough for creating a client for that storage.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between pageserver and the remote storage.
-    pub max_concurrent_timelines_sync: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
-    /// The storage connection configuration.
-    pub storage: RemoteStorageKind,
-}
-
-/// A kind of a remote storage to connect to, with its connection configuration.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum RemoteStorageKind {
-    /// Storage based on local file system.
-    /// Specify a root folder to place all stored files into.
-    LocalFs(PathBuf),
-    /// AWS S3 based storage, storing all files in the S3 bucket
-    /// specified by the config
-    AwsS3(S3Config),
-}
-
-/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
-#[derive(Clone, PartialEq, Eq)]
-pub struct S3Config {
-    /// Name of the bucket to connect to.
-    pub bucket_name: String,
-    /// The region where the bucket is located at.
-    pub bucket_region: String,
-    /// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once.
-    pub prefix_in_bucket: Option<String>,
-    /// "Login" to use when connecting to bucket.
-    /// Can be empty for cases like AWS k8s IAM
-    /// where we can allow certain pods to connect
-    /// to the bucket directly without any credentials.
-    pub access_key_id: Option<String>,
-    /// "Password" to use when connecting to bucket.
-    pub secret_access_key: Option<String>,
-    /// A base URL to send S3 requests to.
-    /// By default, the endpoint is derived from a region name, assuming it's
-    /// an AWS S3 region name, erroring on wrong region name.
-    /// Endpoint provides a way to support other S3 flavors and their regions.
-    ///
-    /// Example: `http://127.0.0.1:5000`
-    pub endpoint: Option<String>,
-    /// AWS S3 has various limits on its API calls, we need not to exceed those.
-    /// See [`defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
-    pub concurrency_limit: NonZeroUsize,
-}
-
-impl std::fmt::Debug for S3Config {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("S3Config")
-            .field("bucket_name", &self.bucket_name)
-            .field("bucket_region", &self.bucket_region)
-            .field("prefix_in_bucket", &self.prefix_in_bucket)
-            .field("concurrency_limit", &self.concurrency_limit)
-            .finish()
-    }
-}
-
 impl PageServerConf {
    //
    // Repository paths, relative to workdir.
@@ -439,7 +368,7 @@ impl PageServerConf {
                "remote_storage" => {
                    builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?))
                }
-                "tenant_conf" => {
+                "tenant_config" => {
                    t_conf = Self::parse_toml_tenant_conf(item)?;
                }
                "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)),
@@ -522,21 +451,21 @@ impl PageServerConf {
        let bucket_name = toml.get("bucket_name");
        let bucket_region = toml.get("bucket_region");

-        let max_concurrent_timelines_sync = NonZeroUsize::new(
-            parse_optional_integer("max_concurrent_timelines_sync", toml)?
-                .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC),
+        let max_concurrent_syncs = NonZeroUsize::new(
+            parse_optional_integer("max_concurrent_syncs", toml)?
+                .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
        )
-        .context("Failed to parse 'max_concurrent_timelines_sync' as a positive integer")?;
+        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;

        let max_sync_errors = NonZeroU32::new(
            parse_optional_integer("max_sync_errors", toml)?
-                .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
+                .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
        )
        .context("Failed to parse 'max_sync_errors' as a positive integer")?;

        let concurrency_limit = NonZeroUsize::new(
            parse_optional_integer("concurrency_limit", toml)?
-                .unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
+                .unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
        )
        .context("Failed to parse 'concurrency_limit' as a positive integer")?;

@@ -551,16 +480,6 @@ impl PageServerConf {
            (None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
                bucket_name: parse_toml_string("bucket_name", bucket_name)?,
                bucket_region: parse_toml_string("bucket_region", bucket_region)?,
-                access_key_id: toml
-                    .get("access_key_id")
-                    .map(|access_key_id| parse_toml_string("access_key_id", access_key_id))
-                    .transpose()?,
-                secret_access_key: toml
-                    .get("secret_access_key")
-                    .map(|secret_access_key| {
-                        parse_toml_string("secret_access_key", secret_access_key)
-                    })
-                    .transpose()?,
                prefix_in_bucket: toml
                    .get("prefix_in_bucket")
                    .map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
@@ -578,7 +497,7 @@ impl PageServerConf {
        };

        Ok(RemoteStorageConfig {
-            max_concurrent_timelines_sync,
+            max_concurrent_syncs,
            max_sync_errors,
            storage,
        })
@@ -806,11 +725,11 @@ pg_distrib_dir='{}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_timelines_sync: NonZeroUsize::new(
-                        defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC
+                    max_concurrent_syncs: NonZeroUsize::new(
+                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
                    )
                    .unwrap(),
-                    max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
+                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
@@ -828,29 +747,25 @@ pg_distrib_dir='{}'
        let bucket_name = "some-sample-bucket".to_string();
        let bucket_region = "eu-north-1".to_string();
        let prefix_in_bucket = "test_prefix".to_string();
-        let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string();
-        let secret_access_key = "SOMEsEcReTsd292v".to_string();
        let endpoint = "http://localhost:5000".to_string();
-        let max_concurrent_timelines_sync = NonZeroUsize::new(111).unwrap();
+        let max_concurrent_syncs = NonZeroUsize::new(111).unwrap();
        let max_sync_errors = NonZeroU32::new(222).unwrap();
        let s3_concurrency_limit = NonZeroUsize::new(333).unwrap();

        let identical_toml_declarations = &[
            format!(
                r#"[remote_storage]
-max_concurrent_timelines_sync = {max_concurrent_timelines_sync}
+max_concurrent_syncs = {max_concurrent_syncs}
 max_sync_errors = {max_sync_errors}
 bucket_name = '{bucket_name}'
 bucket_region = '{bucket_region}'
 prefix_in_bucket = '{prefix_in_bucket}'
-access_key_id = '{access_key_id}'
-secret_access_key = '{secret_access_key}'
 endpoint = '{endpoint}'
 concurrency_limit = {s3_concurrency_limit}"#
            ),
            format!(
-                "remote_storage={{max_concurrent_timelines_sync={max_concurrent_timelines_sync}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
-                bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', access_key_id='{access_key_id}', secret_access_key='{secret_access_key}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
+                "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
+                bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
            ),
        ];

@@ -873,13 +788,11 @@ pg_distrib_dir='{}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_timelines_sync,
+                    max_concurrent_syncs,
                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
-                        access_key_id: Some(access_key_id.clone()),
-                        secret_access_key: Some(secret_access_key.clone()),
                        prefix_in_bucket: Some(prefix_in_bucket.clone()),
                        endpoint: Some(endpoint.clone()),
                        concurrency_limit: s3_concurrency_limit,
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -31,6 +31,7 @@ pub struct TenantCreateRequest {
    pub compaction_threshold: Option<usize>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
+    pub image_creation_threshold: Option<usize>,
    pub pitr_interval: Option<String>,
 }

@@ -65,6 +66,7 @@ pub struct TenantConfigRequest {
    pub compaction_threshold: Option<usize>,
    pub gc_horizon: Option<u64>,
    pub gc_period: Option<String>,
+    pub image_creation_threshold: Option<usize>,
    pub pitr_interval: Option<String>,
 }

@@ -78,6 +80,7 @@ impl TenantConfigRequest {
            compaction_threshold: None,
            gc_horizon: None,
            gc_period: None,
+            image_creation_threshold: None,
            pitr_interval: None,
        }
    }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3,17 +3,16 @@ use std::sync::Arc;
 use anyhow::{Context, Result};
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
+use remote_storage::GenericRemoteStorage;
 use tracing::*;

 use super::models::{
    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse,
    TimelineCreateRequest,
 };
-use crate::config::RemoteStorageKind;
-use crate::remote_storage::{
-    download_index_part, schedule_timeline_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket,
-};
 use crate::repository::Repository;
+use crate::storage_sync;
+use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
 use crate::tenant_config::TenantConfOpt;
 use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
 use crate::{config::PageServerConf, tenant_mgr, timelines};
@@ -37,11 +36,6 @@ struct State {
    remote_storage: Option<GenericRemoteStorage>,
 }

-enum GenericRemoteStorage {
-    Local(LocalFs),
-    S3(S3Bucket),
-}
-
 impl State {
    fn new(
        conf: &'static PageServerConf,
@@ -57,14 +51,7 @@ impl State {
        let remote_storage = conf
            .remote_storage_config
            .as_ref()
-            .map(|storage_config| match &storage_config.storage {
-                RemoteStorageKind::LocalFs(root) => {
-                    LocalFs::new(root.clone(), &conf.workdir).map(GenericRemoteStorage::Local)
-                }
-                RemoteStorageKind::AwsS3(s3_config) => {
-                    S3Bucket::new(s3_config, &conf.workdir).map(GenericRemoteStorage::S3)
-                }
-            })
+            .map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
            .transpose()
            .context("Failed to init generic remote storage")?;

@@ -179,43 +166,47 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
    let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);

-    let span = info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id);
+    let (local_timeline_info, remote_timeline_info) = async {
+        // any error here will render local timeline as None
+        // XXX .in_current_span does not attach messages in spawn_blocking future to current future's span
+        let local_timeline_info = tokio::task::spawn_blocking(move || {
+            let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
+            let local_timeline = {
+                repo.get_timeline(timeline_id)
+                    .as_ref()
+                    .map(|timeline| {
+                        LocalTimelineInfo::from_repo_timeline(
+                            tenant_id,
+                            timeline_id,
+                            timeline,
+                            include_non_incremental_logical_size,
+                        )
+                    })
+                    .transpose()?
+            };
+            Ok::<_, anyhow::Error>(local_timeline)
+        })
+        .await
+        .ok()
+        .and_then(|r| r.ok())
+        .flatten();

-    let (local_timeline_info, span) = tokio::task::spawn_blocking(move || {
-        let entered = span.entered();
-        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
-        let local_timeline = {
-            repo.get_timeline(timeline_id)
-                .as_ref()
-                .map(|timeline| {
-                    LocalTimelineInfo::from_repo_timeline(
-                        tenant_id,
-                        timeline_id,
-                        timeline,
-                        include_non_incremental_logical_size,
-                    )
+        let remote_timeline_info = {
+            let remote_index_read = get_state(&request).remote_index.read().await;
+            remote_index_read
+                .timeline_entry(&ZTenantTimelineId {
+                    tenant_id,
+                    timeline_id,
+                })
+                .map(|remote_entry| RemoteTimelineInfo {
+                    remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
+                    awaits_download: remote_entry.awaits_download,
                })
-                .transpose()?
        };
-        Ok::<_, anyhow::Error>((local_timeline, entered.exit()))
-    })
-    .await
-    .map_err(ApiError::from_err)??;
-
-    let remote_timeline_info = {
-        let remote_index_read = get_state(&request).remote_index.read().await;
-        remote_index_read
-            .timeline_entry(&ZTenantTimelineId {
-                tenant_id,
-                timeline_id,
-            })
-            .map(|remote_entry| RemoteTimelineInfo {
-                remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
-                awaits_download: remote_entry.awaits_download,
-            })
-    };
-
-    let _enter = span.entered();
+        (local_timeline_info, remote_timeline_info)
+    }
+    .instrument(info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id))
+    .await;

    if local_timeline_info.is_none() && remote_timeline_info.is_none() {
        return Err(ApiError::NotFound(
@@ -244,7 +235,7 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
    );

    tokio::task::spawn_blocking(move || {
-        if tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).is_ok() {
+        if tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).is_ok() {
            // TODO: maybe answer with 309 Not Modified here?
            anyhow::bail!("Timeline is already present locally")
        };
@@ -269,14 +260,14 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
        }

        remote_timeline.awaits_download = true;
-        schedule_timeline_download(tenant_id, timeline_id);
+        storage_sync::schedule_layer_download(tenant_id, timeline_id);
        return json_response(StatusCode::ACCEPTED, ());
    } else {
        // no timeline in the index, release the lock to make the potentially lengthy download opetation
        drop(index_accessor);
    }

-    let new_timeline = match try_download_shard_data(state, sync_id).await {
+    let new_timeline = match try_download_index_part_data(state, sync_id).await {
        Ok(Some(mut new_timeline)) => {
            tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
                .await
@@ -305,35 +296,32 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
        }
        None => index_accessor.add_timeline_entry(sync_id, new_timeline),
    }
-    schedule_timeline_download(tenant_id, timeline_id);
+    storage_sync::schedule_layer_download(tenant_id, timeline_id);
    json_response(StatusCode::ACCEPTED, ())
 }

-async fn try_download_shard_data(
+async fn try_download_index_part_data(
    state: &State,
    sync_id: ZTenantTimelineId,
 ) -> anyhow::Result<Option<RemoteTimeline>> {
-    let shard = match state.remote_storage.as_ref() {
+    let index_part = match state.remote_storage.as_ref() {
        Some(GenericRemoteStorage::Local(local_storage)) => {
-            download_index_part(state.conf, local_storage, sync_id).await
+            storage_sync::download_index_part(state.conf, local_storage, sync_id).await
        }
        Some(GenericRemoteStorage::S3(s3_storage)) => {
-            download_index_part(state.conf, s3_storage, sync_id).await
+            storage_sync::download_index_part(state.conf, s3_storage, sync_id).await
        }
        None => return Ok(None),
    }
-    .with_context(|| format!("Failed to download index shard for timeline {}", sync_id))?;
+    .with_context(|| format!("Failed to download index part for timeline {sync_id}"))?;

    let timeline_path = state
        .conf
        .timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
-    RemoteTimeline::from_index_part(&timeline_path, shard)
+    RemoteTimeline::from_index_part(&timeline_path, index_part)
        .map(Some)
        .with_context(|| {
-            format!(
-                "Failed to convert index shard into remote timeline for timeline {}",
-                sync_id
-            )
+            format!("Failed to convert index part into remote timeline for timeline {sync_id}")
        })
 }

@@ -347,8 +335,8 @@ async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body
        let _enter =
            info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
                .entered();
-        let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
-        repo.detach_timeline(timeline_id)
+        let state = get_state(&request);
+        tenant_mgr::detach_timeline(state.conf, tenant_id, timeline_id)
    })
    .await
    .map_err(ApiError::from_err)??;
@@ -365,7 +353,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
        crate::tenant_mgr::list_tenants()
    })
    .await
-    .map_err(ApiError::from_err)??;
+    .map_err(ApiError::from_err)?;

    json_response(StatusCode::OK, response_data)
 }
@@ -377,12 +365,13 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    let request_data: TenantCreateRequest = json_request(&mut request).await?;
    let remote_index = get_state(&request).remote_index.clone();

-    let mut tenant_conf: TenantConfOpt = Default::default();
+    let mut tenant_conf = TenantConfOpt::default();
    if let Some(gc_period) = request_data.gc_period {
        tenant_conf.gc_period =
            Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
    }
    tenant_conf.gc_horizon = request_data.gc_horizon;
+    tenant_conf.image_creation_threshold = request_data.image_creation_threshold;

    if let Some(pitr_interval) = request_data.pitr_interval {
        tenant_conf.pitr_interval =
@@ -430,6 +419,7 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
            Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
    }
    tenant_conf.gc_horizon = request_data.gc_horizon;
+    tenant_conf.image_creation_threshold = request_data.image_creation_threshold;

    if let Some(pitr_interval) = request_data.pitr_interval {
        tenant_conf.pitr_interval =
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -274,7 +274,7 @@ fn import_control_file<R: Repository>(

    // Extract the checkpoint record and import it separately.
    let pg_control = ControlFileData::decode(&buffer)?;
-    let checkpoint_bytes = pg_control.checkPointCopy.encode();
+    let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
    modification.put_checkpoint(checkpoint_bytes)?;

    Ok(pg_control)
--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -20,8 +20,8 @@ use tracing::*;

 use std::cmp::{max, min, Ordering};
 use std::collections::hash_map::Entry;
-use std::collections::BTreeSet;
 use std::collections::HashMap;
+use std::collections::{BTreeSet, HashSet};
 use std::fs;
 use std::fs::{File, OpenOptions};
 use std::io::Write;
@@ -34,10 +34,9 @@ use std::time::{Duration, Instant, SystemTime};
 use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME};
 use crate::config::PageServerConf;
 use crate::keyspace::KeySpace;
+use crate::storage_sync::index::RemoteIndex;
 use crate::tenant_config::{TenantConf, TenantConfOpt};

-use crate::page_cache;
-use crate::remote_storage::{schedule_timeline_checkpoint_upload, RemoteIndex};
 use crate::repository::{
    GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter,
 };
@@ -48,6 +47,7 @@ use crate::virtual_file::VirtualFile;
 use crate::walreceiver::IS_WAL_RECEIVER;
 use crate::walredo::WalRedoManager;
 use crate::CheckpointConfig;
+use crate::{page_cache, storage_sync};

 use metrics::{
    register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec,
@@ -74,6 +74,7 @@ pub mod metadata;
 mod par_fsync;
 mod storage_layer;

+use crate::pgdatadir_mapping::LsnForTimestamp;
 use delta_layer::{DeltaLayer, DeltaLayerWriter};
 use ephemeral_file::is_ephemeral_file;
 use filename::{DeltaFileName, ImageFileName};
@@ -81,6 +82,7 @@ use image_layer::{ImageLayer, ImageLayerWriter};
 use inmemory_layer::InMemoryLayer;
 use layer_map::LayerMap;
 use layer_map::SearchResult;
+use postgres_ffi::xlog_utils::to_pg_timestamp;
 use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};

 // re-export this function so that page_cache.rs can use it.
@@ -89,7 +91,7 @@ pub use crate::layered_repository::ephemeral_file::writeback as writeback_epheme
 // Metrics collected on operations on the storage repository.
 lazy_static! {
    static ref STORAGE_TIME: HistogramVec = register_histogram_vec!(
-        "pageserver_storage_time",
+        "pageserver_storage_operations_seconds",
        "Time spent on storage operations",
        &["operation", "tenant_id", "timeline_id"]
    )
@@ -99,8 +101,8 @@ lazy_static! {
 // Metrics collected on operations on the storage repository.
 lazy_static! {
    static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!(
-        "pageserver_getpage_reconstruct_time",
-        "Time spent on storage operations",
+        "pageserver_getpage_reconstruct_seconds",
+        "Time spent in reconstruct_value",
        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric");
@@ -108,13 +110,13 @@ lazy_static! {

 lazy_static! {
    static ref MATERIALIZED_PAGE_CACHE_HIT: IntCounterVec = register_int_counter_vec!(
-        "materialize_page_cache_hits",
+        "pageserver_materialized_cache_hits_total",
        "Number of cache hits from materialized page cache",
        &["tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric");
    static ref WAIT_LSN_TIME: HistogramVec = register_histogram_vec!(
-        "wait_lsn_time",
+        "pageserver_wait_lsn_seconds",
        "Time spent waiting for WAL to arrive",
        &["tenant_id", "timeline_id"]
    )
@@ -134,12 +136,12 @@ lazy_static! {
 // or in testing they estimate how much we would upload if we did.
 lazy_static! {
    static ref NUM_PERSISTENT_FILES_CREATED: IntCounter = register_int_counter!(
-        "pageserver_num_persistent_files_created",
+        "pageserver_created_persistent_files_total",
        "Number of files created that are meant to be uploaded to cloud storage",
    )
    .expect("failed to define a metric");
    static ref PERSISTENT_BYTES_WRITTEN: IntCounter = register_int_counter!(
-        "pageserver_persistent_bytes_written",
+        "pageserver_written_persistent_bytes_total",
        "Total bytes written that are meant to be uploaded to cloud storage",
    )
    .expect("failed to define a metric");
@@ -161,7 +163,7 @@ pub struct LayeredRepository {
    // This is necessary to allow global config updates.
    tenant_conf: Arc<RwLock<TenantConfOpt>>,

-    tenantid: ZTenantId,
+    tenant_id: ZTenantId,
    timelines: Mutex<HashMap<ZTimelineId, LayeredTimelineEntry>>,
    // This mutex prevents creation of new timelines during GC.
    // Adding yet another mutex (in addition to `timelines`) is needed because holding
@@ -223,10 +225,10 @@ impl Repository for LayeredRepository {
        let mut timelines = self.timelines.lock().unwrap();

        // Create the timeline directory, and write initial metadata to file.
-        crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;
+        crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenant_id))?;

        let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn);
-        Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;
+        Self::save_metadata(self.conf, timelineid, self.tenant_id, &metadata, true)?;

        let timeline = LayeredTimeline::new(
            self.conf,
@@ -234,7 +236,7 @@ impl Repository for LayeredRepository {
            metadata,
            None,
            timelineid,
-            self.tenantid,
+            self.tenant_id,
            Arc::clone(&self.walredo_mgr),
            self.upload_layers,
        );
@@ -283,7 +285,7 @@ impl Repository for LayeredRepository {
        };

        // create a new timeline directory
-        let timelinedir = self.conf.timeline_path(&dst, &self.tenantid);
+        let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id);

        crashsafe_dir::create_dir(&timelinedir)?;

@@ -298,8 +300,8 @@ impl Repository for LayeredRepository {
            *src_timeline.latest_gc_cutoff_lsn.read().unwrap(),
            src_timeline.initdb_lsn,
        );
-        crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
-        Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;
+        crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?;
+        Self::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?;
        timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata });

        info!("branched timeline {} from {} at {}", dst, src, start_lsn);
@@ -322,7 +324,7 @@ impl Repository for LayeredRepository {
            .unwrap_or_else(|| "-".to_string());

        STORAGE_TIME
-            .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_str])
+            .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
            .observe_closure_duration(|| {
                self.gc_iteration_internal(target_timelineid, horizon, pitr, checkpoint_before_gc)
            })
@@ -342,7 +344,7 @@ impl Repository for LayeredRepository {

        for (timelineid, timeline) in &timelines_to_compact {
            let _entered =
-                info_span!("compact", timeline = %timelineid, tenant = %self.tenantid).entered();
+                info_span!("compact", timeline = %timelineid, tenant = %self.tenant_id).entered();
            match timeline {
                LayeredTimelineEntry::Loaded(timeline) => {
                    timeline.compact()?;
@@ -383,27 +385,33 @@ impl Repository for LayeredRepository {

        for (timelineid, timeline) in &timelines_to_compact {
            let _entered =
-                info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid).entered();
+                info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenant_id)
+                    .entered();
            timeline.checkpoint(CheckpointConfig::Flush)?;
        }

        Ok(())
    }

-    // Detaches the timeline from the repository.
-    fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
+    fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
        let mut timelines = self.timelines.lock().unwrap();
-        if timelines.remove(&timeline_id).is_none() {
-            bail!("cannot detach timeline that is not available locally");
-        }
+        // check no child timelines, because detach will remove files, which will brake child branches
+        // FIXME this can still be violated because we do not guarantee
+        //   that all ancestors are downloaded/attached to the same pageserver
+        let num_children = timelines
+            .iter()
+            .filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id))
+            .count();

-        // Release the lock to shutdown and remove the files without holding it
-        drop(timelines);
-        // shutdown the timeline (this shuts down the walreceiver)
-        thread_mgr::shutdown_threads(None, Some(self.tenantid), Some(timeline_id));
+        ensure!(
+            num_children == 0,
+            "Cannot detach timeline which has child timelines"
+        );

-        // remove timeline files (maybe avoid this for ease of debugging if something goes wrong)
-        fs::remove_dir_all(self.conf.timeline_path(&timeline_id, &self.tenantid))?;
+        ensure!(
+            timelines.remove(&timeline_id).is_some(),
+            "Cannot detach timeline {timeline_id} that is not available locally"
+        );
        Ok(())
    }

@@ -422,7 +430,7 @@ impl Repository for LayeredRepository {
                    Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
                    Entry::Vacant(entry) => {
                        // we need to get metadata of a timeline, another option is to pass it along with Downloaded status
-                        let metadata = Self::load_metadata(self.conf, timeline_id, self.tenantid).context("failed to load local metadata")?;
+                        let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?;
                        // finally we make newly downloaded timeline visible to repository
                        entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, })
                    },
@@ -449,7 +457,7 @@ enum LayeredTimelineEntry {
 impl LayeredTimelineEntry {
    fn timeline_id(&self) -> ZTimelineId {
        match self {
-            LayeredTimelineEntry::Loaded(timeline) => timeline.timelineid,
+            LayeredTimelineEntry::Loaded(timeline) => timeline.timeline_id,
            LayeredTimelineEntry::Unloaded { id, .. } => *id,
        }
    }
@@ -535,6 +543,13 @@ impl LayeredRepository {
            .unwrap_or(self.conf.default_tenant_conf.gc_period)
    }

+    pub fn get_image_creation_threshold(&self) -> usize {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .image_creation_threshold
+            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
+    }
+
    pub fn get_pitr_interval(&self) -> Duration {
        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
@@ -547,7 +562,7 @@ impl LayeredRepository {

        tenant_conf.update(&new_tenant_conf);

-        LayeredRepository::persist_tenant_config(self.conf, self.tenantid, *tenant_conf)?;
+        LayeredRepository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?;
        Ok(())
    }

@@ -602,21 +617,17 @@ impl LayeredRepository {

    fn load_local_timeline(
        &self,
-        timelineid: ZTimelineId,
+        timeline_id: ZTimelineId,
        timelines: &mut HashMap<ZTimelineId, LayeredTimelineEntry>,
    ) -> anyhow::Result<Arc<LayeredTimeline>> {
-        let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)
+        let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
            .context("failed to load metadata")?;
        let disk_consistent_lsn = metadata.disk_consistent_lsn();

        let ancestor = metadata
            .ancestor_timeline()
            .map(|ancestor_timeline_id| {
-                trace!(
-                    "loading {}'s ancestor {}",
-                    timelineid,
-                    &ancestor_timeline_id
-                );
+                trace!("loading {timeline_id}'s ancestor {}", &ancestor_timeline_id);
                self.get_timeline_load_internal(ancestor_timeline_id, timelines)
            })
            .transpose()
@@ -630,8 +641,8 @@ impl LayeredRepository {
            Arc::clone(&self.tenant_conf),
            metadata,
            ancestor,
-            timelineid,
-            self.tenantid,
+            timeline_id,
+            self.tenant_id,
            Arc::clone(&self.walredo_mgr),
            self.upload_layers,
        );
@@ -646,12 +657,12 @@ impl LayeredRepository {
        conf: &'static PageServerConf,
        tenant_conf: TenantConfOpt,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
-        tenantid: ZTenantId,
+        tenant_id: ZTenantId,
        remote_index: RemoteIndex,
        upload_layers: bool,
    ) -> LayeredRepository {
        LayeredRepository {
-            tenantid,
+            tenant_id,
            conf,
            tenant_conf: Arc::new(RwLock::new(tenant_conf)),
            timelines: Mutex::new(HashMap::new()),
@@ -690,7 +701,7 @@ impl LayeredRepository {
        let mut tenant_conf: TenantConfOpt = Default::default();
        for (key, item) in toml.iter() {
            match key {
-                "tenant_conf" => {
+                "tenant_config" => {
                    tenant_conf = PageServerConf::parse_toml_tenant_conf(item)?;
                }
                _ => bail!("unrecognized pageserver option '{}'", key),
@@ -712,7 +723,7 @@ impl LayeredRepository {
        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.

-# [tenant_config]
+[tenant_config]
 "#
        .to_string();

@@ -728,7 +739,7 @@ impl LayeredRepository {
    }

    /// Save timeline metadata to file
-    fn save_metadata(
+    pub fn save_metadata(
        conf: &'static PageServerConf,
        timelineid: ZTimelineId,
        tenantid: ZTenantId,
@@ -763,17 +774,6 @@ impl LayeredRepository {
        Ok(())
    }

-    fn load_metadata(
-        conf: &'static PageServerConf,
-        timelineid: ZTimelineId,
-        tenantid: ZTenantId,
-    ) -> Result<TimelineMetadata> {
-        let path = metadata_path(conf, timelineid, tenantid);
-        info!("loading metadata from {}", path.display());
-        let metadata_bytes = std::fs::read(&path)?;
-        TimelineMetadata::from_bytes(&metadata_bytes)
-    }
-
    //
    // How garbage collection works:
    //
@@ -806,7 +806,7 @@ impl LayeredRepository {
        checkpoint_before_gc: bool,
    ) -> Result<GcResult> {
        let _span_guard =
-            info_span!("gc iteration", tenant = %self.tenantid, timeline = ?target_timelineid)
+            info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timelineid)
                .entered();
        let mut totals: GcResult = Default::default();
        let now = Instant::now();
@@ -890,14 +890,18 @@ impl LayeredRepository {
        totals.elapsed = now.elapsed();
        Ok(totals)
    }
+
+    pub fn tenant_id(&self) -> ZTenantId {
+        self.tenant_id
+    }
 }

 pub struct LayeredTimeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<RwLock<TenantConfOpt>>,

-    tenantid: ZTenantId,
-    timelineid: ZTimelineId,
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,

    layers: RwLock<LayerMap>,

@@ -1155,6 +1159,13 @@ impl LayeredTimeline {
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

+    fn get_image_creation_threshold(&self) -> usize {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .image_creation_threshold
+            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
+    }
+
    /// Open a Timeline handle.
    ///
    /// Loads the metadata for the timeline into memory, but not the layer map.
@@ -1164,50 +1175,50 @@ impl LayeredTimeline {
        tenant_conf: Arc<RwLock<TenantConfOpt>>,
        metadata: TimelineMetadata,
        ancestor: Option<LayeredTimelineEntry>,
-        timelineid: ZTimelineId,
-        tenantid: ZTenantId,
+        timeline_id: ZTimelineId,
+        tenant_id: ZTenantId,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        upload_layers: bool,
    ) -> LayeredTimeline {
        let reconstruct_time_histo = RECONSTRUCT_TIME
-            .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
+            .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
            .unwrap();
        let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
-            .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
+            .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
            .unwrap();
        let flush_time_histo = STORAGE_TIME
            .get_metric_with_label_values(&[
                "layer flush",
-                &tenantid.to_string(),
-                &timelineid.to_string(),
+                &tenant_id.to_string(),
+                &timeline_id.to_string(),
            ])
            .unwrap();
        let compact_time_histo = STORAGE_TIME
            .get_metric_with_label_values(&[
                "compact",
-                &tenantid.to_string(),
-                &timelineid.to_string(),
+                &tenant_id.to_string(),
+                &timeline_id.to_string(),
            ])
            .unwrap();
        let create_images_time_histo = STORAGE_TIME
            .get_metric_with_label_values(&[
                "create images",
-                &tenantid.to_string(),
-                &timelineid.to_string(),
+                &tenant_id.to_string(),
+                &timeline_id.to_string(),
            ])
            .unwrap();
        let last_record_gauge = LAST_RECORD_LSN
-            .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
+            .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
            .unwrap();
        let wait_lsn_time_histo = WAIT_LSN_TIME
-            .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()])
+            .get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
            .unwrap();

        LayeredTimeline {
            conf,
            tenant_conf,
-            timelineid,
-            tenantid,
+            timeline_id,
+            tenant_id,
            layers: RwLock::new(LayerMap::default()),

            walredo_mgr,
@@ -1259,7 +1270,7 @@ impl LayeredTimeline {

        // Scan timeline directory and create ImageFileName and DeltaFilename
        // structs representing all files on disk
-        let timeline_path = self.conf.timeline_path(&self.timelineid, &self.tenantid);
+        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);

        for direntry in fs::read_dir(timeline_path)? {
            let direntry = direntry?;
@@ -1271,7 +1282,7 @@ impl LayeredTimeline {
                if imgfilename.lsn > disk_consistent_lsn {
                    warn!(
                        "found future image layer {} on timeline {} disk_consistent_lsn is {}",
-                        imgfilename, self.timelineid, disk_consistent_lsn
+                        imgfilename, self.timeline_id, disk_consistent_lsn
                    );

                    rename_to_backup(direntry.path())?;
@@ -1279,7 +1290,7 @@ impl LayeredTimeline {
                }

                let layer =
-                    ImageLayer::new(self.conf, self.timelineid, self.tenantid, &imgfilename);
+                    ImageLayer::new(self.conf, self.timeline_id, self.tenant_id, &imgfilename);

                trace!("found layer {}", layer.filename().display());
                layers.insert_historic(Arc::new(layer));
@@ -1294,7 +1305,7 @@ impl LayeredTimeline {
                if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
                    warn!(
                        "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
-                        deltafilename, self.timelineid, disk_consistent_lsn
+                        deltafilename, self.timeline_id, disk_consistent_lsn
                    );

                    rename_to_backup(direntry.path())?;
@@ -1302,7 +1313,7 @@ impl LayeredTimeline {
                }

                let layer =
-                    DeltaLayer::new(self.conf, self.timelineid, self.tenantid, &deltafilename);
+                    DeltaLayer::new(self.conf, self.timeline_id, self.tenant_id, &deltafilename);

                trace!("found layer {}", layer.filename().display());
                layers.insert_historic(Arc::new(layer));
@@ -1421,7 +1432,8 @@ impl LayeredTimeline {

            let layers = timeline.layers.read().unwrap();

-            // Check the open and frozen in-memory layers first
+            // Check the open and frozen in-memory layers first, in order from newest
+            // to oldest.
            if let Some(open_layer) = &layers.open_layer {
                let start_lsn = open_layer.get_lsn_range().start;
                if cont_lsn > start_lsn {
@@ -1439,7 +1451,7 @@ impl LayeredTimeline {
                    continue;
                }
            }
-            for frozen_layer in layers.frozen_layers.iter() {
+            for frozen_layer in layers.frozen_layers.iter().rev() {
                let start_lsn = frozen_layer.get_lsn_range().start;
                if cont_lsn > start_lsn {
                    //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
@@ -1483,7 +1495,7 @@ impl LayeredTimeline {
        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
        // We should look at the key to determine if it's a cacheable object
        let (lsn, read_guard) =
-            cache.lookup_materialized_page(self.tenantid, self.timelineid, key, lsn)?;
+            cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?;
        let img = Bytes::from(read_guard.to_vec());
        Some((lsn, img))
    }
@@ -1492,12 +1504,20 @@ impl LayeredTimeline {
        let ancestor = self
            .ancestor_timeline
            .as_ref()
-            .expect("there should be an ancestor")
+            .with_context(|| {
+                format!(
+                    "Ancestor is missing. Timeline id: {} Ancestor id {:?}",
+                    self.timeline_id,
+                    self.get_ancestor_timeline_id(),
+                )
+            })?
            .ensure_loaded()
            .with_context(|| {
                format!(
-                "Cannot get the whole layer for read locked: timeline {} is not present locally",
-                self.get_ancestor_timeline_id().unwrap())
+                    "Ancestor timeline is not loaded. Timeline id: {} Ancestor id {:?}",
+                    self.timeline_id,
+                    self.get_ancestor_timeline_id(),
+                )
            })?;
        Ok(Arc::clone(ancestor))
    }
@@ -1532,12 +1552,12 @@ impl LayeredTimeline {

            trace!(
                "creating layer for write at {}/{} for record at {}",
-                self.timelineid,
+                self.timeline_id,
                start_lsn,
                lsn
            );
            let new_layer =
-                InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn)?;
+                InMemoryLayer::create(self.conf, self.timeline_id, self.tenant_id, start_lsn)?;
            let layer_rc = Arc::new(new_layer);

            layers.open_layer = Some(Arc::clone(&layer_rc));
@@ -1611,8 +1631,8 @@ impl LayeredTimeline {
            let self_clone = Arc::clone(self);
            thread_mgr::spawn(
                thread_mgr::ThreadKind::LayerFlushThread,
-                Some(self.tenantid),
-                Some(self.timelineid),
+                Some(self.tenant_id),
+                Some(self.timeline_id),
                "layer flush thread",
                false,
                move || self_clone.flush_frozen_layers(false),
@@ -1681,10 +1701,13 @@ impl LayeredTimeline {
        // them all in parallel.
        par_fsync::par_fsync(&[
            new_delta_path.clone(),
-            self.conf.timeline_path(&self.timelineid, &self.tenantid),
+            self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
        ])?;
+        fail_point!("checkpoint-before-sync");

-        // Finally, replace the frozen in-memory layer with the new on-disk layers
+        fail_point!("flush-frozen");
+
+        // Finally, replace the frozen in-memory layer with the new on-disk layer
        {
            let mut layers = self.layers.write().unwrap();
            let l = layers.frozen_layers.pop_front();
@@ -1705,6 +1728,7 @@ impl LayeredTimeline {
        // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
        // *all* the layers, to avoid fsyncing the file multiple times.
        let disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1);
+        fail_point!("checkpoint-after-sync");

        // If we were able to advance 'disk_consistent_lsn', save it the metadata file.
        // After crash, we will restart WAL streaming and processing from that point.
@@ -1749,8 +1773,8 @@ impl LayeredTimeline {

            LayeredRepository::save_metadata(
                self.conf,
-                self.timelineid,
-                self.tenantid,
+                self.timeline_id,
+                self.tenant_id,
                &metadata,
                false,
            )?;
@@ -1759,11 +1783,11 @@ impl LayeredTimeline {
            PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len());

            if self.upload_layers.load(atomic::Ordering::Relaxed) {
-                schedule_timeline_checkpoint_upload(
-                    self.tenantid,
-                    self.timelineid,
-                    new_delta_path,
-                    metadata,
+                storage_sync::schedule_layer_upload(
+                    self.tenant_id,
+                    self.timeline_id,
+                    HashSet::from([new_delta_path]),
+                    Some(metadata),
                );
            }

@@ -1814,7 +1838,8 @@ impl LayeredTimeline {
        let target_file_size = self.get_checkpoint_distance();

        // Define partitioning schema if needed
-        if let Ok(pgdir) = tenant_mgr::get_timeline_for_tenant_load(self.tenantid, self.timelineid)
+        if let Ok(pgdir) =
+            tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
        {
            let (partitioning, lsn) = pgdir.repartition(
                self.get_last_record_lsn(),
@@ -1823,11 +1848,21 @@ impl LayeredTimeline {
            let timer = self.create_images_time_histo.start_timer();
            // 2. Create new image layers for partitions that have been modified
            // "enough".
+            let mut layer_paths_to_upload = HashSet::with_capacity(partitioning.parts.len());
            for part in partitioning.parts.iter() {
-                if self.time_for_new_image_layer(part, lsn, 3)? {
-                    self.create_image_layer(part, lsn)?;
+                if self.time_for_new_image_layer(part, lsn)? {
+                    let new_path = self.create_image_layer(part, lsn)?;
+                    layer_paths_to_upload.insert(new_path);
                }
            }
+            if self.upload_layers.load(atomic::Ordering::Relaxed) {
+                storage_sync::schedule_layer_upload(
+                    self.tenant_id,
+                    self.timeline_id,
+                    layer_paths_to_upload,
+                    None,
+                );
+            }
            timer.stop_and_record();

            // 3. Compact
@@ -1842,18 +1877,13 @@ impl LayeredTimeline {
    }

    // Is it time to create a new image layer for the given partition?
-    fn time_for_new_image_layer(
-        &self,
-        partition: &KeySpace,
-        lsn: Lsn,
-        threshold: usize,
-    ) -> Result<bool> {
+    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<bool> {
        let layers = self.layers.read().unwrap();

        for part_range in &partition.ranges {
            let image_coverage = layers.image_coverage(part_range, lsn)?;
            for (img_range, last_img) in image_coverage {
-                let img_lsn = if let Some(ref last_img) = last_img {
+                let img_lsn = if let Some(last_img) = last_img {
                    last_img.get_lsn_range().end
                } else {
                    Lsn(0)
@@ -1865,7 +1895,7 @@ impl LayeredTimeline {
                    "range {}-{}, has {} deltas on this timeline",
                    img_range.start, img_range.end, num_deltas
                );
-                if num_deltas >= threshold {
+                if num_deltas >= self.get_image_creation_threshold() {
                    return Ok(true);
                }
            }
@@ -1874,11 +1904,11 @@ impl LayeredTimeline {
        Ok(false)
    }

-    fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<()> {
+    fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<PathBuf> {
        let img_range =
            partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
        let mut image_layer_writer =
-            ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?;
+            ImageLayerWriter::new(self.conf, self.timeline_id, self.tenant_id, &img_range, lsn)?;

        for range in &partition.ranges {
            let mut key = range.start;
@@ -1901,16 +1931,17 @@ impl LayeredTimeline {
        // and fsync them all in parallel.
        par_fsync::par_fsync(&[
            image_layer.path(),
-            self.conf.timeline_path(&self.timelineid, &self.tenantid),
+            self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
        ])?;

        // FIXME: Do we need to do something to upload it to remote storage here?

        let mut layers = self.layers.write().unwrap();
+        let new_path = image_layer.path();
        layers.insert_historic(Arc::new(image_layer));
        drop(layers);

-        Ok(())
+        Ok(new_path)
    }

    fn compact_level0(&self, target_file_size: u64) -> Result<()> {
@@ -1976,8 +2007,8 @@ impl LayeredTimeline {
            if writer.is_none() {
                writer = Some(DeltaLayerWriter::new(
                    self.conf,
-                    self.timelineid,
-                    self.tenantid,
+                    self.timeline_id,
+                    self.tenant_id,
                    key,
                    lsn_range.clone(),
                )?);
@@ -1995,7 +2026,7 @@ impl LayeredTimeline {
            let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();

            // also sync the directory
-            layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid));
+            layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));

            // Fsync all the layer files and directory using multiple threads to
            // minimize latency.
@@ -2005,18 +2036,38 @@ impl LayeredTimeline {
        }

        let mut layers = self.layers.write().unwrap();
+        let mut new_layer_paths = HashSet::with_capacity(new_layers.len());
        for l in new_layers {
+            new_layer_paths.insert(l.path());
            layers.insert_historic(Arc::new(l));
        }

        // Now that we have reshuffled the data to set of new delta layers, we can
        // delete the old ones
+        let mut layer_paths_do_delete = HashSet::with_capacity(level0_deltas.len());
        for l in level0_deltas {
            l.delete()?;
-            layers.remove_historic(l.clone());
+            if let Some(path) = l.local_path() {
+                layer_paths_do_delete.insert(path);
+            }
+            layers.remove_historic(l);
        }
        drop(layers);

+        if self.upload_layers.load(atomic::Ordering::Relaxed) {
+            storage_sync::schedule_layer_upload(
+                self.tenant_id,
+                self.timeline_id,
+                new_layer_paths,
+                None,
+            );
+            storage_sync::schedule_layer_delete(
+                self.tenant_id,
+                self.timeline_id,
+                layer_paths_do_delete,
+            );
+        }
+
        Ok(())
    }

@@ -2069,17 +2120,60 @@ impl LayeredTimeline {
        let cutoff = gc_info.cutoff;
        let pitr = gc_info.pitr;

-        let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered();
+        // Calculate pitr cutoff point.
+        // If we cannot determine a cutoff LSN, be conservative and don't GC anything.
+        let mut pitr_cutoff_lsn: Lsn = *self.get_latest_gc_cutoff_lsn();
+
+        if let Ok(timeline) =
+            tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
+        {
+            // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
+            // If we don't have enough data to convert to LSN,
+            // play safe and don't remove any layers.
+            if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
+                let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
+
+                match timeline.find_lsn_for_timestamp(pitr_timestamp)? {
+                    LsnForTimestamp::Present(lsn) => pitr_cutoff_lsn = lsn,
+                    LsnForTimestamp::Future(lsn) => {
+                        debug!("future({})", lsn);
+                        pitr_cutoff_lsn = cutoff;
+                    }
+                    LsnForTimestamp::Past(lsn) => {
+                        debug!("past({})", lsn);
+                    }
+                }
+                debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
+            }
+        } else if cfg!(test) {
+            // We don't have local timeline in mocked cargo tests.
+            // So, just ignore pitr_interval setting in this case.
+            pitr_cutoff_lsn = cutoff;
+        }
+
+        let new_gc_cutoff = Lsn::min(cutoff, pitr_cutoff_lsn);
+
+        // Nothing to GC. Return early.
+        if *self.get_latest_gc_cutoff_lsn() >= new_gc_cutoff {
+            info!(
+                "Nothing to GC for timeline {}. cutoff_lsn {}",
+                self.timeline_id, new_gc_cutoff
+            );
+            result.elapsed = now.elapsed()?;
+            return Ok(result);
+        }
+
+        let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %cutoff).entered();

        // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn.
        // See branch_timeline() for details.
-        *self.latest_gc_cutoff_lsn.write().unwrap() = cutoff;
+        *self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff;

        info!("GC starting");

        debug!("retain_lsns: {:?}", retain_lsns);

-        let mut layers_to_remove: Vec<Arc<dyn Layer>> = Vec::new();
+        let mut layers_to_remove = Vec::new();

        // Scan all on-disk layers in the timeline.
        //
@@ -2113,30 +2207,18 @@ impl LayeredTimeline {
                result.layers_needed_by_cutoff += 1;
                continue 'outer;
            }
-            // 2. It is newer than PiTR interval?
-            // We use modification time of layer file to estimate update time.
-            // This estimation is not quite precise but maintaining LSN->timestamp map seems to be overkill.
-            // It is not expected that users will need high precision here. And this estimation
-            // is conservative: modification time of file is always newer than actual time of version
-            // creation. So it is safe for users.
-            // TODO A possible "bloat" issue still persists here.
-            // If modification time changes because of layer upload/download, we will keep these files
-            // longer than necessary.
-            // https://github.com/neondatabase/neon/issues/1554
-            //
-            if let Ok(metadata) = fs::metadata(&l.filename()) {
-                let last_modified = metadata.modified()?;
-                if now.duration_since(last_modified)? < pitr {
-                    debug!(
-                        "keeping {} because it's modification time {:?} is newer than PITR {:?}",
-                        l.filename().display(),
-                        last_modified,
-                        pitr
-                    );
-                    result.layers_needed_by_pitr += 1;
-                    continue 'outer;
-                }
+
+            // 2. It is newer than PiTR cutoff point?
+            if l.get_lsn_range().end > pitr_cutoff_lsn {
+                debug!(
+                    "keeping {} because it's newer than pitr_cutoff_lsn {}",
+                    l.filename().display(),
+                    pitr_cutoff_lsn
+                );
+                result.layers_needed_by_pitr += 1;
+                continue 'outer;
            }
+
            // 3. Is it needed by a child branch?
            // NOTE With that wee would keep data that
            // might be referenced by child branches forever.
@@ -2190,13 +2272,24 @@ impl LayeredTimeline {
        // Actually delete the layers from disk and remove them from the map.
        // (couldn't do this in the loop above, because you cannot modify a collection
        // while iterating it. BTreeMap::retain() would be another option)
+        let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len());
        for doomed_layer in layers_to_remove {
            doomed_layer.delete()?;
-            layers.remove_historic(doomed_layer.clone());
-
+            if let Some(path) = doomed_layer.local_path() {
+                layer_paths_to_delete.insert(path);
+            }
+            layers.remove_historic(doomed_layer);
            result.layers_removed += 1;
        }

+        if self.upload_layers.load(atomic::Ordering::Relaxed) {
+            storage_sync::schedule_layer_delete(
+                self.tenant_id,
+                self.timeline_id,
+                layer_paths_to_delete,
+            );
+        }
+
        result.elapsed = now.elapsed()?;
        Ok(result)
    }
@@ -2260,8 +2353,8 @@ impl LayeredTimeline {
                if img.len() == page_cache::PAGE_SZ {
                    let cache = page_cache::get();
                    cache.memorize_materialized_page(
-                        self.tenantid,
-                        self.timelineid,
+                        self.tenant_id,
+                        self.timeline_id,
                        key,
                        last_rec_lsn,
                        &img,
@@ -2343,6 +2436,26 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
    bail!("couldn't find an unused backup number for {:?}", path)
 }

+fn load_metadata(
+    conf: &'static PageServerConf,
+    timeline_id: ZTimelineId,
+    tenant_id: ZTenantId,
+) -> anyhow::Result<TimelineMetadata> {
+    let metadata_path = metadata_path(conf, timeline_id, tenant_id);
+    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
+        format!(
+            "Failed to read metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })?;
+    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
+        format!(
+            "Failed to parse metadata bytes from path {}",
+            metadata_path.display()
+        )
+    })
+}
+
 ///
 /// Tests that are specific to the layered storage format.
 ///
@@ -2377,9 +2490,19 @@ pub mod tests {

        let err = harness.try_load().err().expect("should fail");
        assert_eq!(err.to_string(), "failed to load local metadata");
-        assert_eq!(
-            err.source().unwrap().to_string(),
-            "metadata checksum mismatch"
+
+        let mut found_error_message = false;
+        let mut err_source = err.source();
+        while let Some(source) = err_source {
+            if source.to_string() == "metadata checksum mismatch" {
+                found_error_message = true;
+                break;
+            }
+            err_source = source.source();
+        }
+        assert!(
+            found_error_message,
+            "didn't find the corrupted metadata error"
        );

        Ok(())
--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -23,6 +23,7 @@ distribution depends on the workload: the updates could be totally random, or
 there could be a long stream of updates to a single relation when data is bulk
 loaded, for example, or something in between.

+```
 Cloud Storage                   Page Server                           Safekeeper
                        L1               L0             Memory            WAL

@@ -37,6 +38,7 @@ Cloud Storage                   Page Server                           Safekeeper
 +----+----+          +----+----+      |   |     |
 |EEEE|               |EEEE|EEEE|      +---+-----+
 +----+               +----+----+
+```

 In this illustration, WAL is received as a stream from the Safekeeper, from the
 right.  It is immediately captured by the page server and stored quickly in
@@ -47,7 +49,7 @@ the same page and relation close to each other.
 From the page server memory, whenever enough WAL has been accumulated, it is flushed
 to disk into a new L0 layer file, and the memory is released.

-When enough L0 files have been accumulated, they are merged together rand sliced
+When enough L0 files have been accumulated, they are merged together and sliced
 per key-space, producing a new set of files where each file contains a more
 narrow key range, but larger LSN range.

@@ -121,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
 a range of LSNs (or a single LSN, in case of image layers). You can think of it
 as a rectangle in the two-dimensional key-LSN space. The layer files for each
 timeline are stored in the timeline's subdirectory under
-.zenith/tenants/<tenantid>/timelines.
+`.zenith/tenants/<tenantid>/timelines`.

 There are two kind of layer files: images, and delta layers. An image file
 contains a snapshot of all keys at a particular LSN, whereas a delta file
@@ -130,8 +132,11 @@ range of LSN.

 image file:

+```
    000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
              start key                          end key                           LSN
+```
+

 The first parts define the key range that the layer covers. See
 pgdatadir_mapping.rs for how the key space is used. The last part is the LSN.
@@ -140,8 +145,10 @@ delta file:

 Delta files are named similarly, but they cover a range of LSNs:

+```
    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
              start key                          end key                          start LSN     end LSN
+```

 A delta file contains all the key-values in the key-range that were updated in
 the LSN range. If a key has not been modified, there is no trace of it in the
@@ -151,7 +158,9 @@ delta layer.
 A delta layer file can cover a part of the overall key space, as in the previous
 example, or the whole key range like this:

+```
    000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051
+```

 A file that covers the whole key range is called a L0 file (Level 0), while a
 file that covers only part of the key range is called a L1 file. The "level" of
@@ -168,7 +177,9 @@ version, and how branching and GC works is still valid.

 The full path of a delta file looks like this:

+```
    .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
+```

 For simplicity, the examples below use a simplified notation for the
 paths.  The tenant ID is left out, the timeline ID is replaced with
@@ -177,8 +188,10 @@ with a human-readable table name. The LSNs are also shorter. For
 example, a base image file at LSN 100 and a delta file between 100-200
 for 'orders' table on 'main' branch is represented like this:

+```
    main/orders_100
    main/orders_100_200
+```


 # Creating layer files
@@ -188,12 +201,14 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end
 of WAL is currently at LSN 250. In this starting situation, you would
 have these files on disk:

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
 	main/customers_100
 	main/customers_100_200
 	main/customers_200
+```

 In addition to those files, the recent changes between LSN 200 and the
 end of WAL at 250 are kept in memory. If the page server crashes, the
@@ -224,6 +239,7 @@ If the customers table is modified later, a new file is created for it
 at the next checkpoint. The new file will cover the "gap" from the
 last layer file, so the LSN ranges are always contiguous:

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
@@ -236,6 +252,7 @@ last layer file, so the LSN ranges are always contiguous:
 	main/customers_200
 	main/customers_200_500
 	main/customers_500
+```

 ## Reading page versions

@@ -259,15 +276,18 @@ involves replaying any WAL records applicable to the page between LSNs

 Imagine that a child branch is created at LSN 250:

+```
            @250
    ----main--+-------------------------->
               \
                +---child-------------->
+```


 Then, the 'orders' table is updated differently on the 'main' and
 'child' branches. You now have this situation on disk:

+```
    main/orders_100
    main/orders_100_200
    main/orders_200
@@ -282,6 +302,7 @@ Then, the 'orders' table is updated differently on the 'main' and
    child/orders_300
    child/orders_300_400
    child/orders_400
+```

 Because the 'customers' table hasn't been modified on the child
 branch, there is no file for it there. If you request a page for it on
@@ -294,6 +315,7 @@ is linear, and the request's LSN identifies unambiguously which file
 you need to look at. For example, the history for the 'orders' table
 on the 'main' branch consists of these files:

+```
    main/orders_100
    main/orders_100_200
    main/orders_200
@@ -301,10 +323,12 @@ on the 'main' branch consists of these files:
    main/orders_300
    main/orders_300_400
    main/orders_400
+```

 And from the 'child' branch's point of view, it consists of these
 files:

+```
    main/orders_100
    main/orders_100_200
    main/orders_200
@@ -313,6 +337,7 @@ files:
    child/orders_300
    child/orders_300_400
    child/orders_400
+```

 The branch metadata includes the point where the child branch was
 created, LSN 250. If a page request comes with LSN 275, we read the
@@ -345,6 +370,7 @@ Let's look at the single branch scenario again. Imagine that the end
 of the branch is LSN 525, so that the GC horizon is currently at
 525-150 = 375

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
@@ -357,11 +383,13 @@ of the branch is LSN 525, so that the GC horizon is currently at
 	main/customers_100
 	main/customers_100_200
 	main/customers_200
+```

 We can remove the following files because the end LSNs of those files are
 older than GC horizon 375, and there are more recent layer files for the
 table:

+```
 	main/orders_100       DELETE
 	main/orders_100_200   DELETE
 	main/orders_200       DELETE
@@ -374,8 +402,9 @@ table:
 	main/customers_100      DELETE
 	main/customers_100_200  DELETE
 	main/customers_200      KEEP, NO NEWER VERSION
+```

-'main/customers_100_200' is old enough, but it cannot be
+'main/customers_200' is old enough, but it cannot be
 removed because there is no newer layer file for the table.

 Things get slightly more complicated with multiple branches. All of
@@ -384,6 +413,7 @@ retain older shapshot files that are still needed by child branches.
 For example, if child branch is created at LSN 150, and the 'customers'
 table is updated on the branch, you would have these files:

+```
 	main/orders_100        KEEP, NEEDED BY child BRANCH
 	main/orders_100_200    KEEP, NEEDED BY child BRANCH
 	main/orders_200        DELETE
@@ -398,6 +428,7 @@ table is updated on the branch, you would have these files:
 	main/customers_200       KEEP, NO NEWER VERSION
 	child/customers_150_300  DELETE
 	child/customers_300      KEEP, NO NEWER VERSION
+```

 In this situation, 'main/orders_100' and 'main/orders_100_200' cannot
 be removed, even though they are older than the GC horizon, because
@@ -407,6 +438,7 @@ and 'main/orders_200_300' can still be removed.
 If 'orders' is modified later on the 'child' branch, we will create a
 new base image and delta file for it on the child:

+```
 	main/orders_100
 	main/orders_100_200

@@ -419,6 +451,7 @@ new base image and delta file for it on the child:
 	child/customers_300
 	child/orders_150_400
 	child/orders_400
+```

 After this, the 'main/orders_100' and 'main/orders_100_200' file could
 be removed. It is no longer needed by the child branch, because there
@@ -434,6 +467,7 @@ Describe GC and checkpoint interval settings.
 In principle, each relation can be checkpointed separately, i.e. the
 LSN ranges of the files don't need to line up. So this would be legal:

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
@@ -446,6 +480,7 @@ LSN ranges of the files don't need to line up. So this would be legal:
 	main/customers_250
 	main/customers_250_500
 	main/customers_500
+```

 However, the code currently always checkpoints all relations together.
 So that situation doesn't arise in practice.
@@ -468,11 +503,13 @@ does that.  It could be useful, however, as a transient state when
 garbage collecting around branch points, or explicit recovery
 points. For example, if we start with this:

+```
 	main/orders_100
 	main/orders_100_200
 	main/orders_200
 	main/orders_200_300
 	main/orders_300
+```

 And there is a branch or explicit recovery point at LSN 150, we could
 replace 'main/orders_100_200' with 'main/orders_150' to keep a
--- a/pageserver/src/layered_repository/delta_layer.rs
+++ b/pageserver/src/layered_repository/delta_layer.rs
@@ -38,10 +38,6 @@ use crate::walrecord;
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use serde::{Deserialize, Serialize};
-use tracing::*;
-// avoid binding to Write (conflicts with std::io::Write)
-// while being able to use std::fmt::Write's methods
-use std::fmt::Write as _;
 use std::fs;
 use std::io::{BufWriter, Write};
 use std::io::{Seek, SeekFrom};
@@ -49,6 +45,7 @@ use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+use tracing::*;

 use utils::{
    bin_ser::BeSer,
@@ -218,6 +215,10 @@ impl Layer for DeltaLayer {
        PathBuf::from(self.layer_name().to_string())
    }

+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
+    }
+
    fn get_value_reconstruct_data(
        &self,
        key: Key,
@@ -250,6 +251,9 @@ impl Layer for DeltaLayer {
                    return false;
                }
                let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
+                if entry_lsn < lsn_range.start {
+                    return false;
+                }
                offsets.push((entry_lsn, blob_ref.pos()));

                !blob_ref.will_init()
@@ -258,8 +262,18 @@ impl Layer for DeltaLayer {
            // Ok, 'offsets' now contains the offsets of all the entries we need to read
            let mut cursor = file.block_cursor();
            for (entry_lsn, pos) in offsets {
-                let buf = cursor.read_blob(pos)?;
-                let val = Value::des(&buf)?;
+                let buf = cursor.read_blob(pos).with_context(|| {
+                    format!(
+                        "Failed to read blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
+                let val = Value::des(&buf).with_context(|| {
+                    format!(
+                        "Failed to deserialize file blob from virtual file {}",
+                        file.file.path.display()
+                    )
+                })?;
                match val {
                    Value::Image(img) => {
                        reconstruct_state.img = Some((entry_lsn, img));
@@ -348,6 +362,28 @@ impl Layer for DeltaLayer {
        tree_reader.dump()?;

        let mut cursor = file.block_cursor();
+
+        // A subroutine to dump a single blob
+        let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
+            let buf = cursor.read_blob(blob_ref.pos())?;
+            let val = Value::des(&buf)?;
+            let desc = match val {
+                Value::Image(img) => {
+                    format!(" img {} bytes", img.len())
+                }
+                Value::WalRecord(rec) => {
+                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    format!(
+                        " rec {} bytes will_init: {} {}",
+                        buf.len(),
+                        rec.will_init(),
+                        wal_desc
+                    )
+                }
+            };
+            Ok(desc)
+        };
+
        tree_reader.visit(
            &[0u8; DELTA_KEY_SIZE],
            VisitDirection::Forwards,
@@ -356,34 +392,10 @@ impl Layer for DeltaLayer {
                let key = DeltaKey::extract_key_from_buf(delta_key);
                let lsn = DeltaKey::extract_lsn_from_buf(delta_key);

-                let mut desc = String::new();
-                match cursor.read_blob(blob_ref.pos()) {
-                    Ok(buf) => {
-                        let val = Value::des(&buf);
-                        match val {
-                            Ok(Value::Image(img)) => {
-                                write!(&mut desc, " img {} bytes", img.len()).unwrap();
-                            }
-                            Ok(Value::WalRecord(rec)) => {
-                                let wal_desc = walrecord::describe_wal_record(&rec);
-                                write!(
-                                    &mut desc,
-                                    " rec {} bytes will_init: {} {}",
-                                    buf.len(),
-                                    rec.will_init(),
-                                    wal_desc
-                                )
-                                .unwrap();
-                            }
-                            Err(err) => {
-                                write!(&mut desc, " DESERIALIZATION ERROR: {}", err).unwrap();
-                            }
-                        }
-                    }
-                    Err(err) => {
-                        write!(&mut desc, " READ ERROR: {}", err).unwrap();
-                    }
-                }
+                let desc = match dump_blob(blob_ref) {
+                    Ok(desc) => desc,
+                    Err(err) => format!("ERROR: {}", err),
+                };
                println!("  key {} at {}: {}", key, lsn, desc);
                true
            },
--- a/pageserver/src/layered_repository/disk_btree.rs
+++ b/pageserver/src/layered_repository/disk_btree.rs
@@ -11,7 +11,6 @@
 //! - page-oriented
 //!
 //! TODO:
-//! - better errors (e.g. with thiserror?)
 //! - maybe something like an Adaptive Radix Tree would be more efficient?
 //! - the values stored by image and delta layers are offsets into the file,
 //!   and they are in monotonically increasing order. Prefix compression would
@@ -19,11 +18,12 @@
 //! - An Iterator interface would be more convenient for the callers than the
 //!   'visit' function
 //!
-use anyhow;
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
 use hex;
-use std::cmp::Ordering;
+use std::{cmp::Ordering, io, result};
+use thiserror::Error;
+use tracing::error;

 use crate::layered_repository::block_io::{BlockReader, BlockWriter};

@@ -86,6 +86,23 @@ impl Value {
    }
 }

+#[derive(Error, Debug)]
+pub enum DiskBtreeError {
+    #[error("Attempt to append a value that is too large {0} > {}", MAX_VALUE)]
+    AppendOverflow(u64),
+
+    #[error("Unsorted input: key {key:?} is <= last_key {last_key:?}")]
+    UnsortedInput { key: Box<[u8]>, last_key: Box<[u8]> },
+
+    #[error("Could not push to new leaf node")]
+    FailedToPushToNewLeafNode,
+
+    #[error("IoError: {0}")]
+    Io(#[from] io::Error),
+}
+
+pub type Result<T> = result::Result<T, DiskBtreeError>;
+
 /// This is the on-disk representation.
 struct OnDiskNode<'a, const L: usize> {
    // Fixed-width fields
@@ -106,12 +123,12 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
    ///
    /// Interpret a PAGE_SZ page as a node.
    ///
-    fn deparse(buf: &[u8]) -> OnDiskNode<L> {
+    fn deparse(buf: &[u8]) -> Result<OnDiskNode<L>> {
        let mut cursor = std::io::Cursor::new(buf);
-        let num_children = cursor.read_u16::<BE>().unwrap();
-        let level = cursor.read_u8().unwrap();
-        let prefix_len = cursor.read_u8().unwrap();
-        let suffix_len = cursor.read_u8().unwrap();
+        let num_children = cursor.read_u16::<BE>()?;
+        let level = cursor.read_u8()?;
+        let prefix_len = cursor.read_u8()?;
+        let suffix_len = cursor.read_u8()?;

        let mut off = cursor.position();
        let prefix_off = off as usize;
@@ -129,7 +146,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
        let keys = &buf[keys_off..keys_off + keys_len];
        let values = &buf[values_off..values_off + values_len];

-        OnDiskNode {
+        Ok(OnDiskNode {
            num_children,
            level,
            prefix_len,
@@ -137,7 +154,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
            prefix,
            keys,
            values,
-        }
+        })
    }

    ///
@@ -149,7 +166,11 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
        Value::from_slice(value_slice)
    }

-    fn binary_search(&self, search_key: &[u8; L], keybuf: &mut [u8]) -> Result<usize, usize> {
+    fn binary_search(
+        &self,
+        search_key: &[u8; L],
+        keybuf: &mut [u8],
+    ) -> result::Result<usize, usize> {
        let mut size = self.num_children as usize;
        let mut low = 0;
        let mut high = size;
@@ -209,7 +230,7 @@ where
    ///
    /// Read the value for given key. Returns the value, or None if it doesn't exist.
    ///
-    pub fn get(&self, search_key: &[u8; L]) -> anyhow::Result<Option<u64>> {
+    pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
        let mut result: Option<u64> = None;
        self.visit(search_key, VisitDirection::Forwards, |key, value| {
            if key == search_key {
@@ -230,7 +251,7 @@ where
        search_key: &[u8; L],
        dir: VisitDirection,
        mut visitor: V,
-    ) -> anyhow::Result<bool>
+    ) -> Result<bool>
    where
        V: FnMut(&[u8], u64) -> bool,
    {
@@ -243,7 +264,7 @@ where
        search_key: &[u8; L],
        dir: VisitDirection,
        visitor: &mut V,
-    ) -> anyhow::Result<bool>
+    ) -> Result<bool>
    where
        V: FnMut(&[u8], u64) -> bool,
    {
@@ -260,11 +281,11 @@ where
        search_key: &[u8; L],
        dir: VisitDirection,
        visitor: &mut V,
-    ) -> anyhow::Result<bool>
+    ) -> Result<bool>
    where
        V: FnMut(&[u8], u64) -> bool,
    {
-        let node = OnDiskNode::deparse(node_buf);
+        let node = OnDiskNode::deparse(node_buf)?;
        let prefix_len = node.prefix_len as usize;
        let suffix_len = node.suffix_len as usize;

@@ -369,15 +390,15 @@ where
    }

    #[allow(dead_code)]
-    pub fn dump(&self) -> anyhow::Result<()> {
+    pub fn dump(&self) -> Result<()> {
        self.dump_recurse(self.root_blk, &[], 0)
    }

-    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> anyhow::Result<()> {
+    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
        let blk = self.reader.read_blk(self.start_blk + blknum)?;
        let buf: &[u8] = blk.as_ref();

-        let node = OnDiskNode::<L>::deparse(buf);
+        let node = OnDiskNode::<L>::deparse(buf)?;

        print!("{:indent$}", "", indent = depth * 2);
        println!(
@@ -442,17 +463,24 @@ where
        }
    }

-    pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<(), anyhow::Error> {
-        assert!(value <= MAX_VALUE);
+    pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<()> {
+        if value > MAX_VALUE {
+            return Err(DiskBtreeError::AppendOverflow(value));
+        }
        if let Some(last_key) = &self.last_key {
-            assert!(key > last_key, "unsorted input");
+            if key <= last_key {
+                return Err(DiskBtreeError::UnsortedInput {
+                    key: key.as_slice().into(),
+                    last_key: last_key.as_slice().into(),
+                });
+            }
        }
        self.last_key = Some(*key);

-        Ok(self.append_internal(key, Value::from_u64(value))?)
+        self.append_internal(key, Value::from_u64(value))
    }

-    fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<(), std::io::Error> {
+    fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> {
        // Try to append to the current leaf buffer
        let last = self.stack.last_mut().unwrap();
        let level = last.level;
@@ -476,14 +504,15 @@ where
        // key to it.
        let mut last = BuildNode::new(level);
        if !last.push(key, value) {
-            panic!("could not push to new leaf node");
+            return Err(DiskBtreeError::FailedToPushToNewLeafNode);
        }
+
        self.stack.push(last);

        Ok(())
    }

-    fn flush_node(&mut self) -> Result<(), std::io::Error> {
+    fn flush_node(&mut self) -> Result<()> {
        let last = self.stack.pop().unwrap();
        let buf = last.pack();
        let downlink_key = last.first_key();
@@ -505,7 +534,7 @@ where
    /// (In the image and delta layers, it is stored in the beginning of the file,
    /// in the summary header)
    ///
-    pub fn finish(mut self) -> Result<(u32, W), std::io::Error> {
+    pub fn finish(mut self) -> Result<(u32, W)> {
        // flush all levels, except the root.
        while self.stack.len() > 1 {
            self.flush_node()?;
@@ -692,14 +721,14 @@ mod tests {
    impl BlockReader for TestDisk {
        type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;

-        fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
+        fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
            Ok(std::rc::Rc::new(buf))
        }
    }
    impl BlockWriter for &mut TestDisk {
-        fn write_blk(&mut self, buf: Bytes) -> Result<u32, std::io::Error> {
+        fn write_blk(&mut self, buf: Bytes) -> io::Result<u32> {
            let blknum = self.blocks.len();
            self.blocks.push(buf);
            Ok(blknum as u32)
@@ -707,7 +736,7 @@ mod tests {
    }

    #[test]
-    fn basic() -> anyhow::Result<()> {
+    fn basic() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);

@@ -788,7 +817,7 @@ mod tests {
    }

    #[test]
-    fn lots_of_keys() -> anyhow::Result<()> {
+    fn lots_of_keys() -> Result<()> {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);

@@ -882,7 +911,7 @@ mod tests {
    }

    #[test]
-    fn random_data() -> anyhow::Result<()> {
+    fn random_data() -> Result<()> {
        // Generate random keys with exponential distribution, to
        // exercise the prefix compression
        const NUM_KEYS: usize = 100000;
@@ -927,21 +956,27 @@ mod tests {
    }

    #[test]
-    #[should_panic(expected = "unsorted input")]
    fn unsorted_input() {
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk);

        let _ = writer.append(b"ba", 1);
        let _ = writer.append(b"bb", 2);
-        let _ = writer.append(b"aa", 3);
+        let err = writer.append(b"aa", 3).expect_err("should've failed");
+        match err {
+            DiskBtreeError::UnsortedInput { key, last_key } => {
+                assert_eq!(key.as_ref(), b"aa".as_slice());
+                assert_eq!(last_key.as_ref(), b"bb".as_slice());
+            }
+            _ => panic!("unexpected error variant, expected DiskBtreeError::UnsortedInput"),
+        }
    }

    ///
    /// This test contains a particular data set, see disk_btree_test_data.rs
    ///
    #[test]
-    fn particular_data() -> anyhow::Result<()> {
+    fn particular_data() -> Result<()> {
        // Build a tree from it
        let mut disk = TestDisk::new();
        let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
--- a/pageserver/src/layered_repository/image_layer.rs
+++ b/pageserver/src/layered_repository/image_layer.rs
@@ -125,6 +125,10 @@ impl Layer for ImageLayer {
        PathBuf::from(self.layer_name().to_string())
    }

+    fn local_path(&self) -> Option<PathBuf> {
+        Some(self.path())
+    }
+
    fn get_tenant_id(&self) -> ZTenantId {
        self.tenantid
    }
--- a/pageserver/src/layered_repository/inmemory_layer.rs
+++ b/pageserver/src/layered_repository/inmemory_layer.rs
@@ -85,6 +85,10 @@ impl Layer for InMemoryLayer {
        ))
    }

+    fn local_path(&self) -> Option<PathBuf> {
+        None
+    }
+
    fn get_tenant_id(&self) -> ZTenantId {
        self.tenantid
    }
@@ -207,7 +211,7 @@ impl Layer for InMemoryLayer {
                        write!(&mut desc, " img {} bytes", img.len())?;
                    }
                    Ok(Value::WalRecord(rec)) => {
-                        let wal_desc = walrecord::describe_wal_record(&rec);
+                        let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
                        write!(
                            &mut desc,
                            " rec {} bytes will_init: {} {}",
--- a/pageserver/src/layered_repository/layer_map.rs
+++ b/pageserver/src/layered_repository/layer_map.rs
@@ -43,10 +43,13 @@ pub struct LayerMap {
    pub next_open_layer_at: Option<Lsn>,

    ///
-    /// The frozen layer, if any, contains WAL older than the current 'open_layer'
-    /// or 'next_open_layer_at', but newer than any historic layer. The frozen
-    /// layer is during checkpointing, when an InMemoryLayer is being written out
-    /// to disk.
+    /// Frozen layers, if any. Frozen layers are in-memory layers that
+    /// are no longer added to, but haven't been written out to disk
+    /// yet. They contain WAL older than the current 'open_layer' or
+    /// 'next_open_layer_at', but newer than any historic layer.
+    /// The frozen layers are in order from oldest to newest, so that
+    /// the newest one is in the 'back' of the VecDeque, and the oldest
+    /// in the 'front'.
    ///
    pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,

@@ -129,17 +132,15 @@ impl LayerMap {
                // this layer contains the requested point in the key/lsn space.
                // No need to search any further
                trace!(
-                    "found layer {} for request on {} at {}",
+                    "found layer {} for request on {key} at {end_lsn}",
                    l.filename().display(),
-                    key,
-                    end_lsn
                );
                latest_delta.replace(Arc::clone(l));
                break;
            }
            // this layer's end LSN is smaller than the requested point. If there's
            // nothing newer, this is what we need to return. Remember this.
-            if let Some(ref old_candidate) = latest_delta {
+            if let Some(old_candidate) = &latest_delta {
                if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
                    latest_delta.replace(Arc::clone(l));
                }
@@ -149,10 +150,8 @@ impl LayerMap {
        }
        if let Some(l) = latest_delta {
            trace!(
-                "found (old) layer {} for request on {} at {}",
+                "found (old) layer {} for request on {key} at {end_lsn}",
                l.filename().display(),
-                key,
-                end_lsn
            );
            let lsn_floor = std::cmp::max(
                Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
@@ -163,17 +162,13 @@ impl LayerMap {
                layer: l,
            }))
        } else if let Some(l) = latest_img {
-            trace!(
-                "found img layer and no deltas for request on {} at {}",
-                key,
-                end_lsn
-            );
+            trace!("found img layer and no deltas for request on {key} at {end_lsn}");
            Ok(Some(SearchResult {
                lsn_floor: latest_img_lsn.unwrap(),
                layer: l,
            }))
        } else {
-            trace!("no layer found for request on {} at {}", key, end_lsn);
+            trace!("no layer found for request on {key} at {end_lsn}");
            Ok(None)
        }
    }
@@ -191,7 +186,6 @@ impl LayerMap {
    ///
    /// This should be called when the corresponding file on disk has been deleted.
    ///
-    #[allow(dead_code)]
    pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
        let len_before = self.historic_layers.len();

@@ -250,7 +244,7 @@ impl LayerMap {
        }
    }

-    pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
+    pub fn iter_historic_layers(&self) -> impl Iterator<Item = &Arc<dyn Layer>> {
        self.historic_layers.iter()
    }

--- a/pageserver/src/layered_repository/storage_layer.rs
+++ b/pageserver/src/layered_repository/storage_layer.rs
@@ -105,6 +105,9 @@ pub trait Layer: Send + Sync {
    /// log messages, even though they're never not on disk.)
    fn filename(&self) -> PathBuf;

+    /// If a layer has a corresponding file on a local filesystem, return its absolute path.
+    fn local_path(&self) -> Option<PathBuf>;
+
    ///
    /// Return data needed to reconstruct given page at LSN.
    ///
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -9,8 +9,8 @@ pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
 pub mod reltag;
-pub mod remote_storage;
 pub mod repository;
+pub mod storage_sync;
 pub mod tenant_config;
 pub mod tenant_mgr;
 pub mod tenant_threads;
@@ -45,7 +45,7 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

 lazy_static! {
    static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
-        "pageserver_live_connections_count",
+        "pageserver_live_connections",
        "Number of live network connections",
        &["pageserver_connection_kind"]
    )
@@ -67,7 +67,7 @@ pub type RepositoryImpl = LayeredRepository;

 pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;

-pub fn shutdown_pageserver() {
+pub fn shutdown_pageserver(exit_code: i32) {
    // Shut down the libpq endpoint thread. This prevents new connections from
    // being accepted.
    thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
@@ -94,5 +94,5 @@ pub fn shutdown_pageserver() {
    thread_mgr::shutdown_threads(None, None, None);

    info!("Shut down successfully completed");
-    std::process::exit(0);
+    std::process::exit(exit_code);
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -19,7 +19,6 @@ use std::net::TcpListener;
 use std::str;
 use std::str::FromStr;
 use std::sync::{Arc, RwLockReadGuard};
-use std::time::Duration;
 use tracing::*;
 use utils::{
    auth::{self, Claims, JwtAuth, Scope},
@@ -31,7 +30,7 @@ use utils::{

 use crate::basebackup;
 use crate::config::{PageServerConf, ProfilingConfig};
-use crate::pgdatadir_mapping::DatadirTimeline;
+use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
 use crate::profiling::profpoint_start;
 use crate::reltag::RelTag;
 use crate::repository::Repository;
@@ -42,12 +41,16 @@ use crate::thread_mgr::ThreadKind;
 use crate::walreceiver;
 use crate::CheckpointConfig;
 use metrics::{register_histogram_vec, HistogramVec};
+use postgres_ffi::xlog_utils::to_pg_timestamp;
+
+use postgres_ffi::pg_constants;

 // Wrapped in libpq CopyData
 enum PagestreamFeMessage {
    Exists(PagestreamExistsRequest),
    Nblocks(PagestreamNblocksRequest),
    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
 }

 // Wrapped in libpq CopyData
@@ -56,6 +59,7 @@ enum PagestreamBeMessage {
    Nblocks(PagestreamNblocksResponse),
    GetPage(PagestreamGetPageResponse),
    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
 }

 #[derive(Debug)]
@@ -80,6 +84,13 @@ struct PagestreamGetPageRequest {
    blkno: u32,
 }

+#[derive(Debug)]
+struct PagestreamDbSizeRequest {
+    latest: bool,
+    lsn: Lsn,
+    dbnode: u32,
+}
+
 #[derive(Debug)]
 struct PagestreamExistsResponse {
    exists: bool,
@@ -100,6 +111,11 @@ struct PagestreamErrorResponse {
    message: String,
 }

+#[derive(Debug)]
+struct PagestreamDbSizeResponse {
+    db_size: i64,
+}
+
 impl PagestreamFeMessage {
    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
        // TODO these gets can fail
@@ -141,6 +157,11 @@ impl PagestreamFeMessage {
                },
                blkno: body.get_u32(),
            })),
+            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                dbnode: body.get_u32(),
+            })),
            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
        }
    }
@@ -171,6 +192,10 @@ impl PagestreamBeMessage {
                bytes.put(resp.message.as_bytes());
                bytes.put_u8(0); // null terminator
            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_i64(resp.db_size);
+            }
        }

        bytes.into()
@@ -300,7 +325,7 @@ const TIME_BUCKETS: &[f64] = &[

 lazy_static! {
    static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
-        "pageserver_smgr_query_time",
+        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling",
        &["smgr_query_type", "tenant_id", "timeline_id"],
        TIME_BUCKETS.into()
@@ -326,7 +351,7 @@ impl PageServerHandler {
        let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();

        // Check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+        let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
            .context("Cannot load local timeline")?;

        /* switch client to COPYBOTH */
@@ -366,6 +391,11 @@ impl PageServerHandler {
                                .observe_closure_duration(|| {
                                    self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
                                }),
+                            PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME
+                                .with_label_values(&["get_db_size", &tenant_id, &timeline_id])
+                                .observe_closure_duration(|| {
+                                    self.handle_db_size_request(timeline.as_ref(), &req)
+                                }),
                        };

                        let response = response.unwrap_or_else(|e| {
@@ -486,6 +516,32 @@ impl PageServerHandler {
        }))
    }

+    fn handle_db_size_request<R: Repository>(
+        &self,
+        timeline: &DatadirTimeline<R>,
+        req: &PagestreamDbSizeRequest,
+    ) -> Result<PagestreamBeMessage> {
+        let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered();
+        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
+        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
+
+        let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?;
+        let mut total_blocks: i64 = 0;
+
+        for rel in all_rels {
+            if rel.forknum == 0 {
+                let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0);
+                total_blocks += n_blocks as i64;
+            }
+        }
+
+        let db_size = total_blocks * pg_constants::BLCKSZ as i64;
+
+        Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
+            db_size,
+        }))
+    }
+
    fn handle_get_page_at_lsn_request<R: Repository>(
        &self,
        timeline: &DatadirTimeline<R>,
@@ -522,7 +578,7 @@ impl PageServerHandler {
        info!("starting");

        // check that the timeline exists
-        let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+        let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
            .context("Cannot load local timeline")?;
        let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
@@ -656,7 +712,7 @@ impl postgres_backend::Handler for PageServerHandler {
                info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();

            // Check that the timeline exists
-            tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+            tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
                .context("Cannot load local timeline")?;

            walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?;
@@ -667,7 +723,10 @@ impl postgres_backend::Handler for PageServerHandler {
            // on connect
            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else if query_string.starts_with("failpoints ") {
+            ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support");
+
            let (_, failpoints) = query_string.split_at("failpoints ".len());
+
            for failpoint in failpoints.split(';') {
                if let Some((name, actions)) = failpoint.split_once('=') {
                    info!("cfg failpoint: {} {}", name, actions);
@@ -691,6 +750,7 @@ impl postgres_backend::Handler for PageServerHandler {
                RowDescriptor::int8_col(b"compaction_threshold"),
                RowDescriptor::int8_col(b"gc_horizon"),
                RowDescriptor::int8_col(b"gc_period"),
+                RowDescriptor::int8_col(b"image_creation_threshold"),
                RowDescriptor::int8_col(b"pitr_interval"),
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
@@ -705,6 +765,7 @@ impl postgres_backend::Handler for PageServerHandler {
                Some(repo.get_compaction_threshold().to_string().as_bytes()),
                Some(repo.get_gc_horizon().to_string().as_bytes()),
                Some(repo.get_gc_period().as_secs().to_string().as_bytes()),
+                Some(repo.get_image_creation_threshold().to_string().as_bytes()),
                Some(repo.get_pitr_interval().as_secs().to_string().as_bytes()),
            ]))?
            .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
@@ -734,7 +795,9 @@ impl postgres_backend::Handler for PageServerHandler {
                .unwrap_or_else(|| Ok(repo.get_gc_horizon()))?;

            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
-            let result = repo.gc_iteration(Some(timelineid), gc_horizon, Duration::ZERO, true)?;
+            // Use tenant's pitr setting
+            let pitr = repo.get_pitr_interval();
+            let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"layers_total"),
                RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
@@ -768,7 +831,7 @@ impl postgres_backend::Handler for PageServerHandler {

            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
-            let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+            let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
                .context("Couldn't load timeline")?;
            timeline.tline.compact()?;

@@ -787,7 +850,7 @@ impl postgres_backend::Handler for PageServerHandler {
            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;

-            let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
+            let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
                .context("Cannot load local timeline")?;

            timeline.tline.checkpoint(CheckpointConfig::Forced)?;
@@ -800,6 +863,33 @@ impl postgres_backend::Handler for PageServerHandler {

            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+        } else if query_string.starts_with("get_lsn_by_timestamp ") {
+            // Locate LSN of last transaction with timestamp less or equal than sppecified
+            // TODO lazy static
+            let re = Regex::new(r"^get_lsn_by_timestamp ([[:xdigit:]]+) ([[:xdigit:]]+) '(.*)'$")
+                .unwrap();
+            let caps = re
+                .captures(query_string)
+                .with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?;
+
+            let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
+            let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
+            let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
+                .context("Cannot load local timeline")?;
+
+            let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?;
+            let timestamp_pg = to_pg_timestamp(timestamp);
+
+            pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
+                b"lsn",
+            )]))?;
+            let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? {
+                LsnForTimestamp::Present(lsn) => format!("{}", lsn),
+                LsnForTimestamp::Future(_lsn) => "future".into(),
+                LsnForTimestamp::Past(_lsn) => "past".into(),
+            };
+            pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
+            pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
        } else {
            bail!("unknown command");
        }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,6 +13,7 @@ use crate::repository::{Repository, Timeline};
 use crate::walrecord::ZenithWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
+use postgres_ffi::xlog_utils::TimestampTz;
 use postgres_ffi::{pg_constants, Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
@@ -45,6 +46,13 @@ where
    current_logical_size: AtomicIsize,
 }

+#[derive(Debug)]
+pub enum LsnForTimestamp {
+    Present(Lsn),
+    Future(Lsn),
+    Past(Lsn),
+}
+
 impl<R: Repository> DatadirTimeline<R> {
    pub fn new(tline: Arc<R::Timeline>, repartition_threshold: u64) -> Self {
        DatadirTimeline {
@@ -202,6 +210,106 @@ impl<R: Repository> DatadirTimeline<R> {
        Ok(exists)
    }

+    /// Locate LSN, such that all transactions that committed before
+    /// 'search_timestamp' are visible, but nothing newer is.
+    ///
+    /// This is not exact. Commit timestamps are not guaranteed to be ordered,
+    /// so it's not well defined which LSN you get if there were multiple commits
+    /// "in flight" at that point in time.
+    ///
+    pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
+        let gc_cutoff_lsn_guard = self.tline.get_latest_gc_cutoff_lsn();
+        let min_lsn = *gc_cutoff_lsn_guard;
+        let max_lsn = self.tline.get_last_record_lsn();
+
+        // LSNs are always 8-byte aligned. low/mid/high represent the
+        // LSN divided by 8.
+        let mut low = min_lsn.0 / 8;
+        let mut high = max_lsn.0 / 8 + 1;
+
+        let mut found_smaller = false;
+        let mut found_larger = false;
+        while low < high {
+            // cannot overflow, high and low are both smaller than u64::MAX / 2
+            let mid = (high + low) / 2;
+
+            let cmp = self.is_latest_commit_timestamp_ge_than(
+                search_timestamp,
+                Lsn(mid * 8),
+                &mut found_smaller,
+                &mut found_larger,
+            )?;
+
+            if cmp {
+                high = mid;
+            } else {
+                low = mid + 1;
+            }
+        }
+        match (found_smaller, found_larger) {
+            (false, false) => {
+                // This can happen if no commit records have been processed yet, e.g.
+                // just after importing a cluster.
+                bail!("no commit timestamps found");
+            }
+            (true, false) => {
+                // Didn't find any commit timestamps larger than the request
+                Ok(LsnForTimestamp::Future(max_lsn))
+            }
+            (false, true) => {
+                // Didn't find any commit timestamps smaller than the request
+                Ok(LsnForTimestamp::Past(max_lsn))
+            }
+            (true, true) => {
+                // low is the LSN of the first commit record *after* the search_timestamp,
+                // Back off by one to get to the point just before the commit.
+                //
+                // FIXME: it would be better to get the LSN of the previous commit.
+                // Otherwise, if you restore to the returned LSN, the database will
+                // include physical changes from later commits that will be marked
+                // as aborted, and will need to be vacuumed away.
+                Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
+            }
+        }
+    }
+
+    ///
+    /// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
+    /// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
+    ///
+    /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
+    /// with a smaller/larger timestamp.
+    ///
+    fn is_latest_commit_timestamp_ge_than(
+        &self,
+        search_timestamp: TimestampTz,
+        probe_lsn: Lsn,
+        found_smaller: &mut bool,
+        found_larger: &mut bool,
+    ) -> Result<bool> {
+        for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
+            let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
+            for blknum in (0..nblocks).rev() {
+                let clog_page =
+                    self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
+
+                if clog_page.len() == pg_constants::BLCKSZ as usize + 8 {
+                    let mut timestamp_bytes = [0u8; 8];
+                    timestamp_bytes.copy_from_slice(&clog_page[pg_constants::BLCKSZ as usize..]);
+                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+
+                    if timestamp >= search_timestamp {
+                        *found_larger = true;
+                        return Ok(true);
+                    } else {
+                        *found_smaller = true;
+                    }
+                }
+            }
+        }
+        Ok(false)
+    }
+
    /// Get a list of SLRU segments
    pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
        // fetch directory entry
--- a/pageserver/src/remote_storage.rs
+++ b/pageserver/src/remote_storage.rs
@@ -1,394 +0,0 @@
-//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
-//! This particular module serves as a public API border between pageserver and the internal storage machinery.
-//! No other modules from this tree are supposed to be used directly by the external code.
-//!
-//! There are a few components the storage machinery consists of:
-//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
-//!     * [`local_fs`] allows to use local file system as an external storage
-//!     * [`s3_bucket`] uses AWS S3 bucket as an external storage
-//!
-//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
-//! Synchronization internals are split into submodules
-//!     * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files
-//!     * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively
-//!
-//! * public API via to interact with the external world:
-//!     * [`start_local_timeline_sync`] to launch a background async loop to handle the synchronization
-//!     * [`schedule_timeline_checkpoint_upload`] and [`schedule_timeline_download`] to enqueue a new upload and download tasks,
-//!       to be processed by the async loop
-//!
-//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
-//!
-//! +------------------------+                                    +--------->-------+
-//! |                        |  - - - (init async loop) - - - ->  |                 |
-//! |                        |                                    |                 |
-//! |                        |  ------------------------------->  |      async      |
-//! |       pageserver       |    (enqueue timeline sync task)    | upload/download |
-//! |                        |                                    |      loop       |
-//! |                        |  <-------------------------------  |                 |
-//! |                        |  (apply new timeline sync states)  |                 |
-//! +------------------------+                                    +---------<-------+
-//!                                                                         |
-//!                                                                         |
-//!                                          CRUD layer file operations     |
-//!                                     (upload/download/delete/list, etc.) |
-//!                                                                         V
-//!                                                            +------------------------+
-//!                                                            |                        |
-//!                                                            | [`RemoteStorage`] impl |
-//!                                                            |                        |
-//!                                                            | pageserver assumes it  |
-//!                                                            | owns exclusive write   |
-//!                                                            | access to this storage |
-//!                                                            +------------------------+
-//!
-//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop uninitialised, if configured so.
-//! The loop inits the storage connection and checks the remote files stored.
-//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
-//! Based on the remote storage data, the sync logic immediately schedules sync tasks for local timelines and reports about remote only timelines to pageserver, so it can
-//! query their downloads later if they are accessed.
-//!
-//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
-//! If the storage sync loop was successfully started before, pageserver schedules the new checkpoint file uploads after every checkpoint.
-//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either).
-//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
-//!
-//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`],
-//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
-//! Such submissions happen in two cases:
-//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future
-//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory
-//!
-//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits.
-//!
-//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file).
-//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
-//! by the storage upload, if enabled.
-//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files.
-//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable":
-//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state
-//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
-//! when the newer image is downloaded
-//!
-//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure.
-//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files.
-//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download.
-//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`],
-//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files.
-//!
-//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed.
-//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only,
-//! when a new timeline is scheduled for the download.
-//!
-//! NOTES:
-//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
-//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
-//!
-//! * the sync tasks may not processed immediately after the submission: if they error and get re-enqueued, their execution might be backed off to ensure error cap is not exceeded too fast.
-//! The sync queue processing also happens in batches, so the sync tasks can wait in the queue for some time.
-
-mod local_fs;
-mod s3_bucket;
-mod storage_sync;
-
-use std::{
-    collections::{HashMap, HashSet},
-    ffi, fs,
-    path::{Path, PathBuf},
-};
-
-use anyhow::{bail, Context};
-use tokio::io;
-use tracing::{debug, error, info};
-
-pub use self::{
-    local_fs::LocalFs,
-    s3_bucket::S3Bucket,
-    storage_sync::{
-        download_index_part,
-        index::{IndexPart, RemoteIndex, RemoteTimeline},
-        schedule_timeline_checkpoint_upload, schedule_timeline_download,
-    },
-};
-use crate::{
-    config::{PageServerConf, RemoteStorageKind},
-    layered_repository::{
-        ephemeral_file::is_ephemeral_file,
-        metadata::{TimelineMetadata, METADATA_FILE_NAME},
-    },
-};
-use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
-
-/// A timeline status to share with pageserver's sync counterpart,
-/// after comparing local and remote timeline state.
-#[derive(Clone, Copy, Debug)]
-pub enum LocalTimelineInitStatus {
-    /// The timeline has every remote layer present locally.
-    /// There could be some layers requiring uploading,
-    /// but this does not block the timeline from any user interaction.
-    LocallyComplete,
-    /// A timeline has some files remotely, that are not present locally and need downloading.
-    /// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only,
-    /// so the data needs to be downloaded first before the timeline can be used.
-    NeedsSync,
-}
-
-type LocalTimelineInitStatuses = HashMap<ZTenantId, HashMap<ZTimelineId, LocalTimelineInitStatus>>;
-
-/// A structure to combine all synchronization data to share with pageserver after a successful sync loop initialization.
-/// Successful initialization includes a case when sync loop is not started, in which case the startup data is returned still,
-/// to simplify the received code.
-pub struct SyncStartupData {
-    pub remote_index: RemoteIndex,
-    pub local_timeline_init_statuses: LocalTimelineInitStatuses,
-}
-
-/// Based on the config, initiates the remote storage connection and starts a separate thread
-/// that ensures that pageserver and the remote storage are in sync with each other.
-/// If no external configuration connection given, no thread or storage initialization is done.
-/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
-pub fn start_local_timeline_sync(
-    config: &'static PageServerConf,
-) -> anyhow::Result<SyncStartupData> {
-    let local_timeline_files = local_tenant_timeline_files(config)
-        .context("Failed to collect local tenant timeline files")?;
-
-    match &config.remote_storage_config {
-        Some(storage_config) => match &storage_config.storage {
-            RemoteStorageKind::LocalFs(root) => {
-                info!("Using fs root '{}' as a remote storage", root.display());
-                storage_sync::spawn_storage_sync_thread(
-                    config,
-                    local_timeline_files,
-                    LocalFs::new(root.clone(), &config.workdir)?,
-                    storage_config.max_concurrent_timelines_sync,
-                    storage_config.max_sync_errors,
-                )
-            },
-            RemoteStorageKind::AwsS3(s3_config) => {
-                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
-                    s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                storage_sync::spawn_storage_sync_thread(
-                    config,
-                    local_timeline_files,
-                    S3Bucket::new(s3_config, &config.workdir)?,
-                    storage_config.max_concurrent_timelines_sync,
-                    storage_config.max_sync_errors,
-                )
-            },
-        }
-        .context("Failed to spawn the storage sync thread"),
-        None => {
-            info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
-            let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
-            for (ZTenantTimelineId { tenant_id, timeline_id }, _) in
-                local_timeline_files
-            {
-                local_timeline_init_statuses
-                    .entry(tenant_id)
-                    .or_default()
-                    .insert(timeline_id, LocalTimelineInitStatus::LocallyComplete);
-            }
-            Ok(SyncStartupData {
-                local_timeline_init_statuses,
-                remote_index: RemoteIndex::empty(),
-            })
-        }
-    }
-}
-
-fn local_tenant_timeline_files(
-    config: &'static PageServerConf,
-) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
-    let mut local_tenant_timeline_files = HashMap::new();
-    let tenants_dir = config.tenants_path();
-    for tenants_dir_entry in fs::read_dir(&tenants_dir)
-        .with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
-    {
-        match &tenants_dir_entry {
-            Ok(tenants_dir_entry) => {
-                match collect_timelines_for_tenant(config, &tenants_dir_entry.path()) {
-                    Ok(collected_files) => {
-                        local_tenant_timeline_files.extend(collected_files.into_iter())
-                    }
-                    Err(e) => error!(
-                        "Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
-                        tenants_dir.display(),
-                        tenants_dir_entry,
-                        e
-                    ),
-                }
-            }
-            Err(e) => error!(
-                "Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
-                tenants_dir_entry,
-                tenants_dir.display(),
-                e
-            ),
-        }
-    }
-
-    Ok(local_tenant_timeline_files)
-}
-
-fn collect_timelines_for_tenant(
-    config: &'static PageServerConf,
-    tenant_path: &Path,
-) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
-    let mut timelines = HashMap::new();
-    let tenant_id = tenant_path
-        .file_name()
-        .and_then(ffi::OsStr::to_str)
-        .unwrap_or_default()
-        .parse::<ZTenantId>()
-        .context("Could not parse tenant id out of the tenant dir name")?;
-    let timelines_dir = config.timelines_path(&tenant_id);
-
-    for timelines_dir_entry in fs::read_dir(&timelines_dir).with_context(|| {
-        format!(
-            "Failed to list timelines dir entry for tenant {}",
-            tenant_id
-        )
-    })? {
-        match timelines_dir_entry {
-            Ok(timelines_dir_entry) => {
-                let timeline_path = timelines_dir_entry.path();
-                match collect_timeline_files(&timeline_path) {
-                    Ok((timeline_id, metadata, timeline_files)) => {
-                        timelines.insert(
-                            ZTenantTimelineId {
-                                tenant_id,
-                                timeline_id,
-                            },
-                            (metadata, timeline_files),
-                        );
-                    }
-                    Err(e) => error!(
-                        "Failed to process timeline dir contents at '{}', reason: {:?}",
-                        timeline_path.display(),
-                        e
-                    ),
-                }
-            }
-            Err(e) => error!(
-                "Failed to list timelines for entry tenant {}, reason: {:?}",
-                tenant_id, e
-            ),
-        }
-    }
-
-    Ok(timelines)
-}
-
-// discover timeline files and extract timeline metadata
-//  NOTE: ephemeral files are excluded from the list
-fn collect_timeline_files(
-    timeline_dir: &Path,
-) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet<PathBuf>)> {
-    let mut timeline_files = HashSet::new();
-    let mut timeline_metadata_path = None;
-
-    let timeline_id = timeline_dir
-        .file_name()
-        .and_then(ffi::OsStr::to_str)
-        .unwrap_or_default()
-        .parse::<ZTimelineId>()
-        .context("Could not parse timeline id out of the timeline dir name")?;
-    let timeline_dir_entries =
-        fs::read_dir(&timeline_dir).context("Failed to list timeline dir contents")?;
-    for entry in timeline_dir_entries {
-        let entry_path = entry.context("Failed to list timeline dir entry")?.path();
-        if entry_path.is_file() {
-            if entry_path.file_name().and_then(ffi::OsStr::to_str) == Some(METADATA_FILE_NAME) {
-                timeline_metadata_path = Some(entry_path);
-            } else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
-                debug!("skipping ephemeral file {}", entry_path.display());
-                continue;
-            } else {
-                timeline_files.insert(entry_path);
-            }
-        }
-    }
-
-    let timeline_metadata_path = match timeline_metadata_path {
-        Some(path) => path,
-        None => bail!("No metadata file found in the timeline directory"),
-    };
-    let metadata = TimelineMetadata::from_bytes(
-        &fs::read(&timeline_metadata_path).context("Failed to read timeline metadata file")?,
-    )
-    .context("Failed to parse timeline metadata file bytes")?;
-
-    Ok((timeline_id, metadata, timeline_files))
-}
-
-/// Storage (potentially remote) API to manage its state.
-/// This storage tries to be unaware of any layered repository context,
-/// providing basic CRUD operations for storage files.
-#[async_trait::async_trait]
-pub trait RemoteStorage: Send + Sync {
-    /// A way to uniquely reference a file in the remote storage.
-    type StoragePath;
-
-    /// Attempts to derive the storage path out of the local path, if the latter is correct.
-    fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath>;
-
-    /// Gets the download path of the given storage file.
-    fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf>;
-
-    /// Lists all items the storage has right now.
-    async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>>;
-
-    /// Streams the local file contents into remote into the remote storage entry.
-    async fn upload(
-        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
-        // S3 PUT request requires the content length to be specified,
-        // otherwise it starts to fail with the concurrent connection count increasing.
-        from_size_bytes: usize,
-        to: &Self::StoragePath,
-        metadata: Option<StorageMetadata>,
-    ) -> anyhow::Result<()>;
-
-    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
-    /// Returns the metadata, if any was stored with the file previously.
-    async fn download(
-        &self,
-        from: &Self::StoragePath,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>>;
-
-    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
-    /// Returns the metadata, if any was stored with the file previously.
-    async fn download_range(
-        &self,
-        from: &Self::StoragePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>>;
-
-    async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
-}
-
-/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
-/// Immutable, cannot be changed once the file is created.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct StorageMetadata(HashMap<String, String>);
-
-fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
-    if prefix == path {
-        anyhow::bail!(
-            "Prefix and the path are equal, cannot strip: '{}'",
-            prefix.display()
-        )
-    } else {
-        path.strip_prefix(prefix).with_context(|| {
-            format!(
-                "Path '{}' is not prefixed with '{}'",
-                path.display(),
-                prefix.display(),
-            )
-        })
-    }
-}
--- a/pageserver/src/remote_storage/storage_sync.rs
+++ b/pageserver/src/remote_storage/storage_sync.rs
--- a/pageserver/src/remote_storage/storage_sync/delete.rs
+++ b/pageserver/src/remote_storage/storage_sync/delete.rs
@@ -0,0 +1,223 @@
+//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage.
+
+use anyhow::Context;
+use futures::stream::{FuturesUnordered, StreamExt};
+use tracing::{debug, error, info};
+use utils::zid::ZTenantTimelineId;
+
+use crate::remote_storage::{
+    storage_sync::{SyncQueue, SyncTask},
+    RemoteStorage,
+};
+
+use super::{LayersDeletion, SyncData};
+
+/// Attempts to remove the timleline layers from the remote storage.
+/// If the task had not adjusted the metadata before, the deletion will fail.
+pub(super) async fn delete_timeline_layers<'a, P, S>(
+    storage: &'a S,
+    sync_queue: &SyncQueue,
+    sync_id: ZTenantTimelineId,
+    mut delete_data: SyncData<LayersDeletion>,
+) -> bool
+where
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
+{
+    if !delete_data.data.deletion_registered {
+        error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing");
+        delete_data.retries += 1;
+        sync_queue.push(sync_id, SyncTask::Delete(delete_data));
+        return false;
+    }
+
+    if delete_data.data.layers_to_delete.is_empty() {
+        info!("No layers to delete, skipping");
+        return true;
+    }
+
+    let layers_to_delete = delete_data
+        .data
+        .layers_to_delete
+        .drain()
+        .collect::<Vec<_>>();
+    debug!("Layers to delete: {layers_to_delete:?}");
+    info!("Deleting {} timeline layers", layers_to_delete.len());
+
+    let mut delete_tasks = layers_to_delete
+        .into_iter()
+        .map(|local_layer_path| async {
+            let storage_path = match storage.storage_path(&local_layer_path).with_context(|| {
+                format!(
+                    "Failed to get the layer storage path for local path '{}'",
+                    local_layer_path.display()
+                )
+            }) {
+                Ok(path) => path,
+                Err(e) => return Err((e, local_layer_path)),
+            };
+
+            match storage.delete(&storage_path).await.with_context(|| {
+                format!(
+                    "Failed to delete remote layer from storage at '{:?}'",
+                    storage_path
+                )
+            }) {
+                Ok(()) => Ok(local_layer_path),
+                Err(e) => Err((e, local_layer_path)),
+            }
+        })
+        .collect::<FuturesUnordered<_>>();
+
+    let mut errored = false;
+    while let Some(deletion_result) = delete_tasks.next().await {
+        match deletion_result {
+            Ok(local_layer_path) => {
+                debug!(
+                    "Successfully deleted layer {} for timeline {sync_id}",
+                    local_layer_path.display()
+                );
+                delete_data.data.deleted_layers.insert(local_layer_path);
+            }
+            Err((e, local_layer_path)) => {
+                errored = true;
+                error!(
+                    "Failed to delete layer {} for timeline {sync_id}: {e:?}",
+                    local_layer_path.display()
+                );
+                delete_data.data.layers_to_delete.insert(local_layer_path);
+            }
+        }
+    }
+
+    if errored {
+        debug!("Reenqueuing failed delete task for timeline {sync_id}");
+        delete_data.retries += 1;
+        sync_queue.push(sync_id, SyncTask::Delete(delete_data));
+    }
+    errored
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashSet, num::NonZeroUsize};
+
+    use itertools::Itertools;
+    use tempfile::tempdir;
+    use tokio::fs;
+    use utils::lsn::Lsn;
+
+    use crate::{
+        remote_storage::{
+            storage_sync::test_utils::{create_local_timeline, dummy_metadata},
+            LocalFs,
+        },
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn delete_timeline_negative() -> anyhow::Result<()> {
+        let harness = RepoHarness::create("delete_timeline_negative")?;
+        let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
+        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
+        let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
+
+        let deleted = delete_timeline_layers(
+            &storage,
+            &sync_queue,
+            sync_id,
+            SyncData {
+                retries: 1,
+                data: LayersDeletion {
+                    deleted_layers: HashSet::new(),
+                    layers_to_delete: HashSet::new(),
+                    deletion_registered: false,
+                },
+            },
+        )
+        .await;
+
+        assert!(
+            !deleted,
+            "Should not start the deletion for task with delete metadata unregistered"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn delete_timeline() -> anyhow::Result<()> {
+        let harness = RepoHarness::create("delete_timeline")?;
+        let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
+
+        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
+        let layer_files = ["a", "b", "c", "d"];
+        let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
+        let current_retries = 3;
+        let metadata = dummy_metadata(Lsn(0x30));
+        let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
+        let timeline_upload =
+            create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
+        for local_path in timeline_upload.layers_to_upload {
+            let remote_path = storage.storage_path(&local_path)?;
+            let remote_parent_dir = remote_path.parent().unwrap();
+            if !remote_parent_dir.exists() {
+                fs::create_dir_all(&remote_parent_dir).await?;
+            }
+            fs::copy(&local_path, &remote_path).await?;
+        }
+        assert_eq!(
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|remote_path| storage.local_path(&remote_path).unwrap())
+                .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
+                .sorted()
+                .collect::<Vec<_>>(),
+            layer_files
+                .iter()
+                .map(|layer_str| layer_str.to_string())
+                .sorted()
+                .collect::<Vec<_>>(),
+            "Expect to have all layer files remotely before deletion"
+        );
+
+        let deleted = delete_timeline_layers(
+            &storage,
+            &sync_queue,
+            sync_id,
+            SyncData {
+                retries: current_retries,
+                data: LayersDeletion {
+                    deleted_layers: HashSet::new(),
+                    layers_to_delete: HashSet::from([
+                        local_timeline_path.join("a"),
+                        local_timeline_path.join("c"),
+                        local_timeline_path.join("something_different"),
+                    ]),
+                    deletion_registered: true,
+                },
+            },
+        )
+        .await;
+        assert!(deleted, "Should be able to delete timeline files");
+
+        assert_eq!(
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|remote_path| storage.local_path(&remote_path).unwrap())
+                .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
+                .sorted()
+                .collect::<Vec<_>>(),
+            vec!["b".to_string(), "d".to_string()],
+            "Expect to have only non-deleted files remotely"
+        );
+
+        Ok(())
+    }
+}
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -1,5 +1,5 @@
 use crate::layered_repository::metadata::TimelineMetadata;
-use crate::remote_storage::RemoteIndex;
+use crate::storage_sync::index::RemoteIndex;
 use crate::walrecord::ZenithWalRecord;
 use crate::CheckpointConfig;
 use anyhow::{bail, Result};
@@ -259,7 +259,7 @@ pub trait Repository: Send + Sync {
    /// api's 'compact' command.
    fn compaction_iteration(&self) -> Result<()>;

-    /// detaches locally available timeline by stopping all threads and removing all the data.
+    /// detaches timeline-related in-memory data.
    fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;

    // Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
@@ -467,6 +467,7 @@ pub mod repo_harness {
                compaction_threshold: Some(tenant_conf.compaction_threshold),
                gc_horizon: Some(tenant_conf.gc_horizon),
                gc_period: Some(tenant_conf.gc_period),
+                image_creation_threshold: Some(tenant_conf.image_creation_threshold),
                pitr_interval: Some(tenant_conf.pitr_interval),
            }
        }
--- a/pageserver/src/storage_sync.rs
+++ b/pageserver/src/storage_sync.rs
--- a/pageserver/src/storage_sync/delete.rs
+++ b/pageserver/src/storage_sync/delete.rs
@@ -0,0 +1,228 @@
+//! Timeline synchrnonization logic to delete a bulk of timeline's remote files from the remote storage.
+
+use anyhow::Context;
+use futures::stream::{FuturesUnordered, StreamExt};
+use tracing::{debug, error, info};
+
+use crate::storage_sync::{SyncQueue, SyncTask};
+use remote_storage::RemoteStorage;
+use utils::zid::ZTenantTimelineId;
+
+use super::{LayersDeletion, SyncData};
+
+/// Attempts to remove the timleline layers from the remote storage.
+/// If the task had not adjusted the metadata before, the deletion will fail.
+pub(super) async fn delete_timeline_layers<'a, P, S>(
+    storage: &'a S,
+    sync_queue: &SyncQueue,
+    sync_id: ZTenantTimelineId,
+    mut delete_data: SyncData<LayersDeletion>,
+) -> bool
+where
+    P: std::fmt::Debug + Send + Sync + 'static,
+    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
+{
+    if !delete_data.data.deletion_registered {
+        error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing");
+        delete_data.retries += 1;
+        sync_queue.push(sync_id, SyncTask::Delete(delete_data));
+        return false;
+    }
+
+    if delete_data.data.layers_to_delete.is_empty() {
+        info!("No layers to delete, skipping");
+        return true;
+    }
+
+    let layers_to_delete = delete_data
+        .data
+        .layers_to_delete
+        .drain()
+        .collect::<Vec<_>>();
+    debug!("Layers to delete: {layers_to_delete:?}");
+    info!("Deleting {} timeline layers", layers_to_delete.len());
+
+    let mut delete_tasks = layers_to_delete
+        .into_iter()
+        .map(|local_layer_path| async {
+            let storage_path =
+                match storage
+                    .remote_object_id(&local_layer_path)
+                    .with_context(|| {
+                        format!(
+                            "Failed to get the layer storage path for local path '{}'",
+                            local_layer_path.display()
+                        )
+                    }) {
+                    Ok(path) => path,
+                    Err(e) => return Err((e, local_layer_path)),
+                };
+
+            match storage.delete(&storage_path).await.with_context(|| {
+                format!(
+                    "Failed to delete remote layer from storage at '{:?}'",
+                    storage_path
+                )
+            }) {
+                Ok(()) => Ok(local_layer_path),
+                Err(e) => Err((e, local_layer_path)),
+            }
+        })
+        .collect::<FuturesUnordered<_>>();
+
+    let mut errored = false;
+    while let Some(deletion_result) = delete_tasks.next().await {
+        match deletion_result {
+            Ok(local_layer_path) => {
+                debug!(
+                    "Successfully deleted layer {} for timeline {sync_id}",
+                    local_layer_path.display()
+                );
+                delete_data.data.deleted_layers.insert(local_layer_path);
+            }
+            Err((e, local_layer_path)) => {
+                errored = true;
+                error!(
+                    "Failed to delete layer {} for timeline {sync_id}: {e:?}",
+                    local_layer_path.display()
+                );
+                delete_data.data.layers_to_delete.insert(local_layer_path);
+            }
+        }
+    }
+
+    if errored {
+        debug!("Reenqueuing failed delete task for timeline {sync_id}");
+        delete_data.retries += 1;
+        sync_queue.push(sync_id, SyncTask::Delete(delete_data));
+    }
+    errored
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashSet, num::NonZeroUsize};
+
+    use itertools::Itertools;
+    use tempfile::tempdir;
+    use tokio::fs;
+    use utils::lsn::Lsn;
+
+    use crate::{
+        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+        storage_sync::test_utils::{create_local_timeline, dummy_metadata},
+    };
+    use remote_storage::LocalFs;
+
+    use super::*;
+
+    #[tokio::test]
+    async fn delete_timeline_negative() -> anyhow::Result<()> {
+        let harness = RepoHarness::create("delete_timeline_negative")?;
+        let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
+        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
+        let storage = LocalFs::new(
+            tempdir()?.path().to_path_buf(),
+            harness.conf.workdir.clone(),
+        )?;
+
+        let deleted = delete_timeline_layers(
+            &storage,
+            &sync_queue,
+            sync_id,
+            SyncData {
+                retries: 1,
+                data: LayersDeletion {
+                    deleted_layers: HashSet::new(),
+                    layers_to_delete: HashSet::new(),
+                    deletion_registered: false,
+                },
+            },
+        )
+        .await;
+
+        assert!(
+            !deleted,
+            "Should not start the deletion for task with delete metadata unregistered"
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn delete_timeline() -> anyhow::Result<()> {
+        let harness = RepoHarness::create("delete_timeline")?;
+        let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
+
+        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
+        let layer_files = ["a", "b", "c", "d"];
+        let storage = LocalFs::new(
+            tempdir()?.path().to_path_buf(),
+            harness.conf.workdir.clone(),
+        )?;
+        let current_retries = 3;
+        let metadata = dummy_metadata(Lsn(0x30));
+        let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
+        let timeline_upload =
+            create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
+        for local_path in timeline_upload.layers_to_upload {
+            let remote_path = storage.remote_object_id(&local_path)?;
+            let remote_parent_dir = remote_path.parent().unwrap();
+            if !remote_parent_dir.exists() {
+                fs::create_dir_all(&remote_parent_dir).await?;
+            }
+            fs::copy(&local_path, &remote_path).await?;
+        }
+        assert_eq!(
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|remote_path| storage.local_path(&remote_path).unwrap())
+                .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
+                .sorted()
+                .collect::<Vec<_>>(),
+            layer_files
+                .iter()
+                .map(|layer_str| layer_str.to_string())
+                .sorted()
+                .collect::<Vec<_>>(),
+            "Expect to have all layer files remotely before deletion"
+        );
+
+        let deleted = delete_timeline_layers(
+            &storage,
+            &sync_queue,
+            sync_id,
+            SyncData {
+                retries: current_retries,
+                data: LayersDeletion {
+                    deleted_layers: HashSet::new(),
+                    layers_to_delete: HashSet::from([
+                        local_timeline_path.join("a"),
+                        local_timeline_path.join("c"),
+                        local_timeline_path.join("something_different"),
+                    ]),
+                    deletion_registered: true,
+                },
+            },
+        )
+        .await;
+        assert!(deleted, "Should be able to delete timeline files");
+
+        assert_eq!(
+            storage
+                .list()
+                .await?
+                .into_iter()
+                .map(|remote_path| storage.local_path(&remote_path).unwrap())
+                .filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
+                .sorted()
+                .collect::<Vec<_>>(),
+            vec!["b".to_string(), "d".to_string()],
+            "Expect to have only non-deleted files remotely"
+        );
+
+        Ok(())
+    }
+}
--- a/pageserver/src/remote_storage/storage_sync/download.rs
+++ b/pageserver/src/remote_storage/storage_sync/download.rs
@@ -1,27 +1,28 @@
 //! Timeline synchrnonization logic to fetch the layer files from remote storage into pageserver's local directory.

-use std::fmt::Debug;
+use std::{collections::HashSet, fmt::Debug, path::Path};

 use anyhow::Context;
 use futures::stream::{FuturesUnordered, StreamExt};
-use tokio::fs;
+use remote_storage::{path_with_suffix_extension, RemoteStorage};
+use tokio::{
+    fs,
+    io::{self, AsyncWriteExt},
+};
 use tracing::{debug, error, info, warn};

 use crate::{
-    config::PageServerConf,
-    layered_repository::metadata::metadata_path,
-    remote_storage::{
-        storage_sync::{sync_queue, SyncTask},
-        RemoteStorage,
-    },
+    config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
 };
 use utils::zid::ZTenantTimelineId;

 use super::{
    index::{IndexPart, RemoteTimeline},
-    SyncData, TimelineDownload,
+    LayersDownload, SyncData, SyncQueue,
 };

+pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
+
 /// Retrieves index data from the remote storage for a given timeline.
 pub async fn download_index_part<P, S>(
    conf: &'static PageServerConf,
@@ -30,23 +31,25 @@ pub async fn download_index_part<P, S>(
 ) -> anyhow::Result<IndexPart>
 where
    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
 {
    let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
        .with_file_name(IndexPart::FILE_NAME)
        .with_extension(IndexPart::FILE_EXTENSION);
-    let part_storage_path = storage.storage_path(&index_part_path).with_context(|| {
-        format!(
-            "Failed to get the index part storage path for local path '{}'",
-            index_part_path.display()
-        )
-    })?;
+    let part_storage_path = storage
+        .remote_object_id(&index_part_path)
+        .with_context(|| {
+            format!(
+                "Failed to get the index part storage path for local path '{}'",
+                index_part_path.display()
+            )
+        })?;
    let mut index_part_bytes = Vec::new();
    storage
        .download(&part_storage_path, &mut index_part_bytes)
        .await
        .with_context(|| {
-            format!("Failed to download an index part from storage path '{part_storage_path:?}'")
+            format!("Failed to download an index part from storage path {part_storage_path:?}")
        })?;

    let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| {
@@ -71,7 +74,7 @@ pub(super) enum DownloadedTimeline {
    FailedAndRescheduled,
    /// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
    /// Initial download successful.
-    Successful(SyncData<TimelineDownload>),
+    Successful(SyncData<LayersDownload>),
 }

 /// Attempts to download all given timeline's layers.
@@ -80,14 +83,16 @@ pub(super) enum DownloadedTimeline {
 ///
 /// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task.
 pub(super) async fn download_timeline_layers<'a, P, S>(
+    conf: &'static PageServerConf,
    storage: &'a S,
+    sync_queue: &'a SyncQueue,
    remote_timeline: Option<&'a RemoteTimeline>,
    sync_id: ZTenantTimelineId,
-    mut download_data: SyncData<TimelineDownload>,
+    mut download_data: SyncData<LayersDownload>,
 ) -> DownloadedTimeline
 where
    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
 {
    let remote_timeline = match remote_timeline {
        Some(remote_timeline) => {
@@ -114,6 +119,11 @@ where
    debug!("Layers to download: {layers_to_download:?}");
    info!("Downloading {} timeline layers", layers_to_download.len());

+    if layers_to_download.is_empty() {
+        info!("No layers to download after filtering, skipping");
+        return DownloadedTimeline::Successful(download_data);
+    }
+
    let mut download_tasks = layers_to_download
        .into_iter()
        .map(|layer_desination_path| async move {
@@ -124,7 +134,7 @@ where
                );
            } else {
                let layer_storage_path = storage
-                    .storage_path(&layer_desination_path)
+                    .remote_object_id(&layer_desination_path)
                    .with_context(|| {
                        format!(
                            "Failed to get the layer storage path for local path '{}'",
@@ -132,12 +142,24 @@ where
                        )
                    })?;

-                let mut destination_file = fs::File::create(&layer_desination_path)
-                    .await
-                    .with_context(|| {
+                // Perform a rename inspired by durable_rename from file_utils.c.
+                // The sequence:
+                //     write(tmp)
+                //     fsync(tmp)
+                //     rename(tmp, new)
+                //     fsync(new)
+                //     fsync(parent)
+                // For more context about durable_rename check this email from postgres mailing list:
+                // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
+                // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
+                let temp_file_path =
+                    path_with_suffix_extension(&layer_desination_path, TEMP_DOWNLOAD_EXTENSION);
+
+                let mut destination_file =
+                    fs::File::create(&temp_file_path).await.with_context(|| {
                        format!(
                            "Failed to create a destination file for layer '{}'",
-                            layer_desination_path.display()
+                            temp_file_path.display()
                        )
                    })?;

@@ -149,15 +171,55 @@ where
                            "Failed to download a layer from storage path '{layer_storage_path:?}'"
                        )
                    })?;
+
+                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
+                // A file will not be closed immediately when it goes out of scope if there are any IO operations
+                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
+                // you should call flush before dropping it.
+                //
+                // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
+                // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
+                // But for additional safety lets check/wait for any pending operations.
+                destination_file.flush().await.with_context(|| {
+                    format!(
+                        "failed to flush source file at {}",
+                        temp_file_path.display()
+                    )
+                })?;
+
+                // not using sync_data because it can lose file size update
+                destination_file.sync_all().await.with_context(|| {
+                    format!(
+                        "failed to fsync source file at {}",
+                        temp_file_path.display()
+                    )
+                })?;
+                drop(destination_file);
+
+                fail::fail_point!("remote-storage-download-pre-rename", |_| {
+                    anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
+                });
+
+                fs::rename(&temp_file_path, &layer_desination_path).await?;
+
+                fsync_path(&layer_desination_path).await.with_context(|| {
+                    format!(
+                        "Cannot fsync layer destination path {}",
+                        layer_desination_path.display(),
+                    )
+                })?;
            }
            Ok::<_, anyhow::Error>(layer_desination_path)
        })
        .collect::<FuturesUnordered<_>>();

    let mut errors_happened = false;
+    // keep files we've downloaded to remove them from layers_to_skip if directory fsync fails
+    let mut undo = HashSet::new();
    while let Some(download_result) = download_tasks.next().await {
        match download_result {
            Ok(downloaded_path) => {
+                undo.insert(downloaded_path.clone());
                download.layers_to_skip.insert(downloaded_path);
            }
            Err(e) => {
@@ -167,10 +229,28 @@ where
        }
    }

+    // fsync timeline directory which is a parent directory for downloaded files
+    let ZTenantTimelineId {
+        tenant_id,
+        timeline_id,
+    } = &sync_id;
+    let timeline_dir = conf.timeline_path(timeline_id, tenant_id);
+    if let Err(e) = fsync_path(&timeline_dir).await {
+        error!(
+            "Cannot fsync parent directory {} error {}",
+            timeline_dir.display(),
+            e
+        );
+        for item in undo {
+            download.layers_to_skip.remove(&item);
+        }
+        errors_happened = true;
+    }
+
    if errors_happened {
        debug!("Reenqueuing failed download task for timeline {sync_id}");
        download_data.retries += 1;
-        sync_queue::push(sync_id, SyncTask::Download(download_data));
+        sync_queue.push(sync_id, SyncTask::Download(download_data));
        DownloadedTimeline::FailedAndRescheduled
    } else {
        info!("Successfully downloaded all layers");
@@ -178,22 +258,27 @@ where
    }
 }

+async fn fsync_path(path: impl AsRef<Path>) -> Result<(), io::Error> {
+    fs::File::open(path).await?.sync_all().await
+}
+
 #[cfg(test)]
 mod tests {
-    use std::collections::{BTreeSet, HashSet};
+    use std::{
+        collections::{BTreeSet, HashSet},
+        num::NonZeroUsize,
+    };

+    use remote_storage::{LocalFs, RemoteStorage};
    use tempfile::tempdir;
    use utils::lsn::Lsn;

    use crate::{
-        remote_storage::{
-            storage_sync::{
-                index::RelativePath,
-                test_utils::{create_local_timeline, dummy_metadata},
-            },
-            LocalFs,
-        },
        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+        storage_sync::{
+            index::RelativePath,
+            test_utils::{create_local_timeline, dummy_metadata},
+        },
    };

    use super::*;
@@ -201,9 +286,14 @@ mod tests {
    #[tokio::test]
    async fn download_timeline() -> anyhow::Result<()> {
        let harness = RepoHarness::create("download_timeline")?;
+        let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
+
        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
        let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"];
-        let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
+        let storage = LocalFs::new(
+            tempdir()?.path().to_path_buf(),
+            harness.conf.workdir.clone(),
+        )?;
        let current_retries = 3;
        let metadata = dummy_metadata(Lsn(0x30));
        let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
@@ -211,7 +301,7 @@ mod tests {
            create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;

        for local_path in timeline_upload.layers_to_upload {
-            let remote_path = storage.storage_path(&local_path)?;
+            let remote_path = storage.remote_object_id(&local_path)?;
            let remote_parent_dir = remote_path.parent().unwrap();
            if !remote_parent_dir.exists() {
                fs::create_dir_all(&remote_parent_dir).await?;
@@ -236,12 +326,14 @@ mod tests {
        );

        let download_data = match download_timeline_layers(
+            harness.conf,
            &storage,
+            &sync_queue,
            Some(&remote_timeline),
            sync_id,
            SyncData::new(
                current_retries,
-                TimelineDownload {
+                LayersDownload {
                    layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]),
                },
            ),
@@ -293,16 +385,19 @@ mod tests {
    #[tokio::test]
    async fn download_timeline_negatives() -> anyhow::Result<()> {
        let harness = RepoHarness::create("download_timeline_negatives")?;
+        let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
-        let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;

        let empty_remote_timeline_download = download_timeline_layers(
+            harness.conf,
            &storage,
+            &sync_queue,
            None,
            sync_id,
            SyncData::new(
                0,
-                TimelineDownload {
+                LayersDownload {
                    layers_to_skip: HashSet::new(),
                },
            ),
@@ -319,12 +414,14 @@ mod tests {
            "Should not expect download for the timeline"
        );
        let already_downloading_remote_timeline_download = download_timeline_layers(
+            harness.conf,
            &storage,
+            &sync_queue,
            Some(&not_expecting_download_remote_timeline),
            sync_id,
            SyncData::new(
                0,
-                TimelineDownload {
+                LayersDownload {
                    layers_to_skip: HashSet::new(),
                },
            ),
@@ -332,7 +429,7 @@ mod tests {
        .await;
        assert!(
            matches!(
-                dbg!(already_downloading_remote_timeline_download),
+                already_downloading_remote_timeline_download,
                DownloadedTimeline::Abort,
            ),
            "Should not allow downloading for remote timeline that does not expect it"
@@ -346,7 +443,10 @@ mod tests {
        let harness = RepoHarness::create("test_download_index_part")?;
        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);

-        let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
+        let storage = LocalFs::new(
+            tempdir()?.path().to_path_buf(),
+            harness.conf.workdir.clone(),
+        )?;
        let metadata = dummy_metadata(Lsn(0x30));
        let local_timeline_path = harness.timeline_path(&TIMELINE_ID);

@@ -367,7 +467,7 @@ mod tests {
            metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id)
                .with_file_name(IndexPart::FILE_NAME)
                .with_extension(IndexPart::FILE_EXTENSION);
-        let storage_path = storage.storage_path(&local_index_part_path)?;
+        let storage_path = storage.remote_object_id(&local_index_part_path)?;
        fs::create_dir_all(storage_path.parent().unwrap()).await?;
        fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?;

--- a/pageserver/src/remote_storage/storage_sync/index.rs
+++ b/pageserver/src/remote_storage/storage_sync/index.rs
@@ -8,7 +8,7 @@ use std::{
    sync::Arc,
 };

-use anyhow::{Context, Ok};
+use anyhow::{anyhow, Context, Ok};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use tokio::sync::RwLock;
@@ -113,7 +113,7 @@ impl RemoteTimelineIndex {
        awaits_download: bool,
    ) -> anyhow::Result<()> {
        self.timeline_entry_mut(id)
-            .ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))?
+            .ok_or_else(|| anyhow!("unknown timeline sync {id}"))?
            .awaits_download = awaits_download;
        Ok(())
    }
@@ -147,6 +147,13 @@ impl RemoteTimeline {
        self.missing_layers.extend(upload_failures.into_iter());
    }

+    pub fn remove_layers(&mut self, layers_to_remove: &HashSet<PathBuf>) {
+        self.timeline_layers
+            .retain(|layer| !layers_to_remove.contains(layer));
+        self.missing_layers
+            .retain(|layer| !layers_to_remove.contains(layer));
+    }
+
    /// Lists all layer files in the given remote timeline. Omits the metadata file.
    pub fn stored_files(&self) -> &HashSet<PathBuf> {
        &self.timeline_layers
--- a/pageserver/src/remote_storage/storage_sync/upload.rs
+++ b/pageserver/src/remote_storage/storage_sync/upload.rs
@@ -4,20 +4,19 @@ use std::{fmt::Debug, path::PathBuf};

 use anyhow::Context;
 use futures::stream::{FuturesUnordered, StreamExt};
+use remote_storage::RemoteStorage;
 use tokio::fs;
 use tracing::{debug, error, info, warn};

-use crate::{
-    config::PageServerConf,
-    layered_repository::metadata::metadata_path,
-    remote_storage::{
-        storage_sync::{index::RemoteTimeline, sync_queue, SyncTask},
-        RemoteStorage,
-    },
-};
 use utils::zid::ZTenantTimelineId;

-use super::{index::IndexPart, SyncData, TimelineUpload};
+use super::{
+    index::{IndexPart, RemoteTimeline},
+    LayersUpload, SyncData, SyncQueue,
+};
+use crate::{
+    config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
+};

 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<P, S>(
@@ -28,7 +27,7 @@ pub(super) async fn upload_index_part<P, S>(
 ) -> anyhow::Result<()>
 where
    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
 {
    let index_part_bytes = serde_json::to_vec(&index_part)
        .context("Failed to serialize index part file into bytes")?;
@@ -38,12 +37,15 @@ where
    let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
        .with_file_name(IndexPart::FILE_NAME)
        .with_extension(IndexPart::FILE_EXTENSION);
-    let index_part_storage_path = storage.storage_path(&index_part_path).with_context(|| {
-        format!(
-            "Failed to get the index part storage path for local path '{}'",
-            index_part_path.display()
-        )
-    })?;
+    let index_part_storage_path =
+        storage
+            .remote_object_id(&index_part_path)
+            .with_context(|| {
+                format!(
+                    "Failed to get the index part storage path for local path '{}'",
+                    index_part_path.display()
+                )
+            })?;

    storage
        .upload(
@@ -64,11 +66,7 @@ pub(super) enum UploadedTimeline {
    /// Upload failed due to some error, the upload task is rescheduled for another retry.
    FailedAndRescheduled,
    /// No issues happened during the upload, all task files were put into the remote storage.
-    Successful(SyncData<TimelineUpload>),
-    /// No failures happened during the upload, but some files were removed locally before the upload task completed
-    /// (could happen due to retries, for instance, if GC happens in the interim).
-    /// Such files are considered "not needed" and ignored, but the task's metadata should be discarded and the new one loaded from the local file.
-    SuccessfulAfterLocalFsUpdate(SyncData<TimelineUpload>),
+    Successful(SyncData<LayersUpload>),
 }

 /// Attempts to upload given layer files.
@@ -77,16 +75,20 @@ pub(super) enum UploadedTimeline {
 /// On an error, bumps the retries count and reschedules the entire task.
 pub(super) async fn upload_timeline_layers<'a, P, S>(
    storage: &'a S,
+    sync_queue: &SyncQueue,
    remote_timeline: Option<&'a RemoteTimeline>,
    sync_id: ZTenantTimelineId,
-    mut upload_data: SyncData<TimelineUpload>,
+    mut upload_data: SyncData<LayersUpload>,
 ) -> UploadedTimeline
 where
    P: Debug + Send + Sync + 'static,
-    S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
+    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
 {
    let upload = &mut upload_data.data;
-    let new_upload_lsn = upload.metadata.disk_consistent_lsn();
+    let new_upload_lsn = upload
+        .metadata
+        .as_ref()
+        .map(|meta| meta.disk_consistent_lsn());

    let already_uploaded_layers = remote_timeline
        .map(|timeline| timeline.stored_files())
@@ -99,9 +101,14 @@ where
        .cloned()
        .collect::<Vec<_>>();

+    if layers_to_upload.is_empty() {
+        info!("No layers to upload after filtering, aborting");
+        return UploadedTimeline::Successful(upload_data);
+    }
+
    debug!("Layers to upload: {layers_to_upload:?}");
    info!(
-        "Uploading {} timeline layers, new lsn: {new_upload_lsn}",
+        "Uploading {} timeline layers, new lsn: {new_upload_lsn:?}",
        layers_to_upload.len(),
    );

@@ -109,7 +116,7 @@ where
        .into_iter()
        .map(|source_path| async move {
            let storage_path = storage
-                .storage_path(&source_path)
+                .remote_object_id(&source_path)
                .with_context(|| {
                    format!(
                        "Failed to get the layer storage path for local path '{}'",
@@ -156,7 +163,6 @@ where
        .collect::<FuturesUnordered<_>>();

    let mut errors_happened = false;
-    let mut local_fs_updated = false;
    while let Some(upload_result) = upload_tasks.next().await {
        match upload_result {
            Ok(uploaded_path) => {
@@ -173,7 +179,16 @@ where
                        errors_happened = true;
                        error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
                    } else {
-                        local_fs_updated = true;
+                        // We have run the upload sync task, but the file we wanted to upload is gone.
+                        // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to
+                        // retry the upload tasks, if S3 or network is down: but during this time, pageserver might still operate and
+                        // run compaction/gc threads, removing redundant files from disk.
+                        // It's not good to pause GC/compaction because of those and we would rather skip such uploads.
+                        //
+                        // Yet absence of such files might also mean that the timeline metadata file was updated (GC moves the Lsn forward, for instance).
+                        // We don't try to read a more recent version, since it could contain `disk_consistent_lsn` that does not have its upload finished yet.
+                        // This will create "missing" layers and make data inconsistent.
+                        // Instead, we only update the metadata when it was submitted in an upload task as a checkpoint result.
                        upload.layers_to_upload.remove(&source_path);
                        warn!(
                            "Missing locally a layer file {} scheduled for upload, skipping",
@@ -188,11 +203,8 @@ where
    if errors_happened {
        debug!("Reenqueuing failed upload task for timeline {sync_id}");
        upload_data.retries += 1;
-        sync_queue::push(sync_id, SyncTask::Upload(upload_data));
+        sync_queue.push(sync_id, SyncTask::Upload(upload_data));
        UploadedTimeline::FailedAndRescheduled
-    } else if local_fs_updated {
-        info!("Successfully uploaded all layers, some local layers were removed during the upload");
-        UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data)
    } else {
        info!("Successfully uploaded all layers");
        UploadedTimeline::Successful(upload_data)
@@ -206,20 +218,21 @@ enum UploadError {

 #[cfg(test)]
 mod tests {
-    use std::collections::{BTreeSet, HashSet};
+    use std::{
+        collections::{BTreeSet, HashSet},
+        num::NonZeroUsize,
+    };

+    use remote_storage::LocalFs;
    use tempfile::tempdir;
    use utils::lsn::Lsn;

    use crate::{
-        remote_storage::{
-            storage_sync::{
-                index::RelativePath,
-                test_utils::{create_local_timeline, dummy_metadata},
-            },
-            LocalFs,
-        },
        repository::repo_harness::{RepoHarness, TIMELINE_ID},
+        storage_sync::{
+            index::RelativePath,
+            test_utils::{create_local_timeline, dummy_metadata},
+        },
    };

    use super::{upload_index_part, *};
@@ -227,15 +240,21 @@ mod tests {
    #[tokio::test]
    async fn regular_layer_upload() -> anyhow::Result<()> {
        let harness = RepoHarness::create("regular_layer_upload")?;
+        let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);

        let layer_files = ["a", "b"];
-        let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
+        let storage = LocalFs::new(
+            tempdir()?.path().to_path_buf(),
+            harness.conf.workdir.clone(),
+        )?;
        let current_retries = 3;
        let metadata = dummy_metadata(Lsn(0x30));
        let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
-        let timeline_upload =
+        let mut timeline_upload =
            create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
+        timeline_upload.metadata = None;
+
        assert!(
            storage.list().await?.is_empty(),
            "Storage should be empty before any uploads are made"
@@ -243,6 +262,7 @@ mod tests {

        let upload_result = upload_timeline_layers(
            &storage,
+            &sync_queue,
            None,
            sync_id,
            SyncData::new(current_retries, timeline_upload.clone()),
@@ -278,8 +298,8 @@ mod tests {
            "Successful upload should have all layers uploaded"
        );
        assert_eq!(
-            upload.metadata, metadata,
-            "Successful upload should not chage its metadata"
+            upload.metadata, None,
+            "Successful upload without metadata should not have it returned either"
        );

        let storage_files = storage.list().await?;
@@ -307,10 +327,11 @@ mod tests {
    #[tokio::test]
    async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> {
        let harness = RepoHarness::create("layer_upload_after_local_fs_update")?;
+        let (sync_queue, _) = SyncQueue::new(NonZeroUsize::new(100).unwrap());
        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);

        let layer_files = ["a1", "b1"];
-        let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
        let current_retries = 5;
        let metadata = dummy_metadata(Lsn(0x40));

@@ -332,6 +353,7 @@ mod tests {

        let upload_result = upload_timeline_layers(
            &storage,
+            &sync_queue,
            None,
            sync_id,
            SyncData::new(current_retries, timeline_upload.clone()),
@@ -339,7 +361,7 @@ mod tests {
        .await;

        let upload_data = match upload_result {
-            UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) => upload_data,
+            UploadedTimeline::Successful(upload_data) => upload_data,
            wrong_result => panic!(
                "Expected a successful after local fs upload for timeline, but got: {wrong_result:?}"
            ),
@@ -367,7 +389,8 @@ mod tests {
            "Successful upload should have all layers uploaded"
        );
        assert_eq!(
-            upload.metadata, metadata,
+            upload.metadata,
+            Some(metadata),
            "Successful upload should not chage its metadata"
        );

@@ -397,7 +420,7 @@ mod tests {
        let harness = RepoHarness::create("test_upload_index_part")?;
        let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);

-        let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
+        let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
        let metadata = dummy_metadata(Lsn(0x40));
        let local_timeline_path = harness.timeline_path(&TIMELINE_ID);

--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -32,6 +32,7 @@ pub mod defaults {

    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
    pub const DEFAULT_GC_PERIOD: &str = "100 s";
+    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
 }

@@ -47,6 +48,7 @@ pub struct TenantConf {
    // This parameter determines L1 layer file size.
    pub compaction_target_size: u64,
    // How often to check if there's compaction work to be done.
+    #[serde(with = "humantime_serde")]
    pub compaction_period: Duration,
    // Level0 delta layer threshold for compaction.
    pub compaction_threshold: usize,
@@ -56,11 +58,15 @@ pub struct TenantConf {
    // Page versions older than this are garbage collected away.
    pub gc_horizon: u64,
    // Interval at which garbage collection is triggered.
+    #[serde(with = "humantime_serde")]
    pub gc_period: Duration,
+    // Delta layer churn threshold to create L1 image layers.
+    pub image_creation_threshold: usize,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is time.
    // Page versions older than this are garbage collected away.
+    #[serde(with = "humantime_serde")]
    pub pitr_interval: Duration,
 }

@@ -70,10 +76,14 @@ pub struct TenantConf {
 pub struct TenantConfOpt {
    pub checkpoint_distance: Option<u64>,
    pub compaction_target_size: Option<u64>,
+    #[serde(with = "humantime_serde")]
    pub compaction_period: Option<Duration>,
    pub compaction_threshold: Option<usize>,
    pub gc_horizon: Option<u64>,
+    #[serde(with = "humantime_serde")]
    pub gc_period: Option<Duration>,
+    pub image_creation_threshold: Option<usize>,
+    #[serde(with = "humantime_serde")]
    pub pitr_interval: Option<Duration>,
 }

@@ -94,6 +104,9 @@ impl TenantConfOpt {
                .unwrap_or(global_conf.compaction_threshold),
            gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
            gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
+            image_creation_threshold: self
+                .image_creation_threshold
+                .unwrap_or(global_conf.image_creation_threshold),
            pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
        }
    }
@@ -117,6 +130,9 @@ impl TenantConfOpt {
        if let Some(gc_period) = other.gc_period {
            self.gc_period = Some(gc_period);
        }
+        if let Some(image_creation_threshold) = other.image_creation_threshold {
+            self.image_creation_threshold = Some(image_creation_threshold);
+        }
        if let Some(pitr_interval) = other.pitr_interval {
            self.pitr_interval = Some(pitr_interval);
        }
@@ -136,6 +152,7 @@ impl TenantConf {
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
+            image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
            pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
                .expect("cannot parse default PITR interval"),
        }
@@ -156,6 +173,7 @@ impl TenantConf {
            compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
            gc_horizon: defaults::DEFAULT_GC_HORIZON,
            gc_period: Duration::from_secs(10),
+            image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
            pitr_interval: Duration::from_secs(60 * 60),
        }
    }
--- a/pageserver/src/tenant_mgr.rs
+++ b/pageserver/src/tenant_mgr.rs
@@ -3,8 +3,10 @@

 use crate::config::PageServerConf;
 use crate::layered_repository::LayeredRepository;
-use crate::remote_storage::RemoteIndex;
+use crate::pgdatadir_mapping::DatadirTimeline;
 use crate::repository::{Repository, TimelineSyncStatusUpdate};
+use crate::storage_sync::index::RemoteIndex;
+use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
 use crate::tenant_config::TenantConfOpt;
 use crate::thread_mgr;
 use crate::thread_mgr::ThreadKind;
@@ -12,26 +14,54 @@ use crate::timelines;
 use crate::timelines::CreateRepo;
 use crate::walredo::PostgresRedoManager;
 use crate::{DatadirTimelineImpl, RepositoryImpl};
-use anyhow::{Context, Result};
-use lazy_static::lazy_static;
+use anyhow::{bail, Context};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::fmt;
-use std::sync::{Arc, Mutex, MutexGuard};
+use std::sync::Arc;
 use tracing::*;
+
 use utils::zid::{ZTenantId, ZTimelineId};

-lazy_static! {
-    static ref TENANTS: Mutex<HashMap<ZTenantId, Tenant>> = Mutex::new(HashMap::new());
+mod tenants_state {
+    use std::{
+        collections::HashMap,
+        sync::{RwLock, RwLockReadGuard, RwLockWriteGuard},
+    };
+
+    use utils::zid::ZTenantId;
+
+    use crate::tenant_mgr::Tenant;
+
+    lazy_static::lazy_static! {
+        static ref TENANTS: RwLock<HashMap<ZTenantId, Tenant>> = RwLock::new(HashMap::new());
+    }
+
+    pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<ZTenantId, Tenant>> {
+        TENANTS
+            .read()
+            .expect("Failed to read() tenants lock, it got poisoned")
+    }
+
+    pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap<ZTenantId, Tenant>> {
+        TENANTS
+            .write()
+            .expect("Failed to write() tenants lock, it got poisoned")
+    }
 }

 struct Tenant {
    state: TenantState,
+    /// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk.
    repo: Arc<RepositoryImpl>,
-
-    timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>,
+    /// Timelines, located locally in the pageserver's datadir.
+    /// Timelines can entirely be removed entirely by the `detach` operation only.
+    ///
+    /// Local timelines have more metadata that's loaded into memory,
+    /// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`].
+    local_timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>,
 }

 #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -48,6 +78,9 @@ pub enum TenantState {
    // The local disk might have some newer files that don't exist in cloud storage yet.
    // The tenant cannot be accessed anymore for any reason, but graceful shutdown.
    Stopping,
+
+    // Something went wrong loading the tenant state
+    Broken,
 }

 impl fmt::Display for TenantState {
@@ -56,47 +89,37 @@ impl fmt::Display for TenantState {
            TenantState::Active => f.write_str("Active"),
            TenantState::Idle => f.write_str("Idle"),
            TenantState::Stopping => f.write_str("Stopping"),
+            TenantState::Broken => f.write_str("Broken"),
        }
    }
 }

-fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
-    TENANTS.lock().unwrap()
-}
+/// Initialize repositories with locally available timelines.
+/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
+/// are scheduled for download and added to the repository once download is completed.
+pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIndex> {
+    let SyncStartupData {
+        remote_index,
+        local_timeline_init_statuses,
+    } = storage_sync::start_local_timeline_sync(conf)
+        .context("Failed to set up local files sync with external storage")?;

-// Sets up wal redo manager and repository for tenant. Reduces code duplication.
-// Used during pageserver startup, or when new tenant is attached to pageserver.
-pub fn load_local_repo(
-    conf: &'static PageServerConf,
-    tenant_id: ZTenantId,
-    remote_index: &RemoteIndex,
-) -> Result<Arc<RepositoryImpl>> {
-    let mut m = access_tenants();
-    let tenant = m.entry(tenant_id).or_insert_with(|| {
-        // Set up a WAL redo manager, for applying WAL records.
-        let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
-
-        // Set up an object repository, for actual data storage.
-        let repo: Arc<LayeredRepository> = Arc::new(LayeredRepository::new(
-            conf,
-            Default::default(),
-            Arc::new(walredo_mgr),
-            tenant_id,
-            remote_index.clone(),
-            conf.remote_storage_config.is_some(),
-        ));
-        Tenant {
-            state: TenantState::Idle,
-            repo,
-            timelines: HashMap::new(),
+    for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
+        if let Err(err) =
+            init_local_repository(conf, tenant_id, local_timeline_init_statuses, &remote_index)
+        {
+            // Report the error, but continue with the startup for other tenants. An error
+            // loading a tenant is serious, but it's better to complete the startup and
+            // serve other tenants, than fail completely.
+            error!("Failed to initialize local tenant {tenant_id}: {:?}", err);
+            let mut m = tenants_state::write_tenants();
+            if let Some(tenant) = m.get_mut(&tenant_id) {
+                tenant.state = TenantState::Broken;
+            }
        }
-    });
+    }

-    // Restore tenant config
-    let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?;
-    tenant.repo.update_tenant_config(tenant_conf)?;
-
-    Ok(Arc::clone(&tenant.repo))
+    Ok(remote_index)
 }

 /// Updates tenants' repositories, changing their timelines state in memory.
@@ -113,32 +136,21 @@ pub fn apply_timeline_sync_status_updates(
        "Applying sync status updates for {} timelines",
        sync_status_updates.len()
    );
-    trace!("Sync status updates: {:?}", sync_status_updates);
+    debug!("Sync status updates: {sync_status_updates:?}");

-    for (tenant_id, tenant_timelines_sync_status_updates) in sync_status_updates {
+    for (tenant_id, status_updates) in sync_status_updates {
        let repo = match load_local_repo(conf, tenant_id, remote_index) {
            Ok(repo) => repo,
            Err(e) => {
-                error!(
-                    "Failed to load repo for tenant {} Error: {:#}",
-                    tenant_id, e
-                );
+                error!("Failed to load repo for tenant {tenant_id} Error: {e:?}",);
                continue;
            }
        };
-
-        for (timeline_id, timeline_sync_status_update) in tenant_timelines_sync_status_updates {
-            match repo.apply_timeline_remote_sync_status_update(timeline_id, timeline_sync_status_update)
-            {
-                Ok(_) => debug!(
-                    "successfully applied timeline sync status update: {} -> {}",
-                    timeline_id, timeline_sync_status_update
-                ),
-                Err(e) => error!(
-                    "Failed to apply timeline sync status update for tenant {}. timeline {} update {} Error: {:#}",
-                    tenant_id, timeline_id, timeline_sync_status_update, e
-                ),
-            }
+        match apply_timeline_remote_sync_status_updates(&repo, status_updates) {
+            Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"),
+            Err(e) => error!(
+                "Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}"
+            ),
        }
    }
 }
@@ -147,11 +159,16 @@ pub fn apply_timeline_sync_status_updates(
 /// Shut down all tenants. This runs as part of pageserver shutdown.
 ///
 pub fn shutdown_all_tenants() {
-    let mut m = access_tenants();
+    let mut m = tenants_state::write_tenants();
    let mut tenantids = Vec::new();
    for (tenantid, tenant) in m.iter_mut() {
-        tenant.state = TenantState::Stopping;
-        tenantids.push(*tenantid)
+        match tenant.state {
+            TenantState::Active | TenantState::Idle | TenantState::Stopping => {
+                tenant.state = TenantState::Stopping;
+                tenantids.push(*tenantid)
+            }
+            TenantState::Broken => {}
+        }
    }
    drop(m);

@@ -167,22 +184,16 @@ pub fn shutdown_all_tenants() {
    // should be no more activity in any of the repositories.
    //
    // On error, log it but continue with the shutdown for other tenants.
-    for tenantid in tenantids {
-        debug!("shutdown tenant {}", tenantid);
-        match get_repository_for_tenant(tenantid) {
+    for tenant_id in tenantids {
+        debug!("shutdown tenant {tenant_id}");
+        match get_repository_for_tenant(tenant_id) {
            Ok(repo) => {
                if let Err(err) = repo.checkpoint() {
-                    error!(
-                        "Could not checkpoint tenant {} during shutdown: {:?}",
-                        tenantid, err
-                    );
+                    error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
                }
            }
            Err(err) => {
-                error!(
-                    "Could not get repository for tenant {} during shutdown: {:?}",
-                    tenantid, err
-                );
+                error!("Could not get repository for tenant {tenant_id} during shutdown: {err:?}");
            }
        }
    }
@@ -191,20 +202,20 @@ pub fn shutdown_all_tenants() {
 pub fn create_tenant_repository(
    conf: &'static PageServerConf,
    tenant_conf: TenantConfOpt,
-    tenantid: ZTenantId,
+    tenant_id: ZTenantId,
    remote_index: RemoteIndex,
-) -> Result<Option<ZTenantId>> {
-    match access_tenants().entry(tenantid) {
+) -> anyhow::Result<Option<ZTenantId>> {
+    match tenants_state::write_tenants().entry(tenant_id) {
        Entry::Occupied(_) => {
-            debug!("tenant {} already exists", tenantid);
+            debug!("tenant {tenant_id} already exists");
            Ok(None)
        }
        Entry::Vacant(v) => {
-            let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
+            let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
            let repo = timelines::create_repo(
                conf,
                tenant_conf,
-                tenantid,
+                tenant_id,
                CreateRepo::Real {
                    wal_redo_manager,
                    remote_index,
@@ -213,36 +224,39 @@ pub fn create_tenant_repository(
            v.insert(Tenant {
                state: TenantState::Idle,
                repo,
-                timelines: HashMap::new(),
+                local_timelines: HashMap::new(),
            });
-            Ok(Some(tenantid))
+            Ok(Some(tenant_id))
        }
    }
 }

-pub fn update_tenant_config(tenant_conf: TenantConfOpt, tenantid: ZTenantId) -> Result<()> {
-    info!("configuring tenant {}", tenantid);
-    let repo = get_repository_for_tenant(tenantid)?;
+pub fn update_tenant_config(
+    tenant_conf: TenantConfOpt,
+    tenant_id: ZTenantId,
+) -> anyhow::Result<()> {
+    info!("configuring tenant {tenant_id}");
+    let repo = get_repository_for_tenant(tenant_id)?;

    repo.update_tenant_config(tenant_conf)?;
    Ok(())
 }

 pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
-    Some(access_tenants().get(&tenantid)?.state)
+    Some(tenants_state::read_tenants().get(&tenantid)?.state)
 }

 ///
 /// Change the state of a tenant to Active and launch its compactor and GC
 /// threads. If the tenant was already in Active state or Stopping, does nothing.
 ///
-pub fn activate_tenant(tenant_id: ZTenantId) -> Result<()> {
-    let mut m = access_tenants();
+pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
+    let mut m = tenants_state::write_tenants();
    let tenant = m
        .get_mut(&tenant_id)
-        .with_context(|| format!("Tenant not found for id {}", tenant_id))?;
+        .with_context(|| format!("Tenant not found for id {tenant_id}"))?;

-    info!("activating tenant {}", tenant_id);
+    info!("activating tenant {tenant_id}");

    match tenant.state {
        // If the tenant is already active, nothing to do.
@@ -255,7 +269,7 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> Result<()> {
                Some(tenant_id),
                None,
                "Compactor thread",
-                true,
+                false,
                move || crate::tenant_threads::compact_loop(tenant_id),
            )?;

@@ -264,16 +278,13 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> Result<()> {
                Some(tenant_id),
                None,
                "GC thread",
-                true,
+                false,
                move || crate::tenant_threads::gc_loop(tenant_id),
            )
-            .with_context(|| format!("Failed to launch GC thread for tenant {}", tenant_id));
+            .with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}"));

            if let Err(e) = &gc_spawn_result {
-                error!(
-                    "Failed to start GC thread for tenant {}, stopping its checkpointer thread: {:?}",
-                    tenant_id, e
-                );
+                error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}");
                thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
                return gc_spawn_result;
            }
@@ -283,43 +294,89 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> Result<()> {
        TenantState::Stopping => {
            // don't re-activate it if it's being stopped
        }
+
+        TenantState::Broken => {
+            // cannot activate
+        }
    }
    Ok(())
 }

-pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<RepositoryImpl>> {
-    let m = access_tenants();
+pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result<Arc<RepositoryImpl>> {
+    let m = tenants_state::read_tenants();
    let tenant = m
-        .get(&tenantid)
-        .with_context(|| format!("Tenant {} not found", tenantid))?;
+        .get(&tenant_id)
+        .with_context(|| format!("Tenant {tenant_id} not found"))?;

    Ok(Arc::clone(&tenant.repo))
 }

-// Retrieve timeline for tenant. Load it into memory if it is not already loaded
-pub fn get_timeline_for_tenant_load(
-    tenantid: ZTenantId,
-    timelineid: ZTimelineId,
-) -> Result<Arc<DatadirTimelineImpl>> {
-    let mut m = access_tenants();
+/// Retrieves local timeline for tenant.
+/// Loads it into memory if it is not already loaded.
+pub fn get_local_timeline_with_load(
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+) -> anyhow::Result<Arc<DatadirTimelineImpl>> {
+    let mut m = tenants_state::write_tenants();
    let tenant = m
-        .get_mut(&tenantid)
-        .with_context(|| format!("Tenant {} not found", tenantid))?;
+        .get_mut(&tenant_id)
+        .with_context(|| format!("Tenant {tenant_id} not found"))?;

-    if let Some(page_tline) = tenant.timelines.get(&timelineid) {
+    if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) {
        return Ok(Arc::clone(page_tline));
    }
-    // First access to this timeline. Create a DatadirTimeline wrapper for it
-    let tline = tenant
-        .repo
-        .get_timeline_load(timelineid)
-        .with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid))?;

-    let repartition_distance = tenant.repo.get_checkpoint_distance() / 10;
+    let page_tline = new_local_timeline(&tenant.repo, timeline_id)
+        .with_context(|| format!("Failed to create new local timeline for tenant {tenant_id}"))?;
+    tenant
+        .local_timelines
+        .insert(timeline_id, Arc::clone(&page_tline));
+    Ok(page_tline)
+}

-    let page_tline = Arc::new(DatadirTimelineImpl::new(tline, repartition_distance));
+pub fn detach_timeline(
+    conf: &'static PageServerConf,
+    tenant_id: ZTenantId,
+    timeline_id: ZTimelineId,
+) -> anyhow::Result<()> {
+    // shutdown the timeline threads (this shuts down the walreceiver)
+    thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id));
+
+    match tenants_state::write_tenants().get_mut(&tenant_id) {
+        Some(tenant) => {
+            tenant
+                .repo
+                .detach_timeline(timeline_id)
+                .context("Failed to detach inmem tenant timeline")?;
+            tenant.local_timelines.remove(&timeline_id);
+        }
+        None => bail!("Tenant {tenant_id} not found in local tenant state"),
+    }
+
+    let local_timeline_directory = conf.timeline_path(&timeline_id, &tenant_id);
+    std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
+        format!(
+            "Failed to remove local timeline directory '{}'",
+            local_timeline_directory.display()
+        )
+    })?;
+
+    Ok(())
+}
+
+fn new_local_timeline(
+    repo: &RepositoryImpl,
+    timeline_id: ZTimelineId,
+) -> anyhow::Result<Arc<DatadirTimeline<LayeredRepository>>> {
+    let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| {
+        format!("Inmem timeline {timeline_id} not found in tenant's repository")
+    })?;
+    let repartition_distance = repo.get_checkpoint_distance() / 10;
+    let page_tline = Arc::new(DatadirTimelineImpl::new(
+        inmem_timeline,
+        repartition_distance,
+    ));
    page_tline.init_logical_size()?;
-    tenant.timelines.insert(timelineid, Arc::clone(&page_tline));
    Ok(page_tline)
 }

@@ -331,15 +388,121 @@ pub struct TenantInfo {
    pub state: TenantState,
 }

-pub fn list_tenants() -> Result<Vec<TenantInfo>> {
-    access_tenants()
+pub fn list_tenants() -> Vec<TenantInfo> {
+    tenants_state::read_tenants()
        .iter()
-        .map(|v| {
-            let (id, tenant) = v;
-            Ok(TenantInfo {
-                id: *id,
-                state: tenant.state,
-            })
+        .map(|(id, tenant)| TenantInfo {
+            id: *id,
+            state: tenant.state,
        })
        .collect()
 }
+
+fn init_local_repository(
+    conf: &'static PageServerConf,
+    tenant_id: ZTenantId,
+    local_timeline_init_statuses: HashMap<ZTimelineId, LocalTimelineInitStatus>,
+    remote_index: &RemoteIndex,
+) -> anyhow::Result<(), anyhow::Error> {
+    // initialize local tenant
+    let repo = load_local_repo(conf, tenant_id, remote_index)
+        .with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
+
+    let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len());
+    for (timeline_id, init_status) in local_timeline_init_statuses {
+        match init_status {
+            LocalTimelineInitStatus::LocallyComplete => {
+                debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
+                status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
+            }
+            LocalTimelineInitStatus::NeedsSync => {
+                debug!(
+                    "timeline {tenant_id} for tenant {timeline_id} needs sync, \
+                     so skipped for adding into repository until sync is finished"
+                );
+            }
+        }
+    }
+
+    // Lets fail here loudly to be on the safe side.
+    // XXX: It may be a better api to actually distinguish between repository startup
+    //   and processing of newly downloaded timelines.
+    apply_timeline_remote_sync_status_updates(&repo, status_updates)
+        .with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?;
+    Ok(())
+}
+
+fn apply_timeline_remote_sync_status_updates(
+    repo: &LayeredRepository,
+    status_updates: HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
+) -> anyhow::Result<()> {
+    let mut registration_queue = Vec::with_capacity(status_updates.len());
+
+    // first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration
+    for (timeline_id, status_update) in status_updates {
+        repo.apply_timeline_remote_sync_status_update(timeline_id, status_update)
+            .with_context(|| {
+                format!("Failed to load timeline {timeline_id} into in-memory repository")
+            })?;
+        match status_update {
+            TimelineSyncStatusUpdate::Downloaded => registration_queue.push(timeline_id),
+        }
+    }
+
+    for timeline_id in registration_queue {
+        let tenant_id = repo.tenant_id();
+        match tenants_state::write_tenants().get_mut(&tenant_id) {
+            Some(tenant) => match tenant.local_timelines.entry(timeline_id) {
+                Entry::Occupied(_) => {
+                    bail!("Local timeline {timeline_id} already registered")
+                }
+                Entry::Vacant(v) => {
+                    v.insert(new_local_timeline(repo, timeline_id).with_context(|| {
+                        format!("Failed to register new local timeline for tenant {tenant_id}")
+                    })?);
+                }
+            },
+            None => bail!(
+                "Tenant {} not found in local tenant state",
+                repo.tenant_id()
+            ),
+        }
+    }
+
+    Ok(())
+}
+
+// Sets up wal redo manager and repository for tenant. Reduces code duplication.
+// Used during pageserver startup, or when new tenant is attached to pageserver.
+fn load_local_repo(
+    conf: &'static PageServerConf,
+    tenant_id: ZTenantId,
+    remote_index: &RemoteIndex,
+) -> anyhow::Result<Arc<RepositoryImpl>> {
+    let mut m = tenants_state::write_tenants();
+    let tenant = m.entry(tenant_id).or_insert_with(|| {
+        // Set up a WAL redo manager, for applying WAL records.
+        let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
+
+        // Set up an object repository, for actual data storage.
+        let repo: Arc<LayeredRepository> = Arc::new(LayeredRepository::new(
+            conf,
+            TenantConfOpt::default(),
+            Arc::new(walredo_mgr),
+            tenant_id,
+            remote_index.clone(),
+            conf.remote_storage_config.is_some(),
+        ));
+        Tenant {
+            state: TenantState::Idle,
+            repo,
+            local_timelines: HashMap::new(),
+        }
+    });
+
+    // Restore tenant config
+    let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?;
+    tenant.repo.update_tenant_config(tenant_conf)?;
+
+    Ok(Arc::clone(&tenant.repo))
+}
--- a/pageserver/src/thread_mgr.rs
+++ b/pageserver/src/thread_mgr.rs
@@ -130,12 +130,14 @@ struct PageServerThread {
 }

 /// Launch a new thread
+/// Note: if shutdown_process_on_error is set to true failure
+///   of the thread will lead to shutdown of entire process
 pub fn spawn<F>(
    kind: ThreadKind,
    tenant_id: Option<ZTenantId>,
    timeline_id: Option<ZTimelineId>,
    name: &str,
-    fail_on_error: bool,
+    shutdown_process_on_error: bool,
    f: F,
 ) -> std::io::Result<()>
 where
@@ -175,7 +177,7 @@ where
                thread_id,
                thread_rc2,
                shutdown_rx,
-                fail_on_error,
+                shutdown_process_on_error,
                f,
            )
        }) {
@@ -201,7 +203,7 @@ fn thread_wrapper<F>(
    thread_id: u64,
    thread: Arc<PageServerThread>,
    shutdown_rx: watch::Receiver<()>,
-    fail_on_error: bool,
+    shutdown_process_on_error: bool,
    f: F,
 ) where
    F: FnOnce() -> anyhow::Result<()> + Send + 'static,
@@ -221,27 +223,41 @@ fn thread_wrapper<F>(
    let result = panic::catch_unwind(AssertUnwindSafe(f));

    // Remove our entry from the global hashmap.
-    THREADS.lock().unwrap().remove(&thread_id);
+    let thread = THREADS
+        .lock()
+        .unwrap()
+        .remove(&thread_id)
+        .expect("no thread in registry");

    match result {
        Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name),
        Ok(Err(err)) => {
-            if fail_on_error {
+            if shutdown_process_on_error {
                error!(
-                    "Shutting down: thread '{}' exited with error: {:?}",
-                    thread_name, err
+                    "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                    thread_name, thread.tenant_id, thread.timeline_id, err
                );
-                shutdown_pageserver();
+                shutdown_pageserver(1);
            } else {
-                error!("Thread '{}' exited with error: {:?}", thread_name, err);
+                error!(
+                    "Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                    thread_name, thread.tenant_id, thread.timeline_id, err
+                );
            }
        }
        Err(err) => {
-            error!(
-                "Shutting down: thread '{}' panicked: {:?}",
-                thread_name, err
-            );
-            shutdown_pageserver();
+            if shutdown_process_on_error {
+                error!(
+                    "Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                    thread_name, thread.tenant_id, thread.timeline_id, err
+                );
+                shutdown_pageserver(1);
+            } else {
+                error!(
+                    "Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                    thread_name, thread.tenant_id, thread.timeline_id, err
+                );
+            }
        }
    }
 }
--- a/pageserver/src/timelines.rs
+++ b/pageserver/src/timelines.rs
@@ -2,7 +2,7 @@
 //! Timeline management code
 //

-use anyhow::{bail, Context, Result};
+use anyhow::{bail, ensure, Context, Result};
 use postgres_ffi::ControlFileData;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
@@ -23,8 +23,8 @@ use utils::{
 use crate::{
    config::PageServerConf,
    layered_repository::metadata::TimelineMetadata,
-    remote_storage::RemoteIndex,
    repository::{LocalTimelineState, Repository},
+    storage_sync::index::RemoteIndex,
    tenant_config::TenantConfOpt,
    DatadirTimeline, RepositoryImpl,
 };
@@ -106,7 +106,7 @@ impl LocalTimelineInfo {
        match repo_timeline {
            RepositoryTimeline::Loaded(_) => {
                let datadir_tline =
-                    tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id)?;
+                    tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?;
                Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size)
            }
            RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)),
@@ -152,7 +152,7 @@ pub fn init_pageserver(

    if let Some(tenant_id) = create_tenant {
        println!("initializing tenantid {}", tenant_id);
-        let repo = create_repo(conf, Default::default(), tenant_id, CreateRepo::Dummy)
+        let repo = create_repo(conf, TenantConfOpt::default(), tenant_id, CreateRepo::Dummy)
            .context("failed to create repo")?;
        let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
        bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())
@@ -203,9 +203,11 @@ pub fn create_repo(
    };

    let repo_dir = conf.tenant_path(&tenant_id);
-    if repo_dir.exists() {
-        bail!("tenant {} directory already exists", tenant_id);
-    }
+    ensure!(
+        !repo_dir.exists(),
+        "cannot create new tenant repo: '{}' directory already exists",
+        tenant_id
+    );

    // top-level dir may exist if we are creating it through CLI
    crashsafe_dir::create_dir_all(&repo_dir)
@@ -383,7 +385,7 @@ pub(crate) fn create_timeline(
            repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
            // load the timeline into memory
            let loaded_timeline =
-                tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?;
+                tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
            LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false)
                .context("cannot fill timeline info")?
        }
@@ -391,7 +393,7 @@ pub(crate) fn create_timeline(
            bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
            // load the timeline into memory
            let new_timeline =
-                tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?;
+                tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
            LocalTimelineInfo::from_loaded_timeline(&new_timeline, false)
                .context("cannot fill timeline info")?
        }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -34,7 +34,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[

 lazy_static! {
    static ref STORAGE_IO_TIME: HistogramVec = register_histogram_vec!(
-        "pageserver_io_time",
+        "pageserver_io_operations_seconds",
        "Time spent in IO operations",
        &["operation", "tenant_id", "timeline_id"],
        STORAGE_IO_TIME_BUCKETS.into()
@@ -43,8 +43,8 @@ lazy_static! {
 }
 lazy_static! {
    static ref STORAGE_IO_SIZE: IntGaugeVec = register_int_gauge_vec!(
-        "pageserver_io_size",
-        "Amount of bytes",
+        "pageserver_io_operations_bytes_total",
+        "Total amount of bytes read/written in IO operations",
        &["operation", "tenant_id", "timeline_id"]
    )
    .expect("failed to define a metric");
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,8 +21,10 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

+use anyhow::Context;
 use postgres_ffi::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
+use postgres_ffi::{page_is_new, page_set_lsn};

 use anyhow::Result;
 use bytes::{Buf, Bytes, BytesMut};
@@ -82,7 +84,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
    ) -> Result<()> {
        let mut modification = timeline.begin_modification(lsn);

-        let mut decoded = decode_wal_record(recdata);
+        let mut decoded = decode_wal_record(recdata).context("failed decoding wal record")?;
        let mut buf = decoded.record.clone();
        buf.advance(decoded.main_data_offset);

@@ -251,7 +253,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {

        // If checkpoint data was updated, store the new version in the repository
        if self.checkpoint_modified {
-            let new_checkpoint_bytes = self.checkpoint.encode();
+            let new_checkpoint_bytes = self.checkpoint.encode()?;

            modification.put_checkpoint(new_checkpoint_bytes)?;
            self.checkpoint_modified = false;
@@ -303,8 +305,14 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                image.resize(image.len() + blk.hole_length as usize, 0u8);
                image.unsplit(tail);
            }
-            image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
-            image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
+            //
+            // Match the logic of XLogReadBufferForRedoExtended:
+            // The page may be uninitialized. If so, we can't set the LSN because
+            // that would corrupt the page.
+            //
+            if !page_is_new(&image) {
+                page_set_lsn(&mut image, lsn)
+            }
            assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())?;
        } else {
@@ -635,7 +643,10 @@ impl<'a, R: Repository> WalIngest<'a, R> {
                    segno,
                    rpageno,
                    if is_commit {
-                        ZenithWalRecord::ClogSetCommitted { xids: page_xids }
+                        ZenithWalRecord::ClogSetCommitted {
+                            xids: page_xids,
+                            timestamp: parsed.xact_time,
+                        }
                    } else {
                        ZenithWalRecord::ClogSetAborted { xids: page_xids }
                    },
@@ -652,7 +663,10 @@ impl<'a, R: Repository> WalIngest<'a, R> {
            segno,
            rpageno,
            if is_commit {
-                ZenithWalRecord::ClogSetCommitted { xids: page_xids }
+                ZenithWalRecord::ClogSetCommitted {
+                    xids: page_xids,
+                    timestamp: parsed.xact_time,
+                }
            } else {
                ZenithWalRecord::ClogSetAborted { xids: page_xids }
            },
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/walreceiver.rs
@@ -184,7 +184,7 @@ fn walreceiver_main(
    let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
        .with_context(|| format!("no repository found for tenant {}", tenant_id))?;
    let timeline =
-        tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).with_context(|| {
+        tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).with_context(|| {
            format!(
                "local timeline {} not found for tenant {}",
                timeline_id, tenant_id
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -1,6 +1,7 @@
 //!
 //! Functions for parsing WAL records.
 //!
+use anyhow::Result;
 use bytes::{Buf, Bytes};
 use postgres_ffi::pg_constants;
 use postgres_ffi::xlog_utils::{TimestampTz, XLOG_SIZE_OF_XLOG_RECORD};
@@ -9,6 +10,7 @@ use postgres_ffi::{BlockNumber, OffsetNumber};
 use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
 use serde::{Deserialize, Serialize};
 use tracing::*;
+use utils::bin_ser::DeserializeError;

 /// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper
 /// around a PostgreSQL WAL record, or a custom zenith-specific "record".
@@ -24,7 +26,10 @@ pub enum ZenithWalRecord {
        flags: u8,
    },
    /// Mark transaction IDs as committed on a CLOG page
-    ClogSetCommitted { xids: Vec<TransactionId> },
+    ClogSetCommitted {
+        xids: Vec<TransactionId>,
+        timestamp: TimestampTz,
+    },
    /// Mark transaction IDs as aborted on a CLOG page
    ClogSetAborted { xids: Vec<TransactionId> },
    /// Extend multixact offsets SLRU
@@ -500,7 +505,7 @@ impl XlMultiXactTruncate {
 //      block data
 //      ...
 //      main data
-pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
+pub fn decode_wal_record(record: Bytes) -> Result<DecodedWALRecord, DeserializeError> {
    let mut rnode_spcnode: u32 = 0;
    let mut rnode_dbnode: u32 = 0;
    let mut rnode_relnode: u32 = 0;
@@ -511,7 +516,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
    // 1. Parse XLogRecord struct

    // FIXME: assume little-endian here
-    let xlogrec = XLogRecord::from_bytes(&mut buf);
+    let xlogrec = XLogRecord::from_bytes(&mut buf)?;

    trace!(
        "decode_wal_record xl_rmid = {} xl_info = {}",
@@ -739,34 +744,32 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
        assert_eq!(buf.remaining(), main_data_len as usize);
    }

-    DecodedWALRecord {
+    Ok(DecodedWALRecord {
        xl_xid: xlogrec.xl_xid,
        xl_info: xlogrec.xl_info,
        xl_rmid: xlogrec.xl_rmid,
        record,
        blocks,
        main_data_offset,
-    }
+    })
 }

 ///
 /// Build a human-readable string to describe a WAL record
 ///
 /// For debugging purposes
-pub fn describe_wal_record(rec: &ZenithWalRecord) -> String {
+pub fn describe_wal_record(rec: &ZenithWalRecord) -> Result<String, DeserializeError> {
    match rec {
-        ZenithWalRecord::Postgres { will_init, rec } => {
-            format!(
-                "will_init: {}, {}",
-                will_init,
-                describe_postgres_wal_record(rec)
-            )
-        }
-        _ => format!("{:?}", rec),
+        ZenithWalRecord::Postgres { will_init, rec } => Ok(format!(
+            "will_init: {}, {}",
+            will_init,
+            describe_postgres_wal_record(rec)?
+        )),
+        _ => Ok(format!("{:?}", rec)),
    }
 }

-fn describe_postgres_wal_record(record: &Bytes) -> String {
+fn describe_postgres_wal_record(record: &Bytes) -> Result<String, DeserializeError> {
    // TODO: It would be nice to use the PostgreSQL rmgrdesc infrastructure for this.
    // Maybe use the postgres wal redo process, the same used for replaying WAL records?
    // Or could we compile the rmgrdesc routines into the dump_layer_file() binary directly,
@@ -779,7 +782,7 @@ fn describe_postgres_wal_record(record: &Bytes) -> String {
    // 1. Parse XLogRecord struct

    // FIXME: assume little-endian here
-    let xlogrec = XLogRecord::from_bytes(&mut buf);
+    let xlogrec = XLogRecord::from_bytes(&mut buf)?;

    let unknown_str: String;

@@ -827,5 +830,5 @@ fn describe_postgres_wal_record(record: &Bytes) -> String {
        }
    };

-    String::from(result)
+    Ok(String::from(result))
 }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -106,16 +106,16 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
 // each tenant.
 lazy_static! {
    static ref WAL_REDO_TIME: Histogram =
-        register_histogram!("pageserver_wal_redo_time", "Time spent on WAL redo")
+        register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
            .expect("failed to define a metric");
    static ref WAL_REDO_WAIT_TIME: Histogram = register_histogram!(
-        "pageserver_wal_redo_wait_time",
+        "pageserver_wal_redo_wait_seconds",
        "Time spent waiting for access to the WAL redo process"
    )
    .expect("failed to define a metric");
    static ref WAL_REDO_RECORD_COUNTER: IntCounter = register_int_counter!(
-        "pageserver_wal_records_replayed",
-        "Number of WAL records replayed"
+        "pageserver_replayed_wal_records_total",
+        "Number of WAL records replayed in WAL redo process"
    )
    .unwrap();
 }
@@ -283,6 +283,11 @@ impl PostgresRedoManager {
        // If something went wrong, don't try to reuse the process. Kill it, and
        // next request will launch a new one.
        if result.is_err() {
+            error!(
+                "error applying {} WAL records to reconstruct page image at LSN {}",
+                records.len(),
+                lsn
+            );
            let process = process_guard.take().unwrap();
            process.kill();
        }
@@ -387,7 +392,7 @@ impl PostgresRedoManager {
            }
            // Non-relational WAL records are handled here, with custom code that has the
            // same effects as the corresponding Postgres WAL redo function.
-            ZenithWalRecord::ClogSetCommitted { xids } => {
+            ZenithWalRecord::ClogSetCommitted { xids, timestamp } => {
                let (slru_kind, segno, blknum) =
                    key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
                assert_eq!(
@@ -421,6 +426,21 @@ impl PostgresRedoManager {
                        page,
                    );
                }
+
+                // Append the timestamp
+                if page.len() == pg_constants::BLCKSZ as usize + 8 {
+                    page.truncate(pg_constants::BLCKSZ as usize);
+                }
+                if page.len() == pg_constants::BLCKSZ as usize {
+                    page.extend_from_slice(&timestamp.to_be_bytes());
+                } else {
+                    warn!(
+                        "CLOG blk {} in seg {} has invalid size {}",
+                        blknum,
+                        segno,
+                        page.len()
+                    );
+                }
            }
            ZenithWalRecord::ClogSetAborted { xids } => {
                let (slru_kind, segno, blknum) =
--- a/poetry.lock
+++ b/poetry.lock
@@ -822,7 +822,7 @@ python-versions = "*"

 [[package]]
 name = "moto"
-version = "3.0.4"
+version = "3.1.7"
 description = "A library that allows your python tests to easily mock out the boto library"
 category = "main"
 optional = false
@@ -844,6 +844,7 @@ importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
 Jinja2 = ">=2.10.1"
 jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
 MarkupSafe = "!=2.0.0a1"
+pyparsing = {version = ">=3.0.0", optional = true, markers = "extra == \"server\""}
 python-dateutil = ">=2.1,<3.0.0"
 python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
 pytz = "*"
@@ -855,7 +856,7 @@ werkzeug = "*"
 xmltodict = "*"

 [package.extras]
-all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools"]
+all = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools"]
 apigateway = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"]
 apigatewayv2 = ["PyYAML (>=5.1)"]
 appsync = ["graphql-core"]
@@ -864,14 +865,16 @@ batch = ["docker (>=2.5.1)"]
 cloudformation = ["docker (>=2.5.1)", "PyYAML (>=5.1)", "cfn-lint (>=0.4.0)"]
 cognitoidp = ["python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)"]
 ds = ["sshpubkeys (>=3.1.0)"]
+dynamodb = ["docker (>=2.5.1)"]
 dynamodb2 = ["docker (>=2.5.1)"]
 dynamodbstreams = ["docker (>=2.5.1)"]
 ec2 = ["sshpubkeys (>=3.1.0)"]
 efs = ["sshpubkeys (>=3.1.0)"]
+glue = ["pyparsing (>=3.0.0)"]
 iotdata = ["jsondiff (>=1.1.2)"]
 route53resolver = ["sshpubkeys (>=3.1.0)"]
 s3 = ["PyYAML (>=5.1)"]
-server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "setuptools", "flask", "flask-cors"]
+server = ["PyYAML (>=5.1)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "ecdsa (!=0.15)", "docker (>=2.5.1)", "graphql-core", "jsondiff (>=1.1.2)", "aws-xray-sdk (>=0.93,!=0.96)", "idna (>=2.5,<4)", "cfn-lint (>=0.4.0)", "sshpubkeys (>=3.1.0)", "pyparsing (>=3.0.0)", "setuptools", "flask", "flask-cors"]
 ssm = ["PyYAML (>=5.1)", "dataclasses"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]

@@ -1068,6 +1071,17 @@ python-versions = ">=3.6"
 py = "*"
 pytest = ">=3.10"

+[[package]]
+name = "pytest-lazy-fixture"
+version = "0.6.3"
+description = "It helps to use fixtures in pytest.mark.parametrize"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+pytest = ">=3.2.5"
+
 [[package]]
 name = "pytest-xdist"
 version = "2.5.0"
@@ -1361,7 +1375,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.7"
-content-hash = "58762accad4122026c650fa43421a900546e89f9908e2268410e7b11cc8c6c4e"
+content-hash = "dc63b6e02d0ceccdc4b5616e9362c149a27fdcc6c54fda63a3b115a5b980c42e"

 [metadata.files]
 aiopg = [
@@ -1679,8 +1693,8 @@ mccabe = [
    {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
 ]
 moto = [
-    {file = "moto-3.0.4-py2.py3-none-any.whl", hash = "sha256:79646213d8438385182f4eea79e28725f94b3d0d3dc9a3eda81db47e0ebef6cc"},
-    {file = "moto-3.0.4.tar.gz", hash = "sha256:168b8a3cb4dd8a6df8e51d582761cefa9657b9f45ac7e1eb24dae394ebc9e000"},
+    {file = "moto-3.1.7-py3-none-any.whl", hash = "sha256:4ab6fb8dd150343e115d75e3dbdb5a8f850fc7236790819d7cef438c11ee6e89"},
+    {file = "moto-3.1.7.tar.gz", hash = "sha256:20607a0fd0cf6530e05ffb623ca84d3f45d50bddbcec2a33705a0cf471e71289"},
 ]
 mypy = [
    {file = "mypy-0.910-cp35-cp35m-macosx_10_9_x86_64.whl", hash = "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457"},
@@ -1855,6 +1869,10 @@ pytest-forked = [
    {file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"},
    {file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"},
 ]
+pytest-lazy-fixture = [
+    {file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"},
+    {file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"},
+]
 pytest-xdist = [
    {file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"},
    {file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"},
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -5,6 +5,7 @@ edition = "2021"

 [dependencies]
 anyhow = "1.0"
+async-trait = "0.1"
 base64 = "0.13.0"
 bytes = { version = "1.0.1", features = ['serde'] }
 clap = "3.0"
@@ -31,13 +32,14 @@ thiserror = "1.0.30"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 tokio-rustls = "0.23.0"
+url = "2.2.2"
+git-version = "0.3.5"

 utils = { path = "../libs/utils" }
 metrics = { path = "../libs/metrics" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }

 [dev-dependencies]
-async-trait = "0.1"
 rcgen = "0.8.14"
 rstest = "0.12"
 tokio-postgres-rustls = "0.9.0"
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -0,0 +1,33 @@
+# Proxy
+
+Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented:
+
+* legacy
+  old method, when username ends with `@zenith` it uses md5 auth dbname as the cluster name; otherwise, it sends a login link and waits for the console to call back
+* console
+  new SCRAM-based console API; uses SNI info to select the destination cluster
+* postgres
+  uses postgres to select auth secrets of existing roles. Useful for local testing
+* link
+  sends login link for all usernames
+
+## Using SNI-based routing on localhost
+
+Now proxy determines cluster name from the subdomain, request to the `my-cluster-42.somedomain.tld` will be routed to the cluster named `my-cluster-42`. Unfortunately `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
+
+```
+openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.localtest.me"
+
+```
+
+now you can start proxy:
+
+```
+./target/debug/proxy -c server.crt -k server.key
+```
+
+and connect to it:
+
+```
+PGSSLROOTCERT=./server.crt psql 'postgres://my-cluster-42.localtest.me:1234?sslmode=verify-full'
+```
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -1,22 +1,17 @@
 mod credentials;
-
-#[cfg(test)]
 mod flow;

-use crate::compute::DatabaseInfo;
-use crate::config::ProxyConfig;
-use crate::cplane_api::{self, CPlaneApi};
+use crate::auth_backend::{console, legacy_console, link, postgres};
+use crate::config::{AuthBackendType, ProxyConfig};
 use crate::error::UserFacingError;
 use crate::stream::PqStream;
-use crate::waiters;
+use crate::{auth_backend, compute, waiters};
+use console::ConsoleAuthError::SniMissing;
 use std::io;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};

 pub use credentials::ClientCredentials;
-
-#[cfg(test)]
 pub use flow::*;

 /// Common authentication error.
@@ -24,9 +19,11 @@ pub use flow::*;
 pub enum AuthErrorImpl {
    /// Authentication error reported by the console.
    #[error(transparent)]
-    Console(#[from] cplane_api::AuthError),
+    Console(#[from] auth_backend::AuthError),
+
+    #[error(transparent)]
+    GetAuthInfo(#[from] auth_backend::console::ConsoleAuthError),

-    #[cfg(test)]
    #[error(transparent)]
    Sasl(#[from] crate::sasl::Error),

@@ -41,19 +38,19 @@ pub enum AuthErrorImpl {

 impl AuthErrorImpl {
    pub fn auth_failed(msg: impl Into<String>) -> Self {
-        AuthErrorImpl::Console(cplane_api::AuthError::auth_failed(msg))
+        AuthErrorImpl::Console(auth_backend::AuthError::auth_failed(msg))
    }
 }

 impl From<waiters::RegisterError> for AuthErrorImpl {
    fn from(e: waiters::RegisterError) -> Self {
-        AuthErrorImpl::Console(cplane_api::AuthError::from(e))
+        AuthErrorImpl::Console(auth_backend::AuthError::from(e))
    }
 }

 impl From<waiters::WaitError> for AuthErrorImpl {
    fn from(e: waiters::WaitError) -> Self {
-        AuthErrorImpl::Console(cplane_api::AuthError::from(e))
+        AuthErrorImpl::Console(auth_backend::AuthError::from(e))
    }
 }

@@ -76,112 +73,33 @@ impl UserFacingError for AuthError {
        match self.0.as_ref() {
            Console(e) => e.to_string_client(),
            MalformedPassword => self.to_string(),
+            GetAuthInfo(e) if matches!(e, SniMissing) => e.to_string(),
            _ => "Internal error".to_string(),
        }
    }
 }

-async fn handle_static(
-    host: String,
-    port: u16,
-    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    creds: ClientCredentials,
-) -> Result<DatabaseInfo, AuthError> {
-    client
-        .write_message(&Be::AuthenticationCleartextPassword)
-        .await?;
-
-    // Read client's password bytes
-    let msg = client.read_password_message().await?;
-    let cleartext_password = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?;
-
-    let db_info = DatabaseInfo {
-        host,
-        port,
-        dbname: creds.dbname.clone(),
-        user: creds.user.clone(),
-        password: Some(cleartext_password.into()),
-    };
-
-    client
-        .write_message_noflush(&Be::AuthenticationOk)?
-        .write_message_noflush(&BeParameterStatusMessage::encoding())?;
-
-    Ok(db_info)
-}
-
-async fn handle_existing_user(
+async fn handle_user(
    config: &ProxyConfig,
-    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
    creds: ClientCredentials,
-) -> Result<DatabaseInfo, AuthError> {
-    let psql_session_id = new_psql_session_id();
-    let md5_salt = rand::random();
-
-    client
-        .write_message(&Be::AuthenticationMD5Password(md5_salt))
-        .await?;
-
-    // Read client's password hash
-    let msg = client.read_password_message().await?;
-    let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?;
-
-    let cplane = CPlaneApi::new(config.auth_endpoint.clone());
-    let db_info = cplane
-        .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id)
-        .await?;
-
-    client
-        .write_message_noflush(&Be::AuthenticationOk)?
-        .write_message_noflush(&BeParameterStatusMessage::encoding())?;
-
-    Ok(db_info)
-}
-
-async fn handle_new_user(
-    config: &ProxyConfig,
-    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> Result<DatabaseInfo, AuthError> {
-    let psql_session_id = new_psql_session_id();
-    let greeting = hello_message(&config.redirect_uri, &psql_session_id);
-
-    let db_info = cplane_api::with_waiter(psql_session_id, |waiter| async {
-        // Give user a URL to spawn a new database
-        client
-            .write_message_noflush(&Be::AuthenticationOk)?
-            .write_message_noflush(&BeParameterStatusMessage::encoding())?
-            .write_message(&Be::NoticeResponse(&greeting))
-            .await?;
-
-        // Wait for web console response (see `mgmt`)
-        waiter.await?.map_err(AuthErrorImpl::auth_failed)
-    })
-    .await?;
-
-    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
-
-    Ok(db_info)
-}
-
-fn new_psql_session_id() -> String {
-    hex::encode(rand::random::<[u8; 8]>())
-}
-
-fn parse_password(bytes: &[u8]) -> Option<&str> {
-    std::str::from_utf8(bytes).ok()?.strip_suffix('\0')
-}
-
-fn hello_message(redirect_uri: &str, session_id: &str) -> String {
-    format!(
-        concat![
-            "☀️  Welcome to Zenith!\n",
-            "To proceed with database creation, open the following link:\n\n",
-            "    {redirect_uri}{session_id}\n\n",
-            "It needs to be done once and we will send you '.pgpass' file,\n",
-            "which will allow you to access or create ",
-            "databases without opening your web browser."
-        ],
-        redirect_uri = redirect_uri,
-        session_id = session_id,
-    )
+) -> Result<compute::NodeInfo, AuthError> {
+    match config.auth_backend {
+        AuthBackendType::LegacyConsole => {
+            legacy_console::handle_user(
+                &config.auth_endpoint,
+                &config.auth_link_uri,
+                client,
+                &creds,
+            )
+            .await
+        }
+        AuthBackendType::Console => {
+            console::handle_user(config.auth_endpoint.as_ref(), client, &creds).await
+        }
+        AuthBackendType::Postgres => {
+            postgres::handle_user(&config.auth_endpoint, client, &creds).await
+        }
+        AuthBackendType::Link => link::handle_user(config.auth_link_uri.as_ref(), client).await,
+    }
 }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,7 +1,7 @@
 //! User credentials used in authentication.

 use super::AuthError;
-use crate::compute::DatabaseInfo;
+use crate::compute;
 use crate::config::ProxyConfig;
 use crate::error::UserFacingError;
 use crate::stream::PqStream;
@@ -18,10 +18,22 @@ pub enum ClientCredsParseError {
 impl UserFacingError for ClientCredsParseError {}

 /// Various client credentials which we use for authentication.
-#[derive(Debug, PartialEq, Eq)]
+/// Note that we don't store any kind of client key or password here.
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ClientCredentials {
    pub user: String,
    pub dbname: String,
+
+    // New console API requires SNI info to determine the cluster name.
+    // Other Auth backends don't need it.
+    pub sni_data: Option<String>,
+}
+
+impl ClientCredentials {
+    pub fn is_existing_user(&self) -> bool {
+        // This logic will likely change in the future.
+        self.user.ends_with("@zenith")
+    }
 }

 impl TryFrom<HashMap<String, String>> for ClientCredentials {
@@ -37,7 +49,11 @@ impl TryFrom<HashMap<String, String>> for ClientCredentials {
        let user = get_param("user")?;
        let db = get_param("database")?;

-        Ok(Self { user, dbname: db })
+        Ok(Self {
+            user,
+            dbname: db,
+            sni_data: None,
+        })
    }
 }

@@ -46,21 +62,9 @@ impl ClientCredentials {
    pub async fn authenticate(
        self,
        config: &ProxyConfig,
-        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-    ) -> Result<DatabaseInfo, AuthError> {
-        use crate::config::ClientAuthMethod::*;
-        use crate::config::RouterConfig::*;
-        match &config.router_config {
-            Static { host, port } => super::handle_static(host.clone(), *port, client, self).await,
-            Dynamic(Mixed) => {
-                if self.user.ends_with("@zenith") {
-                    super::handle_existing_user(config, client, self).await
-                } else {
-                    super::handle_new_user(config, client).await
-                }
-            }
-            Dynamic(Password) => super::handle_existing_user(config, client, self).await,
-            Dynamic(Link) => super::handle_new_user(config, client).await,
-        }
+        client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
+    ) -> Result<compute::NodeInfo, AuthError> {
+        // This method is just a convenient facade for `handle_user`
+        super::handle_user(config, client, self).await
    }
 }
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -27,19 +27,6 @@ impl AuthMethod for Scram<'_> {
    }
 }

-/// Use password-based auth in [`AuthFlow`].
-pub struct Md5(
-    /// Salt for client.
-    pub [u8; 4],
-);
-
-impl AuthMethod for Md5 {
-    #[inline(always)]
-    fn first_message(&self) -> BeMessage<'_> {
-        Be::AuthenticationMD5Password(self.0)
-    }
-}
-
 /// This wrapper for [`PqStream`] performs client authentication.
 #[must_use]
 pub struct AuthFlow<'a, Stream, State> {
@@ -70,19 +57,10 @@ impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
    }
 }

-/// Stream wrapper for handling simple MD5 password auth.
-impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Md5> {
-    /// Perform user authentication. Raise an error in case authentication failed.
-    #[allow(unused)]
-    pub async fn authenticate(self) -> Result<(), AuthError> {
-        unimplemented!("MD5 auth flow is yet to be implemented");
-    }
-}
-
 /// Stream wrapper for handling [SCRAM](crate::scram) auth.
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
    /// Perform user authentication. Raise an error in case authentication failed.
-    pub async fn authenticate(self) -> Result<(), AuthError> {
+    pub async fn authenticate(self) -> Result<scram::ScramKey, AuthError> {
        // Initial client message contains the chosen auth method's name.
        let msg = self.stream.read_password_message().await?;
        let sasl = sasl::FirstMessage::parse(&msg).ok_or(AuthErrorImpl::MalformedPassword)?;
@@ -93,10 +71,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
        }

        let secret = self.state.0;
-        sasl::SaslStream::new(self.stream, sasl.message)
+        let key = sasl::SaslStream::new(self.stream, sasl.message)
            .authenticate(scram::Exchange::new(secret, rand::random, None))
            .await?;

-        Ok(())
+        Ok(key)
    }
 }
--- a/proxy/src/auth_backend.rs
+++ b/proxy/src/auth_backend.rs
@@ -0,0 +1,31 @@
+pub mod console;
+pub mod legacy_console;
+pub mod link;
+pub mod postgres;
+
+pub use legacy_console::{AuthError, AuthErrorImpl};
+
+use crate::mgmt;
+use crate::waiters::{self, Waiter, Waiters};
+use lazy_static::lazy_static;
+
+lazy_static! {
+    static ref CPLANE_WAITERS: Waiters<mgmt::ComputeReady> = Default::default();
+}
+
+/// Give caller an opportunity to wait for the cloud's reply.
+pub async fn with_waiter<R, T, E>(
+    psql_session_id: impl Into<String>,
+    action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R,
+) -> Result<T, E>
+where
+    R: std::future::Future<Output = Result<T, E>>,
+    E: From<waiters::RegisterError>,
+{
+    let waiter = CPLANE_WAITERS.register(psql_session_id.into())?;
+    action(waiter).await
+}
+
+pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), waiters::NotifyError> {
+    CPLANE_WAITERS.notify(psql_session_id, msg)
+}
--- a/proxy/src/auth_backend/console.rs
+++ b/proxy/src/auth_backend/console.rs
@@ -0,0 +1,243 @@
+//! Declaration of Cloud API V2.
+
+use crate::{
+    auth::{self, AuthFlow},
+    compute, scram,
+};
+use serde::{Deserialize, Serialize};
+use thiserror::Error;
+
+use crate::auth::ClientCredentials;
+use crate::stream::PqStream;
+
+use tokio::io::{AsyncRead, AsyncWrite};
+use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+
+#[derive(Debug, Error)]
+pub enum ConsoleAuthError {
+    // We shouldn't include the actual secret here.
+    #[error("Bad authentication secret")]
+    BadSecret,
+
+    #[error("Bad client credentials: {0:?}")]
+    BadCredentials(crate::auth::ClientCredentials),
+
+    #[error("SNI info is missing, please upgrade the postgres client library")]
+    SniMissing,
+
+    #[error("Unexpected SNI content")]
+    SniWrong,
+
+    #[error(transparent)]
+    BadUrl(#[from] url::ParseError),
+
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+
+    /// HTTP status (other than 200) returned by the console.
+    #[error("Console responded with an HTTP status: {0}")]
+    HttpStatus(reqwest::StatusCode),
+
+    #[error(transparent)]
+    Transport(#[from] reqwest::Error),
+
+    #[error("Console responded with a malformed JSON: '{0}'")]
+    MalformedResponse(#[from] serde_json::Error),
+
+    #[error("Console responded with a malformed compute address: '{0}'")]
+    MalformedComputeAddress(String),
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+struct GetRoleSecretResponse {
+    role_secret: String,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+struct GetWakeComputeResponse {
+    address: String,
+}
+
+/// Auth secret which is managed by the cloud.
+pub enum AuthInfo {
+    /// Md5 hash of user's password.
+    Md5([u8; 16]),
+    /// [SCRAM](crate::scram) authentication info.
+    Scram(scram::ServerSecret),
+}
+
+/// Compute node connection params provided by the cloud.
+/// Note how it implements serde traits, since we receive it over the wire.
+#[derive(Serialize, Deserialize, Default)]
+pub struct DatabaseInfo {
+    pub host: String,
+    pub port: u16,
+    pub dbname: String,
+    pub user: String,
+
+    /// [Cloud API V1](super::legacy) returns cleartext password,
+    /// but [Cloud API V2](super::api) implements [SCRAM](crate::scram)
+    /// authentication, so we can leverage this method and cope without password.
+    pub password: Option<String>,
+}
+
+// Manually implement debug to omit personal and sensitive info.
+impl std::fmt::Debug for DatabaseInfo {
+    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
+        fmt.debug_struct("DatabaseInfo")
+            .field("host", &self.host)
+            .field("port", &self.port)
+            .finish()
+    }
+}
+
+impl From<DatabaseInfo> for tokio_postgres::Config {
+    fn from(db_info: DatabaseInfo) -> Self {
+        let mut config = tokio_postgres::Config::new();
+
+        config
+            .host(&db_info.host)
+            .port(db_info.port)
+            .dbname(&db_info.dbname)
+            .user(&db_info.user);
+
+        if let Some(password) = db_info.password {
+            config.password(password);
+        }
+
+        config
+    }
+}
+
+async fn get_auth_info(
+    auth_endpoint: &str,
+    user: &str,
+    cluster: &str,
+) -> Result<AuthInfo, ConsoleAuthError> {
+    let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_get_role_secret"))?;
+
+    url.query_pairs_mut()
+        .append_pair("project", cluster)
+        .append_pair("role", user);
+
+    // TODO: use a proper logger
+    println!("cplane request: {}", url);
+
+    let resp = reqwest::get(url).await?;
+    if !resp.status().is_success() {
+        return Err(ConsoleAuthError::HttpStatus(resp.status()));
+    }
+
+    let response: GetRoleSecretResponse = serde_json::from_str(resp.text().await?.as_str())?;
+
+    scram::ServerSecret::parse(response.role_secret.as_str())
+        .map(AuthInfo::Scram)
+        .ok_or(ConsoleAuthError::BadSecret)
+}
+
+/// Wake up the compute node and return the corresponding connection info.
+async fn wake_compute(
+    auth_endpoint: &str,
+    cluster: &str,
+) -> Result<(String, u16), ConsoleAuthError> {
+    let mut url = reqwest::Url::parse(&format!("{auth_endpoint}/proxy_wake_compute"))?;
+    url.query_pairs_mut().append_pair("project", cluster);
+
+    // TODO: use a proper logger
+    println!("cplane request: {}", url);
+
+    let resp = reqwest::get(url).await?;
+    if !resp.status().is_success() {
+        return Err(ConsoleAuthError::HttpStatus(resp.status()));
+    }
+
+    let response: GetWakeComputeResponse = serde_json::from_str(resp.text().await?.as_str())?;
+    let (host, port) = response
+        .address
+        .split_once(':')
+        .ok_or_else(|| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?;
+    let port: u16 = port
+        .parse()
+        .map_err(|_| ConsoleAuthError::MalformedComputeAddress(response.address.clone()))?;
+
+    Ok((host.to_string(), port))
+}
+
+pub async fn handle_user(
+    auth_endpoint: &str,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    creds: &ClientCredentials,
+) -> Result<compute::NodeInfo, crate::auth::AuthError> {
+    // Determine cluster name from SNI.
+    let cluster = creds
+        .sni_data
+        .as_ref()
+        .ok_or(ConsoleAuthError::SniMissing)?
+        .split_once('.')
+        .ok_or(ConsoleAuthError::SniWrong)?
+        .0;
+
+    let user = creds.user.as_str();
+
+    // Step 1: get the auth secret
+    let auth_info = get_auth_info(auth_endpoint, user, cluster).await?;
+
+    let flow = AuthFlow::new(client);
+    let scram_keys = match auth_info {
+        AuthInfo::Md5(_) => {
+            // TODO: decide if we should support MD5 in api v2
+            return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into());
+        }
+        AuthInfo::Scram(secret) => {
+            let scram = auth::Scram(&secret);
+            Some(compute::ScramKeys {
+                client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(),
+                server_key: secret.server_key.as_bytes(),
+            })
+        }
+    };
+
+    client
+        .write_message_noflush(&Be::AuthenticationOk)?
+        .write_message_noflush(&BeParameterStatusMessage::encoding())?;
+
+    // Step 2: wake compute
+    let (host, port) = wake_compute(auth_endpoint, cluster).await?;
+
+    Ok(compute::NodeInfo {
+        db_info: DatabaseInfo {
+            host,
+            port,
+            dbname: creds.dbname.clone(),
+            user: creds.user.clone(),
+            password: None,
+        },
+        scram_keys,
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn parse_db_info() -> anyhow::Result<()> {
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+            "password": "password",
+        }))?;
+
+        let _: DatabaseInfo = serde_json::from_value(json!({
+            "host": "localhost",
+            "port": 5432,
+            "dbname": "postgres",
+            "user": "john_doe",
+        }))?;
+
+        Ok(())
+    }
+}
--- a/proxy/src/auth_backend/legacy_console.rs
+++ b/proxy/src/auth_backend/legacy_console.rs
@@ -0,0 +1,206 @@
+//! Cloud API V1.
+
+use super::console::DatabaseInfo;
+
+use crate::auth::ClientCredentials;
+use crate::stream::PqStream;
+
+use crate::{compute, waiters};
+use serde::{Deserialize, Serialize};
+
+use tokio::io::{AsyncRead, AsyncWrite};
+use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+
+use thiserror::Error;
+
+use crate::error::UserFacingError;
+
+#[derive(Debug, Error)]
+pub enum AuthErrorImpl {
+    /// Authentication error reported by the console.
+    #[error("Authentication failed: {0}")]
+    AuthFailed(String),
+
+    /// HTTP status (other than 200) returned by the console.
+    #[error("Console responded with an HTTP status: {0}")]
+    HttpStatus(reqwest::StatusCode),
+
+    #[error("Console responded with a malformed JSON: {0}")]
+    MalformedResponse(#[from] serde_json::Error),
+
+    #[error(transparent)]
+    Transport(#[from] reqwest::Error),
+
+    #[error(transparent)]
+    WaiterRegister(#[from] waiters::RegisterError),
+
+    #[error(transparent)]
+    WaiterWait(#[from] waiters::WaitError),
+}
+
+#[derive(Debug, Error)]
+#[error(transparent)]
+pub struct AuthError(Box<AuthErrorImpl>);
+
+impl AuthError {
+    /// Smart constructor for authentication error reported by `mgmt`.
+    pub fn auth_failed(msg: impl Into<String>) -> Self {
+        AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into())))
+    }
+}
+
+impl<T> From<T> for AuthError
+where
+    AuthErrorImpl: From<T>,
+{
+    fn from(e: T) -> Self {
+        AuthError(Box::new(e.into()))
+    }
+}
+
+impl UserFacingError for AuthError {
+    fn to_string_client(&self) -> String {
+        use AuthErrorImpl::*;
+        match self.0.as_ref() {
+            AuthFailed(_) | HttpStatus(_) => self.to_string(),
+            _ => "Internal error".to_string(),
+        }
+    }
+}
+
+// NOTE: the order of constructors is important.
+// https://serde.rs/enum-representations.html#untagged
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(untagged)]
+enum ProxyAuthResponse {
+    Ready { conn_info: DatabaseInfo },
+    Error { error: String },
+    NotReady { ready: bool }, // TODO: get rid of `ready`
+}
+
+async fn authenticate_proxy_client(
+    auth_endpoint: &reqwest::Url,
+    creds: &ClientCredentials,
+    md5_response: &str,
+    salt: &[u8; 4],
+    psql_session_id: &str,
+) -> Result<DatabaseInfo, AuthError> {
+    let mut url = auth_endpoint.clone();
+    url.query_pairs_mut()
+        .append_pair("login", &creds.user)
+        .append_pair("database", &creds.dbname)
+        .append_pair("md5response", md5_response)
+        .append_pair("salt", &hex::encode(salt))
+        .append_pair("psql_session_id", psql_session_id);
+
+    super::with_waiter(psql_session_id, |waiter| async {
+        println!("cloud request: {}", url);
+        // TODO: leverage `reqwest::Client` to reuse connections
+        let resp = reqwest::get(url).await?;
+        if !resp.status().is_success() {
+            return Err(AuthErrorImpl::HttpStatus(resp.status()).into());
+        }
+
+        let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?;
+        println!("got auth info: #{:?}", auth_info);
+
+        use ProxyAuthResponse::*;
+        let db_info = match auth_info {
+            Ready { conn_info } => conn_info,
+            Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()),
+            NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?,
+        };
+
+        Ok(db_info)
+    })
+    .await
+}
+
+async fn handle_existing_user(
+    auth_endpoint: &reqwest::Url,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
+    creds: &ClientCredentials,
+) -> Result<crate::compute::NodeInfo, crate::auth::AuthError> {
+    let psql_session_id = super::link::new_psql_session_id();
+    let md5_salt = rand::random();
+
+    client
+        .write_message(&Be::AuthenticationMD5Password(md5_salt))
+        .await?;
+
+    // Read client's password hash
+    let msg = client.read_password_message().await?;
+    let md5_response = parse_password(&msg).ok_or(crate::auth::AuthErrorImpl::MalformedPassword)?;
+
+    let db_info = authenticate_proxy_client(
+        auth_endpoint,
+        creds,
+        md5_response,
+        &md5_salt,
+        &psql_session_id,
+    )
+    .await?;
+
+    client
+        .write_message_noflush(&Be::AuthenticationOk)?
+        .write_message_noflush(&BeParameterStatusMessage::encoding())?;
+
+    Ok(compute::NodeInfo {
+        db_info,
+        scram_keys: None,
+    })
+}
+
+pub async fn handle_user(
+    auth_endpoint: &reqwest::Url,
+    auth_link_uri: &reqwest::Url,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
+    creds: &ClientCredentials,
+) -> Result<crate::compute::NodeInfo, crate::auth::AuthError> {
+    if creds.is_existing_user() {
+        handle_existing_user(auth_endpoint, client, creds).await
+    } else {
+        super::link::handle_user(auth_link_uri.as_ref(), client).await
+    }
+}
+
+fn parse_password(bytes: &[u8]) -> Option<&str> {
+    std::str::from_utf8(bytes).ok()?.strip_suffix('\0')
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn test_proxy_auth_response() {
+        // Ready
+        let auth: ProxyAuthResponse = serde_json::from_value(json!({
+            "ready": true,
+            "conn_info": DatabaseInfo::default(),
+        }))
+        .unwrap();
+        assert!(matches!(
+            auth,
+            ProxyAuthResponse::Ready {
+                conn_info: DatabaseInfo { .. }
+            }
+        ));
+
+        // Error
+        let auth: ProxyAuthResponse = serde_json::from_value(json!({
+            "ready": false,
+            "error": "too bad, so sad",
+        }))
+        .unwrap();
+        assert!(matches!(auth, ProxyAuthResponse::Error { .. }));
+
+        // NotReady
+        let auth: ProxyAuthResponse = serde_json::from_value(json!({
+            "ready": false,
+        }))
+        .unwrap();
+        assert!(matches!(auth, ProxyAuthResponse::NotReady { .. }));
+    }
+}
--- a/proxy/src/auth_backend/link.rs
+++ b/proxy/src/auth_backend/link.rs
@@ -0,0 +1,52 @@
+use crate::{compute, stream::PqStream};
+use tokio::io::{AsyncRead, AsyncWrite};
+use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+
+fn hello_message(redirect_uri: &str, session_id: &str) -> String {
+    format!(
+        concat![
+            "☀️  Welcome to Neon!\n",
+            "To proceed with database creation, open the following link:\n\n",
+            "    {redirect_uri}{session_id}\n\n",
+            "It needs to be done once and we will send you '.pgpass' file,\n",
+            "which will allow you to access or create ",
+            "databases without opening your web browser."
+        ],
+        redirect_uri = redirect_uri,
+        session_id = session_id,
+    )
+}
+
+pub fn new_psql_session_id() -> String {
+    hex::encode(rand::random::<[u8; 8]>())
+}
+
+pub async fn handle_user(
+    redirect_uri: &str,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> Result<compute::NodeInfo, crate::auth::AuthError> {
+    let psql_session_id = new_psql_session_id();
+    let greeting = hello_message(redirect_uri, &psql_session_id);
+
+    let db_info = crate::auth_backend::with_waiter(psql_session_id, |waiter| async {
+        // Give user a URL to spawn a new database
+        client
+            .write_message_noflush(&Be::AuthenticationOk)?
+            .write_message_noflush(&BeParameterStatusMessage::encoding())?
+            .write_message(&Be::NoticeResponse(&greeting))
+            .await?;
+
+        // Wait for web console response (see `mgmt`)
+        waiter
+            .await?
+            .map_err(crate::auth::AuthErrorImpl::auth_failed)
+    })
+    .await?;
+
+    client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
+
+    Ok(compute::NodeInfo {
+        db_info,
+        scram_keys: None,
+    })
+}
--- a/proxy/src/auth_backend/postgres.rs
+++ b/proxy/src/auth_backend/postgres.rs
@@ -0,0 +1,93 @@
+//! Local mock of Cloud API V2.
+
+use super::console::{self, AuthInfo, DatabaseInfo};
+use crate::scram;
+use crate::{auth::ClientCredentials, compute};
+
+use crate::stream::PqStream;
+use tokio::io::{AsyncRead, AsyncWrite};
+use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
+
+async fn get_auth_info(
+    auth_endpoint: &str,
+    creds: &ClientCredentials,
+) -> Result<AuthInfo, console::ConsoleAuthError> {
+    // We wrap `tokio_postgres::Error` because we don't want to infect the
+    // method's error type with a detail that's specific to debug mode only.
+    let io_error = |e| std::io::Error::new(std::io::ErrorKind::Other, e);
+
+    // Perhaps we could persist this connection, but then we'd have to
+    // write more code for reopening it if it got closed, which doesn't
+    // seem worth it.
+    let (client, connection) = tokio_postgres::connect(auth_endpoint, tokio_postgres::NoTls)
+        .await
+        .map_err(io_error)?;
+
+    tokio::spawn(connection);
+    let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1";
+    let rows = client
+        .query(query, &[&creds.user])
+        .await
+        .map_err(io_error)?;
+
+    match &rows[..] {
+        // We can't get a secret if there's no such user.
+        [] => Err(console::ConsoleAuthError::BadCredentials(creds.to_owned())),
+        // We shouldn't get more than one row anyway.
+        [row, ..] => {
+            let entry = row.try_get(0).map_err(io_error)?;
+            scram::ServerSecret::parse(entry)
+                .map(AuthInfo::Scram)
+                .or_else(|| {
+                    // It could be an md5 hash if it's not a SCRAM secret.
+                    let text = entry.strip_prefix("md5")?;
+                    Some(AuthInfo::Md5({
+                        let mut bytes = [0u8; 16];
+                        hex::decode_to_slice(text, &mut bytes).ok()?;
+                        bytes
+                    }))
+                })
+                // Putting the secret into this message is a security hazard!
+                .ok_or(console::ConsoleAuthError::BadSecret)
+        }
+    }
+}
+
+pub async fn handle_user(
+    auth_endpoint: &reqwest::Url,
+    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    creds: &ClientCredentials,
+) -> Result<compute::NodeInfo, crate::auth::AuthError> {
+    let auth_info = get_auth_info(auth_endpoint.as_ref(), creds).await?;
+
+    let flow = crate::auth::AuthFlow::new(client);
+    let scram_keys = match auth_info {
+        AuthInfo::Md5(_) => {
+            // TODO: decide if we should support MD5 in api v2
+            return Err(crate::auth::AuthErrorImpl::auth_failed("MD5 is not supported").into());
+        }
+        AuthInfo::Scram(secret) => {
+            let scram = crate::auth::Scram(&secret);
+            Some(compute::ScramKeys {
+                client_key: flow.begin(scram).await?.authenticate().await?.as_bytes(),
+                server_key: secret.server_key.as_bytes(),
+            })
+        }
+    };
+
+    client
+        .write_message_noflush(&Be::AuthenticationOk)?
+        .write_message_noflush(&BeParameterStatusMessage::encoding())?;
+
+    Ok(compute::NodeInfo {
+        db_info: DatabaseInfo {
+            // TODO: handle that near CLI params parsing
+            host: auth_endpoint.host_str().unwrap_or("localhost").to_owned(),
+            port: auth_endpoint.port().unwrap_or(5432),
+            dbname: creds.dbname.to_owned(),
+            user: creds.user.to_owned(),
+            password: None,
+        },
+        scram_keys,
+    })
+}
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,6 +1,6 @@
+use crate::auth_backend::console::DatabaseInfo;
 use crate::cancellation::CancelClosure;
 use crate::error::UserFacingError;
-use serde::{Deserialize, Serialize};
 use std::io;
 use std::net::SocketAddr;
 use thiserror::Error;
@@ -23,32 +23,21 @@ pub enum ConnectionError {

 impl UserFacingError for ConnectionError {}

-/// Compute node connection params.
-#[derive(Serialize, Deserialize, Default)]
-pub struct DatabaseInfo {
-    pub host: String,
-    pub port: u16,
-    pub dbname: String,
-    pub user: String,
-    pub password: Option<String>,
-}
-
-// Manually implement debug to omit personal and sensitive info
-impl std::fmt::Debug for DatabaseInfo {
-    fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
-        fmt.debug_struct("DatabaseInfo")
-            .field("host", &self.host)
-            .field("port", &self.port)
-            .finish()
-    }
-}
-
 /// PostgreSQL version as [`String`].
 pub type Version = String;

-impl DatabaseInfo {
+/// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
+pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
+
+/// Compute node connection params.
+pub struct NodeInfo {
+    pub db_info: DatabaseInfo,
+    pub scram_keys: Option<ScramKeys>,
+}
+
+impl NodeInfo {
    async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
-        let host_port = format!("{}:{}", self.host, self.port);
+        let host_port = format!("{}:{}", self.db_info.host, self.db_info.port);
        let socket = TcpStream::connect(host_port).await?;
        let socket_addr = socket.peer_addr()?;
        socket2::SockRef::from(&socket).set_keepalive(true)?;
@@ -63,11 +52,13 @@ impl DatabaseInfo {
            .await
            .map_err(|_| ConnectionError::FailedToConnectToCompute)?;

-        // TODO: establish a secure connection to the DB
-        let (client, conn) = tokio_postgres::Config::from(self)
-            .connect_raw(&mut socket, NoTls)
-            .await?;
+        let mut config = tokio_postgres::Config::from(self.db_info);
+        if let Some(scram_keys) = self.scram_keys {
+            config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(scram_keys));
+        }

+        // TODO: establish a secure connection to the DB
+        let (client, conn) = config.connect_raw(&mut socket, NoTls).await?;
        let version = conn
            .parameter("server_version")
            .ok_or(ConnectionError::FailedToFetchPgVersion)?
@@ -78,21 +69,3 @@ impl DatabaseInfo {
        Ok((socket, version, cancel_closure))
    }
 }
-
-impl From<DatabaseInfo> for tokio_postgres::Config {
-    fn from(db_info: DatabaseInfo) -> Self {
-        let mut config = tokio_postgres::Config::new();
-
-        config
-            .host(&db_info.host)
-            .port(db_info.port)
-            .dbname(&db_info.dbname)
-            .user(&db_info.user);
-
-        if let Some(password) = db_info.password {
-            config.password(password);
-        }
-
-        config
-    }
-}
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,65 +1,47 @@
-use anyhow::{bail, ensure, Context};
-use std::net::SocketAddr;
-use std::str::FromStr;
-use std::sync::Arc;
-
-pub type TlsConfig = Arc<rustls::ServerConfig>;
+use anyhow::{ensure, Context};
+use std::{str::FromStr, sync::Arc};

 #[non_exhaustive]
-pub enum ClientAuthMethod {
-    Password,
+pub enum AuthBackendType {
+    LegacyConsole,
+    Console,
+    Postgres,
    Link,
-
-    /// Use password auth only if username ends with "@zenith"
-    Mixed,
 }

-pub enum RouterConfig {
-    Static { host: String, port: u16 },
-    Dynamic(ClientAuthMethod),
-}
-
-impl FromStr for ClientAuthMethod {
+impl FromStr for AuthBackendType {
    type Err = anyhow::Error;

    fn from_str(s: &str) -> anyhow::Result<Self> {
-        use ClientAuthMethod::*;
+        println!("ClientAuthMethod::from_str: '{}'", s);
+        use AuthBackendType::*;
        match s {
-            "password" => Ok(Password),
+            "legacy" => Ok(LegacyConsole),
+            "console" => Ok(Console),
+            "postgres" => Ok(Postgres),
            "link" => Ok(Link),
-            "mixed" => Ok(Mixed),
-            _ => bail!("Invalid option for router: `{}`", s),
+            _ => Err(anyhow::anyhow!("Invlid option for auth method")),
        }
    }
 }

 pub struct ProxyConfig {
-    /// main entrypoint for users to connect to
-    pub proxy_address: SocketAddr,
+    /// TLS configuration for the proxy.
+    pub tls_config: Option<TlsConfig>,

-    /// method of assigning compute nodes
-    pub router_config: RouterConfig,
+    pub auth_backend: AuthBackendType,

-    /// internally used for status and prometheus metrics
-    pub http_address: SocketAddr,
-
-    /// management endpoint. Upon user account creation control plane
-    /// will notify us here, so that we can 'unfreeze' user session.
-    /// TODO It uses postgres protocol over TCP but should be migrated to http.
-    pub mgmt_address: SocketAddr,
-
-    /// send unauthenticated users to this URI
-    pub redirect_uri: String,
-
-    /// control plane address where we would check auth.
    pub auth_endpoint: reqwest::Url,

-    pub tls_config: Option<TlsConfig>,
+    pub auth_link_uri: reqwest::Url,
 }

-pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
+pub type TlsConfig = Arc<rustls::ServerConfig>;
+
+/// Configure TLS for the main endpoint.
+pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfig> {
    let key = {
-        let key_bytes = std::fs::read(key_path).context("SSL key file")?;
+        let key_bytes = std::fs::read(key_path).context("TLS key file")?;
        let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
            .context("couldn't read TLS keys")?;

@@ -68,7 +50,7 @@ pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result<TlsConfi
    };

    let cert_chain = {
-        let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?;
+        let cert_chain_bytes = std::fs::read(cert_path).context("TLS cert file")?;
        rustls_pemfile::certs(&mut &cert_chain_bytes[..])
            .context("couldn't read TLS certificate chain")?
            .into_iter()
--- a/proxy/src/cplane_api.rs
+++ b/proxy/src/cplane_api.rs
@@ -1,183 +0,0 @@
-use crate::auth::ClientCredentials;
-use crate::compute::DatabaseInfo;
-use crate::error::UserFacingError;
-use crate::mgmt;
-use crate::waiters::{self, Waiter, Waiters};
-use lazy_static::lazy_static;
-use serde::{Deserialize, Serialize};
-use thiserror::Error;
-
-lazy_static! {
-    static ref CPLANE_WAITERS: Waiters<mgmt::ComputeReady> = Default::default();
-}
-
-/// Give caller an opportunity to wait for cplane's reply.
-pub async fn with_waiter<R, T, E>(
-    psql_session_id: impl Into<String>,
-    action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R,
-) -> Result<T, E>
-where
-    R: std::future::Future<Output = Result<T, E>>,
-    E: From<waiters::RegisterError>,
-{
-    let waiter = CPLANE_WAITERS.register(psql_session_id.into())?;
-    action(waiter).await
-}
-
-pub fn notify(
-    psql_session_id: &str,
-    msg: Result<DatabaseInfo, String>,
-) -> Result<(), waiters::NotifyError> {
-    CPLANE_WAITERS.notify(psql_session_id, msg)
-}
-
-/// Zenith console API wrapper.
-pub struct CPlaneApi {
-    auth_endpoint: reqwest::Url,
-}
-
-impl CPlaneApi {
-    pub fn new(auth_endpoint: reqwest::Url) -> Self {
-        Self { auth_endpoint }
-    }
-}
-
-#[derive(Debug, Error)]
-pub enum AuthErrorImpl {
-    /// Authentication error reported by the console.
-    #[error("Authentication failed: {0}")]
-    AuthFailed(String),
-
-    /// HTTP status (other than 200) returned by the console.
-    #[error("Console responded with an HTTP status: {0}")]
-    HttpStatus(reqwest::StatusCode),
-
-    #[error("Console responded with a malformed JSON: {0}")]
-    MalformedResponse(#[from] serde_json::Error),
-
-    #[error(transparent)]
-    Transport(#[from] reqwest::Error),
-
-    #[error(transparent)]
-    WaiterRegister(#[from] waiters::RegisterError),
-
-    #[error(transparent)]
-    WaiterWait(#[from] waiters::WaitError),
-}
-
-#[derive(Debug, Error)]
-#[error(transparent)]
-pub struct AuthError(Box<AuthErrorImpl>);
-
-impl AuthError {
-    /// Smart constructor for authentication error reported by `mgmt`.
-    pub fn auth_failed(msg: impl Into<String>) -> Self {
-        AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into())))
-    }
-}
-
-impl<T> From<T> for AuthError
-where
-    AuthErrorImpl: From<T>,
-{
-    fn from(e: T) -> Self {
-        AuthError(Box::new(e.into()))
-    }
-}
-
-impl UserFacingError for AuthError {
-    fn to_string_client(&self) -> String {
-        use AuthErrorImpl::*;
-        match self.0.as_ref() {
-            AuthFailed(_) | HttpStatus(_) => self.to_string(),
-            _ => "Internal error".to_string(),
-        }
-    }
-}
-
-impl CPlaneApi {
-    pub async fn authenticate_proxy_client(
-        &self,
-        creds: ClientCredentials,
-        md5_response: &str,
-        salt: &[u8; 4],
-        psql_session_id: &str,
-    ) -> Result<DatabaseInfo, AuthError> {
-        let mut url = self.auth_endpoint.clone();
-        url.query_pairs_mut()
-            .append_pair("login", &creds.user)
-            .append_pair("database", &creds.dbname)
-            .append_pair("md5response", md5_response)
-            .append_pair("salt", &hex::encode(salt))
-            .append_pair("psql_session_id", psql_session_id);
-
-        with_waiter(psql_session_id, |waiter| async {
-            println!("cplane request: {}", url);
-            // TODO: leverage `reqwest::Client` to reuse connections
-            let resp = reqwest::get(url).await?;
-            if !resp.status().is_success() {
-                return Err(AuthErrorImpl::HttpStatus(resp.status()).into());
-            }
-
-            let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?;
-            println!("got auth info: #{:?}", auth_info);
-
-            use ProxyAuthResponse::*;
-            let db_info = match auth_info {
-                Ready { conn_info } => conn_info,
-                Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()),
-                NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?,
-            };
-
-            Ok(db_info)
-        })
-        .await
-    }
-}
-
-// NOTE: the order of constructors is important.
-// https://serde.rs/enum-representations.html#untagged
-#[derive(Serialize, Deserialize, Debug)]
-#[serde(untagged)]
-enum ProxyAuthResponse {
-    Ready { conn_info: DatabaseInfo },
-    Error { error: String },
-    NotReady { ready: bool }, // TODO: get rid of `ready`
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use serde_json::json;
-
-    #[test]
-    fn test_proxy_auth_response() {
-        // Ready
-        let auth: ProxyAuthResponse = serde_json::from_value(json!({
-            "ready": true,
-            "conn_info": DatabaseInfo::default(),
-        }))
-        .unwrap();
-        assert!(matches!(
-            auth,
-            ProxyAuthResponse::Ready {
-                conn_info: DatabaseInfo { .. }
-            }
-        ));
-
-        // Error
-        let auth: ProxyAuthResponse = serde_json::from_value(json!({
-            "ready": false,
-            "error": "too bad, so sad",
-        }))
-        .unwrap();
-        assert!(matches!(auth, ProxyAuthResponse::Error { .. }));
-
-        // NotReady
-        let auth: ProxyAuthResponse = serde_json::from_value(json!({
-            "ready": false,
-        }))
-        .unwrap();
-        assert!(matches!(auth, ProxyAuthResponse::NotReady { .. }));
-    }
-}
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -5,34 +5,29 @@
 //! in somewhat transparent manner (again via communication with control plane API).

 mod auth;
+mod auth_backend;
 mod cancellation;
 mod compute;
 mod config;
-mod cplane_api;
 mod error;
 mod http;
 mod mgmt;
+mod parse;
 mod proxy;
+mod sasl;
+mod scram;
 mod stream;
 mod waiters;

-// Currently SCRAM is only used in tests
-#[cfg(test)]
-mod parse;
-#[cfg(test)]
-mod sasl;
-#[cfg(test)]
-mod scram;
-
 use anyhow::{bail, Context};
 use clap::{App, Arg};
 use config::ProxyConfig;
 use futures::FutureExt;
-use std::future::Future;
+use std::{future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
-use utils::GIT_VERSION;
+use utils::project_git_version;

-use crate::config::{ClientAuthMethod, RouterConfig};
+project_git_version!(GIT_VERSION);

 /// Flattens `Result<Result<T>>` into `Result<T>`.
 async fn flatten_err(
@@ -44,7 +39,7 @@ async fn flatten_err(
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    metrics::set_common_metrics_prefix("zenith_proxy");
-    let arg_matches = App::new("Zenith proxy/router")
+    let arg_matches = App::new("Neon proxy/router")
        .version(GIT_VERSION)
        .arg(
            Arg::new("proxy")
@@ -55,18 +50,11 @@ async fn main() -> anyhow::Result<()> {
                .default_value("127.0.0.1:4432"),
        )
        .arg(
-            Arg::new("auth-method")
-                .long("auth-method")
+            Arg::new("auth-backend")
+                .long("auth-backend")
                .takes_value(true)
-                .help("Possible values: password | link | mixed")
-                .default_value("mixed"),
-        )
-        .arg(
-            Arg::new("static-router")
-                .short('s')
-                .long("static-router")
-                .takes_value(true)
-                .help("Route all clients to host:port"),
+                .help("Possible values: legacy | console | postgres | link")
+                .default_value("legacy"),
        )
        .arg(
            Arg::new("mgmt")
@@ -89,7 +77,7 @@ async fn main() -> anyhow::Result<()> {
                .short('u')
                .long("uri")
                .takes_value(true)
-                .help("redirect unauthenticated users to given uri")
+                .help("redirect unauthenticated users to the given uri in case of link auth")
                .default_value("http://localhost:3000/psql_session/"),
        )
        .arg(
@@ -97,77 +85,68 @@ async fn main() -> anyhow::Result<()> {
                .short('a')
                .long("auth-endpoint")
                .takes_value(true)
-                .help("API endpoint for authenticating users")
+                .help("cloud API endpoint for authenticating users")
                .default_value("http://localhost:3000/authenticate_proxy_request/"),
        )
        .arg(
-            Arg::new("ssl-key")
+            Arg::new("tls-key")
                .short('k')
-                .long("ssl-key")
+                .long("tls-key")
+                .alias("ssl-key") // backwards compatibility
                .takes_value(true)
-                .help("path to SSL key for client postgres connections"),
+                .help("path to TLS key for client postgres connections"),
        )
        .arg(
-            Arg::new("ssl-cert")
+            Arg::new("tls-cert")
                .short('c')
-                .long("ssl-cert")
+                .long("tls-cert")
+                .alias("ssl-cert") // backwards compatibility
                .takes_value(true)
-                .help("path to SSL cert for client postgres connections"),
+                .help("path to TLS cert for client postgres connections"),
        )
        .get_matches();

    let tls_config = match (
-        arg_matches.value_of("ssl-key"),
-        arg_matches.value_of("ssl-cert"),
+        arg_matches.value_of("tls-key"),
+        arg_matches.value_of("tls-cert"),
    ) {
-        (Some(key_path), Some(cert_path)) => Some(config::configure_ssl(key_path, cert_path)?),
+        (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?),
        (None, None) => None,
-        _ => bail!("either both or neither ssl-key and ssl-cert must be specified"),
+        _ => bail!("either both or neither tls-key and tls-cert must be specified"),
    };

-    let auth_method = arg_matches.value_of("auth-method").unwrap().parse()?;
-    let router_config = match arg_matches.value_of("static-router") {
-        None => RouterConfig::Dynamic(auth_method),
-        Some(addr) => {
-            if let ClientAuthMethod::Password = auth_method {
-                let (host, port) = addr.split_once(':').unwrap();
-                RouterConfig::Static {
-                    host: host.to_string(),
-                    port: port.parse().unwrap(),
-                }
-            } else {
-                bail!("static-router requires --auth-method password")
-            }
-        }
-    };
+    let proxy_address: SocketAddr = arg_matches.value_of("proxy").unwrap().parse()?;
+    let mgmt_address: SocketAddr = arg_matches.value_of("mgmt").unwrap().parse()?;
+    let http_address: SocketAddr = arg_matches.value_of("http").unwrap().parse()?;

    let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig {
-        router_config,
-        proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
-        mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
-        http_address: arg_matches.value_of("http").unwrap().parse()?,
-        redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
-        auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
        tls_config,
+        auth_backend: arg_matches.value_of("auth-backend").unwrap().parse()?,
+        auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
+        auth_link_uri: arg_matches.value_of("uri").unwrap().parse()?,
    }));

-    println!("Version: {}", GIT_VERSION);
+    println!("Version: {GIT_VERSION}");

    // Check that we can bind to address before further initialization
-    println!("Starting http on {}", config.http_address);
-    let http_listener = TcpListener::bind(config.http_address).await?.into_std()?;
+    println!("Starting http on {}", http_address);
+    let http_listener = TcpListener::bind(http_address).await?.into_std()?;

-    println!("Starting mgmt on {}", config.mgmt_address);
-    let mgmt_listener = TcpListener::bind(config.mgmt_address).await?.into_std()?;
+    println!("Starting mgmt on {}", mgmt_address);
+    let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?;

-    println!("Starting proxy on {}", config.proxy_address);
-    let proxy_listener = TcpListener::bind(config.proxy_address).await?;
+    println!("Starting proxy on {}", proxy_address);
+    let proxy_listener = TcpListener::bind(proxy_address).await?;

-    let http = tokio::spawn(http::thread_main(http_listener));
-    let proxy = tokio::spawn(proxy::thread_main(config, proxy_listener));
-    let mgmt = tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener));
+    let tasks = [
+        tokio::spawn(http::thread_main(http_listener)),
+        tokio::spawn(proxy::thread_main(config, proxy_listener)),
+        tokio::task::spawn_blocking(move || mgmt::thread_main(mgmt_listener)),
+    ]
+    .map(flatten_err);

-    let tasks = [flatten_err(http), flatten_err(proxy), flatten_err(mgmt)];
+    // This will block until all tasks have completed.
+    // Furthermore, the first one to fail will cancel the rest.
    let _: Vec<()> = futures::future::try_join_all(tasks).await?;

    Ok(())
--- a/proxy/src/mgmt.rs
+++ b/proxy/src/mgmt.rs
@@ -1,4 +1,4 @@
-use crate::{compute::DatabaseInfo, cplane_api};
+use crate::auth_backend;
 use anyhow::Context;
 use serde::Deserialize;
 use std::{
@@ -10,6 +10,8 @@ use utils::{
    pq_proto::{BeMessage, SINGLE_COL_ROWDESC},
 };

+/// TODO: move all of that to auth-backend/link.rs when we ditch legacy-console backend
+
 ///
 /// Main proxy listener loop.
 ///
@@ -75,12 +77,12 @@ struct PsqlSessionResponse {

 #[derive(Deserialize)]
 enum PsqlSessionResult {
-    Success(DatabaseInfo),
+    Success(auth_backend::console::DatabaseInfo),
    Failure(String),
 }

 /// A message received by `mgmt` when a compute node is ready.
-pub type ComputeReady = Result<DatabaseInfo, String>;
+pub type ComputeReady = Result<auth_backend::console::DatabaseInfo, String>;

 impl PsqlSessionResult {
    fn into_compute_ready(self) -> ComputeReady {
@@ -111,7 +113,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R

    let resp: PsqlSessionResponse = serde_json::from_str(query_string)?;

-    match cplane_api::notify(&resp.session_id, resp.result.into_compute_ready()) {
+    match auth_backend::notify(&resp.session_id, resp.result.into_compute_ready()) {
        Ok(()) => {
            pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
                .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -73,7 +73,7 @@ pub async fn thread_main(
 async fn handle_client(
    config: &ProxyConfig,
    cancel_map: &CancelMap,
-    stream: impl AsyncRead + AsyncWrite + Unpin,
+    stream: impl AsyncRead + AsyncWrite + Unpin + Send,
 ) -> anyhow::Result<()> {
    // The `closed` counter will increase when this future is destroyed.
    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
@@ -144,10 +144,15 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                }

                // Here and forth: `or_else` demands that we use a future here
-                let creds = async { params.try_into() }
+                let mut creds: auth::ClientCredentials = async { params.try_into() }
                    .or_else(|e| stream.throw_error(e))
                    .await?;

+                // Set SNI info when available
+                if let Stream::Tls { tls } = stream.get_ref() {
+                    creds.sni_data = tls.get_ref().1.sni_hostname().map(|s| s.to_owned());
+                }
+
                break Ok(Some((stream, creds)));
            }
            CancelRequest(cancel_key_data) => {
@@ -174,7 +179,7 @@ impl<S> Client<S> {
    }
 }

-impl<S: AsyncRead + AsyncWrite + Unpin> Client<S> {
+impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<S> {
    /// Let the client authenticate and connect to the designated compute node.
    async fn connect_to_db(
        self,
@@ -185,10 +190,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<S> {

        // Authenticate and connect to a compute node.
        let auth = creds.authenticate(config, &mut stream).await;
-        let db_info = async { auth }.or_else(|e| stream.throw_error(e)).await?;
+        let node = async { auth }.or_else(|e| stream.throw_error(e)).await?;

        let (db, version, cancel_closure) =
-            db_info.connect().or_else(|e| stream.throw_error(e)).await?;
+            node.connect().or_else(|e| stream.throw_error(e)).await?;
        let cancel_key_data = session.enable_cancellation(cancel_closure);

        stream
--- a/proxy/src/sasl.rs
+++ b/proxy/src/sasl.rs
@@ -39,9 +39,20 @@ pub enum Error {
 /// A convenient result type for SASL exchange.
 pub type Result<T> = std::result::Result<T, Error>;

+/// A result of one SASL exchange.
+pub enum Step<T, R> {
+    /// We should continue exchanging messages.
+    Continue(T),
+    /// The client has been authenticated successfully.
+    Authenticated(R),
+}
+
 /// Every SASL mechanism (e.g. [SCRAM](crate::scram)) is expected to implement this trait.
 pub trait Mechanism: Sized {
+    /// What's produced as a result of successful authentication.
+    type Output;
+
    /// Produce a server challenge to be sent to the client.
    /// This is how this method is called in PostgreSQL (`libpq/sasl.h`).
-    fn exchange(self, input: &str) -> Result<(Option<Self>, String)>;
+    fn exchange(self, input: &str) -> Result<(Step<Self, Self::Output>, String)>;
 }
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -49,6 +49,7 @@ impl<'a> ServerMessage<&'a str> {
        })
    }
 }
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -51,18 +51,23 @@ impl<S: AsyncWrite + Unpin> SaslStream<'_, S> {
 impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
    /// Perform SASL message exchange according to the underlying algorithm
    /// until user is either authenticated or denied access.
-    pub async fn authenticate(mut self, mut mechanism: impl Mechanism) -> super::Result<()> {
+    pub async fn authenticate<M: Mechanism>(
+        mut self,
+        mut mechanism: M,
+    ) -> super::Result<M::Output> {
        loop {
            let input = self.recv().await?;
            let (moved, reply) = mechanism.exchange(input)?;
+
+            use super::Step::*;
            match moved {
-                Some(moved) => {
+                Continue(moved) => {
                    self.send(&ServerMessage::Continue(&reply)).await?;
                    mechanism = moved;
                }
-                None => {
+                Authenticated(result) => {
                    self.send(&ServerMessage::Final(&reply)).await?;
-                    return Ok(());
+                    return Ok(result);
                }
            }
        }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
chaitanya sharma	7a72812b93	Fix the markdown rendering 004-durability.md RFC	2022-05-16 10:11:39 +03:00
Heikki Linnakangas	a10cac980f	Continue with pageserver startup, if loading some tenants fail. Fixes https://github.com/neondatabase/neon/issues/1664	2022-05-15 00:25:38 +03:00
Heikki Linnakangas	081d5dac5e	Bump vendor/postgres. Includes change to reduce log noise from inmem_smgr.	2022-05-13 21:41:00 +03:00
Andrey Taranik	cded72a580	remove sk-2 from staging inventory list (#1699 )	2022-05-13 20:41:54 +03:00
Egor Suvorov	768c846eeb	Fix test_delete_force from #1653 conflicting with #1692	2022-05-13 17:36:18 +02:00
Anastasia Lubennikova	a2561f0a78	Use tenant's pitr_interval instead of hardroded 0 in the command. Adjust python tests that use the	2022-05-13 18:32:14 +03:00
Anastasia Lubennikova	aa7c601eca	Fix pitr_interval check in GC: Use timestamp->LSN mapping instead of file modification time. Fix 'latest_gc_cutoff_lsn' - set it to the minimum of pitr_cutoff and gc_cutoff. Add new test: test_pitr_gc	2022-05-13 18:32:14 +03:00
Egor Suvorov	bf899a57d9	Safekeeper: add timeline/tenant force delete HTTP endpoings (closes #895 ) * There is no auth in Safekeeper HTTP at all currently, so simply calling `check_permission` is not enough. * There are no checks of Safekeeper still working with the data, as "still working" is burry now: a timeline may be "active" while there are no compute nodes and all data is propagated. * Still, callmemaybe is deactivated, and timeline is removed from the internal map. It can easily sneak back in case of race conditions and implicit creations, though.	2022-05-13 15:43:52 +02:00
Egor Suvorov	07b85e7cfc	Safekeeper refactor: move callmemaybe_tx from SafekeeperPostgresBackend to Timeline	2022-05-13 15:43:52 +02:00
Egor Suvorov	22d997049c	libs/utils/http/request: add ensure_no_body	2022-05-13 15:43:52 +02:00
Kirill Bulatov	b683308791	Return GIT_VERSION back to storage binaries	2022-05-13 16:34:32 +03:00
Kirill Bulatov	51c0f9ab2b	Force git version to be up to date via decl macro	2022-05-13 16:34:32 +03:00
Stas Kelvich	0030da57a8	compute-tools: grant rw priveleges to the all created users	2022-05-13 11:27:00 +03:00
Kirill Bulatov	85884a1599	Disable tenant relocation python test	2022-05-13 01:26:38 +03:00
Thang Pham	ae20751724	update `ZenithCli::create_tenant` return signature (#1692 ) to include the initial timeline's ID in addition to the new tenant's ID. Context: follow-up of https://github.com/neondatabase/neon/pull/1689	2022-05-12 17:27:08 -04:00
Thang Pham	5812e26b90	Create an initial timeline on CLI tenant creation (#1689 ) Resolves #1655	2022-05-12 16:33:09 -04:00
Arthur Petukhovsky	ec8861b8cc	Fix pageserver metrics names (#1682 ) Try to follow Prometheus style-guide https://prometheus.io/docs/practices/naming/ for metrics names. More specifically: - Use `pageserver_` prefix for all pagserver metrics - Specify `_seconds` unit in time metrics - Use unit as a suffix in other cases, such as `_hits`, `_bytes`, `_records` - Use `_total` suffix for accumulating counters (note that Histograms append that suffix internally)	2022-05-12 19:53:07 +03:00
Kirill Bulatov	4538f1e1b8	Correctly operate etcd safekeeper timeline data	2022-05-12 18:47:31 +03:00
Stas Kelvich	b10ae195b7	Set vendor/postgres back to the main branch I accidentally merged postgres PR that was referencing non-main branch.	2022-05-12 15:05:49 +03:00
Alexey Kondratov	b426775aa0	Use compute-tools from the new neondatabase Docker Hub repo	2022-05-12 12:26:24 +03:00
Heikki Linnakangas	5da4f3a4df	Refactor DeltaLayer::dump() function Put most of the code in a closure that returns Result, so that we can use the ?-operator for error handling. That's simpler.	2022-05-12 10:31:04 +03:00
Konstantin Knizhnik	2bde77fced	Do not apply records with LSN smaller than LSN of cached image in del… (#1672 ) * Do not apply records with LSN smaller than LSN of cached image in delta layer * Do not apply records with LSN smaller than LSN of cached image in delta layer	2022-05-12 07:56:02 +03:00
Dhammika Pathirana	c864091035	Fix err msg typo Signed-off-by: Dhammika Pathirana <dham@neon.tech>	2022-05-11 16:13:26 -07:00
Anton Shyrabokau	20361395bb	Add zenith-us-stage-sk-5 to circleci inventory (#1665 ) Co-authored-by: Debian <admin@ip-10-0-5-32.us-west-2.compute.internal>	2022-05-11 21:36:53 +03:00
Arseny Sher	b338b5dffe	Make callmemaybe less agressive until we fix it/migrate to bigger machines.	2022-05-11 22:16:13 +04:00
Stas Kelvich	5bd879f641	Proxy: update protocol after cluster->project rename	2022-05-11 15:50:36 +03:00
Konstantin Knizhnik	e6e883eb12	Do not set LSN for new FPI page (#1657 ) * Do not set LSN for new FPI page refer #1656 * Add page_is_new, page_get_lsn, page_set_lsn functions * Fix page_is_new implementation * Add comment from XLogReadBufferForRedoExtended	2022-05-11 15:23:17 +03:00
Heikki Linnakangas	d710dff975	Remove unnecessary Serialize/Deserialize traits from VecMap. It's never stored on disk. Let's be tidy.	2022-05-10 23:47:40 +03:00
Arseny Sher	6cb14b4200	Optionally remove WAL on safekeepers without s3 offloading. And do that on staging, until offloading is merged.	2022-05-10 22:41:02 +04:00
Thang Pham	87dfa99734	Update layered_repository REAMDE (#1659 )	2022-05-10 09:55:14 -04:00
Thang Pham	cf59b51519	Update README (Running local installation section) (#1649 )	2022-05-09 11:11:46 -04:00
Kirill Bulatov	0a7735a656	Rework remote storage sync queue, general refactoring	2022-05-07 01:33:33 +03:00
Kirill Bulatov	64a602b8f3	Delete timeline layers	2022-05-07 01:33:33 +03:00
Kirill Bulatov	10e4da3997	Rework timeline batching	2022-05-07 01:33:33 +03:00
Kirill Bulatov	de37f982db	Share the remote storage as a crate	2022-05-07 00:30:36 +03:00
Kirill Bulatov	d4e155aaa3	Librarify common etcd timeline logic	2022-05-06 22:32:57 +03:00
Arseny Sher	dd6dca9072	Bump vendor/postgres to shut down on wrong basebackup.	2022-05-06 20:07:26 +04:00
bojanserafimov	ef40e404cf	Rename zenith crate to neon_local (#1625 )	2022-05-05 19:06:53 -04:00
Sergey Melnikov	11a44eda0e	Add TLS support in scram-proxy (#1643 ) * Add TLS support in scram-proxy * Fix authEndpoint	2022-05-05 23:48:16 +03:00
Heikki Linnakangas	30a7598172	Some copy-editing.	2022-05-05 22:35:15 +03:00
Heikki Linnakangas	1ad5658d9c	Fix typos	2022-05-05 22:35:15 +03:00
Dmitry Rodionov	954859f6c5	add readme for performance tests with the current state of things	2022-05-05 22:35:15 +03:00
Andrey Taranik	4024bfe736	get_binaries script fix (#1638 ) * get_binaries script fix * minor improvment for get_binaries	2022-05-05 22:21:07 +03:00
Kirill Bulatov	2ef0e5c6ed	Do not require metadata in every upload sync task	2022-05-05 18:26:39 +03:00
Kirill Bulatov	52a7e3155e	Add local path to the Layer trait and historic layers	2022-05-05 18:26:39 +03:00
Thang Pham	ad5eaa6027	Use node's LSN for read-only nodes (#1642 ) Fixes #1410.	2022-05-05 10:53:10 -04:00
Dmitry Rodionov	0f3ec83172	avoid detach with alive branches	2022-05-05 12:54:42 +03:00
Arseny Sher	c46fe90010	Fix division by zero in WAL removal.	2022-05-05 10:41:43 +04:00
bojanserafimov	bc569dde51	Remove some unwraps from waldecoder (#1539 )	2022-05-04 17:41:05 -04:00
bojanserafimov	02e5083695	Add hot page test (#1479 )	2022-05-04 12:45:01 -04:00
Thang Pham	c4bc604e5f	Fix pg list table alignment #1633 Fixes #1628 - add [`comfy_table`](https://github.com/Nukesor/comfy-table/tree/main) and use it to construct table for `pg list` CLI command Comparison - Old: ``` NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 3823dd05e35d71f6ccf33049de366d70 main 0/16FB140 running migration_check 127.0.0.1:55433 3823dd05e35d71f6ccf33049de366d70 main 0/16FB140 running ``` - New: ``` NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS main 127.0.0.1:55432 3823dd05e35d71f6ccf33049de366d70 main 0/16FB140 running migration_check 127.0.0.1:55433 3823dd05e35d71f6ccf33049de366d70 main 0/16FB140 running ```	2022-05-04 12:12:26 -04:00
Anastasia Lubennikova	b8880bfaab	Bump vendor/postgres	2022-05-04 18:14:45 +03:00
Anastasia Lubennikova	e2cf77441d	Implement pg_database_size(). In this implementation dbsize equals sum of all relation sizes, excluding shared ones.	2022-05-04 18:14:45 +03:00
Arseny Sher	b68e3b03ed	Fix control file update for `b9fd8a36ad`	2022-05-04 17:11:22 +04:00
Arseny Sher	e58c83870f	Bump vendor/postgres to to send timeline_start_lsn.	2022-05-04 14:32:03 +04:00
Arseny Sher	b9fd8a36ad	Remember timeline_start_lsn and local_start_lsn on safekeeper. Make it remember when timeline starts in general and on this safekeeper in particular (the point might be later on new safekeeper replacing failed one). Bumps control file and walproposer protocol versions. While protocol is bumped, also add safekeeper node id to AcceptorProposerGreeting. ref #1561	2022-05-04 14:32:03 +04:00
Heikki Linnakangas	748c5a577b	Bump vendor/postgres. (#1616 ) Includes fix for https://github.com/neondatabase/neon/issues/1615	2022-05-04 10:54:44 +03:00
Stas Kelvich	51a0f2683b	fix scram-proxy addresses	2022-05-04 01:35:30 +03:00
Dmitry Rodionov	9dfa145c7c	tone down tenant not found error	2022-05-04 00:47:52 +03:00
Stas Kelvich	5642d0b2b8	Change shutdown_process_on_error thread spawn settings. Now princeple is following: acceptor threads (libpq and http) error will bring the pageserver down, but all per-tenant thread failures will be treated as an error.	2022-05-04 00:42:57 +03:00
Dmitry Rodionov	2f83f793bc	print more details when thread fails	2022-05-03 18:31:23 +03:00
Anastasia Lubennikova	2f9b17b9e5	Add simple test of pageserver recovery after crash. To cause a crash, use failpoints in checkpointer	2022-05-03 17:13:09 +03:00
Dmitry Rodionov	e7cba0b607	use thiserror instead of anyhow in disk_btree	2022-05-03 15:34:23 +03:00
Dmitry Rodionov	ff7e9a86c6	turn panic into an error with more details	2022-05-03 12:44:42 +03:00
Heikki Linnakangas	9ede38b6c4	Support finding LSN from a commit timestamp. A new `get_lsn_by_timestamp` command is added to the libpq page service API. An extra timestamp field is now stored in an extra field after each Clog page. It is the timestamp of the latest commit, among all the transactions on the Clog page. To find the overall latest commit, we need to scan all Clog pages, but this isn't a very frequent operation so that's not too bad. To find the LSN that corresponds to a timestamp, we perform a binary search. The binary search starts with min = last LSN when GC ran, and max = latest LSN on the timeline. On each iteration of the search we check if there are any commits with a higher-than-requested timestamp at that LSN. Implements github issue 1361.	2022-05-03 09:28:57 +03:00
Heikki Linnakangas	62449d6068	Bump vendor/postgres (#1573 ) This brings us the performance improvements to WAL redo from https://github.com/neondatabase/postgres/pull/144	2022-05-03 09:25:12 +03:00
Konstantin Knizhnik	baa59512b8	Traverse frozen layer in get_reconstruct_data in reverse order (#1601 ) * Traverse frozen layer in get_reconstruct_data in reverse order * Fix comments on frozen layers. Note explicitly the order that the layers are in the queue. * Add fail point to reproduce failpoint iteration error Co-authored-by: Heikki Linnakangas <heikki@neon.tech>	2022-05-03 08:07:14 +03:00
Heikki Linnakangas	87a6c4d051	RFC on connection routing and authentication. This documents how we want this to work. We're not quite there yet.	2022-05-02 23:39:06 +03:00
Stas Kelvich	801b749e1d	Set correct authEndpoint for the new proxy	2022-05-02 21:46:32 +03:00
Kirill Bulatov	5cb501c2b3	Make remote storage test less flacky	2022-05-02 20:04:48 +03:00
Dmitry Rodionov	ad25736f3a	Exit pageserver process with correct error code When we shutdown pageserver due to an error (e g one of th important thrads panicked) use 1 exit code so systemd can properly restart it	2022-05-02 19:04:45 +03:00
Stas Kelvich	9a396e1feb	Support SNI-based routing in proxy	2022-05-02 18:32:18 +03:00
Stas Kelvich	0323bb5870	[proxy] Refactor cplane API and add new console SCRAM auth API Now proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented: * legacy old method, when username ends with `@zenith` it uses md5 auth dbname as the cluster name; otherwise, it sends a login link and waits for the console to call back * console new SCRAM-based console API; uses SNI info to select the destination cluster * postgres uses postgres to select auth secrets of existing roles. Useful for local testing * link sends login link for all usernames	2022-05-02 18:32:18 +03:00
Dmitry Ivanov	af0195b604	[proxy] Introduce `cloud::Api` for communication with Neon Cloud * `cloud::legacy` talks to Cloud API V1. * `cloud::api` defines Cloud API v2. * `cloud::local` mocks the Cloud API V2 using a local postgres instance. * It's possible to choose between API versions using the `--api-version` flag.	2022-05-02 18:32:18 +03:00
Dmitry Ivanov	9df8915b03	[proxy] `sasl::Mechanism` may return `Output` during exchange This is needed to forward the `ClientKey` that's required to connect the proxy to a compute. Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>	2022-05-02 18:32:18 +03:00
Dmitry Ivanov	4b1bd32e4a	Drop `Debug` impl for `ScramKey` and `ServerSecret` There's a notion that accidental misuse of those implementations might reveal authentication secrets.	2022-05-02 18:32:18 +03:00
Andrey Taranik	68ba6a58a0	authEndpoint fix	2022-05-02 17:55:13 +03:00
Andrey Taranik	8f479a712f	minor fixes in proxy deployment	2022-05-02 17:55:13 +03:00
Stas Kelvich	2477d2f9e2	Deploy standalone SRAM proxy on staging	2022-05-02 17:55:13 +03:00
Dhammika Pathirana	992874c916	Fix update ps settings doc Signed-off-by: Dhammika Pathirana <dhammika@gmail.com>	2022-05-01 13:52:08 -07:00
Dhammika Pathirana	3128e8c75c	Fix tenant conf test Signed-off-by: Dhammika Pathirana <dhammika@gmail.com>	2022-05-01 13:13:25 -07:00
Dhammika Pathirana	f3f12db2cb	Add gc churn threshold knob (#1594 ) Signed-off-by: Dhammika Pathirana <dhammika@gmail.com>	2022-05-01 13:13:17 -07:00
Andrey Taranik	038ea4c128	proxy notice message update (#1600 )	2022-04-30 22:04:08 +03:00
Kirill Bulatov	7e1db8c8a1	Show which virtual file got the deserialization errors	2022-04-29 21:40:57 +03:00
Andrey Taranik	aa933d3961	proxy settings update for new domain (#1597 )	2022-04-29 20:05:14 +03:00
Dmitry Rodionov	67b4e38092	remporarily disable test_backpressure_received_lsn_lag	2022-04-29 15:53:56 +03:00
Dmitry Rodionov	05f8e6a050	Use fsync+rename for atomic downloads from remote storage Use failpoint in test_remote_storage to check the behavior	2022-04-29 15:53:56 +03:00
chaitanya sharma	76388abeb6	Rename READMEs with .md extension, and fix links to them. Commit `edba2e97` renamed pageserver/README to pageserver/README.md, but forgot to update links to it. Fix. Rename libs/postgres_ffi/README and safekeeper/README files to also have the the .md extension, so that github can render them nicely. Quote ascii-diagram in safekeeper/README.md so that it renders correctly.	2022-04-29 14:23:42 +03:00
Kirill Bulatov	2911eb084a	Remove timeline files on detach	2022-04-29 09:19:18 +03:00
Kirill Bulatov	6cca57f95a	Properly remove from the local timeline map	2022-04-29 09:19:18 +03:00
Kirill Bulatov	4a46b01caf	Properly populate local timeline map	2022-04-29 09:19:18 +03:00
Anastasia Lubennikova	5c5c3c64f3	Fix tenant config parsing. Add a test	2022-04-28 11:49:19 +03:00