delete data

delete remote extension regression test
2026-05-17 13:10:38 +00:00 · 2023-08-25 09:07:50 -04:00 · 2023-08-25 09:07:50 -04:00
63 changed files with 998 additions and 2227 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -145,11 +145,7 @@ runs:

        if [ "${RERUN_FLAKY}" == "true" ]; then
          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
-                                              --days 7 \
-                                              --output "$TEST_OUTPUT/flaky.json" \
-                                              --pg-version "${DEFAULT_PG_VERSION}" \
-                                              --build-type "${BUILD_TYPE}"
+          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json"

          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
        fi
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -752,7 +752,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.5
+      VM_BUILDER_VERSION: v0.16.3

    steps:
      - name: Checkout
@@ -775,7 +775,6 @@ jobs:
        run: |
          ./vm-builder \
            -enable-file-cache \
-            -cgroup-uid=postgres \
            -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} \
            -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}

@@ -904,7 +903,7 @@ jobs:
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
-    needs: [ tag ]
+    needs: [ promote-images, tag ]
    steps:
      - name: Set PR's status to pending and request a remote CI test
        run: |
--- a/13
+++ b/13
@@ -1,12 +1,11 @@
-/compute_tools/ @neondatabase/control-plane @neondatabase/compute
+/compute_tools/ @neondatabase/control-plane
 /control_plane/ @neondatabase/compute @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute
-/libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/safekeepers
-/libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/compute @neondatabase/storage
+/libs/postgres_ffi/ @neondatabase/compute 
+/libs/remote_storage/ @neondatabase/storage 
+/libs/safekeeper_api/ @neondatabase/safekeepers  
+/pageserver/ @neondatabase/compute @neondatabase/storage 
 /pgxn/ @neondatabase/compute
-/proxy/ @neondatabase/proxy
+/proxy/ @neondatabase/control-plane 
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5014,7 +5014,6 @@ dependencies = [
 "nix 0.26.2",
 "once_cell",
 "pin-project-lite",
- "postgres_connection",
 "pq_proto",
 "rand",
 "regex",
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -211,8 +211,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
-    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.4.tar.gz -O pgvector.tar.gz && \
+    echo "1cb70a63f8928e396474796c22a20be9f7285a8a013009deb8152445b61b72e6 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -19,10 +19,9 @@ Also `compute_ctl` spawns two separate service threads:
 - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
  last activity requests.

-If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
-`vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
-`vm-monitor` communicates with the VM autoscaling system. It coordinates
-downscaling and requests immediate upscaling under resource pressure.
+If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
+compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
+downscaling and (eventually) will request immediate upscaling under resource pressure.

 Usage example:
 ```sh
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -20,10 +20,9 @@
 //! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
 //!   last activity requests.
 //!
-//! If `AUTOSCALING` environment variable is set, `compute_ctl` will start the
-//! `vm-monitor` located in [`neon/libs/vm_monitor`]. For VM compute nodes,
-//! `vm-monitor` communicates with the VM autoscaling system. It coordinates
-//! downscaling and requests immediate upscaling under resource pressure.
+//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
+//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
+//! downscaling and (eventually) will request immediate upscaling under resource pressure.
 //!
 //! Usage example:
 //! ```sh
@@ -279,9 +278,8 @@ fn main() -> Result<()> {
            use tokio_util::sync::CancellationToken;
            use tracing::warn;
            let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
-            let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
-            let cgroup = matches.get_one::<String>("cgroup");
-            let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
+            let cgroup = matches.get_one::<String>("filecache-connstr");
+            let file_cache_connstr = matches.get_one::<String>("cgroup");

            // Only make a runtime if we need to.
            // Note: it seems like you can make a runtime in an inner scope and
@@ -314,7 +312,6 @@ fn main() -> Result<()> {
                        cgroup: cgroup.cloned(),
                        pgconnstr: file_cache_connstr.cloned(),
                        addr: vm_monitor_addr.cloned().unwrap(),
-                        file_cache_on_disk,
                    })),
                    token.clone(),
                ))
@@ -485,11 +482,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            Arg::new("file-cache-on-disk")
-                .long("file-cache-on-disk")
-                .action(clap::ArgAction::SetTrue),
-        )
 }

 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,5 +1,4 @@
 use std::collections::HashMap;
-use std::env;
 use std::fs;
 use std::io::BufRead;
 use std::os::unix::fs::PermissionsExt;
@@ -176,27 +175,6 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
    }
 }

-/// If we are a VM, returns a [`Command`] that will run in the `neon-postgres`
-/// cgroup. Otherwise returns the default `Command::new(cmd)`
-///
-/// This function should be used to start postgres, as it will start it in the
-/// neon-postgres cgroup if we are a VM. This allows autoscaling to control
-/// postgres' resource usage. The cgroup will exist in VMs because vm-builder
-/// creates it during the sysinit phase of its inittab.
-fn maybe_cgexec(cmd: &str) -> Command {
-    // The cplane sets this env var for autoscaling computes.
-    // use `var_os` so we don't have to worry about the variable being valid
-    // unicode. Should never be an concern . . . but just in case
-    if env::var_os("AUTOSCALING").is_some() {
-        let mut command = Command::new("cgexec");
-        command.args(["-g", "memory:neon-postgres"]);
-        command.arg(cmd);
-        command
-    } else {
-        Command::new(cmd)
-    }
-}
-
 /// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
 /// that we give to customers
 fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
@@ -473,7 +451,7 @@ impl ComputeNode {
    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

-        let sync_handle = maybe_cgexec(&self.pgbin)
+        let sync_handle = Command::new(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -608,7 +586,7 @@ impl ComputeNode {

        // Start postgres
        info!("starting postgres");
-        let mut pg = maybe_cgexec(&self.pgbin)
+        let mut pg = Command::new(&self.pgbin)
            .args(["-D", pgdata])
            .spawn()
            .expect("cannot start postgres process");
@@ -636,7 +614,7 @@ impl ComputeNode {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
-        let mut pg = maybe_cgexec(&self.pgbin)
+        let mut pg = Command::new(&self.pgbin)
            .args(["-D", &self.pgdata])
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
                vec![("NEON_AUTH_TOKEN", storage_auth_token)]
--- a/docs/rfcs/025-generation-numbers.md
+++ b/docs/rfcs/025-generation-numbers.md
@@ -1,957 +0,0 @@
-# Pageserver: split-brain safety for remote storage through generation numbers
-
-## Summary
-
-A scheme of logical "generation numbers" for tenant attachment to pageservers is proposed, along with
-changes to the remote storage format to include these generation numbers in S3 keys.
-
-Using the control plane as the issuer of these generation numbers enables strong anti-split-brain
-properties in the pageserver cluster without implementing a consensus mechanism directly
-in the pageservers.
-
-## Motivation
-
-Currently, the pageserver's remote storage format does not provide a mechanism for addressing
-split brain conditions that may happen when replacing a node or when migrating
-a tenant from one pageserver to another.
-
-From a remote storage perspective, a split brain condition occurs whenever two nodes both think
-they have the same tenant attached, and both can write to S3. This can happen in the case of a
-network partition, pathologically long delays (e.g. suspended VM), or software bugs.
-
-In the current deployment model, control plane guarantees that a tenant is attached to one
-pageserver at a time, thereby ruling out split-brain conditions resulting from dual
-attachment (however, there is always the risk of a control plane bug). This control
-plane guarantee prevents robust response to failures, as if a pageserver is unresponsive
-we may not detach from it. The mechanism in this RFC fixes this, by making it safe to
-attach to a new, different pageserver even if an unresponsive pageserver may be running.
-
-Futher, lack of safety during split-brain conditions blocks two important features where occasional
-split-brain conditions are part of the design assumptions:
-
- seamless tenant migration ([RFC PR](https://github.com/neondatabase/neon/pull/5029))
- automatic pageserver instance failure handling (aka "failover") (RFC TBD)
-
-### Prior art
-
- 020-pageserver-s3-coordination.md
- 023-the-state-of-pageserver-tenant-relocation.md
- 026-pageserver-s3-mvcc.md
-
-This RFC has broad similarities to the proposal to implement a MVCC scheme in
-S3 object names, but this RFC avoids a general purpose transaction scheme in
-favour of more specialized "generations" that work like a transaction ID that
-always has the same lifetime as a pageserver process or tenant attachment, whichever
-is shorter.
-
-## Requirements
-
- Accommodate storage backends with no atomic or fencing capability (i.e. work within
-  S3's limitation that there are no atomics and clients can't be fenced)
- Don't depend on any STONITH or node fencing in the compute layer (i.e. we will not
-  assume that we can reliably kill and EC2 instance and have it die)
- Scoped per-tenant, not per-pageserver; for _seamless tenant migration_, we need
-  per-tenant granularity, and for _failover_, we likely want to spread the workload
-  of the failed pageserver instance to a number of peers, rather than monolithically
-  moving the entire workload to another machine.
-  We do not rule out the latter case, but should not constrain ourselves to it.
-
-## Design Tenets
-
-These are not requirements, but are ideas that guide the following design:
-
- Avoid implementing another consensus system: we already have a strongly consistent
-  database in the control plane that can do atomic operations where needed, and we also
-  have a Paxos implementation in the safekeeper.
- Avoiding locking in to specific models of how failover will work (e.g. do not assume that
-  all the tenants on a pageserver will fail over as a unit).
- Be strictly correct when it comes to data integrity. Occasional failures of availability
-  are tolerable, occasional data loss is not.
-
-## Non Goals
-
-The changes in this RFC intentionally isolate the design decision of how to define
-logical generations numbers and object storage format in a way that is somewhat flexible with
-respect to how actual orchestration of failover works.
-
-This RFC intentionally does not cover:
-
- Failure detection
- Orchestration of failover
- Standby modes to keep data ready for fast migration
- Intentional multi-writer operation on tenants (multi-writer scenarios are assumed to be transient split-brain situations).
- Sharding.
-
-The interaction between this RFC and those features is discussed in [Appendix B](#appendix-b-interoperability-with-other-features)
-
-## Impacted Components
-
-pageserver, control plane, safekeeper (optional)
-
-## Implementation Part 1: Correctness
-
-### Summary
-
- A per-tenant **generation number** is introduced to uniquely identifying tenant attachments to pageserver processes.
-
-  - This generation number increments each time the control plane modifies a tenant (`Project`)'s assigned pageserver, or when the assigned pageserver restarts.
-  - the control plane is the authority for generation numbers: only it may
-    increment a generation number.
-
- **Object keys are suffixed** with the generation number
- **Safety for multiply-attached tenants** is provided by the
-  generation number in the object key: the competing pageservers will not
-  try to write to the same keys.
- **Safety in split brain for multiple nodes running with
-  the same node ID** is provided by the pageserver calling out to the control plane
-  on startup, to re-attach and thereby increment the generations of any attached tenants
- **Safety for deletions** is achieved by deferring the DELETE from S3 to a point in time where the deleting node has validated with control plane that no attachment with a higher generation has a reference to the to-be-DELETEd key.
- **The control plane is used to issue generation numbers** to avoid the need for
-  a built-in consensus system in the pageserver, although this could in principle
-  be changed without changing the storage format.
-
-### Generation numbers
-
-A generation number is associated with each tenant in the control plane,
-and each time the attachment status of the tenant changes, this is incremented.
-Changes in attachment status include:
-
- Attaching the tenant to a different pageserver
- A pageserver restarting, and "re-attaching" its tenants on startup
-
-These increments of attachment generation provide invariants we need to avoid
-split-brain issues in storage:
-
- If two pageservers have the same tenant attached, the attachments are guaranteed to have different generation numbers, because the generation would increment
-  while attaching the second one.
- If there are multiple pageservers running with the same node ID, all the attachments on all pageservers are guaranteed to have different generation numbers, because the generation would increment
-  when the second node started and re-attached its tenants.
-
-As long as the infrastructure does not transparently replace an underlying
-physical machine, we are totally safe. See the later [unsafe case](#unsafe-case-on-badly-behaved-infrastructure) section for details.
-
-### Object Key Changes
-
-#### Generation suffix
-
-All object keys (layer objects and index objects) will contain the attachment
-generation as a [suffix](#why-a-generation-suffix-rather-than-prefix).
-This suffix is the primary mechanism for protecting against split-brain situations, and
-enabling safe multi-attachment of tenants:
-
- Two pageservers running with the same node ID (e.g. after a failure, where there is
-  some rogue pageserver still running) will not try to write to the same objects, because at startup they will have re-attached tenants and thereby incremented
-  generation numbers.
- Multiple attachments (to different pageservers) of the same tenant will not try to write to the same objects, as each attachment would have a distinct generation.
-
-The generation is appended in hex format (8 byte string representing
-u32), to all our existing key names. A u32's range limit would permit
-27 restarts _per second_ over a 5 year system lifetime: orders of magnitude more than
-is realistic.
-
-The exact meaning of the generation suffix can evolve over time if necessary, for
-example if we chose to implement a failover mechanism internally to the pageservers
-rather than going via the control plane. The storage format just sees it as a number,
-with the only semantic property being that the highest numbered index is the latest.
-
-#### Index changes
-
-Since object keys now include a generation suffix, the index of these keys must also be updated. IndexPart currently stores keys and LSNs sufficient to reconstruct key names: this would be extended to store the generation as well.
-
-This will increase the size of the file, but only modestly: layers are already encoded as
-their string-ized form, so the overhead is about 10 bytes per layer. This will be less if/when
-the index storage format is migrated to a binary format from JSON.
-
-#### Visibility
-
-_This section doesn't describe code changes, but extends on the consequences of the
-object key changes given above_
-
-##### Visibility of objects to pageservers
-
-Pageservers can of course list objects in S3 at any time, but in practice their
-visible set is based on the contents of their LayerMap, which is initialized
-from the `index_part.json.???` that they load.
-
-Starting with the `index_part` from the most recent previous generation
-(see [loading index_part](#finding-the-remote-indices-for-timelines)), a pageserver
-initially has visibility of all the objects that were referenced in the loaded index.
-These objects are guaranteed to remain visible until the current generation is
-superseded, via pageservers in older generations avoiding deletions (see [deletion](#deletion)).
-
-The "most recent previous generation" is _not_ necessarily the most recent
-in terms of walltime, it is the one that is readable at the time a new generation
-starts. Consider the following sequence of a tenant being re-attached to different
-pageserver nodes:
-
- Create + attach on PS1 in generation 1
- PS1 Do some work, write out index_part.json-0001
- Attach to PS2 in generation 2
- Read index_part.json-0001
- PS2 starts doing some work...
- Attach to PS3 in generation 3
- Read index_part.json-0001
- **...PS2 finishes its work: now it writes index_part.json-0002**
- PS3 writes out index_part.json-0003
-
-In the above sequence, the ancestry of indices is:
-
-```
-0001 -> 0002
-     |
-     -> 0003
-```
-
-This is not an issue for safety: if the 0002 references some object that is
-not in 0001, then 0003 simply does not see it, and will re-do whatever
-work was required (e.g. ingesting WAL or doing compaction). Objects referenced
-by only the 0002 index will never be read by future attachment generations, and
-will eventually be cleaned up by a scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)).
-
-##### Visibility of LSNs to clients
-
-Because index_part.json is now written with a generation suffix, which data
-is visible depends on which generation the reader is operating in:
-
- If one was passively reading from S3 from outside of a pageserver, the
-  visibility of data would depend on which index_part.json-<generation> file
-  one had chosen to read from.
- If two pageservers have the same tenant attached, they may have different
-  data visible as they're independently replaying the WAL, and maintaining
-  independent LayerMaps that are written to independent index_part.json files.
-  Data does not have to be remotely committed to be visible.
- For a pageserver writing with a stale generation, historic LSNs
-  remain readable until another pageserver (with a higher generation suffix)
-  decides to execute GC deletions. At this point, we may think of the stale
-  attachment's generation as having logically ended: during its existence
-  the generation had a consistent view of the world.
- For a newly attached pageserver, its highest visible LSN may appears to
-  go backwards with respect to an earlier attachment, if that earlier
-  attachment had not uploaded all data to S3 before the new attachment.
-
-### Deletion
-
-#### Generation number validation
-
-While writes are de-conflicted by writers always using their own generation number in the key,
-deletions are slightly more challenging: if a pageserver A is isolated, and the true active node is
-pageserver B, then it is dangerous for A to do any object deletions, even of objects that it wrote
-itself, because pageserver's B metadata might reference those objects.
-
-We solve this by inserting a "generation validation" step between the write of a remote index
-that un-links a particular object from the index, and the actual deletion of the object, such
-that deletions strictly obey the following ordering:
-
-1. Write out index_part.json: this guarantees that any subsequent reader of the metadata will
-   not try and read the object we unlinked.
-2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
-3. If step 2 passes, it is safe to delete the object. Why? The check-in with control plane
-   together with our visibility rules guarantees that any later generation
-   will use either the exact `index_part.json` that we uploaded in step 1, or a successor
-   of it; not an earlier one. In both cases, the `index_part.json` doesn't reference the
-   key we are deleting anymore, so, the key is invisible to any later attachment generation.
-   Hence it's safe to delete it.
-
-Note that at step 2 we are only confirming that deletions of objects _no longer referenced
-by the specific `index_part.json` written in step 1_ are safe. If we were attempting other deletions concurrently,
-these would need their own generation validation step.
-
-If step 2 fails, we may leak the object. This is safe, but has a cost: see [scrubbing](#cleaning-up-orphan-objects-scrubbing). We may avoid this entirely outside of node
-failures, if we do proper flushing of deletions on clean shutdown and clean migration.
-
-To avoid doing a huge number of control plane requests to perform generation validation,
-validation of many tenants will be done in a single request, and deletions will be queued up
-prior to validation: see [Persistent deletion queue](#persistent-deletion-queue) for more.
-
-#### `remote_consistent_lsn` updates
-
-Remote objects are not the only kind of deletion the pageserver does: it also indirectly deletes
-WAL data, by feeding back remote_consistent_lsn to safekeepers, as a signal to the safekeepers that
-they may drop data below this LSN.
-
-For the same reasons that deletion of objects must be guarded by an attachment generation number
-validation step, updates to `remote_consistent_lsn` are subject to the same rules, using
-an ordering as follows:
-
-1. upload the index_part that covers data up to LSN `L0` to S3
-2. Call out to control plane to validate that the generation which we use for our attachment is still the latest.
-3. advance the `remote_consistent_lsn` that we advertise to the safekeepers to `L0`
-
-If step 2 fails, then the `remote_consistent_lsn` advertised
-to safekeepers will not advance again until a pageserver
-with the latest generation is ready to do so.
-
-**Note:** at step 3 we are not advertising the _latest_ remote_consistent_lsn, we are
-advertising the value in the index_part that we uploaded in step 1. This provides
-a strong ordering guarantee.
-
-Internally to the pageserver, each timeline will have two remote_consistent_lsn values: the one that
-reflects its latest write to remote storage, and the one that reflects the most
-recent validation of generation number. It is only the latter value that may
-be advertised to the outside world (i.e. to the safekeeper).
-
-The control plane remains unaware of `remote_consistent_lsn`: it only has to validate
-the freshness of generation numbers, thereby granting the pageserver permission to
-share the information with the safekeeper.
-
-For convenience, in subsequent sections and RFCs we will use "deletion" to mean both deletion
-of objects in S3, and updates to the `remote_consistent_lsn`, as updates to the remote consistent
-LSN are de-facto deletions done via the safekeeper, and both kinds of deletion are subject to
-the same generation validation requirement.
-
-### Pageserver attach/startup changes
-
-#### Attachment
-
-Calls to `/v1/tenant/{tenant_id}/attach` are augmented with an additional
-`generation` field in the body.
-
-The pageserver does not persist this: a generation is only good for the lifetime
-of a process.
-
-#### Finding the remote indices for timelines
-
-Because index files are now suffixed with generation numbers, the pageserver
-cannot always GET the remote index in one request, because it can't always
-know a-priori what the latest remote index is.
-
-Typically, the most recent generation to write an index would be our own
-generation minus 1. However, this might not be the case: the previous
-node might have started and acquired a generation number, and then crashed
-before writing out a remote index.
-
-In the general case and as a fallback, the pageserver may list all the `index_part.json`
-files for a timeline, sort them by generation, and pick the highest that is `<=`
-its current generation for this attachment. The tenant should never load an index
-with an attachment generation _newer_ than its own.
-These two rules combined ensure that objects written by later generations are never visible to earlier generations.
-
-Note that if a given attachment picks an index part from an earlier generation (say n-2), but crashes & restarts before it writes its own generation's index part, next time it tries to pick an index part there may be an index part from generation n-1.
-It would pick the n-1 index part in that case, because it's sorted higher than the previous one from generation n-2.
-So, above rules guarantee no determinism in selecting the index part.
-are allowed to be attached with stale attachment generations during a multiply-attached
-phase in a migration, and in this instance if the old location's pageserver restarts,
-it should not try and load the newer generation's index.
-
-To summarize, on starting a timeline, the pageserver will:
-
-1. Issue a GET for index_part.json-<my generation - 1>
-2. If 1 failed, issue a ListObjectsv2 request for index_part.json\* and
-   pick the newest.
-
-One could optimize this further by using the control plane to record specifically
-which generation most recently wrote an index_part.json, if necessary, to increase
-the probability of finding the index_part.json in one GET. One could also improve
-the chances by having pageservers proactively write out index_part.json after they
-get a new generation ID.
-
-#### Re-attachment on startup
-
-On startup, the pageserver will call out to an new control plane `/re-attach`
-API (see [Generation API](#generation-api)). This returns a list of
-tenants that should be attached to the pageserver, and their generation numbers, which
-the control plane will increment before returning.
-
-The pageserver should still scan its local disk on startup, but should _delete_
-any local content for tenants not indicated in the `/re-attach` response: their
-absence is an implicit detach operation.
-
-**Note** if a tenant is omitted from the re-attach response, its local disk content
-will be deleted. This will change in subsequent work, when the control plane gains
-the concept of a secondary/standby location: a node with local content may revert
-to this status and retain some local content.
-
-#### Cleaning up previous generations' remote indices
-
-Deletion of old indices is not necessary for correctness, although it is necessary
-to avoid the ListObjects fallback in the previous section becoming ever more expensive.
-
-Once the new attachment has written out its index_part.json, it may asynchronously clean up historic index_part.json
-objects that were found.
-
-We may choose to implement this deletion either as an explicit step after we
-write out index_part for the first time in a pageserver's lifetime, or for
-simplicity just do it periodically as part of the background scrub (see [scrubbing](#cleaning-up-orphan-objects-scrubbing));
-
-### Control Plane Changes
-
-#### Store generations for attaching tenants
-
- The `Project` table must store the generation number for use when
-  attaching the tenant to a new pageserver.
- The `/v1/tenant/:tenant_id/attach` pageserver API will require the generation number,
-  which the control plane can supply by simply incrementing the `Project`'s
-  generation number each time the tenant is attached to a different server: the same database
-  transaction that changes the assigned pageserver should also change the generation number.
-
-#### Generation API
-
-This section describes an API that could be provided directly by the control plane,
-or built as a separate microservice. In earlier parts of the RFC, when we
-discuss the control plane providing generation numbers, we are referring to this API.
-
-The API endpoints used by the pageserver to acquire and validate generation
-numbers are quite simple, and only require access to some persistent and
-linerizable storage (such as a database).
-
-Building this into the control plane is proposed as a least-effort option to exploit existing infrastructure and implement generation number issuance in the same transaction that mandates it (i.e., the transaction that updates the `Project` assignment to another pageserver).
-However, this is not mandatory: this "Generation Number Issuer" could
-be built as a microservice. In practice, we will write such a miniature service
-anyway, to enable E2E pageserver/compute testing without control plane.
-
-The endpoints required by pageservers are:
-
-##### `/re-attach`
-
- Request: `{node_id: <u32>}`
- Response:
-  - 200 `{tenants: [{id: <TenantId>, gen: <u32>}]}`
-  - 404: unknown node_id
-  - (Future: 429: flapping detected, perhaps nodes are fighting for the same node ID,
-    or perhaps this node was in a retry loop)
-  - (On unknown tenants, omit tenant from `tenants` array)
- Server behavior: query database for which tenants should be attached to this pageserver.
-  - for each tenant that should be attached, increment the attachment generation and
-    include the new generation in the response
- Client behavior:
-  - for all tenants in the response, activate with the new generation number
-  - for any local disk content _not_ referenced in the response, act as if we
-    had been asked to detach it (i.e. delete local files)
-
-**Note** the `node_id` in this request will change in future if we move to ephemeral
-node IDs, to be replaced with some correlation ID that helps the control plane realize
-if a process is running with the same storage as a previous pageserver process (e.g.
-we might use EC instance ID, or we might just write some UUID to the disk the first
-time we use it)
-
-##### `/validate`
-
- Request: `{'tenants': [{tenant: <tenant id>, attach_gen: <gen>}, ...]}'`
- Response:
-  - 200 `{'tenants': [{tenant: <tenant id>, status: <bool>}...]}`
-  - (On unknown tenants, omit tenant from `tenants` array)
- Purpose: enable the pageserver to discover for the given attachments whether they are still the latest.
- Server behavior: this is a read-only operation: simply compare the generations in the request with
-  the generations known to the server, and set status to `true` if they match.
- Client behavior: clients must not do deletions within a tenant's remote data until they have
-  received a response indicating the generation they hold for the attachment is current.
-
-#### Use of `/load` and `/ignore` APIs
-
-Because the pageserver will be changed to only attach tenants on startup
-based on the control plane's response to a `/re-attach` request, the load/ignore
-APIs no longer make sense in their current form.
-
-The `/load` API becomes functionally equivalent to attach, and will be removed:
-any location that used `/load` before should just attach instead.
-
-The `/ignore` API is equivalent to detaching, but without deleting local files.
-
-### Timeline/Branch creation & deletion
-
-All of the previous arguments for safety have described operations within
-a timeline, where we may describe a sequence that includes updates to
-index_part.json, and where reads and writes are coming from a postgres
-endpoint (writes via the safekeeper).
-
-Creating or destroying timeline is a bit different, because writes
-are coming from the control plane.
-
-We must be safe against scenarios such as:
-
- A tenant is attached to pageserver B while pageserver A is
-  in the middle of servicing an RPC from the control plane to
-  create or delete a tenant.
- A pageserver A has been sent a timeline creation request
-  but becomes unresponsive. The tenant is attached to a
-  different pageserver B, and the timeline creation request
-  is sent there too.
-
-#### Timeline Creation
-
-If some very slow node tries to do a timeline creation _after_
-a more recent generation node has already created the timeline
-and written some data into it, that must not cause harm. This
-is provided in timeline creations by the way all the objects
-within the timeline's remote path include a generation suffix:
-a slow node in an old generation that attempts to "create" a timeline
-that already exists will just emit an index_part.json with
-an old generation suffix.
-
-Timeline IDs are never reused, so we don't have
-to worry about the case of create/delete/create cycles. If they
-were re-used during a disaster recovery "un-delete" of a timeline,
-that special case can be handled by calling out to all available pageservers
-to check that they return 404 for the timeline, and to flush their
-deletion queues in case they had any deletions pending from the
-timeline.
-
-The above makes it safe for control plane to change the assignment of
-tenant to pageserver in control plane while a timeline creation is ongoing.
-The reason is that the creation request against the new assigned pageserver
-uses a new generation number. However, care must be taken by control plane
-to ensure that a "timeline creation successul" response from some pageserver
-is checked for the pageserver's generation for that timeline's tenant still being the latest.
-If it is not the latest, the response does not constitute a successful timeline creation.
-It is acceptable to discard such responses, the scrubber will clean up the S3 state.
-It is better to issue a timelien deletion request to the stale attachment.
-
-#### Timeline Deletion
-
-Tenant/timeline deletion operations are exempt from generation validation
-on deletes, and therefore don't have to go through the same deletion
-queue as GC/compaction layer deletions. This is because once a
-delete is issued by the control plane, it is a promise that the
-control plane will keep trying until the deletion is done, so even stale
-pageservers are permitted to go ahead and delete the objects.
-
-The implications of this for control plane are:
-
- During timeline/tenant deletion, the control plane must wait for the deletion to
-  be truly complete (status 404) and also handle the case where the pageserver
-  becomes unavailable, either by waiting for a replacement with the same node_id,
-  or by *re-attaching the tenant elsewhere.
-
- The control plane must persist its intent to delete
-  a timeline/tenant before issuing any RPCs, and then once it starts, it must
-  keep retrying until the tenant/timeline is gone. This is already handled
-  by using a persistent `Operation` record that is retried indefinitely.
-
-Timeline deletion may result in a special kind of object leak, where
-the latest generation attachment completes a deletion (including erasing
-all objects in the timeline path), but some slow/partitioned node is
-writing into the timeline path with a stale generation number. This would
-not be caught by any per-timeline scrubbing (see [scrubbing](#cleaning-up-orphan-objects-scrubbing)), since scrubbing happens on the
-attached pageserver, and once the timeline is deleted it isn't attached anywhere.
-This scenario should be pretty rare, and the control plane can make it even
-rarer by ensuring that if a tenant is in a multi-attached state (e.g. during
-migration), we wait for that to complete before processing the deletion. Beyond
-that, we may implement some other top-level scrub of timelines in
-an external tool, to identify any tenant/timeline paths that are not found
-in the control plane database.
-
-#### Examples
-
- Deletion, node restarts partway through:
-  - By the time we returned 202, we have written a remote delete marker
-  - Any subsequent incarnation of the same node_id will see the remote
-    delete marker and continue to process the deletion
-  - If the original pageserver is lost permanently and no replacement
-    with the same node_id is available, then the control plane must recover
-    by re-attaching the tenant to a different node.
- Creation, node becomes unresponsive partway through.
-  - Control plane will see HTTP request timeout, keep re-issuing
-    request to whoever is the latest attachment point for the tenant
-    until it succeeds.
-  - Stale nodes may be trying to execute timeline creation: they will
-    write out index_part.json files with
-    stale attachment generation: these will be eventually cleaned up
-    by the same mechanism as other old indices.
-
-### Unsafe case on badly behaved infrastructure
-
-This section is only relevant if running on a different environment
-than EC2 machines with ephemeral disks.
-
-If we ever run pageservers on infrastructure that might transparently restart
-a pageserver while leaving an old process running (e.g. a VM gets rescheduled
-without the old one being fenced), then there is a risk of corruption, when
-the control plane attaches the tenant, as follows:
-
- If the control plane sends an `/attach` request to node A, then node A dies
-  and is replaced, and the control plane's retries the request without
-  incrementing that attachment ID, then it could end up with two physical nodes
-  both using the same generation number.
- This is not an issue when using EC2 instances with ephemeral storage, as long
-  as the control plane never re-uses a node ID, but it would need re-examining
-  if running on different infrastructure.
- To robustly protect against this class of issue, we would either:
-  - add a "node generation" to distinguish between different processes holding the
-    same node_id.
-  - or, dispense with static node_id entirely and issue an ephemeral ID to each
-    pageserver process when it starts.
-
-## Implementation Part 2: Optimizations
-
-### Persistent deletion queue
-
-Between writing our a new index_part.json that doesn't reference an object,
-and executing the deletion, an object passes through a window where it is
-only referenced in memory, and could be leaked if the pageserver is stopped
-uncleanly. That introduces conflicting incentives: on the one hand, we would
-like to delay and batch deletions to
-1. minimize the cost of the mandatory validations calls to control plane, and
-2. minimize cost for DeleteObjects requests.
-On the other hand we would also like to minimize leakage by executing
-deletions promptly.
-
-To resolve this, we may make the deletion queue persistent
-and then executing these in the background at a later time.
-
-_Note: The deletion queue's reason for existence is optimization rather than correctness,
-so there is a lot of flexibility in exactly how the it should work,
-as long as it obeys the rule to validate generations before executing deletions,
-so the following details are not essential to the overall RFC._
-
-#### Scope
-
-The deletion queue will be global per pageserver, not per-tenant. There
-are several reasons for this choice:
-
- Use the queue as a central point to coalesce validation requests to the
-  control plane: this avoids individual `Timeline` objects ever touching
-  the control plane API, and avoids them having to know the rules about
-  validating deletions. This separation of concerns will avoid burdening
-  the already many-LoC `Timeline` type with even more responsibility.
- Decouple the deletion queue from Tenant attachment lifetime: we may
-  "hibernate" an inactive tenant by tearing down its `Tenant`/`Timeline`
-  objects in the pageserver, without having to wait for deletions to be done.
- Amortize the cost of I/O for the persistent queue, instead of having many
-  tiny queues.
- Coalesce deletions into a smaller number of larger DeleteObjects calls
-
-Because of the cost of doing I/O for persistence, and the desire to coalesce
-generation validation requests across tenants, and coalesce deletions into
-larger DeleteObjects requests, there will be one deletion queue per pageserver
-rather than one per tenant. This has the added benefit that when deactivating
-a tenant, we do not have to drain their deletion queue: deletions can proceed
-for a tenant whose main `Tenant` object has been torn down.
-
-#### Flow of deletion
-
-The flow of a deletion is becomes:
-
-1. Need for deletion of an object (=> layer file) is identified.
-2. Unlink the object from all the places that reference it (=> `index_part.json`).
-3. Enqueue the deletion to a persistent queue.
-   Each entry is `tenant_id, attachment_generation, S3 key`.
-4. Validate & execute in batches:
-  4.1 For a batch of entries, call into control plane.
-  4.2 For the subset of entries that passed validation, execute a `DeleteObjects` S3 DELETE request for their S3 keys.
-
-As outlined in the Part 1 on correctness, it is critical that deletions are only
-executed once the key is not referenced anywhere in S3.
-This property is obviously upheld by the scheme above.
-
-#### We Accept Object Leakage In Acceptable Circumcstances
-
-If we crash in the flow above between (2) and (3), we lose track of unreferenced object.
-Further, enqueuing a single to the persistent queue may not be durable immediately to amortize cost of flush to disk.
-This is acceptable for now, it can be caught by [the scrubber](#cleaning-up-orphan-objects-scrubbing).
-
-There are various measures we can take to improve this in the future.
-1. Cap amount of time until enqueued entry becomes durable (timeout for flush-to-tisk)
-2. Proactively flush:
-    - On graceful shutdown, as we anticipate that some or
-      all of our attachments may be re-assigned while we are offline.
-    - On tenant detach.
-3. For each entry, keep track of whether it has passed (2).
-   Only admit entries to (4) one they have passed (2).
-   This requires re-writing / two queue entries (intent, commit) per deletion.
-
-The important take-away with any of the above is that it's not
-disastrous to leak objects in exceptional circumstances.
-
-#### Operations that may skip the queue
-
-Deletions of an entire timeline are [exempt](#Timeline-Deletion) from generation number validation. Once the
-control plane sends the deletion request, there is no requirement to retain the readability
-of any data within the timeline, and all objects within the timeline path may be deleted
-at any time from the control plane's deletion request onwards.
-
-Since deletions of smaller timelines won't have enough objects to compose a full sized
-DeleteObjects request, it is still useful to send these through the last part of the
-deletion pipeline to coalesce with other executing deletions: to enable this, the
-deletion queue should expose two input channels: one for deletions that must be
-processed in a generation-aware way, and a fast path for timeline deletions, where
-that fast path may skip validation and the persistent queue.
-
-### Cleaning up orphan objects (scrubbing)
-
-An orphan object is any object which is no longer referenced by a running node or by metadata.
-
-Examples of how orphan objects arise:
-
- A node PUTs a layer object, then crashes before it writes the
-  index_part.json that references that layer.
- A stale node carries on running for some time, and writes out an unbounded number of
-  objects while it believes itself to be the rightful writer for a tenant.
- A pageserver crashes between un-linking an object from the index, and persisting
-  the object to its deletion queue.
-
-Orphan objects are functionally harmless, but have a small cost due to S3 capacity consumed. We
-may clean them up at some time in the future, but doing a ListObjectsv2 operation and cross
-referencing with the latest metadata to identify objects which are not referenced.
-
-Scrubbing will be done only by an attached pageserver (not some third party process), and deletions requested during scrub will go through the same
-validation as all other deletions: the attachment generation must be
-fresh. This avoids the possibility of a stale pageserver incorrectly
-thinking than an object written by a newer generation is stale, and deleting
-it.
-
-It is not strictly necessary that scrubbing be done by an attached
-pageserver: it could also be done externally. However, an external
-scrubber would still require the same validation procedure that
-a pageserver's deletion queue performs, before actually erasing
-objects.
-
-## Operational impact
-
-### Availability
-
-Coordination of generation numbers via the control plane introduce a dependency for certain
-operations:
-
-1. Starting new pageservers (or activating pageservers after a restart)
-2. Executing enqueued deletions
-3. Advertising updated `remote_consistent_lsn` to enable WAL trimming
-
-Item 1. would mean that some in-place restarts that previously would have resumed service even if the control plane were
-unavailable, will now not resume service to users until the control plane is available. We could
-avoid this by having a timeout on communication with the control plane, and after some timeout,
-resume service with the previous generation numbers (assuming this was persisted to disk). However,
-this is unlikely to be needed as the control plane is already an essential & highly available component. Also, having a node re-use an old generation number would complicate
-reasoning about the system, as it would break the invariant that a generation number uniquely identifies
-a tenant's attachment to a given pageserver _process_: it would merely identify the tenant's attachment
-to the pageserver _machine_ or its _on-disk-state_.
-
-Item 2. is a non-issue operationally: it's harmless to delay deletions, the only impact of objects pending deletion is
-the S3 capacity cost.
-
-Item 3. could be an issue if safekeepers are low on disk space and the control plane is unavailable for a long time. If this became an issue,
-we could adjust the safekeeper to delete segments from local disk sooner, as soon as they're uploaded to S3, rather than waiting for
-remote_consistent_lsn to advance.
-
-For a managed service, the general approach should be to make sure we are monitoring & respond fast enough
-that control plane outages are bounded in time.
-
-There is also the fact that control plane runs in a single region.
-The latency for distant regions is not a big concern for us because all request types added by this RFC are either infrequent or not in the way of the data path.
-However, we lose region isolation for the operations listed above.
-The ongoing work to split console and control will give us per-region control plane, and all operations in this RFC can be handled by these per-region control planes.
-With that in mind, we accept the trade-offs outlined in this paragraph.
-
-We will also implement an "escape hatch" config generation numbers, where in a major disaster outage,
-we may manually run pageservers with a hand-selected generation number, so that we can bring them online
-independently of a control plane.
-
-### Rollout
-
-Although there is coupling between components, we may deploy most of the new data plane components
-independently of the control plane: initially they can just use a static generation number.
-
-#### Phase 1
-
-The pageserver is deployed with some special config to:
-
- Always act like everything is generation 1 and do not wait for a control plane issued generation on attach
- Skip the places in deletion and remote_consistent_lsn updates where we would call into control plane
-
-#### Phase 2
-
-The control plane changes are deployed: control plane will now track and increment generation numbers.
-
-#### Phase 3
-
-The pageserver is deployed with its control-plane-dependent changes enabled: it will now require
-the control plane to service re-attach requests on startup, and handle generation
-validation requests.
-
-### On-disk backward compatibility
-
-Backward compatibility with existing data is straightforward:
-
- When reading the index, we may assume that any layer whose metadata doesn't include
-  generations will have a path without generation suffix.
- When locating the index file on attachment, we may use the "fallback" listing path
-  and if there is only an index without generation suffix, that is the one we load.
-
-It is not necessary to re-write existing layers: even new index files will be able
-to represent generation-less layers.
-
-### On-disk forward compatibility
-
-We will do a two phase rollout, probably over multiple releases because we will naturally
-have some of the read-side code ready before the overall functionality is ready:
-
-1. Deploy pageservers which understand the new index format and generation suffixes
-   in keys, but do not write objects with generation numbers in the keys.
-2. Deploy pageservers that write objects with generation numbers in the keys.
-
-Old pageservers will be oblivious to generation numbers. That means that they can't
-read objects with generation numbers in the name. This is why we must
-first step must deploy the ability to read, before the second step
-starts writing them.
-
-# Frequently Asked Questions
-
-## Why a generation _suffix_ rather than _prefix_?
-
-The choice is motivated by object listing, since one can list by prefix but not
-suffix.
-
-In [finding remote indices](#finding-the-remote-indices-for-timelines), we rely
-on being able to do a prefix listing for `<tenant>/<timeline>/index_part.json*`.
-That relies on the prefix listing.
-
-The converse case of using a generation prefix and listing by generation is
-not needed: one could imagine listing by generation while scrubbing (so that
-a particular generation's layers could be scrubbed), but this is not part
-of normal operations, and the [scrubber](#cleaning-up-orphan-objects-scrubbing) probably won't work that way anyway.
-
-## Wouldn't it be simpler to have a separate deletion queue per timeline?
-
-Functionally speaking, we could. That's how RemoteTimelineClient currently works,
-but this approach does not map well to a long-lived persistent queue with
-generation validation.
-
-Anything we do per-timeline generates tiny random I/O, on a pageserver with
-tens of thousands of timelines operating: to be ready for high scale, we should:
-
- A) Amortize costs where we can (e.g. a shared deletion queue)
- B) Expect to put tenants into a quiescent state while they're not
-  busy: i.e. we shouldn't keep a tenant alive to service its deletion queue.
-
-This was discussed in the [scope](#scope) part of the deletion queue section.
-
-# Appendix A: Examples of use in high availability/failover
-
-The generation numbers proposed in this RFC are adaptable to a variety of different
-failover scenarios and models. The sections below sketch how they would work in practice.
-
-### In-place restart of a pageserver
-
-"In-place" here means that the restart is done before any other element in the system
-has taken action in response to the node being down.
-
- After restart, the node issues a re-attach request to the control plane, and
-  receives new generation numbers for all its attached tenants.
- Tenants may be activated with the generation number in the re-attach response.
- If any of its attachments were in fact stale (i.e. had be reassigned to another
-  node while this node was offline), then
-  - the re-attach response will inform the tenant about this by not including
-    the tenant of this by _not_ incrementing the generation for that attachment.
-  - This will implicitly block deletions in the tenant, but as an optimization
-    the pageserver should also proactively stop doing S3 uploads when it notices this stale-generation state.
-  - The control plane is expected to eventually detach this tenant from the
-    pageserver.
-
-If the control plane does not include a tenant in the re-attach response,
-but there is still local state for the tenant in the filesystem, the pageserver
-deletes the local state in response and does not load/active the tenant.
-See the [earlier section on pageserver startup](#pageserver-attachstartup-changes) for details.
-Control plane can use this mechanism to clean up a pageserver that has been
-down for so long that all its tenants were migrated away before it came back
-up again and asked for re-attach.
-
-### Failure of a pageserver
-
-In this context, read "failure" as the most ambiguous possible case, where
-a pageserver is unavailable to clients and control plane, but may still be executing and talking
-to S3.
-
-#### Case A: re-attachment to other nodes
-
-1. Let's say node 0 becomes unresponsive in a cluster of three nodes 0, 1, 2.
-2. Some external mechanism notices that the node is unavailable and initiates
-   movement of all tenants attached to that node to a different node according
-   to some distribution rule.
-   In this example, it would mean incrementing the generation
-   of all tenants that were attached to node 0, as each tenant's assigned pageserver changes.
-3. A tenant which is now attached to node 1 will _also_ still be attached to node
-   0, from the perspective of node 0. Node 0 will still be using its old generation,
-   node 1 will be using a newer generation.
-4. S3 writes will continue from nodes 0 and 1: there will be an index_part.json-00000001
-   \_and\* an index_part.json-00000002. Objects written under the old suffix
-   after the new attachment was created do not matter from the rest of the system's
-   perspective: the endpoints are reading from the new attachment location. Objects
-   written by node 0 are just garbage that can be cleaned up at leisure. Node 0 will
-   not do any deletions because it can't synchronize with control plane, or if it could,
-   its deletion queue processing would get errors for the validation requests.
-
-#### Case B: direct node replacement with same node_id and drive
-
-This is the scenario we would experience if running pageservers in some dynamic
-VM/container environment that would auto-replace a given node_id when it became
-unresponsive, with the node's storage supplied by some network block device
-that is attached to the replacement VM/container.
-
-1. Let's say node 0 fails, and there may be some other peers but they aren't relevant.
-2. Some external mechanism notices that the node is unavailable, and creates
-   a "new node 0" (Node 0b) which is a physically separate server. The original node 0
-   (Node 0a) may still be running, because we do not assume the environment fences nodes.
-3. On startup, node 0b re-attaches and gets higher generation numbers for
-   all tenants.
-4. S3 writes continue from nodes 0a and 0b, but the writes do not collide due to different
-   generation in the suffix, and the writes from node 0a are not visible to the rest
-   of the system because endpoints are reading only from node 0b.
-
-# Appendix B: interoperability with other features
-
-## Sharded Keyspace
-
-The design in this RFC maps neatly to a sharded keyspace design where subsets of the key space
-for a tenant are assigned to different pageservers:
-
- the "unit of work" for attachments becomes something like a TenantShard rather than a Tenant
- TenantShards get generation numbers just as Tenants do.
- Write workload (ingest, compaction) for a tenant is spread out across pageservers via
-  TenantShards, but each TenantShard still has exactly one valid writer at a time.
-
-## Read replicas
-
-_This section is about a passive reader of S3 pageserver state, not a postgres
-read replica_
-
-For historical reads to LSNs below the remote persistent LSN, any node may act as a reader at any
-time: remote data is logically immutable data, and the use of deferred deletion in this RFC helps
-mitigate the fact that remote data is not _physically_ immutable (i.e. the actual data for a given
-page moves around as compaction happens).
-
-A read replica needs to be aware of generations in remote data in order to read the latest
-metadata (find the index_part.json with the latest suffix). It may either query this
-from the control plane, or find it with ListObjectsv2 request
-
-## Seamless migration
-
-To make tenant migration totally seamless, we will probably want to intentionally double-attach
-a tenant briefly, serving reads from the old node while waiting for the new node to be ready.
-
-This RFC enables that double-attachment: two nodes may be attached at the same time, with the migration destination
-having a higher generation number. The old node will be able to ingest and serve reads, but not
-do any deletes. The new node's attachment must also avoid deleting layers that the old node may
-still use. A new piece of state
-will be needed for this in the control plane's definition of an attachment.
-
-## Warm secondary locations
-
-To enable faster tenant movement after a pageserver is lost, we will probably want to spend some
-disk capacity on keeping standby locations populated with local disk data.
-
-There's no conflict between this RFC and that: implementing warm secondary locations on a per-tenant basis
-would be a separate change to the control plane to store standby location(s) for a tenant. Because
-the standbys do not write to S3, they do not need to be assigned generation numbers. When a tenant is
-re-attached to a standby location, that would increment the tenant attachment generation and this
-would work the same as any other attachment change, but with a warm cache.
-
-## Ephemeral node IDs
-
-This RFC intentionally avoids changing anything fundamental about how pageservers are identified
-and registered with the control plane, to avoid coupling the implementation of pageserver split
-brain protection with more fundamental changes in the management of the pageservers.
-
-Moving to ephemeral node IDs would provide an extra layer of
-resilience in the system, as it would prevent the control plane
-accidentally attaching to two physical nodes with the same
-generation, if somehow there were two physical nodes with
-the same node IDs (currently we rely on EC2 guarantees to
-eliminate this scenario). With ephemeral node IDs, there would be
-no possibility of that happening, no matter the behavior of
-underlying infrastructure.
-
-Nothing fundamental in the pageserver's handling of generations needs to change to handle ephemeral node IDs, since we hardly use the
-`node_id` anywhere. The `/re-attach` API would be extended
-to enable the pageserver to obtain its ephemeral ID, and provide
-some correlation identifier (e.g. EC instance ID), to help the
-control plane re-attach tenants to the same physical server that
-previously had them attached.
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -31,8 +31,6 @@ fn lsn_invalid() -> Lsn {
 #[serde_as]
 #[derive(Debug, Clone, Deserialize, Serialize)]
 pub struct SkTimelineInfo {
-    /// Term.
-    pub term: Option<u64>,
    /// Term of the last entry.
    pub last_log_term: Option<u64>,
    /// LSN of the last record.
@@ -60,6 +58,4 @@ pub struct SkTimelineInfo {
    /// A connection string to use for WAL receiving.
    #[serde(default)]
    pub safekeeper_connstr: Option<String>,
-    #[serde(default)]
-    pub http_connstr: Option<String>,
 }
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -38,7 +38,6 @@ url.workspace = true
 uuid.workspace = true

 pq_proto.workspace = true
-postgres_connection.workspace = true
 metrics.workspace = true
 workspace_hack.workspace = true

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -58,8 +58,6 @@ pub mod serde_regex;

 pub mod pageserver_feedback;

-pub mod postgres_client;
-
 pub mod tracing_span_assert;

 pub mod rate_limit;
--- a/libs/utils/src/postgres_client.rs
+++ b/libs/utils/src/postgres_client.rs
@@ -1,37 +0,0 @@
-//! Postgres client connection code common to other crates (safekeeper and
-//! pageserver) which depends on tenant/timeline ids and thus not fitting into
-//! postgres_connection crate.
-
-use anyhow::Context;
-use postgres_connection::{parse_host_port, PgConnectionConfig};
-
-use crate::id::TenantTimelineId;
-
-/// Create client config for fetching WAL from safekeeper on particular timeline.
-/// listen_pg_addr_str is in form host:\[port\].
-pub fn wal_stream_connection_config(
-    TenantTimelineId {
-        tenant_id,
-        timeline_id,
-    }: TenantTimelineId,
-    listen_pg_addr_str: &str,
-    auth_token: Option<&str>,
-    availability_zone: Option<&str>,
-) -> anyhow::Result<PgConnectionConfig> {
-    let (host, port) =
-        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
-    let port = port.unwrap_or(5432);
-    let mut connstr = PgConnectionConfig::new_host_port(host, port)
-        .extend_options([
-            "-c".to_owned(),
-            format!("timeline_id={}", timeline_id),
-            format!("tenant_id={}", tenant_id),
-        ])
-        .set_password(auth_token.map(|s| s.to_owned()));
-
-    if let Some(availability_zone) = availability_zone {
-        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
-    }
-
-    Ok(connstr)
-}
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -16,19 +16,3 @@ in the `neon-postgres` cgroup and set its `memory.{max,high}`.
 * See also: [`neondatabase/vm-monitor`](https://github.com/neondatabase/vm-monitor/),
 where initial development of the monitor happened. The repository is no longer
 maintained but the commit history may be useful for debugging.
-
-## Structure
-
-The `vm-monitor` is loosely comprised of a few systems. These are:
-* the server: this is just a simple `axum` server that accepts requests and
-upgrades them to websocket connections. The server only allows one connection at
-a time. This means that upon receiving a new connection, the server will terminate
-and old one if it exists.
-* the filecache: a struct that allows communication with the Postgres file cache.
-On startup, we connect to the filecache and hold on to the connection for the
-entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
-listening for `memory.high` events and setting its `memory.{high,max}` values.
-* the runner: the runner marries the filecache and cgroup watcher together,
-communicating with the agent throught the `Dispatcher`, and then calling filecache
-and cgroup watcher functions as needed to upscale and downscale
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -634,7 +634,7 @@ impl CgroupWatcher {
            .context("failed to get memory subsystem")?
            .set_mem(cgroups_rs::memory::SetMemory {
                low: None,
-                high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
+                high: Some(MaxValue::Value(bytes.min(i64::MAX as u64) as i64)),
                min: None,
                max: None,
            })
@@ -654,10 +654,8 @@ impl CgroupWatcher {
            .set_mem(cgroups_rs::memory::SetMemory {
                min: None,
                low: None,
-                high: Some(MaxValue::Value(
-                    u64::min(limits.high, i64::MAX as u64) as i64
-                )),
-                max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
+                high: Some(MaxValue::Value(limits.high.min(i64::MAX as u64) as i64)),
+                max: Some(MaxValue::Value(limits.max.min(i64::MAX as u64) as i64)),
            })
            .context("failed to set memory limits")
    }
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -1,7 +1,7 @@
 //! Managing the websocket connection and other signals in the monitor.
 //!
 //! Contains types that manage the interaction (not data interchange, see `protocol`)
-//! between agent and monitor, allowing us to to process and send messages in a
+//! between informant and monitor, allowing us to to process and send messages in a
 //! straightforward way. The dispatcher also manages that signals that come from
 //! the cgroup (requesting upscale), and the signals that go to the cgroup
 //! (notifying it of upscale).
@@ -24,16 +24,16 @@ use crate::protocol::{
 /// The central handler for all communications in the monitor.
 ///
 /// The dispatcher has two purposes:
-/// 1. Manage the connection to the agent, sending and receiving messages.
+/// 1. Manage the connection to the informant, sending and receiving messages.
 /// 2. Communicate with the cgroup manager, notifying it when upscale is received,
-///    and sending a message to the agent when the cgroup manager requests
+///    and sending a message to the informant when the cgroup manager requests
 ///    upscale.
 #[derive(Debug)]
 pub struct Dispatcher {
-    /// We read agent messages of of `source`
+    /// We read informant messages of of `source`
    pub(crate) source: SplitStream<WebSocket>,

-    /// We send messages to the agent through `sink`
+    /// We send messages to the informant through `sink`
    sink: SplitSink<WebSocket, Message>,

    /// Used to notify the cgroup when we are upscaled.
@@ -43,7 +43,7 @@ pub struct Dispatcher {
    /// we send an `UpscaleRequst` to the agent.
    pub(crate) request_upscale_events: mpsc::Receiver<()>,

-    /// The protocol version we have agreed to use with the agent. This is negotiated
+    /// The protocol version we have agreed to use with the informant. This is negotiated
    /// during the creation of the dispatcher, and should be the highest shared protocol
    /// version.
    ///
@@ -56,9 +56,9 @@ pub struct Dispatcher {
 impl Dispatcher {
    /// Creates a new dispatcher using the passed-in connection.
    ///
-    /// Performs a negotiation with the agent to determine the highest protocol
+    /// Performs a negotiation with the informant to determine the highest protocol
    /// version that both support. This consists of two steps:
-    /// 1. Wait for the agent to sent the range of protocols it supports.
+    /// 1. Wait for the informant to sent the range of protocols it supports.
    /// 2. Send a protocol version that works for us as well, or an error if there
    ///    is no compatible version.
    pub async fn new(
@@ -69,7 +69,7 @@ impl Dispatcher {
        let (mut sink, mut source) = stream.split();

        // Figure out the highest protocol version we both support
-        info!("waiting for agent to send protocol version range");
+        info!("waiting for informant to send protocol version range");
        let Some(message) = source.next().await else {
            bail!("websocket connection closed while performing protocol handshake")
        };
@@ -79,7 +79,7 @@ impl Dispatcher {
        let Message::Text(message_text) = message else {
            // All messages should be in text form, since we don't do any
            // pinging/ponging. See nhooyr/websocket's implementation and the
-            // agent for more info
+            // informant/agent for more info
            bail!("received non-text message during proocol handshake: {message:?}")
        };

@@ -88,30 +88,32 @@ impl Dispatcher {
            max: PROTOCOL_MAX_VERSION,
        };

-        let agent_range: ProtocolRange = serde_json::from_str(&message_text)
+        let informant_range: ProtocolRange = serde_json::from_str(&message_text)
            .context("failed to deserialize protocol version range")?;

-        info!(range = ?agent_range, "received protocol version range");
+        info!(range = ?informant_range, "received protocol version range");

-        let highest_shared_version = match monitor_range.highest_shared_version(&agent_range) {
+        let highest_shared_version = match monitor_range.highest_shared_version(&informant_range) {
            Ok(version) => {
                sink.send(Message::Text(
                    serde_json::to_string(&ProtocolResponse::Version(version)).unwrap(),
                ))
                .await
-                .context("failed to notify agent of negotiated protocol version")?;
+                .context("failed to notify informant of negotiated protocol version")?;
                version
            }
            Err(e) => {
                sink.send(Message::Text(
                    serde_json::to_string(&ProtocolResponse::Error(format!(
                        "Received protocol version range {} which does not overlap with {}",
-                        agent_range, monitor_range
+                        informant_range, monitor_range
                    )))
                    .unwrap(),
                ))
                .await
-                .context("failed to notify agent of no overlap between protocol version ranges")?;
+                .context(
+                    "failed to notify informant of no overlap between protocol version ranges",
+                )?;
                Err(e).context("error determining suitable protocol version range")?
            }
        };
@@ -135,7 +137,7 @@ impl Dispatcher {
            .context("failed to send resources and oneshot sender across channel")
    }

-    /// Send a message to the agent.
+    /// Send a message to the informant.
    ///
    /// Although this function is small, it has one major benefit: it is the only
    /// way to send data accross the connection, and you can only pass in a proper
--- a/libs/vm_monitor/src/filecache.rs
+++ b/libs/vm_monitor/src/filecache.rs
@@ -59,8 +59,8 @@ pub struct FileCacheConfig {
    spread_factor: f64,
 }

-impl FileCacheConfig {
-    pub fn default_in_memory() -> Self {
+impl Default for FileCacheConfig {
+    fn default() -> Self {
        Self {
            in_memory: true,
            // 75 %
@@ -71,19 +71,9 @@ impl FileCacheConfig {
            spread_factor: 0.1,
        }
    }
+}

-    pub fn default_on_disk() -> Self {
-        Self {
-            in_memory: false,
-            resource_multiplier: 0.75,
-            // 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
-            // memory, the kernel will just evict from its page cache, rather than e.g. killing
-            // everything.
-            min_remaining_after_cache: NonZeroU64::new(256 * MiB).unwrap(),
-            spread_factor: 0.1,
-        }
-    }
-
+impl FileCacheConfig {
    /// Make sure fields of the config are consistent.
    pub fn validate(&self) -> anyhow::Result<()> {
        // Single field validity
@@ -142,11 +132,11 @@ impl FileCacheConfig {

        // Conversions to ensure we don't overflow from floating-point ops
        let size_from_spread =
-            i64::max(0, (available as f64 / (1.0 + self.spread_factor)) as i64) as u64;
+            0_i64.max((available as f64 / (1.0 + self.spread_factor)) as i64) as u64;

        let size_from_normal = (total as f64 * self.resource_multiplier) as u64;

-        let byte_size = u64::min(size_from_spread, size_from_normal);
+        let byte_size = size_from_spread.min(size_from_normal);

        // The file cache operates in units of mebibytes, so the sizes we produce should
        // be rounded to a mebibyte. We round down to be conservative.
@@ -278,7 +268,7 @@ impl FileCacheState {
            .context("failed to extract max file cache size from query result")?;

        let max_mb = max_bytes / MiB;
-        let num_mb = u64::min(num_bytes, max_bytes) / MiB;
+        let num_mb = (num_bytes / MiB).max(max_mb);

        let capped = if num_bytes > max_bytes {
            " (capped by maximum size)"
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -39,16 +39,6 @@ pub struct Args {
    #[arg(short, long)]
    pub pgconnstr: Option<String>,

-    /// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
-    /// kernel's page cache), and therefore should not count against available memory.
-    //
-    // NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
-    // than a roundabout way, via whether it's on disk), but in order to be backwards compatible
-    // during the switch away from an in-memory file cache, we had to default to the previous
-    // behavior.
-    #[arg(long)]
-    pub file_cache_on_disk: bool,
-
    /// The address we should listen on for connection requests. For the
    /// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
    #[arg(short, long)]
@@ -156,7 +146,7 @@ pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Res

 /// Handles incoming websocket connections.
 ///
-/// If we are already to connected to an agent, we kill that old connection
+/// If we are already to connected to an informant, we kill that old connection
 /// and accept the new one.
 #[tracing::instrument(name = "/monitor", skip_all, fields(?args))]
 pub async fn ws_handler(
@@ -206,10 +196,10 @@ async fn start_monitor(
            return;
        }
    };
-    info!("connected to agent");
+    info!("connected to informant");

    match monitor.run().await {
        Ok(()) => info!("monitor was killed due to new connection"),
-        Err(e) => error!(error = ?e, "monitor terminated unexpectedly"),
+        Err(e) => error!(error = ?e, "monitor terminated by itself"),
    }
 }
--- a/libs/vm_monitor/src/protocol.rs
+++ b/libs/vm_monitor/src/protocol.rs
@@ -1,13 +1,13 @@
-//! Types representing protocols and actual agent-monitor messages.
+//! Types representing protocols and actual informant-monitor messages.
 //!
 //! The pervasive use of serde modifiers throughout this module is to ease
 //! serialization on the go side. Because go does not have enums (which model
 //! messages well), it is harder to model messages, and we accomodate that with
 //! serde.
 //!
-//! *Note*: the agent sends and receives messages in different ways.
+//! *Note*: the informant sends and receives messages in different ways.
 //!
-//! The agent serializes messages in the form and then sends them. The use
+//! The informant serializes messages in the form and then sends them. The use
 //! of `#[serde(tag = "type", content = "content")]` allows us to use `Type`
 //! to determine how to deserialize `Content`.
 //! ```ignore
@@ -25,9 +25,9 @@
 //!     Id   uint64
 //! }
 //! ```
-//! After reading the type field, the agent will decode the entire message
+//! After reading the type field, the informant will decode the entire message
 //! again, this time into the correct type using the embedded fields.
-//! Because the agent cannot just extract the json contained in a certain field
+//! Because the informant cannot just extract the json contained in a certain field
 //! (it initially deserializes to `map[string]interface{}`), we keep the fields
 //! at the top level, so the entire piece of json can be deserialized into a struct,
 //! such as a `DownscaleResult`, with the `Type` and `Id` fields ignored.
@@ -37,7 +37,7 @@ use std::cmp;

 use serde::{de::Error, Deserialize, Serialize};

-/// A Message we send to the agent.
+/// A Message we send to the informant.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct OutboundMsg {
    #[serde(flatten)]
@@ -51,31 +51,31 @@ impl OutboundMsg {
    }
 }

-/// The different underlying message types we can send to the agent.
+/// The different underlying message types we can send to the informant.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 #[serde(tag = "type")]
 pub enum OutboundMsgKind {
-    /// Indicates that the agent sent an invalid message, i.e, we couldn't
+    /// Indicates that the informant sent an invalid message, i.e, we couldn't
    /// properly deserialize it.
    InvalidMessage { error: String },
    /// Indicates that we experienced an internal error while processing a message.
    /// For example, if a cgroup operation fails while trying to handle an upscale,
    /// we return `InternalError`.
    InternalError { error: String },
-    /// Returned to the agent once we have finished handling an upscale. If the
+    /// Returned to the informant once we have finished handling an upscale. If the
    /// handling was unsuccessful, an `InternalError` will get returned instead.
    /// *Note*: this is a struct variant because of the way go serializes struct{}
    UpscaleConfirmation {},
    /// Indicates to the monitor that we are urgently requesting resources.
    /// *Note*: this is a struct variant because of the way go serializes struct{}
    UpscaleRequest {},
-    /// Returned to the agent once we have finished attempting to downscale. If
+    /// Returned to the informant once we have finished attempting to downscale. If
    /// an error occured trying to do so, an `InternalError` will get returned instead.
    /// However, if we are simply unsuccessful (for example, do to needing the resources),
    /// that gets included in the `DownscaleResult`.
    DownscaleResult {
        // FIXME for the future (once the informant is deprecated)
-        // As of the time of writing, the agent/informant version of this struct is
+        // As of the time of writing, the informant/agent version of this struct is
        // called api.DownscaleResult. This struct has uppercase fields which are
        // serialized as such. Thus, we serialize using uppercase names so we don't
        // have to make a breaking change to the agent<->informant protocol. Once
@@ -88,12 +88,12 @@ pub enum OutboundMsgKind {
        status: String,
    },
    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
-    /// agent.
+    /// informant.
    /// *Note*: this is a struct variant because of the way go serializes struct{}
    HealthCheck {},
 }

-/// A message received form the agent.
+/// A message received form the informant.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct InboundMsg {
    #[serde(flatten)]
@@ -101,7 +101,7 @@ pub struct InboundMsg {
    pub(crate) id: usize,
 }

-/// The different underlying message types we can receive from the agent.
+/// The different underlying message types we can receive from the informant.
 #[derive(Serialize, Deserialize, Debug, Clone)]
 #[serde(tag = "type", content = "content")]
 pub enum InboundMsgKind {
@@ -120,14 +120,14 @@ pub enum InboundMsgKind {
    /// when done.
    DownscaleRequest { target: Resources },
    /// Part of the bidirectional heartbeat. The heartbeat is initiated by the
-    /// agent.
+    /// informant.
    /// *Note*: this is a struct variant because of the way go serializes struct{}
    HealthCheck {},
 }

 /// Represents the resources granted to a VM.
 #[derive(Serialize, Deserialize, Debug, Clone, Copy)]
-// Renamed because the agent has multiple resources types:
+// Renamed because the agent/informant has multiple resources types:
 // `Resources` (milliCPU/memory slots)
 // `Allocation` (vCPU/bytes) <- what we correspond to
 #[serde(rename(serialize = "Allocation", deserialize = "Allocation"))]
@@ -151,7 +151,7 @@ pub const PROTOCOL_MAX_VERSION: ProtocolVersion = ProtocolVersion::V1_0;
 pub struct ProtocolVersion(u8);

 impl ProtocolVersion {
-    /// Represents v1.0 of the agent<-> monitor protocol - the initial version
+    /// Represents v1.0 of the informant<-> monitor protocol - the initial version
    ///
    /// Currently the latest version.
    const V1_0: ProtocolVersion = ProtocolVersion(1);
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -1,4 +1,4 @@
-//! Exposes the `Runner`, which handles messages received from agent and
+//! Exposes the `Runner`, which handles messages received from informant and
 //! sends upscale requests.
 //!
 //! This is the "Monitor" part of the monitor binary and is the main entrypoint for
@@ -21,8 +21,8 @@ use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
 use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args, MiB};

-/// Central struct that interacts with agent, dispatcher, and cgroup to handle
-/// signals from the agent.
+/// Central struct that interacts with informant, dispatcher, and cgroup to handle
+/// signals from the informant.
 #[derive(Debug)]
 pub struct Runner {
    config: Config,
@@ -110,10 +110,10 @@ impl Runner {
        // memory limits.
        if let Some(connstr) = &args.pgconnstr {
            info!("initializing file cache");
-            let config = match args.file_cache_on_disk {
-                true => FileCacheConfig::default_on_disk(),
-                false => FileCacheConfig::default_in_memory(),
-            };
+            let config: FileCacheConfig = Default::default();
+            if !config.in_memory {
+                panic!("file cache not in-memory implemented")
+            }

            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                .await
@@ -140,10 +140,7 @@ impl Runner {
            if actual_size != new_size {
                info!("file cache size actually got set to {actual_size}")
            }
-            // Mark the resources given to the file cache as reserved, but only if it's in memory.
-            if !args.file_cache_on_disk {
-                file_cache_reserved_bytes = actual_size;
-            }
+            file_cache_reserved_bytes = actual_size;

            state.filecache = Some(file_cache);
        }
@@ -230,17 +227,18 @@ impl Runner {
        let mut status = vec![];
        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
+            if !file_cache.config.in_memory {
+                panic!("file cache not in-memory unimplemented")
+            }
+
            let actual_usage = file_cache
                .set_file_cache_size(expected_file_cache_mem_usage)
                .await
                .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
-            }
+            file_cache_mem_usage = actual_usage;
            let message = format!(
-                "set file cache size to {} MiB (in memory = {})",
-                bytes_to_mebibytes(actual_usage),
-                file_cache.config.in_memory,
+                "set file cache size to {} MiB",
+                bytes_to_mebibytes(actual_usage)
            );
            info!("downscale: {message}");
            status.push(message);
@@ -291,6 +289,10 @@ impl Runner {
        // Get the file cache's expected contribution to the memory usage
        let mut file_cache_mem_usage = 0;
        if let Some(file_cache) = &mut self.filecache {
+            if !file_cache.config.in_memory {
+                panic!("file cache not in-memory unimplemented");
+            }
+
            let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
            info!(
                target = bytes_to_mebibytes(expected_usage),
@@ -302,9 +304,6 @@ impl Runner {
                .set_file_cache_size(expected_usage)
                .await
                .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
-            }

            if actual_usage != expected_usage {
                warn!(
@@ -313,6 +312,7 @@ impl Runner {
                    bytes_to_mebibytes(actual_usage)
                )
            }
+            file_cache_mem_usage = actual_usage;
        }

        if let Some(cgroup) = &self.cgroup {
@@ -371,7 +371,7 @@ impl Runner {
                Ok(None)
            }
            InboundMsgKind::InternalError { error } => {
-                warn!(error, id, "agent experienced an internal error");
+                warn!(error, id, "informant experienced an internal error");
                Ok(None)
            }
            InboundMsgKind::HealthCheck {} => {
@@ -405,12 +405,10 @@ impl Runner {
                        .await
                        .context("failed to send message")?;
                }
-                // there is a message from the agent
+                // there is a message from the informant
                msg = self.dispatcher.source.next() => {
                    if let Some(msg) = msg {
-                        // Don't use 'message' as a key as the string also uses
-                        // that for its key
-                        info!(?msg, "received message");
+                        info!(message = ?msg, "received message");
                        match msg {
                            Ok(msg) => {
                                let message: InboundMsg = match msg {
@@ -419,10 +417,8 @@ impl Runner {
                                    }
                                    other => {
                                        warn!(
-                                            // Don't use 'message' as a key as the
-                                            // string also uses that for its key
-                                            msg = ?other,
-                                            "agent should only send text messages but received different type"
+                                            message = ?other,
+                                            "informant should only send text messages but received different type"
                                        );
                                        continue
                                    },
@@ -433,7 +429,7 @@ impl Runner {
                                    Ok(None) => continue,
                                    Err(e) => {
                                        let error = e.to_string();
-                                        warn!(?error, "error handling message");
+                                        warn!(%error, "error handling message");
                                        OutboundMsg::new(
                                            OutboundMsgKind::InternalError {
                                                error
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -97,7 +97,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
 async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path)?);
-    let summary_blk = file.read_blk(0).await?;
+    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -48,7 +48,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
    virtual_file::init(10);
    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path)?);
-    let summary_blk = file.read_blk(0).await?;
+    let summary_blk = file.read_blk(0)?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
        actual_summary.index_start_blk,
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -75,12 +75,14 @@
 use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
-    sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+    sync::{
+        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+        RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError,
+    },
 };

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -160,7 +162,7 @@ struct Version {
 }

 struct Slot {
-    inner: tokio::sync::RwLock<SlotInner>,
+    inner: RwLock<SlotInner>,
    usage_count: AtomicU8,
 }

@@ -201,11 +203,6 @@ impl Slot {
            Err(usage_count) => usage_count,
        }
    }
-
-    /// Sets the usage count to a specific value.
-    fn set_usage_count(&self, count: u8) {
-        self.usage_count.store(count, Ordering::Relaxed);
-    }
 }

 pub struct PageCache {
@@ -218,9 +215,9 @@ pub struct PageCache {
    ///
    /// If you add support for caching different kinds of objects, each object kind
    /// can have a separate mapping map, next to this field.
-    materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
+    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

-    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
+    immutable_page_map: RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -236,7 +233,7 @@ pub struct PageCache {
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
+pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);

 impl std::ops::Deref for PageReadGuard<'_> {
    type Target = [u8; PAGE_SZ];
@@ -263,10 +260,9 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 /// to initialize.
 ///
 pub struct PageWriteGuard<'i> {
-    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+    inner: RwLockWriteGuard<'i, SlotInner>,

    // Are the page contents currently valid?
-    // Used to mark pages as invalid that are assigned but not yet filled with data.
    valid: bool,
 }

@@ -341,7 +337,7 @@ impl PageCache {
    /// The 'lsn' is an upper bound, this will return the latest version of
    /// the given block, but not newer than 'lsn'. Returns the actual LSN of the
    /// returned page.
-    pub async fn lookup_materialized_page(
+    pub fn lookup_materialized_page(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -361,7 +357,7 @@ impl PageCache {
            lsn,
        };

-        if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
+        if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
            if let CacheKey::MaterializedPage {
                hash_key: _,
                lsn: available_lsn,
@@ -388,7 +384,7 @@ impl PageCache {
    ///
    /// Store an image of the given page in the cache.
    ///
-    pub async fn memorize_materialized_page(
+    pub fn memorize_materialized_page(
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -405,7 +401,7 @@ impl PageCache {
            lsn,
        };

-        match self.lock_for_write(&cache_key).await? {
+        match self.lock_for_write(&cache_key)? {
            WriteBufResult::Found(write_guard) => {
                // We already had it in cache. Another thread must've put it there
                // concurrently. Check that it had the same contents that we
@@ -423,14 +419,31 @@ impl PageCache {

    // Section 1.2: Public interface functions for working with immutable file pages.

-    pub async fn read_immutable_buf(
-        &self,
-        file_id: FileId,
-        blkno: u32,
-    ) -> anyhow::Result<ReadBufResult> {
+    pub fn read_immutable_buf(&self, file_id: FileId, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };

-        self.lock_for_read(&mut cache_key).await
+        self.lock_for_read(&mut cache_key)
+    }
+
+    /// Immediately drop all buffers belonging to given file
+    pub fn drop_buffers_for_immutable(&self, drop_file_id: FileId) {
+        for slot_idx in 0..self.slots.len() {
+            let slot = &self.slots[slot_idx];
+
+            let mut inner = slot.inner.write().unwrap();
+            if let Some(key) = &inner.key {
+                match key {
+                    CacheKey::ImmutableFilePage { file_id, blkno: _ }
+                        if *file_id == drop_file_id =>
+                    {
+                        // remove mapping for old buffer
+                        self.remove_mapping(key);
+                        inner.key = None;
+                    }
+                    _ => {}
+                }
+            }
+        }
    }

    //
@@ -450,14 +463,14 @@ impl PageCache {
    ///
    /// If no page is found, returns None and *cache_key is left unmodified.
    ///
-    async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+    fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
        let cache_key_orig = cache_key.clone();
-        if let Some(slot_idx) = self.search_mapping(cache_key).await {
+        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we released the mapping
            // lock already, another thread could have evicted the page)
            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.read().await;
+            let inner = slot.inner.read().unwrap();
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
                return Some(PageReadGuard(inner));
@@ -498,7 +511,7 @@ impl PageCache {
    /// }
    /// ```
    ///
-    async fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
+    fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
@@ -513,7 +526,7 @@ impl PageCache {
        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
+            if let Some(read_guard) = self.try_lock_for_read(cache_key) {
                if is_first_iteration {
                    hit.inc();
                }
@@ -543,7 +556,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
+            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
                inner,
@@ -556,13 +569,13 @@ impl PageCache {
    /// found, returns None.
    ///
    /// When locking a page for writing, the search criteria is always "exact".
-    async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
+    fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
            // that it's still what we expected (because we don't released the mapping
            // lock already, another thread could have evicted the page)
            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.write().await;
+            let inner = slot.inner.write().unwrap();
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
                return Some(PageWriteGuard { inner, valid: true });
@@ -575,10 +588,10 @@ impl PageCache {
    ///
    /// Similar to lock_for_read(), but the returned buffer is write-locked and
    /// may be modified by the caller even if it's already found in the cache.
-    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
+    fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
        loop {
            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
+            if let Some(write_guard) = self.try_lock_for_write(cache_key) {
                return Ok(WriteBufResult::Found(write_guard));
            }

@@ -604,7 +617,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
+            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(WriteBufResult::NotFound(PageWriteGuard {
                inner,
@@ -627,7 +640,7 @@ impl PageCache {
    /// returns.  The caller is responsible for re-checking that the slot still
    /// contains the page with the same key before using it.
    ///
-    async fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
+    fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
        match cache_key {
            CacheKey::MaterializedPage { hash_key, lsn } => {
                let map = self.materialized_page_map.read().unwrap();
@@ -759,7 +772,7 @@ impl PageCache {
    /// Find a slot to evict.
    ///
    /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
+    fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard<SlotInner>)> {
        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
@@ -771,7 +784,10 @@ impl PageCache {
            if slot.dec_usage_count() == 0 {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
-                    Err(_err) => {
+                    Err(TryLockError::Poisoned(err)) => {
+                        anyhow::bail!("buffer lock was poisoned: {err:?}")
+                    }
+                    Err(TryLockError::WouldBlock) => {
                        // If we have looped through the whole buffer pool 10 times
                        // and still haven't found a victim buffer, something's wrong.
                        // Maybe all the buffers were in locked. That could happen in
@@ -800,8 +816,6 @@ impl PageCache {
    fn new(num_pages: usize) -> Self {
        assert!(num_pages > 0, "page cache size must be > 0");

-        // We use Box::leak here and into_boxed_slice to avoid leaking uninitialized
-        // memory that Vec's might contain.
        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
@@ -815,7 +829,7 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
+                    inner: RwLock::new(SlotInner { key: None, buf }),
                    usage_count: AtomicU8::new(0),
                }
            })
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -114,6 +114,7 @@ pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
+pub mod manifest;
 mod span;

 pub mod metadata;
@@ -697,7 +698,10 @@ impl Tenant {
            debug!("successfully downloaded index part for timeline {timeline_id}");
            match index_part {
                MaybeDeletedIndexPart::IndexPart(index_part) => {
-                    timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
+                    timeline_ancestors.insert(
+                        timeline_id,
+                        index_part.parse_metadata().context("parse_metadata")?,
+                    );
                    remote_index_and_client.insert(timeline_id, (index_part, client));
                }
                MaybeDeletedIndexPart::Deleted(index_part) => {
@@ -748,7 +752,7 @@ impl Tenant {
            DeleteTimelineFlow::resume_deletion(
                Arc::clone(self),
                timeline_id,
-                &index_part.metadata,
+                &index_part.parse_metadata().context("parse_metadata")?,
                Some(remote_timeline_client),
                None,
            )
@@ -1310,7 +1314,10 @@ impl Tenant {
                        }
                    };

-                    let remote_metadata = index_part.metadata.clone();
+                    let remote_metadata = index_part
+                        .parse_metadata()
+                        .context("parse_metadata")
+                        .map_err(LoadLocalTimelineError::Load)?;
                    (
                        Some(RemoteStartupData {
                            index_part,
@@ -4100,7 +4107,7 @@ mod tests {
        let mut found_error_message = false;
        let mut err_source = err.source();
        while let Some(source) = err_source {
-            if source.to_string().contains("metadata checksum mismatch") {
+            if source.to_string() == "metadata checksum mismatch" {
                found_error_message = true;
                break;
            }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -33,7 +33,7 @@ impl<'a> BlockCursor<'a> {
        let mut blknum = (offset / PAGE_SZ as u64) as u32;
        let mut off = (offset % PAGE_SZ as u64) as usize;

-        let mut buf = self.read_blk(blknum).await?;
+        let mut buf = self.read_blk(blknum)?;

        // peek at the first byte, to determine if it's a 1- or 4-byte length
        let first_len_byte = buf[off];
@@ -49,7 +49,7 @@ impl<'a> BlockCursor<'a> {
                // it is split across two pages
                len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
                blknum += 1;
-                buf = self.read_blk(blknum).await?;
+                buf = self.read_blk(blknum)?;
                len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
                off = 4 - thislen;
            } else {
@@ -70,7 +70,7 @@ impl<'a> BlockCursor<'a> {
            if page_remain == 0 {
                // continue on next page
                blknum += 1;
-                buf = self.read_blk(blknum).await?;
+                buf = self.read_blk(blknum)?;
                off = 0;
                page_remain = PAGE_SZ;
            }
@@ -91,3 +91,71 @@ pub trait BlobWriter {
    /// which can be used to retrieve the data later.
    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error>;
 }
+
+///
+/// An implementation of BlobWriter to write blobs to anything that
+/// implements std::io::Write.
+///
+pub struct WriteBlobWriter<W>
+where
+    W: std::io::Write,
+{
+    inner: W,
+    offset: u64,
+}
+
+impl<W> WriteBlobWriter<W>
+where
+    W: std::io::Write,
+{
+    pub fn new(inner: W, start_offset: u64) -> Self {
+        WriteBlobWriter {
+            inner,
+            offset: start_offset,
+        }
+    }
+
+    pub fn size(&self) -> u64 {
+        self.offset
+    }
+
+    /// Access the underlying Write object.
+    ///
+    /// NOTE: WriteBlobWriter keeps track of the current write offset. If
+    /// you write something directly to the inner Write object, it makes the
+    /// internally tracked 'offset' to go out of sync. So don't do that.
+    pub fn into_inner(self) -> W {
+        self.inner
+    }
+}
+
+impl<W> BlobWriter for WriteBlobWriter<W>
+where
+    W: std::io::Write,
+{
+    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
+        let offset = self.offset;
+
+        if srcbuf.len() < 128 {
+            // Short blob. Write a 1-byte length header
+            let len_buf = srcbuf.len() as u8;
+            self.inner.write_all(&[len_buf])?;
+            self.offset += 1;
+        } else {
+            // Write a 4-byte length header
+            if srcbuf.len() > 0x7fff_ffff {
+                return Err(Error::new(
+                    ErrorKind::Other,
+                    format!("blob too large ({} bytes)", srcbuf.len()),
+                ));
+            }
+            let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
+            len_buf[0] |= 0x80;
+            self.inner.write_all(&len_buf)?;
+            self.offset += 4;
+        }
+        self.inner.write_all(srcbuf)?;
+        self.offset += srcbuf.len() as u64;
+        Ok(offset)
+    }
+}
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -39,7 +39,7 @@ pub enum BlockLease<'a> {
    PageReadGuard(PageReadGuard<'static>),
    EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
    #[cfg(test)]
-    Arc(std::sync::Arc<[u8; PAGE_SZ]>),
+    Rc(std::rc::Rc<[u8; PAGE_SZ]>),
 }

 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -49,9 +49,9 @@ impl From<PageReadGuard<'static>> for BlockLease<'static> {
 }

 #[cfg(test)]
-impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
-    fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
-        BlockLease::Arc(value)
+impl<'a> From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+    fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
+        BlockLease::Rc(value)
    }
 }

@@ -63,7 +63,7 @@ impl<'a> Deref for BlockLease<'a> {
            BlockLease::PageReadGuard(v) => v.deref(),
            BlockLease::EphemeralFileMutableTail(v) => v,
            #[cfg(test)]
-            BlockLease::Arc(v) => v.deref(),
+            BlockLease::Rc(v) => v.deref(),
        }
    }
 }
@@ -83,13 +83,13 @@ pub(crate) enum BlockReaderRef<'a> {

 impl<'a> BlockReaderRef<'a> {
    #[inline(always)]
-    async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+    fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
        use BlockReaderRef::*;
        match self {
-            FileBlockReaderVirtual(r) => r.read_blk(blknum).await,
-            FileBlockReaderFile(r) => r.read_blk(blknum).await,
-            EphemeralFile(r) => r.read_blk(blknum).await,
-            Adapter(r) => r.read_blk(blknum).await,
+            FileBlockReaderVirtual(r) => r.read_blk(blknum),
+            FileBlockReaderFile(r) => r.read_blk(blknum),
+            EphemeralFile(r) => r.read_blk(blknum),
+            Adapter(r) => r.read_blk(blknum),
            #[cfg(test)]
            TestDisk(r) => r.read_blk(blknum),
        }
@@ -134,8 +134,8 @@ impl<'a> BlockCursor<'a> {
    /// access to the contents of the page. (For the page cache, the
    /// lease object represents a lock on the buffer.)
    #[inline(always)]
-    pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.reader.read_blk(blknum).await
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        self.reader.read_blk(blknum)
    }
 }

@@ -150,58 +150,51 @@ pub struct FileBlockReader<F> {
    file_id: page_cache::FileId,
 }

-impl<F> FileBlockReader<F> {
+impl<F> FileBlockReader<F>
+where
+    F: FileExt,
+{
    pub fn new(file: F) -> Self {
        let file_id = page_cache::next_file_id();

        FileBlockReader { file_id, file }
    }
-}

-macro_rules! impls {
-    (FileBlockReader<$ty:ty>) => {
-        impl FileBlockReader<$ty> {
-            /// Read a page from the underlying file into given buffer.
-            fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
-                assert!(buf.len() == PAGE_SZ);
-                self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
-            }
-            /// Read a block.
-            ///
-            /// Returns a "lease" object that can be used to
-            /// access to the contents of the page. (For the page cache, the
-            /// lease object represents a lock on the buffer.)
-            pub async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-                let cache = page_cache::get();
-                loop {
-                    match cache
-                        .read_immutable_buf(self.file_id, blknum)
-                        .await
-                        .map_err(|e| {
-                            std::io::Error::new(
-                                std::io::ErrorKind::Other,
-                                format!("Failed to read immutable buf: {e:#}"),
-                            )
-                        })? {
-                        ReadBufResult::Found(guard) => break Ok(guard.into()),
-                        ReadBufResult::NotFound(mut write_guard) => {
-                            // Read the page from disk into the buffer
-                            self.fill_buffer(write_guard.deref_mut(), blknum)?;
-                            write_guard.mark_valid();
+    /// Read a page from the underlying file into given buffer.
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
+        assert!(buf.len() == PAGE_SZ);
+        self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
+    }
+    /// Read a block.
+    ///
+    /// Returns a "lease" object that can be used to
+    /// access to the contents of the page. (For the page cache, the
+    /// lease object represents a lock on the buffer.)
+    pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        let cache = page_cache::get();
+        loop {
+            match cache
+                .read_immutable_buf(self.file_id, blknum)
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        format!("Failed to read immutable buf: {e:#}"),
+                    )
+                })? {
+                ReadBufResult::Found(guard) => break Ok(guard.into()),
+                ReadBufResult::NotFound(mut write_guard) => {
+                    // Read the page from disk into the buffer
+                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
+                    write_guard.mark_valid();

-                            // Swap for read lock
-                            continue;
-                        }
-                    };
+                    // Swap for read lock
+                    continue;
                }
-            }
+            };
        }
-    };
+    }
 }

-impls!(FileBlockReader<File>);
-impls!(FileBlockReader<VirtualFile>);
-
 impl BlockReader for FileBlockReader<File> {
    fn block_cursor(&self) -> BlockCursor<'_> {
        BlockCursor::new(BlockReaderRef::FileBlockReaderFile(self))
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -262,7 +262,7 @@ where
        let block_cursor = self.reader.block_cursor();
        while let Some((node_blknum, opt_iter)) = stack.pop() {
            // Locate the node.
-            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum).await?;
+            let node_buf = block_cursor.read_blk(self.start_blk + node_blknum)?;

            let node = OnDiskNode::deparse(node_buf.as_ref())?;
            let prefix_len = node.prefix_len as usize;
@@ -357,7 +357,7 @@ where
        let block_cursor = self.reader.block_cursor();

        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
-            let blk = block_cursor.read_blk(self.start_blk + blknum).await?;
+            let blk = block_cursor.read_blk(self.start_blk + blknum)?;
            let buf: &[u8] = blk.as_ref();
            let node = OnDiskNode::<L>::deparse(buf)?;

@@ -704,7 +704,7 @@ pub(crate) mod tests {
        pub(crate) fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
            let mut buf = [0u8; PAGE_SZ];
            buf.copy_from_slice(&self.blocks[blknum as usize]);
-            Ok(std::sync::Arc::new(buf).into())
+            Ok(std::rc::Rc::new(buf).into())
        }
    }
    impl BlockReader for TestDisk {
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -9,6 +9,7 @@ use std::cmp::min;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
+use std::os::unix::prelude::FileExt;
 use std::path::PathBuf;
 use std::sync::atomic::AtomicU64;
 use tracing::*;
@@ -60,14 +61,13 @@ impl EphemeralFile {
        self.len
    }

-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
+    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
            loop {
                match cache
                    .read_immutable_buf(self.page_cache_file_id, blknum)
-                    .await
                    .map_err(|e| {
                        std::io::Error::new(
                            std::io::ErrorKind::Other,
@@ -135,13 +135,10 @@ impl EphemeralFile {
                                // Pre-warm the page cache with what we just wrote.
                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
                                let cache = page_cache::get();
-                                match cache
-                                    .read_immutable_buf(
-                                        self.ephemeral_file.page_cache_file_id,
-                                        self.blknum,
-                                    )
-                                    .await
-                                {
+                                match cache.read_immutable_buf(
+                                    self.ephemeral_file.page_cache_file_id,
+                                    self.blknum,
+                                ) {
                                    Ok(page_cache::ReadBufResult::Found(_guard)) => {
                                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
                                        unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
@@ -224,8 +221,9 @@ pub fn is_ephemeral_file(filename: &str) -> bool {

 impl Drop for EphemeralFile {
    fn drop(&mut self) {
-        // There might still be pages in the [`crate::page_cache`] for this file.
-        // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
+        // drop all pages from page cache
+        let cache = page_cache::get();
+        cache.drop_buffers_for_immutable(self.page_cache_file_id);

        // unlink the file
        let res = std::fs::remove_file(&self.file.path);
--- a/pageserver/src/tenant/manifest.rs
+++ b/pageserver/src/tenant/manifest.rs
@@ -0,0 +1,325 @@
+//! This module contains the encoding and decoding of the local manifest file.
+//!
+//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
+//! records the state of the storage engine. It contains a snapshot of the
+//! state and all operations proceeding that snapshot. The file begins with a
+//! header recording MANIFEST version number. After that, it contains a snapshot.
+//! The snapshot is followed by a list of operations. Each operation is a list
+//! of records. Each record is either an addition or a removal of a layer.
+//!
+//! With MANIFEST, we can:
+//!
+//! 1. recover state quickly by reading the file, potentially boosting the
+//!    startup speed.
+//! 2. ensure all operations are atomic and avoid corruption, solving issues
+//!    like redundant image layer and preparing us for future compaction
+//!    strategies.
+//!
+//! There is also a format for storing all layer files on S3, called
+//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
+//! records all operations as logs, and therefore we can easily replay the
+//! operations when recovering from crash, while ensuring those operations
+//! are atomic upon restart.
+//!
+//! Currently, this is not used in the system. Future refactors will ensure
+//! the storage state will be recorded in this file, and the system can be
+//! recovered from this file. This is tracked in
+//! <https://github.com/neondatabase/neon/issues/4418>
+
+use std::io::{self, Read, Write};
+
+use crate::virtual_file::VirtualFile;
+use anyhow::Result;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+use crc32c::crc32c;
+use serde::{Deserialize, Serialize};
+use tracing::log::warn;
+use utils::lsn::Lsn;
+
+use super::storage_layer::PersistentLayerDesc;
+
+pub struct Manifest {
+    file: VirtualFile,
+}
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub struct Snapshot {
+    pub layers: Vec<PersistentLayerDesc>,
+}
+
+/// serde by default encode this in tagged enum, and therefore it will be something
+/// like `{ "AddLayer": { ... } }`.
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub enum Record {
+    AddLayer(PersistentLayerDesc),
+    RemoveLayer(PersistentLayerDesc),
+}
+
+/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
+const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
+const MANIFEST_VERSION: u64 = 1;
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub struct ManifestHeader {
+    magic_number: u64,
+    version: u64,
+}
+
+const MANIFEST_HEADER_LEN: usize = 16;
+
+impl ManifestHeader {
+    fn encode(&self) -> BytesMut {
+        let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
+        buf.put_u64(self.magic_number);
+        buf.put_u64(self.version);
+        buf
+    }
+
+    fn decode(mut buf: &[u8]) -> Self {
+        assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
+        Self {
+            magic_number: buf.get_u64(),
+            version: buf.get_u64(),
+        }
+    }
+}
+
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
+pub enum Operation {
+    /// A snapshot of the current state.
+    ///
+    /// Lsn field represents the LSN that is persisted to disk for this snapshot.
+    Snapshot(Snapshot, Lsn),
+    /// An atomic operation that changes the state.
+    ///
+    /// Lsn field represents the LSN that is persisted to disk after the operation is done.
+    /// This will only change when new L0 is flushed to the disk.
+    Operation(Vec<Record>, Lsn),
+}
+
+struct RecordHeader {
+    size: u32,
+    checksum: u32,
+}
+
+const RECORD_HEADER_LEN: usize = 8;
+
+impl RecordHeader {
+    fn encode(&self) -> BytesMut {
+        let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
+        buf.put_u32(self.size);
+        buf.put_u32(self.checksum);
+        buf
+    }
+
+    fn decode(mut buf: &[u8]) -> Self {
+        assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
+        Self {
+            size: buf.get_u32(),
+            checksum: buf.get_u32(),
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum ManifestLoadError {
+    #[error("manifest header is corrupted")]
+    CorruptedManifestHeader,
+    #[error("unsupported manifest version: got {0}, expected {1}")]
+    UnsupportedVersion(u64, u64),
+    #[error("error when decoding record: {0}")]
+    DecodeRecord(serde_json::Error),
+    #[error("I/O error: {0}")]
+    Io(io::Error),
+}
+
+#[must_use = "Should check if the manifest is partially corrupted"]
+pub struct ManifestPartiallyCorrupted(bool);
+
+impl Manifest {
+    /// Create a new manifest by writing the manifest header and a snapshot record to the given file.
+    pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
+        let mut manifest = Self { file };
+        manifest.append_manifest_header(ManifestHeader {
+            magic_number: MANIFEST_MAGIC_NUMBER,
+            version: MANIFEST_VERSION,
+        })?;
+        manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
+        Ok(manifest)
+    }
+
+    /// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
+    /// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
+    /// backup the current one.
+    pub fn load(
+        mut file: VirtualFile,
+    ) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
+        let mut buf = vec![];
+        file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
+
+        // Read manifest header
+        let mut buf = Bytes::from(buf);
+        if buf.remaining() < MANIFEST_HEADER_LEN {
+            return Err(ManifestLoadError::CorruptedManifestHeader);
+        }
+        let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
+        buf.advance(MANIFEST_HEADER_LEN);
+        if header.version != MANIFEST_VERSION {
+            return Err(ManifestLoadError::UnsupportedVersion(
+                header.version,
+                MANIFEST_VERSION,
+            ));
+        }
+
+        // Read operations
+        let mut operations = Vec::new();
+        let corrupted = loop {
+            if buf.remaining() == 0 {
+                break false;
+            }
+            if buf.remaining() < RECORD_HEADER_LEN {
+                warn!("incomplete header when decoding manifest, could be corrupted");
+                break true;
+            }
+            let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
+            let size = size as usize;
+            buf.advance(RECORD_HEADER_LEN);
+            if buf.remaining() < size {
+                warn!("incomplete data when decoding manifest, could be corrupted");
+                break true;
+            }
+            let data = &buf[..size];
+            if crc32c(data) != checksum {
+                warn!("checksum mismatch when decoding manifest, could be corrupted");
+                break true;
+            }
+            // if the following decode fails, we cannot use the manifest or safely ignore any record.
+            operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
+            buf.advance(size);
+        };
+        Ok((
+            Self { file },
+            operations,
+            ManifestPartiallyCorrupted(corrupted),
+        ))
+    }
+
+    fn append_data(&mut self, data: &[u8]) -> Result<()> {
+        if data.len() >= u32::MAX as usize {
+            panic!("data too large");
+        }
+        let header = RecordHeader {
+            size: data.len() as u32,
+            checksum: crc32c(data),
+        };
+        let header = header.encode();
+        self.file.write_all(&header)?;
+        self.file.write_all(data)?;
+        self.file.sync_all()?;
+        Ok(())
+    }
+
+    fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
+        let encoded = header.encode();
+        self.file.write_all(&encoded)?;
+        Ok(())
+    }
+
+    /// Add an operation to the manifest. The operation will be appended to the end of the file,
+    /// and the file will fsync.
+    pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
+        let encoded = Vec::from(serde_json::to_string(&operation)?);
+        self.append_data(&encoded)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs::OpenOptions;
+
+    use crate::repository::Key;
+
+    use super::*;
+
+    #[test]
+    fn test_read_manifest() {
+        let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
+        std::fs::create_dir_all(&testdir).unwrap();
+        let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
+        let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
+        let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
+        let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
+        let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
+
+        // Write a manifest with a snapshot and some operations
+        let snapshot = Snapshot {
+            layers: vec![layer1, layer2],
+        };
+        let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
+        manifest
+            .append_operation(Operation::Operation(
+                vec![Record::AddLayer(layer3.clone())],
+                Lsn::from(1),
+            ))
+            .unwrap();
+        drop(manifest);
+
+        // Open the second time and write
+        let file = VirtualFile::open_with_options(
+            &testdir.join("MANIFEST"),
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create_new(false)
+                .truncate(false),
+        )
+        .unwrap();
+        let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        assert!(!corrupted.0);
+        assert_eq!(operations.len(), 2);
+        assert_eq!(
+            &operations[0],
+            &Operation::Snapshot(snapshot.clone(), Lsn::from(0))
+        );
+        assert_eq!(
+            &operations[1],
+            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
+        );
+        manifest
+            .append_operation(Operation::Operation(
+                vec![
+                    Record::RemoveLayer(layer3.clone()),
+                    Record::AddLayer(layer4.clone()),
+                ],
+                Lsn::from(2),
+            ))
+            .unwrap();
+        drop(manifest);
+
+        // Open the third time and verify
+        let file = VirtualFile::open_with_options(
+            &testdir.join("MANIFEST"),
+            OpenOptions::new()
+                .read(true)
+                .write(true)
+                .create_new(false)
+                .truncate(false),
+        )
+        .unwrap();
+        let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
+        assert!(!corrupted.0);
+        assert_eq!(operations.len(), 3);
+        assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
+        assert_eq!(
+            &operations[1],
+            &Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
+        );
+        assert_eq!(
+            &operations[2],
+            &Operation::Operation(
+                vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
+                Lsn::from(2)
+            )
+        );
+    }
+}
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,7 +12,7 @@ use std::fs::{File, OpenOptions};
 use std::io::{self, Write};

 use anyhow::{bail, ensure, Context};
-use serde::{de::Error, Deserialize, Serialize, Serializer};
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
@@ -232,28 +232,6 @@ impl TimelineMetadata {
    }
 }

-impl<'de> Deserialize<'de> for TimelineMetadata {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let bytes = Vec::<u8>::deserialize(deserializer)?;
-        Self::from_bytes(bytes.as_slice()).map_err(|e| D::Error::custom(format!("{e}")))
-    }
-}
-
-impl Serialize for TimelineMetadata {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        let bytes = self
-            .to_bytes()
-            .map_err(|e| serde::ser::Error::custom(format!("{e}")))?;
-        bytes.serialize(serializer)
-    }
-}
-
 /// Save timeline metadata to file
 pub fn save_metadata(
    conf: &'static PageServerConf,
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -541,7 +541,8 @@ impl RemoteTimelineClient {
        // ahead of what's _actually_ on the remote during index upload.
        upload_queue.latest_metadata = metadata.clone();

-        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+        self.schedule_index_upload(upload_queue, metadata_bytes);

        Ok(())
    }
@@ -561,7 +562,8 @@ impl RemoteTimelineClient {
        let upload_queue = guard.initialized_mut()?;

        if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-            self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+            let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+            self.schedule_index_upload(upload_queue, metadata_bytes);
        }

        Ok(())
@@ -571,7 +573,7 @@ impl RemoteTimelineClient {
    fn schedule_index_upload(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        metadata: TimelineMetadata,
+        metadata_bytes: Vec<u8>,
    ) {
        info!(
            "scheduling metadata upload with {} files ({} changed)",
@@ -584,7 +586,7 @@ impl RemoteTimelineClient {
        let index_part = IndexPart::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata,
+            metadata_bytes,
        );
        let op = UploadOp::UploadMetadata(index_part, disk_consistent_lsn);
        self.calls_unfinished_metric_begin(&op);
@@ -640,7 +642,7 @@ impl RemoteTimelineClient {

        // Deleting layers doesn't affect the values stored in TimelineMetadata,
        // so we don't need update it. Just serialize it.
-        let metadata = upload_queue.latest_metadata.clone();
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;

        // Update the remote index file, removing the to-be-deleted files from the index,
        // before deleting the actual files.
@@ -651,13 +653,12 @@ impl RemoteTimelineClient {
        // to syntactically forbid ? or bail! calls here.
        let no_bail_here = || {
            for name in names {
-                if upload_queue.latest_files.remove(name).is_some() {
-                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                }
+                upload_queue.latest_files.remove(name);
+                upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
            }

            if upload_queue.latest_files_changes_since_metadata_upload_scheduled > 0 {
-                self.schedule_index_upload(upload_queue, metadata);
+                self.schedule_index_upload(upload_queue, metadata_bytes);
            }

            // schedule the actual deletions
@@ -1609,7 +1610,8 @@ mod tests {
                &layer_file_name_2.file_name(),
            ],
        );
-        assert_eq!(index_part.metadata, metadata);
+        let downloaded_metadata = index_part.parse_metadata().unwrap();
+        assert_eq!(downloaded_metadata, metadata);

        // Schedule upload and then a deletion. Check that the deletion is queued
        client
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -77,9 +77,7 @@ pub struct IndexPart {
    // private because internally we would read from metadata instead.
    #[serde_as(as = "DisplayFromStr")]
    disk_consistent_lsn: Lsn,
-
-    #[serde(rename = "metadata_bytes")]
-    pub metadata: TimelineMetadata,
+    metadata_bytes: Vec<u8>,
 }

 impl IndexPart {
@@ -97,7 +95,7 @@ impl IndexPart {
    pub fn new(
        layers_and_metadata: HashMap<LayerFileName, LayerFileMetadata>,
        disk_consistent_lsn: Lsn,
-        metadata: TimelineMetadata,
+        metadata_bytes: Vec<u8>,
    ) -> Self {
        let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
        let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
@@ -113,10 +111,14 @@ impl IndexPart {
            timeline_layers,
            layer_metadata,
            disk_consistent_lsn,
-            metadata,
+            metadata_bytes,
            deleted_at: None,
        }
    }
+
+    pub fn parse_metadata(&self) -> anyhow::Result<TimelineMetadata> {
+        TimelineMetadata::from_bytes(&self.metadata_bytes)
+    }
 }

 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -124,12 +126,12 @@ impl TryFrom<&UploadQueueInitialized> for IndexPart {

    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
-        let metadata = upload_queue.latest_metadata.clone();
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;

        Ok(Self::new(
            upload_queue.latest_files.clone(),
            disk_consistent_lsn,
-            metadata,
+            metadata_bytes,
        ))
    }
 }
@@ -180,7 +182,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
            deleted_at: None,
        };

@@ -199,7 +201,7 @@ mod tests {
                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
        }"#;

        let expected = IndexPart {
@@ -217,7 +219,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
            deleted_at: None,
        };

@@ -236,7 +238,7 @@ mod tests {
                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
            },
            "disk_consistent_lsn":"0/16960E8",
-            "metadata_bytes":[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
            "deleted_at": "2023-07-31T09:00:00.123"
        }"#;

@@ -255,7 +257,7 @@ mod tests {
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
        };
@@ -279,7 +281,7 @@ mod tests {
            timeline_layers: HashSet::new(),
            layer_metadata: HashMap::new(),
            disk_consistent_lsn: "0/2532648".parse::<Lsn>().unwrap(),
-            metadata: TimelineMetadata::from_bytes(&[
+            metadata_bytes: [
                136, 151, 49, 208, 0, 70, 0, 4, 0, 0, 0, 0, 2, 83, 38, 72, 1, 0, 0, 0, 0, 2, 83,
                38, 32, 1, 87, 198, 240, 135, 97, 119, 45, 125, 38, 29, 155, 161, 140, 141, 255,
                210, 0, 0, 0, 0, 2, 83, 38, 72, 0, 0, 0, 0, 1, 73, 240, 192, 0, 0, 0, 0, 1, 73,
@@ -300,8 +302,8 @@ mod tests {
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0,
-            ])
-            .unwrap(),
+            ]
+            .to_vec(),
            deleted_at: None,
        };

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value, KEY_SIZE};
-use crate::tenant::blob_io::BlobWriter;
+use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -45,8 +45,8 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::{self, File};
-use std::io::SeekFrom;
 use std::io::{BufWriter, Write};
+use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
@@ -467,7 +467,7 @@ impl DeltaLayer {
            PathOrConf::Path(_) => None,
        };

-        let loaded = DeltaLayerInner::load(&path, summary).await?;
+        let loaded = DeltaLayerInner::load(&path, summary)?;

        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code
@@ -582,6 +582,8 @@ struct DeltaLayerWriterInner {
    lsn_range: Range<Lsn>,

    tree: DiskBtreeBuilder<BlockBuf, DELTA_KEY_SIZE>,
+
+    blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
 }

 impl DeltaLayerWriterInner {
@@ -603,6 +605,12 @@ impl DeltaLayerWriterInner {
        // FIXME: throw an error instead?
        let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);

+        let mut file = VirtualFile::create(&path)?;
+        // make room for the header block
+        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
+        let buf_writer = BufWriter::new(file);
+        let blob_writer = WriteBlobWriter::new(buf_writer, PAGE_SZ as u64);
+
        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
        let tree_builder = DiskBtreeBuilder::new(block_buf);
@@ -615,6 +623,7 @@ impl DeltaLayerWriterInner {
            key_start,
            lsn_range,
            tree: tree_builder,
+            blob_writer,
        })
    }

@@ -623,12 +632,11 @@ impl DeltaLayerWriterInner {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+    fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
        self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
-            .await
    }

-    async fn put_value_bytes(
+    fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
@@ -636,20 +644,30 @@ impl DeltaLayerWriterInner {
        will_init: bool,
    ) -> anyhow::Result<()> {
        assert!(self.lsn_range.start <= lsn);
-        todo!("use TBD EphemeralFile superclass");
+
+        let off = self.blob_writer.write_blob(val)?;
+
+        let blob_ref = BlobRef::new(off, will_init);
+
+        let delta_key = DeltaKey::from_key_lsn(&key, lsn);
+        self.tree.append(&delta_key.0, blob_ref.0)?;
+
+        Ok(())
    }

    fn size(&self) -> u64 {
-        todo!()
+        self.blob_writer.size() + self.tree.borrow_writer().size()
    }

    ///
    /// Finish writing the delta layer.
    ///
    fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
-        let index_start_blk: u32 = todo!();
+        let index_start_blk =
+            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

-        let mut file: VirtualFile = todo!("EphemeralFile superclass needs into_inner() api");
+        let buf_writer = self.blob_writer.into_inner();
+        let mut file = buf_writer.into_inner()?;

        // Write out the index
        let (index_root_blk, block_buf) = self.tree.finish()?;
@@ -779,11 +797,11 @@ impl DeltaLayerWriter {
    ///
    /// The values must be appended in key, lsn order.
    ///
-    pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
-        self.inner.as_mut().unwrap().put_value(key, lsn, val).await
+    pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
+        self.inner.as_mut().unwrap().put_value(key, lsn, val)
    }

-    pub async fn put_value_bytes(
+    pub fn put_value_bytes(
        &mut self,
        key: Key,
        lsn: Lsn,
@@ -794,7 +812,6 @@ impl DeltaLayerWriter {
            .as_mut()
            .unwrap()
            .put_value_bytes(key, lsn, val, will_init)
-            .await
    }

    pub fn size(&self) -> u64 {
@@ -811,20 +828,25 @@ impl DeltaLayerWriter {

 impl Drop for DeltaLayerWriter {
    fn drop(&mut self) {
-        todo!("TBD EpheemralFile superclass into_inner(); => VirtualFile => remove()");
+        if let Some(inner) = self.inner.take() {
+            match inner.blob_writer.into_inner().into_inner() {
+                Ok(vfile) => vfile.remove(),
+                Err(err) => warn!(
+                    "error while flushing buffer of image layer temporary file: {}",
+                    err
+                ),
+            }
+        }
    }
 }

 impl DeltaLayerInner {
-    pub(super) async fn load(
-        path: &std::path::Path,
-        summary: Option<Summary>,
-    ) -> anyhow::Result<Self> {
+    pub(super) fn load(path: &std::path::Path, summary: Option<Summary>) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);

-        let summary_blk = file.read_blk(0).await?;
+        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -1006,7 +1028,7 @@ impl<'a> ValueRef<'a> {
 pub(crate) struct Adapter<T>(T);

 impl<T: AsRef<DeltaLayerInner>> Adapter<T> {
-    pub(crate) async fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
-        self.0.as_ref().file.read_blk(blknum).await
+    pub(crate) fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
+        self.0.as_ref().file.read_blk(blknum)
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, KEY_SIZE};
-use crate::tenant::blob_io::BlobWriter;
+use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
 use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
 use crate::tenant::storage_layer::{
@@ -42,8 +42,8 @@ use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::{self, File};
-use std::io::SeekFrom;
 use std::io::Write;
+use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
@@ -349,8 +349,7 @@ impl ImageLayer {
            PathOrConf::Path(_) => None,
        };

-        let loaded =
-            ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary).await?;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), expected_summary)?;

        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code
@@ -433,7 +432,7 @@ impl ImageLayer {
 }

 impl ImageLayerInner {
-    pub(super) async fn load(
+    pub(super) fn load(
        path: &std::path::Path,
        lsn: Lsn,
        summary: Option<Summary>,
@@ -441,7 +440,7 @@ impl ImageLayerInner {
        let file = VirtualFile::open(path)
            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0).await?;
+        let summary_blk = file.read_blk(0)?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;

        if let Some(mut expected_summary) = summary {
@@ -511,6 +510,7 @@ struct ImageLayerWriterInner {
    key_range: Range<Key>,
    lsn: Lsn,

+    blob_writer: WriteBlobWriter<VirtualFile>,
    tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
 }

@@ -537,6 +537,13 @@ impl ImageLayerWriterInner {
            },
        );
        info!("new image layer {}", path.display());
+        let mut file = VirtualFile::open_with_options(
+            &path,
+            std::fs::OpenOptions::new().write(true).create_new(true),
+        )?;
+        // make room for the header block
+        file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
+        let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);

        // Initialize the b-tree index builder
        let block_buf = BlockBuf::new();
@@ -550,6 +557,7 @@ impl ImageLayerWriterInner {
            key_range: key_range.clone(),
            lsn,
            tree: tree_builder,
+            blob_writer,
        };

        Ok(writer)
@@ -561,7 +569,13 @@ impl ImageLayerWriterInner {
    /// The page versions must be appended in blknum order.
    ///
    fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
-        todo!("use TBD EphemeralFile superclass that skips copying into mutable_tail");
+        ensure!(self.key_range.contains(&key));
+        let off = self.blob_writer.write_blob(img)?;
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        self.tree.append(&keybuf, off)?;
+
        Ok(())
    }

@@ -569,9 +583,10 @@ impl ImageLayerWriterInner {
    /// Finish writing the image layer.
    ///
    fn finish(self) -> anyhow::Result<ImageLayer> {
-        let index_start_blk: u32 = todo!();
+        let index_start_blk =
+            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;

-        let mut file: VirtualFile = todo!("EphemeralFile superclass needs into_inner() api");
+        let mut file = self.blob_writer.into_inner();

        // Write out the index
        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))?;
@@ -709,7 +724,7 @@ impl ImageLayerWriter {
 impl Drop for ImageLayerWriter {
    fn drop(&mut self) {
        if let Some(inner) = self.inner.take() {
-            todo!("TBD EpheemralFile superclass into_inner(); => VirtualFile => remove()");
+            inner.blob_writer.into_inner().remove();
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -348,7 +348,7 @@ impl InMemoryLayer {
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf).await?;
                let will_init = Value::des(&buf)?.will_init();
-                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init).await?;
+                delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
            }
        }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -465,7 +465,7 @@ impl Timeline {
        // The cached image can be returned directly if there is no WAL between the cached image
        // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
        // for redo.
-        let cached_page_img = match self.lookup_cached_page(&key, lsn).await {
+        let cached_page_img = match self.lookup_cached_page(&key, lsn) {
            Some((cached_lsn, cached_img)) => {
                match cached_lsn.cmp(&lsn) {
                    Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
@@ -494,7 +494,6 @@ impl Timeline {

        RECONSTRUCT_TIME
            .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
-            .await
    }

    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
@@ -1615,7 +1614,7 @@ impl Timeline {
        let (conf, tenant_id, timeline_id) = (self.conf, self.tenant_id, self.timeline_id);
        let span = tracing::Span::current();

-        let (loaded_layers, to_sync, total_physical_size) = tokio::task::spawn_blocking({
+        let (loaded_layers, needs_upload, total_physical_size) = tokio::task::spawn_blocking({
            move || {
                let _g = span.entered();
                let discovered = init::scan_timeline_dir(&timeline_path)?;
@@ -1661,7 +1660,6 @@ impl Timeline {

                let mut loaded_layers = Vec::new();
                let mut needs_upload = Vec::new();
-                let mut needs_cleanup = Vec::new();
                let mut total_physical_size = 0;

                for (name, decision) in decided {
@@ -1677,10 +1675,14 @@ impl Timeline {
                        Err(FutureLayer { local }) => {
                            if local.is_some() {
                                path.push(name.file_name());
-                                init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
+                                init::cleanup_future_layer(&path, name, disk_consistent_lsn)?;
                                path.pop();
+                            } else {
+                                // we cannot do anything for remote layers, but not continuing to
+                                // process it will leave it out index_part.json as well.
                            }
-                            needs_cleanup.push(name);
+                            //
+                            // we do not currently schedule deletions for these.
                            continue;
                        }
                    };
@@ -1734,11 +1736,7 @@ impl Timeline {

                    loaded_layers.push(layer);
                }
-                Ok((
-                    loaded_layers,
-                    (needs_upload, needs_cleanup),
-                    total_physical_size,
-                ))
+                Ok((loaded_layers, needs_upload, total_physical_size))
            }
        })
        .await
@@ -1750,11 +1748,9 @@ impl Timeline {
        guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);

        if let Some(rtc) = self.remote_client.as_ref() {
-            let (needs_upload, needs_cleanup) = to_sync;
            for (layer, m) in needs_upload {
                rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
            }
-            rtc.schedule_layer_file_deletion(&needs_cleanup)?;
            rtc.schedule_index_upload_for_file_changes()?;
            // Tenant::create_timeline will wait for these uploads to happen before returning, or
            // on retry.
@@ -2265,15 +2261,7 @@ impl Timeline {
                        )));
                    }
                }
-                ancestor
-                    .wait_lsn(timeline.ancestor_lsn, ctx)
-                    .await
-                    .with_context(|| {
-                        format!(
-                            "wait for lsn {} on ancestor timeline_id={}",
-                            timeline.ancestor_lsn, ancestor.timeline_id
-                        )
-                    })?;
+                ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?;

                timeline_owned = ancestor;
                timeline = &*timeline_owned;
@@ -2452,14 +2440,13 @@ impl Timeline {
        }
    }

-    async fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
+    fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> {
        let cache = page_cache::get();

        // FIXME: It's pointless to check the cache for things that are not 8kB pages.
        // We should look at the key to determine if it's a cacheable object
-        let (lsn, read_guard) = cache
-            .lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)
-            .await?;
+        let (lsn, read_guard) =
+            cache.lookup_materialized_page(self.tenant_id, self.timeline_id, key, lsn)?;
        let img = Bytes::from(read_guard.to_vec());
        Some((lsn, img))
    }
@@ -3599,7 +3586,7 @@ impl Timeline {
                )))
            });

-            writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            writer.as_mut().unwrap().put_value(key, lsn, value)?;
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
@@ -4141,7 +4128,7 @@ impl Timeline {
    ///
    /// Reconstruct a value, using the given base image and WAL records in 'data'.
    ///
-    async fn reconstruct_value(
+    fn reconstruct_value(
        &self,
        key: Key,
        request_lsn: Lsn,
@@ -4210,7 +4197,6 @@ impl Timeline {
                            last_rec_lsn,
                            &img,
                        )
-                        .await
                        .context("Materialized page memoization failed")
                    {
                        return Err(PageReconstructError::from(e));
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -183,7 +183,7 @@ pub(super) fn cleanup_local_file_for_remote(

 pub(super) fn cleanup_future_layer(
    path: &Path,
-    name: &LayerFileName,
+    name: LayerFileName,
    disk_consistent_lsn: Lsn,
 ) -> anyhow::Result<()> {
    use LayerFileName::*;
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -31,11 +31,10 @@ use storage_broker::Streaming;
 use tokio::select;
 use tracing::*;

-use postgres_connection::PgConnectionConfig;
+use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::backoff::{
    exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::postgres_client::wal_stream_connection_config;
 use utils::{
    id::{NodeId, TenantTimelineId},
    lsn::Lsn,
@@ -880,6 +879,33 @@ impl ReconnectReason {
    }
 }

+fn wal_stream_connection_config(
+    TenantTimelineId {
+        tenant_id,
+        timeline_id,
+    }: TenantTimelineId,
+    listen_pg_addr_str: &str,
+    auth_token: Option<&str>,
+    availability_zone: Option<&str>,
+) -> anyhow::Result<PgConnectionConfig> {
+    let (host, port) =
+        parse_host_port(listen_pg_addr_str).context("Unable to parse listen_pg_addr_str")?;
+    let port = port.unwrap_or(5432);
+    let mut connstr = PgConnectionConfig::new_host_port(host, port)
+        .extend_options([
+            "-c".to_owned(),
+            format!("timeline_id={}", timeline_id),
+            format!("tenant_id={}", tenant_id),
+        ])
+        .set_password(auth_token.map(|s| s.to_owned()));
+
+    if let Some(availability_zone) = availability_zone {
+        connstr = connstr.extend_options([format!("availability_zone={}", availability_zone)]);
+    }
+
+    Ok(connstr)
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -895,7 +921,6 @@ mod tests {
            timeline: SafekeeperTimelineInfo {
                safekeeper_id: 0,
                tenant_timeline_id: None,
-                term: 0,
                last_log_term: 0,
                flush_lsn: 0,
                commit_lsn,
@@ -904,7 +929,6 @@ mod tests {
                peer_horizon_lsn: 0,
                local_start_lsn: 0,
                safekeeper_connstr: safekeeper_connstr.to_owned(),
-                http_connstr: safekeeper_connstr.to_owned(),
                availability_zone: None,
            },
            latest_update,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -148,16 +148,17 @@ impl UploadQueue {
            );
        }

+        let index_part_metadata = index_part.parse_metadata()?;
        info!(
            "initializing upload queue with remote index_part.disk_consistent_lsn: {}",
-            index_part.metadata.disk_consistent_lsn()
+            index_part_metadata.disk_consistent_lsn()
        );

        let state = UploadQueueInitialized {
            latest_files: files,
            latest_files_changes_since_metadata_upload_scheduled: 0,
-            latest_metadata: index_part.metadata.clone(),
-            last_uploaded_consistent_lsn: index_part.metadata.disk_consistent_lsn(),
+            latest_metadata: index_part_metadata.clone(),
+            last_uploaded_consistent_lsn: index_part_metadata.disk_consistent_lsn(),
            // what follows are boring default initializations
            task_counter: 0,
            num_inprogress_layer_uploads: 0,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -13,7 +13,7 @@
 use crate::metrics::{STORAGE_IO_SIZE, STORAGE_IO_TIME};
 use once_cell::sync::OnceCell;
 use std::fs::{self, File, OpenOptions};
-use std::io::{Error, ErrorKind, Seek, SeekFrom, Write};
+use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
 use std::os::unix::fs::FileExt;
 use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
@@ -321,8 +321,54 @@ impl VirtualFile {
        drop(self);
        std::fs::remove_file(path).expect("failed to remove the virtual file");
    }
+}

-    pub fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+impl Drop for VirtualFile {
+    /// If a VirtualFile is dropped, close the underlying file if it was open.
+    fn drop(&mut self) {
+        let handle = self.handle.get_mut().unwrap();
+
+        // We could check with a read-lock first, to avoid waiting on an
+        // unrelated I/O.
+        let slot = &get_open_files().slots[handle.index];
+        let mut slot_guard = slot.inner.write().unwrap();
+        if slot_guard.tag == handle.tag {
+            slot.recently_used.store(false, Ordering::Relaxed);
+            // there is also operation "close-by-replace" for closes done on eviction for
+            // comparison.
+            STORAGE_IO_TIME
+                .with_label_values(&["close"])
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
+        }
+    }
+}
+
+impl Read for VirtualFile {
+    fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
+        let pos = self.pos;
+        let n = self.read_at(buf, pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
+}
+
+impl Write for VirtualFile {
+    fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
+        let pos = self.pos;
+        let n = self.write_at(buf, pos)?;
+        self.pos += n as u64;
+        Ok(n)
+    }
+
+    fn flush(&mut self) -> Result<(), std::io::Error> {
+        // flush is no-op for File (at least on unix), so we don't need to do
+        // anything here either.
+        Ok(())
+    }
+}
+
+impl Seek for VirtualFile {
+    fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
        match pos {
            SeekFrom::Start(offset) => {
                self.pos = offset;
@@ -346,50 +392,10 @@ impl VirtualFile {
        }
        Ok(self.pos)
    }
+}

-    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
-    pub fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
-        while !buf.is_empty() {
-            match self.read_at(buf, offset) {
-                Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::UnexpectedEof,
-                        "failed to fill whole buffer",
-                    ))
-                }
-                Ok(n) => {
-                    buf = &mut buf[n..];
-                    offset += n as u64;
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-        }
-        Ok(())
-    }
-
-    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
-        while !buf.is_empty() {
-            match self.write_at(buf, offset) {
-                Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::WriteZero,
-                        "failed to write whole buffer",
-                    ));
-                }
-                Ok(n) => {
-                    buf = &buf[n..];
-                    offset += n as u64;
-                }
-                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
-            }
-        }
-        Ok(())
-    }
-
-    pub fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
+impl FileExt for VirtualFile {
+    fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
        let result = self.with_file("read", |file| file.read_at(buf, offset))?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
@@ -399,7 +405,7 @@ impl VirtualFile {
        result
    }

-    pub fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
+    fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
        let result = self.with_file("write", |file| file.write_at(buf, offset))?;
        if let Ok(size) = result {
            STORAGE_IO_SIZE
@@ -410,26 +416,6 @@ impl VirtualFile {
    }
 }

-impl Drop for VirtualFile {
-    /// If a VirtualFile is dropped, close the underlying file if it was open.
-    fn drop(&mut self) {
-        let handle = self.handle.get_mut().unwrap();
-
-        // We could check with a read-lock first, to avoid waiting on an
-        // unrelated I/O.
-        let slot = &get_open_files().slots[handle.index];
-        let mut slot_guard = slot.inner.write().unwrap();
-        if slot_guard.tag == handle.tag {
-            slot.recently_used.store(false, Ordering::Relaxed);
-            // there is also operation "close-by-replace" for closes done on eviction for
-            // comparison.
-            STORAGE_IO_TIME
-                .with_label_values(&["close"])
-                .observe_closure_duration(|| drop(slot_guard.file.take()));
-        }
-    }
-}
-
 impl OpenFiles {
    fn new(num_slots: usize) -> OpenFiles {
        let mut slots = Box::new(Vec::with_capacity(num_slots));
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -341,35 +341,21 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {

    let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);

+    // Load all timelines from disk to memory.
+    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
+
    // Keep handles to main tasks to die if any of them disappears.
    let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
        FuturesUnordered::new();

-    // Start wal backup launcher before loading timelines as we'll notify it
-    // through the channel about timelines which need offloading, not draining
-    // the channel would cause deadlock.
-    let current_thread_rt = conf
-        .current_thread_runtime
-        .then(|| Handle::try_current().expect("no runtime in main"));
-    let conf_ = conf.clone();
-    let wal_backup_handle = current_thread_rt
-        .as_ref()
-        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
-        .spawn(wal_backup::wal_backup_launcher_task_main(
-            conf_,
-            wal_backup_launcher_rx,
-        ))
-        .map(|res| ("WAL backup launcher".to_owned(), res));
-    tasks_handles.push(Box::pin(wal_backup_handle));
-
-    // Load all timelines from disk to memory.
-    GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx).await?;
-
    let conf_ = conf.clone();
    // Run everything in current thread rt, if asked.
    if conf.current_thread_runtime {
        info!("running in current thread runtime");
    }
+    let current_thread_rt = conf
+        .current_thread_runtime
+        .then(|| Handle::try_current().expect("no runtime in main"));

    let wal_service_handle = current_thread_rt
        .as_ref()
@@ -422,6 +408,17 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
        .map(|res| ("WAL remover".to_owned(), res));
    tasks_handles.push(Box::pin(wal_remover_handle));

+    let conf_ = conf.clone();
+    let wal_backup_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
+        .spawn(wal_backup::wal_backup_launcher_task_main(
+            conf_,
+            wal_backup_launcher_rx,
+        ))
+        .map(|res| ("WAL backup launcher".to_owned(), res));
+    tasks_handles.push(Box::pin(wal_backup_handle));
+
    set_build_info_metric(GIT_VERSION);

    // TODO: update tokio-stream, convert to real async Stream with
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,6 +1,7 @@
 //! Code to deal with safekeeper control file upgrades
 use crate::safekeeper::{
-    AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermLsn,
+    AcceptorState, PersistedPeers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory,
+    TermSwitchEntry,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
@@ -144,7 +145,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
        let oldstate = SafeKeeperStateV1::des(&buf[..buf.len()])?;
        let ac = AcceptorState {
            term: oldstate.acceptor_state.term,
-            term_history: TermHistory(vec![TermLsn {
+            term_history: TermHistory(vec![TermSwitchEntry {
                term: oldstate.acceptor_state.epoch,
                lsn: Lsn(0),
            }]),
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -19,7 +19,6 @@ use crate::receive_wal::WalReceiverState;
 use crate::safekeeper::ServerInfo;
 use crate::safekeeper::Term;
 use crate::send_wal::WalSenderState;
-use crate::timeline::PeerInfo;
 use crate::{debug_dump, pull_timeline};

 use crate::timelines_global_map::TimelineDeleteForceResult;
@@ -102,7 +101,6 @@ pub struct TimelineStatus {
    pub peer_horizon_lsn: Lsn,
    #[serde_as(as = "DisplayFromStr")]
    pub remote_consistent_lsn: Lsn,
-    pub peers: Vec<PeerInfo>,
    pub walsenders: Vec<WalSenderState>,
    pub walreceivers: Vec<WalReceiverState>,
 }
@@ -142,7 +140,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        term_history,
    };

-    let conf = get_conf(&request);
    // Note: we report in memory values which can be lost.
    let status = TimelineStatus {
        tenant_id: ttid.tenant_id,
@@ -156,7 +153,6 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
        backup_lsn: inmem.backup_lsn,
        peer_horizon_lsn: inmem.peer_horizon_lsn,
        remote_consistent_lsn: tli.get_walsenders().get_remote_consistent_lsn(),
-        peers: tli.get_peers(conf).await,
        walsenders: tli.get_walsenders().get_all(),
        walreceivers: tli.get_walreceivers().get_all(),
    };
@@ -286,14 +282,12 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
            tenant_id: ttid.tenant_id.as_ref().to_owned(),
            timeline_id: ttid.timeline_id.as_ref().to_owned(),
        }),
-        term: sk_info.term.unwrap_or(0),
        last_log_term: sk_info.last_log_term.unwrap_or(0),
        flush_lsn: sk_info.flush_lsn.0,
        commit_lsn: sk_info.commit_lsn.0,
        remote_consistent_lsn: sk_info.remote_consistent_lsn.0,
        peer_horizon_lsn: sk_info.peer_horizon_lsn.0,
        safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
-        http_connstr: sk_info.http_connstr.unwrap_or_else(|| "".to_owned()),
        backup_lsn: sk_info.backup_lsn.0,
        local_start_lsn: sk_info.local_start_lsn.0,
        availability_zone: None,
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -21,7 +21,7 @@ use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
 use crate::safekeeper::{
    AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected,
 };
-use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermLsn};
+use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry};
 use crate::timeline::Timeline;
 use crate::GlobalTimelines;
 use postgres_backend::PostgresBackend;
@@ -119,7 +119,7 @@ async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> any
    let history = tli.get_state().await.1.acceptor_state.term_history;
    let history = history.up_to(lsn.checked_sub(1u64).unwrap());
    let mut history_entries = history.0;
-    history_entries.push(TermLsn { term, lsn });
+    history_entries.push(TermSwitchEntry { term, lsn });
    let history = TermHistory(history_entries);

    let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -19,7 +19,6 @@ pub mod json_ctrl;
 pub mod metrics;
 pub mod pull_timeline;
 pub mod receive_wal;
-pub mod recovery;
 pub mod remove_wal;
 pub mod safekeeper;
 pub mod send_wal;
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -227,9 +227,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
    tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
    tokio::fs::rename(tli_dir_path, &timeline_path).await?;

-    let tli = GlobalTimelines::load_timeline(ttid)
-        .await
-        .context("Failed to load timeline after copy")?;
+    let tli = GlobalTimelines::load_timeline(ttid).context("Failed to load timeline after copy")?;

    info!(
        "Loaded timeline {}, flush_lsn={}",
--- a/safekeeper/src/recovery.rs
+++ b/safekeeper/src/recovery.rs
@@ -1,40 +0,0 @@
-//! This module implements pulling WAL from peer safekeepers if compute can't
-//! provide it, i.e. safekeeper lags too much.
-
-use std::sync::Arc;
-
-use tokio::{select, time::sleep, time::Duration};
-use tracing::{info, instrument};
-
-use crate::{timeline::Timeline, SafeKeeperConf};
-
-/// Entrypoint for per timeline task which always runs, checking whether
-/// recovery for this safekeeper is needed and starting it if so.
-#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
-pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
-    info!("started");
-    let mut cancellation_rx = match tli.get_cancellation_rx() {
-        Ok(rx) => rx,
-        Err(_) => {
-            info!("timeline canceled during task start");
-            return;
-        }
-    };
-
-    select! {
-        _ = recovery_main_loop(tli) => { unreachable!() }
-        _ = cancellation_rx.changed() => {
-            info!("stopped");
-        }
-    }
-}
-
-const CHECK_INTERVAL_MS: u64 = 2000;
-
-/// Check regularly whether we need to start recovery.
-async fn recovery_main_loop(_tli: Arc<Timeline>) {
-    let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
-    loop {
-        sleep(check_duration).await;
-    }
-}
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -34,33 +34,22 @@ pub const UNKNOWN_SERVER_VERSION: u32 = 0;

 /// Consensus logical timestamp.
 pub type Term = u64;
-pub const INVALID_TERM: Term = 0;
+const INVALID_TERM: Term = 0;

-#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
-pub struct TermLsn {
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub struct TermSwitchEntry {
    pub term: Term,
    pub lsn: Lsn,
 }
-
-// Creation from tuple provides less typing (e.g. for unit tests).
-impl From<(Term, Lsn)> for TermLsn {
-    fn from(pair: (Term, Lsn)) -> TermLsn {
-        TermLsn {
-            term: pair.0,
-            lsn: pair.1,
-        }
-    }
-}
-
 #[derive(Clone, Serialize, Deserialize)]
-pub struct TermHistory(pub Vec<TermLsn>);
+pub struct TermHistory(pub Vec<TermSwitchEntry>);

 impl TermHistory {
    pub fn empty() -> TermHistory {
        TermHistory(Vec::new())
    }

-    // Parse TermHistory as n_entries followed by TermLsn pairs
+    // Parse TermHistory as n_entries followed by TermSwitchEntry pairs
    pub fn from_bytes(bytes: &mut Bytes) -> Result<TermHistory> {
        if bytes.remaining() < 4 {
            bail!("TermHistory misses len");
@@ -71,7 +60,7 @@ impl TermHistory {
            if bytes.remaining() < 16 {
                bail!("TermHistory is incomplete");
            }
-            res.push(TermLsn {
+            res.push(TermSwitchEntry {
                term: bytes.get_u64_le(),
                lsn: bytes.get_u64_le().into(),
            })
@@ -568,17 +557,12 @@ where
            .up_to(self.flush_lsn())
    }

-    /// Get current term.
-    pub fn get_term(&self) -> Term {
-        self.state.acceptor_state.term
-    }
-
    pub fn get_epoch(&self) -> Term {
        self.state.acceptor_state.get_epoch(self.flush_lsn())
    }

    /// wal_store wrapper avoiding commit_lsn <= flush_lsn violation when we don't have WAL yet.
-    pub fn flush_lsn(&self) -> Lsn {
+    fn flush_lsn(&self) -> Lsn {
        max(self.wal_store.flush_lsn(), self.state.timeline_start_lsn)
    }

@@ -1154,7 +1138,7 @@ mod tests {
        let pem = ProposerElected {
            term: 1,
            start_streaming_at: Lsn(1),
-            term_history: TermHistory(vec![TermLsn {
+            term_history: TermHistory(vec![TermSwitchEntry {
                term: 1,
                lsn: Lsn(3),
            }]),
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,12 +2,12 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.

 use crate::handler::SafekeeperPostgresHandler;
-use crate::safekeeper::{Term, TermLsn};
+use crate::safekeeper::Term;
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
-use anyhow::{bail, Context as AnyhowContext};
+use anyhow::Context as AnyhowContext;
 use bytes::Bytes;
 use parking_lot::Mutex;
 use postgres_backend::PostgresBackend;
@@ -390,25 +390,26 @@ impl SafekeeperPostgresHandler {
            self.appname.clone(),
        ));

-        // Walsender can operate in one of two modes which we select by
-        // application_name: give only committed WAL (used by pageserver) or all
-        // existing WAL (up to flush_lsn, used by walproposer or peer recovery).
-        // The second case is always driven by a consensus leader which term
-        // must generally be also supplied. However we're sloppy to do this in
-        // walproposer recovery which will be removed soon. So TODO is to make
-        // it not Option'al then.
+        let commit_lsn_watch_rx = tli.get_commit_lsn_watch_rx();
+
+        // Walproposer gets special handling: safekeeper must give proposer all
+        // local WAL till the end, whether committed or not (walproposer will
+        // hang otherwise). That's because walproposer runs the consensus and
+        // synchronizes safekeepers on the most advanced one.
        //
-        // Fetching WAL without term in recovery creates a small risk of this
-        // WAL getting concurrently garbaged if another compute rises which
-        // collects majority and starts fixing log on this safekeeper itself.
-        // That's ok as (old) proposer will never be able to commit such WAL.
-        let end_watch = if self.is_walproposer_recovery() {
-            EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
+        // There is a small risk of this WAL getting concurrently garbaged if
+        // another compute rises which collects majority and starts fixing log
+        // on this safekeeper itself. That's ok as (old) proposer will never be
+        // able to commit such WAL.
+        let stop_pos: Option<Lsn> = if self.is_walproposer_recovery() {
+            let wal_end = tli.get_flush_lsn().await;
+            Some(wal_end)
        } else {
-            EndWatch::Commit(tli.get_commit_lsn_watch_rx())
+            None
        };
-        // we don't check term here; it will be checked on first waiting/WAL reading anyway.
-        let end_pos = end_watch.get();
+
+        // take the latest commit_lsn if don't have stop_pos
+        let end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());

        if end_pos < start_pos {
            warn!(
@@ -418,10 +419,8 @@ impl SafekeeperPostgresHandler {
        }

        info!(
-            "starting streaming from {:?}, available WAL ends at {}, recovery={}",
-            start_pos,
-            end_pos,
-            matches!(end_watch, EndWatch::Flush(_))
+            "starting streaming from {:?} till {:?}, available WAL ends at {}",
+            start_pos, stop_pos, end_pos
        );

        // switch to copy
@@ -446,8 +445,9 @@ impl SafekeeperPostgresHandler {
            appname,
            start_pos,
            end_pos,
+            stop_pos,
            term,
-            end_watch,
+            commit_lsn_watch_rx,
            ws_guard: ws_guard.clone(),
            wal_reader,
            send_buf: [0; MAX_SEND_SIZE],
@@ -466,32 +466,6 @@ impl SafekeeperPostgresHandler {
    }
 }

-/// Walsender streams either up to commit_lsn (normally) or flush_lsn in the
-/// given term (recovery by walproposer or peer safekeeper).
-enum EndWatch {
-    Commit(Receiver<Lsn>),
-    Flush(Receiver<TermLsn>),
-}
-
-impl EndWatch {
-    /// Get current end of WAL.
-    fn get(&self) -> Lsn {
-        match self {
-            EndWatch::Commit(r) => *r.borrow(),
-            EndWatch::Flush(r) => r.borrow().lsn,
-        }
-    }
-
-    /// Wait for the update.
-    async fn changed(&mut self) -> anyhow::Result<()> {
-        match self {
-            EndWatch::Commit(r) => r.changed().await?,
-            EndWatch::Flush(r) => r.changed().await?,
-        }
-        Ok(())
-    }
-}
-
 /// A half driving sending WAL.
 struct WalSender<'a, IO> {
    pgb: &'a mut PostgresBackend<IO>,
@@ -506,12 +480,14 @@ struct WalSender<'a, IO> {
    // We send this LSN to the receiver as wal_end, so that it knows how much
    // WAL this safekeeper has. This LSN should be as fresh as possible.
    end_pos: Lsn,
+    // If present, terminate after reaching this position; used by walproposer
+    // in recovery.
+    stop_pos: Option<Lsn>,
    /// When streaming uncommitted part, the term the client acts as the leader
    /// in. Streaming is stopped if local term changes to a different (higher)
    /// value.
    term: Option<Term>,
-    /// Watch channel receiver to learn end of available WAL (and wait for its advancement).
-    end_watch: EndWatch,
+    commit_lsn_watch_rx: Receiver<Lsn>,
    ws_guard: Arc<WalSenderGuard>,
    wal_reader: WalReader,
    // buffer for readling WAL into to send it
@@ -521,20 +497,29 @@ struct WalSender<'a, IO> {
 impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
    /// Send WAL until
    /// - an error occurs
-    /// - receiver is caughtup and there is no computes (if streaming up to commit_lsn)
+    /// - if we are streaming to walproposer, we've streamed until stop_pos
+    ///   (recovery finished)
+    /// - receiver is caughtup and there is no computes
    ///
    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
    /// convenience.
    async fn run(&mut self) -> Result<(), CopyStreamHandlerEnd> {
        loop {
-            // Wait for the next portion if it is not there yet, or just
-            // update our end of WAL available for sending value, we
-            // communicate it to the receiver.
-            self.wait_wal().await?;
-            assert!(
-                self.end_pos > self.start_pos,
-                "nothing to send after waiting for WAL"
-            );
+            // If we are streaming to walproposer, check it is time to stop.
+            if let Some(stop_pos) = self.stop_pos {
+                if self.start_pos >= stop_pos {
+                    // recovery finished
+                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
+                        "ending streaming to walproposer at {}, recovery finished",
+                        self.start_pos
+                    )));
+                }
+            } else {
+                // Wait for the next portion if it is not there yet, or just
+                // update our end of WAL available for sending value, we
+                // communicate it to the receiver.
+                self.wait_wal().await?;
+            }

            // try to send as much as available, capped by MAX_SEND_SIZE
            let mut send_size = self
@@ -582,7 +567,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
    /// exit in the meanwhile
    async fn wait_wal(&mut self) -> Result<(), CopyStreamHandlerEnd> {
        loop {
-            self.end_pos = self.end_watch.get();
+            self.end_pos = *self.commit_lsn_watch_rx.borrow();
            if self.end_pos > self.start_pos {
                // We have something to send.
                trace!("got end_pos {:?}, streaming", self.end_pos);
@@ -590,31 +575,27 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
            }

            // Wait for WAL to appear, now self.end_pos == self.start_pos.
-            if let Some(lsn) = wait_for_lsn(&mut self.end_watch, self.term, self.start_pos).await? {
+            if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? {
                self.end_pos = lsn;
                trace!("got end_pos {:?}, streaming", self.end_pos);
                return Ok(());
            }

-            // Timed out waiting for WAL, check for termination and send KA.
-            // Check for termination only if we are streaming up to commit_lsn
-            // (to pageserver).
-            if let EndWatch::Commit(_) = self.end_watch {
-                if let Some(remote_consistent_lsn) = self
-                    .ws_guard
-                    .walsenders
-                    .get_ws_remote_consistent_lsn(self.ws_guard.id)
-                {
-                    if self.tli.should_walsender_stop(remote_consistent_lsn).await {
-                        // Terminate if there is nothing more to send.
-                        // Note that "ending streaming" part of the string is used by
-                        // pageserver to identify WalReceiverError::SuccessfulCompletion,
-                        // do not change this string without updating pageserver.
-                        return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
+            // Timed out waiting for WAL, check for termination and send KA
+            if let Some(remote_consistent_lsn) = self
+                .ws_guard
+                .walsenders
+                .get_ws_remote_consistent_lsn(self.ws_guard.id)
+            {
+                if self.tli.should_walsender_stop(remote_consistent_lsn).await {
+                    // Terminate if there is nothing more to send.
+                    // Note that "ending streaming" part of the string is used by
+                    // pageserver to identify WalReceiverError::SuccessfulCompletion,
+                    // do not change this string without updating pageserver.
+                    return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
                        "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
                        self.appname, self.start_pos,
                    )));
-                    }
                }
            }

@@ -682,32 +663,22 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {

 const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);

-/// Wait until we have available WAL > start_pos or timeout expires. Returns
-/// - Ok(Some(end_pos)) if needed lsn is successfully observed;
+/// Wait until we have commit_lsn > lsn or timeout expires. Returns
+/// - Ok(Some(commit_lsn)) if needed lsn is successfully observed;
 /// - Ok(None) if timeout expired;
-/// - Err in case of error -- only if 1) term changed while fetching in recovery
-///   mode 2) watch channel closed, which must never happen.
-async fn wait_for_lsn(
-    rx: &mut EndWatch,
-    client_term: Option<Term>,
-    start_pos: Lsn,
-) -> anyhow::Result<Option<Lsn>> {
+/// - Err in case of error (if watch channel is in trouble, shouldn't happen).
+async fn wait_for_lsn(rx: &mut Receiver<Lsn>, lsn: Lsn) -> anyhow::Result<Option<Lsn>> {
    let res = timeout(POLL_STATE_TIMEOUT, async move {
+        let mut commit_lsn;
        loop {
-            let end_pos = rx.get();
-            if end_pos > start_pos {
-                return Ok(end_pos);
-            }
-            if let EndWatch::Flush(rx) = rx {
-                let curr_term = rx.borrow().term;
-                if let Some(client_term) = client_term {
-                    if curr_term != client_term {
-                        bail!("term changed: requested {}, now {}", client_term, curr_term);
-                    }
-                }
-            }
            rx.changed().await?;
+            commit_lsn = *rx.borrow();
+            if commit_lsn > lsn {
+                break;
+            }
        }
+
+        Ok(commit_lsn)
    })
    .await;

--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -3,11 +3,8 @@

 use anyhow::{anyhow, bail, Result};
 use postgres_ffi::XLogSegNo;
-use serde::{Deserialize, Serialize};
-use serde_with::serde_as;
 use tokio::fs;

-use serde_with::DisplayFromStr;
 use std::cmp::max;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -27,10 +24,9 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;

 use crate::receive_wal::WalReceivers;
-use crate::recovery::recovery_main;
 use crate::safekeeper::{
    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
-    SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM,
+    SafekeeperMemState, ServerInfo, Term,
 };
 use crate::send_wal::WalSenders;
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
@@ -41,25 +37,18 @@ use crate::SafeKeeperConf;
 use crate::{debug_dump, wal_storage};

 /// Things safekeeper should know about timeline state on peers.
-#[serde_as]
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone)]
 pub struct PeerInfo {
    pub sk_id: NodeId,
    /// Term of the last entry.
    _last_log_term: Term,
    /// LSN of the last record.
-    #[serde_as(as = "DisplayFromStr")]
    _flush_lsn: Lsn,
-    #[serde_as(as = "DisplayFromStr")]
    pub commit_lsn: Lsn,
    /// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
    /// sk since backup_lsn.
-    #[serde_as(as = "DisplayFromStr")]
    pub local_start_lsn: Lsn,
-    /// When info was received. Serde annotations are not very useful but make
-    /// the code compile -- we don't rely on this field externally.
-    #[serde(skip)]
-    #[serde(default = "Instant::now")]
+    /// When info was received.
    ts: Instant,
 }

@@ -248,9 +237,8 @@ impl SharedState {
                tenant_id: ttid.tenant_id.as_ref().to_owned(),
                timeline_id: ttid.timeline_id.as_ref().to_owned(),
            }),
-            term: self.sk.state.acceptor_state.term,
            last_log_term: self.sk.get_epoch(),
-            flush_lsn: self.sk.flush_lsn().0,
+            flush_lsn: self.sk.wal_store.flush_lsn().0,
            // note: this value is not flushed to control file yet and can be lost
            commit_lsn: self.sk.inmem.commit_lsn.0,
            remote_consistent_lsn: remote_consistent_lsn.0,
@@ -259,7 +247,6 @@ impl SharedState {
                .advertise_pg_addr
                .to_owned()
                .unwrap_or(conf.listen_pg_addr.clone()),
-            http_connstr: conf.listen_http_addr.to_owned(),
            backup_lsn: self.sk.inmem.backup_lsn.0,
            local_start_lsn: self.sk.state.local_start_lsn.0,
            availability_zone: conf.availability_zone.clone(),
@@ -309,13 +296,6 @@ pub struct Timeline {
    commit_lsn_watch_tx: watch::Sender<Lsn>,
    commit_lsn_watch_rx: watch::Receiver<Lsn>,

-    /// Broadcasts (current term, flush_lsn) updates, walsender is interested in
-    /// them when sending in recovery mode (to walproposer or peers). Note: this
-    /// is just a notification, WAL reading should always done with lock held as
-    /// term can change otherwise.
-    term_flush_lsn_watch_tx: watch::Sender<TermLsn>,
-    term_flush_lsn_watch_rx: watch::Receiver<TermLsn>,
-
    /// Safekeeper and other state, that should remain consistent and
    /// synchronized with the disk. This is tokio mutex as we write WAL to disk
    /// while holding it, ensuring that consensus checks are in order.
@@ -337,20 +317,16 @@ pub struct Timeline {
 impl Timeline {
    /// Load existing timeline from disk.
    pub fn load_timeline(
-        conf: &SafeKeeperConf,
+        conf: SafeKeeperConf,
        ttid: TenantTimelineId,
        wal_backup_launcher_tx: Sender<TenantTimelineId>,
    ) -> Result<Timeline> {
        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();

-        let shared_state = SharedState::restore(conf, &ttid)?;
+        let shared_state = SharedState::restore(&conf, &ttid)?;
        let rcl = shared_state.sk.state.remote_consistent_lsn;
        let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
            watch::channel(shared_state.sk.state.commit_lsn);
-        let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((
-            shared_state.sk.get_term(),
-            shared_state.sk.flush_lsn(),
-        )));
        let (cancellation_tx, cancellation_rx) = watch::channel(false);

        Ok(Timeline {
@@ -358,8 +334,6 @@ impl Timeline {
            wal_backup_launcher_tx,
            commit_lsn_watch_tx,
            commit_lsn_watch_rx,
-            term_flush_lsn_watch_tx,
-            term_flush_lsn_watch_rx,
            mutex: Mutex::new(shared_state),
            walsenders: WalSenders::new(rcl),
            walreceivers: WalReceivers::new(),
@@ -371,7 +345,7 @@ impl Timeline {

    /// Create a new timeline, which is not yet persisted to disk.
    pub fn create_empty(
-        conf: &SafeKeeperConf,
+        conf: SafeKeeperConf,
        ttid: TenantTimelineId,
        wal_backup_launcher_tx: Sender<TenantTimelineId>,
        server_info: ServerInfo,
@@ -379,8 +353,6 @@ impl Timeline {
        local_start_lsn: Lsn,
    ) -> Result<Timeline> {
        let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
-        let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
-            watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
        let (cancellation_tx, cancellation_rx) = watch::channel(false);
        let state = SafeKeeperState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);

@@ -389,9 +361,7 @@ impl Timeline {
            wal_backup_launcher_tx,
            commit_lsn_watch_tx,
            commit_lsn_watch_rx,
-            term_flush_lsn_watch_tx,
-            term_flush_lsn_watch_rx,
-            mutex: Mutex::new(SharedState::create_new(conf, &ttid, state)?),
+            mutex: Mutex::new(SharedState::create_new(&conf, &ttid, state)?),
            walsenders: WalSenders::new(Lsn(0)),
            walreceivers: WalReceivers::new(),
            cancellation_rx,
@@ -400,16 +370,12 @@ impl Timeline {
        })
    }

-    /// Initialize fresh timeline on disk and start background tasks. If init
+    /// Initialize fresh timeline on disk and start background tasks. If bootstrap
    /// fails, timeline is cancelled and cannot be used anymore.
    ///
-    /// Init is transactional, so if it fails, created files will be deleted,
+    /// Bootstrap is transactional, so if it fails, created files will be deleted,
    /// and state on disk should remain unchanged.
-    pub async fn init_new(
-        self: &Arc<Timeline>,
-        shared_state: &mut MutexGuard<'_, SharedState>,
-        conf: &SafeKeeperConf,
-    ) -> Result<()> {
+    pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> {
        match fs::metadata(&self.timeline_dir).await {
            Ok(_) => {
                // Timeline directory exists on disk, we should leave state unchanged
@@ -425,7 +391,7 @@ impl Timeline {
        // Create timeline directory.
        fs::create_dir_all(&self.timeline_dir).await?;

-        // Write timeline to disk and start background tasks.
+        // Write timeline to disk and TODO: start background tasks.
        if let Err(e) = shared_state.sk.persist().await {
            // Bootstrap failed, cancel timeline and remove timeline directory.
            self.cancel(shared_state);
@@ -439,14 +405,10 @@ impl Timeline {

            return Err(e);
        }
-        self.bootstrap(conf);
-        Ok(())
-    }

-    /// Bootstrap new or existing timeline starting background stasks.
-    pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
-        // Start recovery task which always runs on the timeline.
-        tokio::spawn(recovery_main(self.clone(), conf.clone()));
+        // TODO: add more initialization steps here
+        self.update_status(shared_state);
+        Ok(())
    }

    /// Delete timeline from disk completely, by removing timeline directory. Background
@@ -482,16 +444,6 @@ impl Timeline {
        *self.cancellation_rx.borrow()
    }

-    /// Returns watch channel which gets value when timeline is cancelled. It is
-    /// guaranteed to have not cancelled value observed (errors otherwise).
-    pub fn get_cancellation_rx(&self) -> Result<watch::Receiver<bool>> {
-        let rx = self.cancellation_rx.clone();
-        if *rx.borrow() {
-            bail!(TimelineError::Cancelled(self.ttid));
-        }
-        Ok(rx)
-    }
-
    /// Take a writing mutual exclusive lock on timeline shared_state.
    pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
        self.mutex.lock().await
@@ -568,11 +520,6 @@ impl Timeline {
        self.commit_lsn_watch_rx.clone()
    }

-    /// Returns term_flush_lsn watch channel.
-    pub fn get_term_flush_lsn_watch_rx(&self) -> watch::Receiver<TermLsn> {
-        self.term_flush_lsn_watch_rx.clone()
-    }
-
    /// Pass arrived message to the safekeeper.
    pub async fn process_msg(
        &self,
@@ -584,7 +531,6 @@ impl Timeline {

        let mut rmsg: Option<AcceptorProposerMessage>;
        let commit_lsn: Lsn;
-        let term_flush_lsn: TermLsn;
        {
            let mut shared_state = self.write_shared_state().await;
            rmsg = shared_state.sk.process_msg(msg).await?;
@@ -598,11 +544,8 @@ impl Timeline {
            }

            commit_lsn = shared_state.sk.inmem.commit_lsn;
-            term_flush_lsn =
-                TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn()));
        }
        self.commit_lsn_watch_tx.send(commit_lsn)?;
-        self.term_flush_lsn_watch_tx.send(term_flush_lsn)?;
        Ok(rmsg)
    }

--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -11,7 +11,7 @@ use serde::Serialize;
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::str::FromStr;
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, Mutex, MutexGuard};
 use tokio::sync::mpsc::Sender;
 use tracing::*;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
@@ -71,23 +71,19 @@ pub struct GlobalTimelines;

 impl GlobalTimelines {
    /// Inject dependencies needed for the timeline constructors and load all timelines to memory.
-    pub async fn init(
+    pub fn init(
        conf: SafeKeeperConf,
        wal_backup_launcher_tx: Sender<TenantTimelineId>,
    ) -> Result<()> {
-        // clippy isn't smart enough to understand that drop(state) releases the
-        // lock, so use explicit block
-        let tenants_dir = {
-            let mut state = TIMELINES_STATE.lock().unwrap();
-            assert!(state.wal_backup_launcher_tx.is_none());
-            state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
-            state.conf = Some(conf);
+        let mut state = TIMELINES_STATE.lock().unwrap();
+        assert!(state.wal_backup_launcher_tx.is_none());
+        state.wal_backup_launcher_tx = Some(wal_backup_launcher_tx);
+        state.conf = Some(conf);

-            // Iterate through all directories and load tenants for all directories
-            // named as a valid tenant_id.
-            state.get_conf().workdir.clone()
-        };
+        // Iterate through all directories and load tenants for all directories
+        // named as a valid tenant_id.
        let mut tenant_count = 0;
+        let tenants_dir = state.get_conf().workdir.clone();
        for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
            .with_context(|| format!("failed to list tenants dir {}", tenants_dir.display()))?
        {
@@ -97,7 +93,7 @@ impl GlobalTimelines {
                        TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
                    {
                        tenant_count += 1;
-                        GlobalTimelines::load_tenant_timelines(tenant_id).await?;
+                        GlobalTimelines::load_tenant_timelines(&mut state, tenant_id)?;
                    }
                }
                Err(e) => error!(
@@ -112,7 +108,7 @@ impl GlobalTimelines {
        info!(
            "found {} tenants directories, successfully loaded {} timelines",
            tenant_count,
-            TIMELINES_STATE.lock().unwrap().timelines.len()
+            state.timelines.len()
        );
        Ok(())
    }
@@ -120,21 +116,17 @@ impl GlobalTimelines {
    /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
    /// errors if any.
    ///
-    /// It is async for update_status_notify sake. Since TIMELINES_STATE lock is
-    /// sync and there is no important reason to make it async (it is always
-    /// held for a short while) we just lock and unlock it for each timeline --
-    /// this function is called during init when nothing else is running, so
-    /// this is fine.
-    async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
-        let (conf, wal_backup_launcher_tx) = {
-            let state = TIMELINES_STATE.lock().unwrap();
-            (
-                state.get_conf().clone(),
-                state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
-            )
-        };
-
-        let timelines_dir = conf.tenant_dir(&tenant_id);
+    /// Note: This function (and all reading/loading below) is sync because
+    /// timelines are loaded while holding GlobalTimelinesState lock. Which is
+    /// fine as this is called only from single threaded main runtime on boot,
+    /// but clippy complains anyway, and suppressing that isn't trivial as async
+    /// is the keyword, ha. That only other user is pull_timeline.rs for which
+    /// being blocked is not that bad, and we can do spawn_blocking.
+    fn load_tenant_timelines(
+        state: &mut MutexGuard<'_, GlobalTimelinesState>,
+        tenant_id: TenantId,
+    ) -> Result<()> {
+        let timelines_dir = state.get_conf().tenant_dir(&tenant_id);
        for timelines_dir_entry in std::fs::read_dir(&timelines_dir)
            .with_context(|| format!("failed to list timelines dir {}", timelines_dir.display()))?
        {
@@ -144,16 +136,13 @@ impl GlobalTimelines {
                        TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
                    {
                        let ttid = TenantTimelineId::new(tenant_id, timeline_id);
-                        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx.clone()) {
+                        match Timeline::load_timeline(
+                            state.get_conf().clone(),
+                            ttid,
+                            state.wal_backup_launcher_tx.as_ref().unwrap().clone(),
+                        ) {
                            Ok(timeline) => {
-                                let tli = Arc::new(timeline);
-                                TIMELINES_STATE
-                                    .lock()
-                                    .unwrap()
-                                    .timelines
-                                    .insert(ttid, tli.clone());
-                                tli.bootstrap(&conf);
-                                tli.update_status_notify().await.unwrap();
+                                state.timelines.insert(ttid, Arc::new(timeline));
                            }
                            // If we can't load a timeline, it's most likely because of a corrupted
                            // directory. We will log an error and won't allow to delete/recreate
@@ -179,22 +168,18 @@ impl GlobalTimelines {
    }

    /// Load timeline from disk to the memory.
-    pub async fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+    pub fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
        let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();

-        match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
+        match Timeline::load_timeline(conf, ttid, wal_backup_launcher_tx) {
            Ok(timeline) => {
                let tli = Arc::new(timeline);
-
                // TODO: prevent concurrent timeline creation/loading
                TIMELINES_STATE
                    .lock()
                    .unwrap()
                    .timelines
                    .insert(ttid, tli.clone());
-
-                tli.bootstrap(&conf);
-
                Ok(tli)
            }
            // If we can't load a timeline, it's bad. Caller will figure it out.
@@ -232,7 +217,7 @@ impl GlobalTimelines {
        info!("creating new timeline {}", ttid);

        let timeline = Arc::new(Timeline::create_empty(
-            &conf,
+            conf,
            ttid,
            wal_backup_launcher_tx,
            server_info,
@@ -255,24 +240,23 @@ impl GlobalTimelines {
            // Write the new timeline to the disk and start background workers.
            // Bootstrap is transactional, so if it fails, the timeline will be deleted,
            // and the state on disk should remain unchanged.
-            if let Err(e) = timeline.init_new(&mut shared_state, &conf).await {
-                // Note: the most likely reason for init failure is that the timeline
+            if let Err(e) = timeline.bootstrap(&mut shared_state).await {
+                // Note: the most likely reason for bootstrap failure is that the timeline
                // directory already exists on disk. This happens when timeline is corrupted
                // and wasn't loaded from disk on startup because of that. We want to preserve
                // the timeline directory in this case, for further inspection.

                // TODO: this is an unusual error, perhaps we should send it to sentry
                // TODO: compute will try to create timeline every second, we should add backoff
-                error!("failed to init new timeline {}: {}", ttid, e);
+                error!("failed to bootstrap timeline {}: {}", ttid, e);

-                // Timeline failed to init, it cannot be used. Remove it from the map.
+                // Timeline failed to bootstrap, it cannot be used. Remove it from the map.
                TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid);
                return Err(e);
            }
            // We are done with bootstrap, release the lock, return the timeline.
            // {} block forces release before .await
        }
-        timeline.update_status_notify().await?;
        timeline.wal_backup_launcher_tx.send(timeline.ttid).await?;
        Ok(timeline)
    }
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -12,26 +12,25 @@ import psycopg2.extras
 # We call the test "flaky" if it failed at least once on the main branch in the last N=10 days.
 FLAKY_TESTS_QUERY = """
    SELECT
-        DISTINCT parent_suite, suite, REGEXP_REPLACE(test, '(release|debug)-pg(\\d+)-?', '') as deparametrized_test
+        DISTINCT parent_suite, suite, test
    FROM
        (
            SELECT
-                reference,
-                jsonb_array_elements(data -> 'children') ->> 'name' as parent_suite,
-                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') ->> 'name' as suite,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'name' as test,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'status' as status,
-                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') ->> 'retriesStatusChange' as retries_status_change,
-                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' ->> 'start')::bigint / 1000)::date as timestamp
+                revision,
+                jsonb_array_elements(data -> 'children') -> 'name' as parent_suite,
+                jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
+                to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
            FROM
                regress_test_results
+            WHERE
+                reference = 'refs/heads/main'
        ) data
    WHERE
        timestamp > CURRENT_DATE - INTERVAL '%s' day
-        AND (
-            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
-            OR retries_status_change::boolean
-        )
+        AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
    ;
 """

@@ -41,9 +40,6 @@ def main(args: argparse.Namespace):
    interval_days = args.days
    output = args.output

-    build_type = args.build_type
-    pg_version = args.pg_version
-
    res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]]
    res = defaultdict(lambda: defaultdict(dict))

@@ -59,21 +55,8 @@ def main(args: argparse.Namespace):
        rows = []

    for row in rows:
-        # We don't want to automatically rerun tests in a performance suite
-        if row["parent_suite"] != "test_runner.regress":
-            continue
-
-        deparametrized_test = row["deparametrized_test"]
-        dash_if_needed = "" if deparametrized_test.endswith("[]") else "-"
-        parametrized_test = deparametrized_test.replace(
-            "[",
-            f"[{build_type}-pg{pg_version}{dash_if_needed}",
-        )
-        res[row["parent_suite"]][row["suite"]][parametrized_test] = True
-
-        logging.info(
-            f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}"
-        )
+        logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}")
+        res[row["parent_suite"]][row["suite"]][row["test"]] = True

    logging.info(f"saving results to {output.name}")
    json.dump(res, output, indent=2)
@@ -94,18 +77,6 @@ if __name__ == "__main__":
        type=int,
        help="how many days to look back for flaky tests (default: 10)",
    )
-    parser.add_argument(
-        "--build-type",
-        required=True,
-        type=str,
-        help="for which build type to create list of flaky tests (debug or release)",
-    )
-    parser.add_argument(
-        "--pg-version",
-        required=True,
-        type=int,
-        help="for which Postgres version to create list of flaky tests (14, 15, etc.)",
-    )
    parser.add_argument(
        "connstr",
        help="connection string to the test results database",
--- a/storage_broker/benches/rps.rs
+++ b/storage_broker/benches/rps.rs
@@ -125,7 +125,6 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                    tenant_id: vec![0xFF; 16],
                    timeline_id: tli_from_u64(counter % n_keys),
                }),
-                term: 0,
                last_log_term: 0,
                flush_lsn: counter,
                commit_lsn: 2,
@@ -133,7 +132,6 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
                remote_consistent_lsn: 4,
                peer_horizon_lsn: 5,
                safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
-                http_connstr: "zenith-1-sk-1.local:7677".to_owned(),
                local_start_lsn: 0,
                availability_zone: None,
            };
--- a/storage_broker/proto/broker.proto
+++ b/storage_broker/proto/broker.proto
@@ -22,8 +22,6 @@ message SubscribeSafekeeperInfoRequest {
 message SafekeeperTimelineInfo {
    uint64 safekeeper_id = 1;
    TenantTimelineId tenant_timeline_id = 2;
-    // Safekeeper term
-    uint64 term = 12;
    // Term of the last entry.
    uint64 last_log_term = 3;
    // LSN of the last record.
@@ -38,8 +36,6 @@ message SafekeeperTimelineInfo {
    uint64 local_start_lsn = 9;
    // A connection string to use for WAL receiving.
    string safekeeper_connstr = 10;
-    // HTTP endpoint connection string
-    string http_connstr = 13;
    // Availability zone of a safekeeper.
    optional string availability_zone = 11;
 }
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -519,7 +519,6 @@ mod tests {
                tenant_id: vec![0x00; 16],
                timeline_id,
            }),
-            term: 0,
            last_log_term: 0,
            flush_lsn: 1,
            commit_lsn: 2,
@@ -527,7 +526,6 @@ mod tests {
            remote_consistent_lsn: 4,
            peer_horizon_lsn: 5,
            safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
-            http_connstr: "neon-1-sk-1.local:7677".to_owned(),
            local_start_lsn: 0,
            availability_zone: None,
        }
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -233,19 +233,10 @@ if TYPE_CHECKING:

 def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
    response = list_prefix(neon_env_builder, prefix)
-    keys = response["KeyCount"]
-    objects = response.get("Contents", [])
-
-    if keys != 0 and len(objects) == 0:
-        # this has been seen in one case with mock_s3:
-        # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
-        # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
-        common_prefixes = response.get("CommonPrefixes", [])
-        log.warn(
-            f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
-        )
-
-    assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"


 def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
--- a/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json
+++ b/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json
@@ -1,24 +0,0 @@
-{
-    "public_extensions": [
-        "anon",
-        "pg_buffercache"
-    ],
-    "library_index": {
-        "anon": "anon",
-        "pg_buffercache": "pg_buffercache"
-    },
-    "extension_data": {
-        "pg_buffercache": {
-            "control_data": {
-                "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true"
-            },
-            "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst"
-        },
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5670669815/v14/extensions/anon.tar.zst"
-        }
-    }
-}
--- a/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst
+++ b/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst
--- a/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst
+++ b/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst
--- a/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json
+++ b/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json
@@ -1,17 +0,0 @@
-{
-    "public_extensions": [
-        "anon"
-    ],
-    "library_index": {
-        "anon": "anon"
-    },
-    "extension_data": {
-        "anon": {
-            "control_data": {
-                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
-            },
-            "archive_path": "5670669815/v15/extensions/anon.tar.zst"
-        }
-    }
-}
-
--- a/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst
+++ b/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -1,327 +0,0 @@
-import os
-import shutil
-import threading
-from contextlib import closing
-from pathlib import Path
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-)
-from fixtures.pg_version import PgVersion
-from fixtures.remote_storage import RemoteStorageKind, available_s3_storages
-
-
-# Cleaning up downloaded files is important for local tests
-# or else one test could reuse the files from another test or another test run
-def cleanup(pg_version):
-    PGDIR = Path(f"pg_install/v{pg_version}")
-
-    LIB_DIR = PGDIR / Path("lib/postgresql")
-    cleanup_lib_globs = ["anon*", "postgis*", "pg_buffercache*"]
-    cleanup_lib_glob_paths = [LIB_DIR.glob(x) for x in cleanup_lib_globs]
-
-    SHARE_DIR = PGDIR / Path("share/postgresql/extension")
-    cleanup_ext_globs = [
-        "anon*",
-        "address_standardizer*",
-        "postgis*",
-        "pageinspect*",
-        "pg_buffercache*",
-        "pgrouting*",
-    ]
-    cleanup_ext_glob_paths = [SHARE_DIR.glob(x) for x in cleanup_ext_globs]
-
-    all_glob_paths = cleanup_lib_glob_paths + cleanup_ext_glob_paths
-    all_cleanup_files = []
-    for file_glob in all_glob_paths:
-        for file in file_glob:
-            all_cleanup_files.append(file)
-
-    for file in all_cleanup_files:
-        try:
-            os.remove(file)
-            log.info(f"removed file {file}")
-        except Exception as err:
-            log.info(
-                f"skipping remove of file {file} because it doesn't exist.\
-                      this may be expected or unexpected depending on the test {err}"
-            )
-
-    cleanup_folders = [SHARE_DIR / Path("anon"), PGDIR / Path("download_extensions")]
-    for folder in cleanup_folders:
-        try:
-            shutil.rmtree(folder)
-            log.info(f"removed folder {folder}")
-        except Exception as err:
-            log.info(
-                f"skipping remove of folder {folder} because it doesn't exist.\
-                      this may be expected or unexpected depending on the test {err}"
-            )
-
-
-def upload_files(env):
-    log.info("Uploading test files to mock bucket")
-    os.chdir("test_runner/regress/data/extension_test")
-    for path in os.walk("."):
-        prefix, _, files = path
-        for file in files:
-            # the [2:] is to remove the leading "./"
-            full_path = os.path.join(prefix, file)[2:]
-
-            with open(full_path, "rb") as f:
-                log.info(f"UPLOAD {full_path} to ext/{full_path}")
-                env.remote_storage_client.upload_fileobj(
-                    f,
-                    env.ext_remote_storage.bucket_name,
-                    f"ext/{full_path}",
-                )
-    os.chdir("../../../..")
-
-
-# Test downloading remote extension.
-@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
-@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
-def test_remote_extensions(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    pg_version: PgVersion,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=remote_storage_kind,
-        test_name="test_remote_extensions",
-        enable_remote_extensions=True,
-    )
-    env = neon_env_builder.init_start()
-    tenant_id, _ = env.neon_cli.create_tenant()
-    env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id)
-
-    assert env.ext_remote_storage is not None  # satisfy mypy
-    assert env.remote_storage_client is not None  # satisfy mypy
-
-    # For MOCK_S3 we upload test files.
-    # For REAL_S3 we use the files already in the bucket
-    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
-        upload_files(env)
-
-    # Start a compute node and check that it can download the extensions
-    # and use them to CREATE EXTENSION and LOAD
-    endpoint = env.endpoints.create_start(
-        "test_remote_extensions",
-        tenant_id=tenant_id,
-        remote_ext_config=env.ext_remote_storage.to_string(),
-        # config_lines=["log_min_messages=debug3"],
-    )
-    try:
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                # Check that appropriate control files were downloaded
-                cur.execute("SELECT * FROM pg_available_extensions")
-                all_extensions = [x[0] for x in cur.fetchall()]
-                log.info(all_extensions)
-                assert "anon" in all_extensions
-
-                # postgis is on real s3 but not mock s3.
-                # it's kind of a big file, would rather not upload to github
-                if remote_storage_kind == RemoteStorageKind.REAL_S3:
-                    assert "postgis" in all_extensions
-                    # this may fail locally if dependency is missing
-                    # we don't really care about the error,
-                    # we just want to make sure it downloaded
-                    try:
-                        cur.execute("CREATE EXTENSION postgis")
-                    except Exception as err:
-                        log.info(f"(expected) error creating postgis extension: {err}")
-                        # we do not check the error, so this is basically a NO-OP
-                        # however checking the log you can make sure that it worked
-                        # and also get valuable information about how long loading the extension took
-
-                # this is expected to fail on my computer because I don't have the pgcrypto extension
-                try:
-                    cur.execute("CREATE EXTENSION anon")
-                except Exception as err:
-                    log.info("error creating anon extension")
-                    assert "pgcrypto" in str(err), "unexpected error creating anon extension"
-    finally:
-        cleanup(pg_version)
-
-
-# Test downloading remote library.
-@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
-@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
-def test_remote_library(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    pg_version: PgVersion,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=remote_storage_kind,
-        test_name="test_remote_library",
-        enable_remote_extensions=True,
-    )
-    env = neon_env_builder.init_start()
-    tenant_id, _ = env.neon_cli.create_tenant()
-    env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id)
-
-    assert env.ext_remote_storage is not None  # satisfy mypy
-    assert env.remote_storage_client is not None  # satisfy mypy
-
-    # For MOCK_S3 we upload test files.
-    # For REAL_S3 we use the files already in the bucket
-    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
-        upload_files(env)
-
-    # and use them to run LOAD library
-    endpoint = env.endpoints.create_start(
-        "test_remote_library",
-        tenant_id=tenant_id,
-        remote_ext_config=env.ext_remote_storage.to_string(),
-        # config_lines=["log_min_messages=debug3"],
-    )
-    try:
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                # try to load library
-                try:
-                    cur.execute("LOAD 'anon'")
-                except Exception as err:
-                    log.info(f"error loading anon library: {err}")
-                    raise AssertionError("unexpected error loading anon library") from err
-
-                # test library which name is different from extension name
-                # this may fail locally if dependency is missing
-                # however, it does successfully download the postgis archive
-                if remote_storage_kind == RemoteStorageKind.REAL_S3:
-                    try:
-                        cur.execute("LOAD 'postgis_topology-3'")
-                    except Exception as err:
-                        log.info("error loading postgis_topology-3")
-                        assert "No such file or directory" in str(
-                            err
-                        ), "unexpected error loading postgis_topology-3"
-    finally:
-        cleanup(pg_version)
-
-
-# Here we test a complex extension
-# which has multiple extensions in one archive
-# using postgis as an example
-# @pytest.mark.skipif(
-#    RemoteStorageKind.REAL_S3 not in available_s3_storages(),
-#    reason="skipping test because real s3 not enabled",
-# )
-@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
-def test_multiple_extensions_one_archive(
-    neon_env_builder: NeonEnvBuilder,
-    pg_version: PgVersion,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=RemoteStorageKind.REAL_S3,
-        test_name="test_multiple_extensions_one_archive",
-        enable_remote_extensions=True,
-    )
-    env = neon_env_builder.init_start()
-    tenant_id, _ = env.neon_cli.create_tenant()
-    env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id)
-
-    assert env.ext_remote_storage is not None  # satisfy mypy
-    assert env.remote_storage_client is not None  # satisfy mypy
-
-    endpoint = env.endpoints.create_start(
-        "test_multiple_extensions_one_archive",
-        tenant_id=tenant_id,
-        remote_ext_config=env.ext_remote_storage.to_string(),
-    )
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CREATE EXTENSION address_standardizer;")
-            cur.execute("CREATE EXTENSION address_standardizer_data_us;")
-            # execute query to ensure that it works
-            cur.execute(
-                "SELECT house_num, name, suftype, city, country, state, unit \
-                        FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \
-                        'One Rust Place, Boston, MA 02109');"
-            )
-            res = cur.fetchall()
-            log.info(res)
-            assert len(res) > 0
-
-    cleanup(pg_version)
-
-
-# Test that extension is downloaded after endpoint restart,
-# when the library is used in the query.
-#
-# Run the test with mutliple simultaneous connections to an endpoint.
-# to ensure that the extension is downloaded only once.
-#
-@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
-def test_extension_download_after_restart(
-    neon_env_builder: NeonEnvBuilder,
-    pg_version: PgVersion,
-):
-    if "15" in pg_version:  # SKIP v15 for now because test set only has extension built for v14
-        return None
-
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=RemoteStorageKind.MOCK_S3,
-        test_name="test_extension_download_after_restart",
-        enable_remote_extensions=True,
-    )
-    env = neon_env_builder.init_start()
-    tenant_id, _ = env.neon_cli.create_tenant()
-    env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id)
-
-    assert env.ext_remote_storage is not None  # satisfy mypy
-    assert env.remote_storage_client is not None  # satisfy mypy
-
-    # For MOCK_S3 we upload test files.
-    upload_files(env)
-
-    endpoint = env.endpoints.create_start(
-        "test_extension_download_after_restart",
-        tenant_id=tenant_id,
-        remote_ext_config=env.ext_remote_storage.to_string(),
-        config_lines=["log_min_messages=debug3"],
-    )
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CREATE extension pg_buffercache;")
-            cur.execute("SELECT * from pg_buffercache;")
-            res = cur.fetchall()
-            assert len(res) > 0
-            log.info(res)
-
-    # shutdown compute node
-    endpoint.stop()
-    # remove extension files locally
-    cleanup(pg_version)
-
-    # spin up compute node again (there are no extension files available, because compute is stateless)
-    endpoint = env.endpoints.create_start(
-        "test_extension_download_after_restart",
-        tenant_id=tenant_id,
-        remote_ext_config=env.ext_remote_storage.to_string(),
-        config_lines=["log_min_messages=debug3"],
-    )
-
-    # connect to compute node and run the query
-    # that will trigger the download of the extension
-    def run_query(endpoint, thread_id: int):
-        log.info("thread_id {%d} starting", thread_id)
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                cur.execute("SELECT * from pg_buffercache;")
-                res = cur.fetchall()
-                assert len(res) > 0
-                log.info("thread_id {%d}, res = %s", thread_id, res)
-
-    threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)]
-
-    for thread in threads:
-        thread.start()
-    for thread in threads:
-        thread.join()
-
-    cleanup(pg_version)
--- a/vm-cgconfig.conf
+++ b/vm-cgconfig.conf
@@ -0,0 +1,12 @@
+# Configuration for cgroups in VM compute nodes
+group neon-postgres {
+    perm {
+        admin {
+            uid = vm-informant;
+        }
+        task {
+            gid = users;
+        }
+    }
+    memory {}
+}
Author	SHA1	Message	Date
Alek Westover	8930a0c4b1	delete data	2023-08-25 09:07:50 -04:00
Alek Westover	b944c6ca23	delete remote extension regression test	2023-08-25 09:07:50 -04:00