hack: allow using async inside Tenant::activate

refactor: prepare to allow async code inside Tenant::state.send_modify()
compute-node-image: fix postgis download (#4280 )
2026-06-01 12:30:38 +00:00 · 2023-05-22 11:40:55 +02:00 · 2023-05-22 11:39:50 +02:00 · 2023-05-19 15:34:22 +01:00 · 2023-05-19 14:35:33 +03:00 · 2023-05-18 21:16:09 -04:00
19 changed files with 577 additions and 202 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -14,3 +14,4 @@ opt-level = 1

 [alias]
 build_testing = ["build", "--features", "testing"]
+neon = ["run", "--bin", "neon_local"]
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -71,12 +71,12 @@ runs:
        path: /tmp/neon-previous
        prefix: latest

-    - name: Download compatibility snapshot for Postgres 14
-      if: inputs.build_type != 'remote' && inputs.pg_version == 'v14'
+    - name: Download compatibility snapshot
+      if: inputs.build_type != 'remote'
      uses: ./.github/actions/download
      with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
-        path: /tmp/compatibility_snapshot_pg14
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
+        path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        prefix: latest

    - name: Checkout
@@ -106,7 +106,7 @@ runs:
        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
-        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
+        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
@@ -197,13 +197,13 @@ runs:
          scripts/generate_and_push_perf_report.sh
        fi

-    - name: Upload compatibility snapshot for Postgres 14
-      if: github.ref_name == 'release' && inputs.pg_version == 'v14'
+    - name: Upload compatibility snapshot
+      if: github.ref_name == 'release'
      uses: ./.github/actions/upload
      with:
-        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
        # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
-        path: /tmp/test_output/compatibility_snapshot_pg14/
+        path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
        prefix: latest

    - name: Upload test results
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -711,7 +711,11 @@ jobs:

  compute-node-image:
    runs-on: [ self-hosted, gen3, large ]
-    container: gcr.io/kaniko-project/executor:v1.9.2-debug
+    container:
+      image: gcr.io/kaniko-project/executor:v1.9.2-debug
+      # Workaround for "Resolving download.osgeo.org (download.osgeo.org)... failed: Temporary failure in name resolution.""
+      # Should be prevented by https://github.com/neondatabase/neon/issues/4281
+      options: --add-host=download.osgeo.org:140.211.15.30
    needs: [ tag ]
    strategy:
      fail-fast: false
@@ -957,7 +961,7 @@ jobs:
  promote-compatibility-data:
    runs-on: [ self-hosted, gen3, small ]
    container:
-      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
    needs: [ promote-images, tag, regress-tests ]
    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
@@ -968,11 +972,13 @@ jobs:
          PREFIX: artifacts/latest
        run: |
          # Update compatibility snapshot for the release
-          for build_type in debug release; do
-            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
-            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
+          for pg_version in v14 v15; do
+            for build_type in debug release; do
+              OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst
+              NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst

-            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+              time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+            done
          done

          # Update Neon artifact for the release (reuse already uploaded artifact)
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -415,6 +415,23 @@ RUN apt-get update && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control

+#########################################################################################
+#
+# Layer "pg-cron-pg-build"
+# compile pg_cron extension
+#
+#########################################################################################
+FROM build-deps AS pg-cron-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O pg_cron.tar.gz && \
+    echo "6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 pg_cron.tar.gz" | sha256sum --check && \
+    mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
+
 #########################################################################################
 #
 # Layer "rust extensions"
@@ -529,6 +546,7 @@ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/README.md
+++ b/README.md
@@ -130,11 +130,11 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
 ```sh
 # Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
-> ./target/debug/neon_local init
+> cargo neon init
 Starting pageserver at '127.0.0.1:64000' in '.neon'.

 # start pageserver, safekeeper, and broker for their intercommunication
-> ./target/debug/neon_local start
+> cargo neon start
 Starting neon broker at 127.0.0.1:50051
 storage_broker started, pid: 2918372
 Starting pageserver at '127.0.0.1:64000' in '.neon'.
@@ -143,19 +143,19 @@ Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
 safekeeper 1 started, pid: 2918437

 # create initial tenant and use it as a default for every future neon_local invocation
-> ./target/debug/neon_local tenant create --set-default
+> cargo neon tenant create --set-default
 tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
 Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
 Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one

 # start postgres compute node
-> ./target/debug/neon_local endpoint start main
+> cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
 Starting postgres at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

 # check list of running postgres instances
-> ./target/debug/neon_local endpoint list
+> cargo neon endpoint list
 ENDPOINT  ADDRESS          TIMELINE                          BRANCH NAME  LSN        STATUS
 main      127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
 ```
@@ -177,22 +177,22 @@ postgres=# select * from t;
 3. And create branches and run postgres on them:
 ```sh
 # create branch named migration_check
-> ./target/debug/neon_local timeline branch --branch-name migration_check
+> cargo neon timeline branch --branch-name migration_check
 Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'

 # check branches tree
-> ./target/debug/neon_local timeline list
+> cargo neon timeline list
 (L) main [de200bd42b49cc1814412c7e592dd6e9]
 (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]

 # start postgres on that branch
-> ./target/debug/neon_local endpoint start migration_check --branch-name migration_check
+> cargo neon endpoint start migration_check --branch-name migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
 Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
 Starting postgres at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'

 # check the new list of running postgres instances
-> ./target/debug/neon_local endpoint list
+> cargo neon endpoint list
 ENDPOINT         ADDRESS          TIMELINE                          BRANCH NAME      LSN        STATUS
 main             127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main             0/16F9A38  running
 migration_check  127.0.0.1:55433  b3b863fa45fa9e57e615f9f2d944e601  migration_check  0/16F9A70  running
@@ -221,7 +221,7 @@ postgres=# select * from t;
 4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
   you have just started. You can terminate them all with one command:
 ```sh
-> ./target/debug/neon_local stop
+> cargo neon stop
 ```

 ## Running tests
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -134,6 +134,7 @@ pub struct Endpoint {

    // port and address of the Postgres server
    pub address: SocketAddr,
+    // postgres major version in the format: 14, 15, etc.
    pg_version: u32,

    // These are not part of the endpoint as such, but the environment
@@ -381,6 +382,11 @@ impl Endpoint {
                conf.append("primary_conninfo", connstr.as_str());
                conf.append("primary_slot_name", slot_name.as_str());
                conf.append("hot_standby", "on");
+                // prefetching of blocks referenced in WAL doesn't make sense for us
+                // Neon hot standby ignores pages that are not in the shared_buffers
+                if self.pg_version >= 15 {
+                    conf.append("recovery_prefetch", "off");
+                }
            }
        }

--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -4,6 +4,11 @@ The pageserver uses Tokio for handling concurrency. Everything runs in
 Tokio tasks, although some parts are written in blocking style and use
 spawn_blocking().

+We currently use std blocking functions for disk I/O, however.  The
+current model is that we consider disk I/Os to be short enough that we
+perform them while running in a Tokio task. Changing all the disk I/O
+calls to async is a TODO.
+
 Each Tokio task is tracked by the `task_mgr` module. It maintains a
 registry of tasks, and which tenant or timeline they are operating
 on.
@@ -21,19 +26,86 @@ also a `shudown_watcher()` Future that can be used with `tokio::select!`
 or similar, to wake up on shutdown.


-### Sync vs async
+### Async cancellation safety

-We use async to wait for incoming data on network connections, and to
-perform other long-running operations. For example, each WAL receiver
-connection is handled by a tokio Task. Once a piece of WAL has been
-received from the network, the task calls the blocking functions in
-the Repository to process the WAL.
+In async Rust, futures can be "cancelled" at any await point, by
+dropping the Future. For example, `tokio::select!` returns as soon as
+one of the Futures returns, and drops the others. `tokio::timeout!` is
+another example. In the Rust ecosystem, some functions are
+cancellation-safe, meaning they can be safely dropped without
+side-effects, while others are not. See documentation of
+`tokio::select!` for examples.

-The core storage code in `layered_repository/` is synchronous, with
-blocking locks and I/O calls. The current model is that we consider
-disk I/Os to be short enough that we perform them while running in a
-Tokio task. If that becomes a problem, we should use `spawn_blocking`
-before entering the synchronous parts of the code, or switch to using
-tokio I/O functions.
+In the pageserver and safekeeper, async code is *not*
+cancellation-safe by default. Unless otherwise marked, any async
+function that you call cannot be assumed to be async
+cancellation-safe, and must be polled to completion.

-Be very careful when mixing sync and async code!
+The downside of non-cancellation safe code is that you have to be very
+careful when using `tokio::select!`, `tokio::timeout!`, and other such
+functions that can cause a Future to be dropped. They can only be used
+with functions that are explicitly documented to be cancellation-safe,
+or you need to spawn a separate task to shield from the cancellation.
+
+At the entry points to the code, we also take care to poll futures to
+completion, or shield the rest of the code from surprise cancellations
+by spawning a separate task. The code that handles incoming HTTP
+requests, for example, spawns a separate task for each request,
+because Hyper will drop the request-handling Future if the HTTP
+connection is lost.  (FIXME: our HTTP handlers do not do that
+currently, but we should fix that. See [issue
+3478](https://github.com/neondatabase/neon/issues/3478)).
+
+
+#### How to cancel, then?
+
+If our code is not cancellation-safe, how do you cancel long-running
+tasks? Use CancellationTokens.
+
+TODO: More details on that. And we have an ongoing discussion on what
+to do if cancellations might come from multiple sources.
+
+#### Exceptions
+Some library functions are cancellation-safe, and are explicitly marked
+as such. For example, `utils::seqwait`.
+
+#### Rationale
+
+The alternative would be to make all async code cancellation-safe,
+unless otherwise marked. That way, you could use `tokio::select!` more
+liberally. The reasons we didn't choose that are explained in this
+section.
+
+Writing code in a cancellation-safe manner is tedious, as you need to
+scrutinize every `.await` and ensure that if the `.await` call never
+returns, the system is in a safe, consistent state. In some ways, you
+need to do that with `?` and early `returns`, too, but `.await`s are
+easier to miss. It is also easier to perform cleanup tasks when a
+function returns an `Err` than when an `.await` simply never
+returns. You can use `scopeguard` and Drop guards to perform cleanup
+tasks, but it is more tedious. An `.await` that never returns is more
+similar to a panic.
+
+Note that even if you only use building blocks that themselves are
+cancellation-safe, it doesn't mean that the code as whole is
+cancellation-safe. For example, consider the following code:
+
+```
+while let Some(i) = work_inbox.recv().await {
+	if let Err(_) = results_outbox.send(i).await {
+		println!("receiver dropped");
+		return;
+		}
+	}
+}
+```
+
+It reads messages from one channel, sends them to another channel. If
+this code is cancelled at the `results_outbox.send(i).await`, the
+message read from the receiver is lost. That may or may not be OK,
+depending on the context.
+
+Another reason to not require cancellation-safety is historical: we
+already had a lot of async code that was not scrutinized for
+cancellation-safety when this issue was raised. Scrutinizing all
+existing code is no fun.
--- a/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
+++ b/docs/rfcs/023-the-state-of-pageserver-tenant-relocation.md
@@ -0,0 +1,232 @@
+# The state of pageserver tenant relocation
+
+Created on 17.03.23
+
+## Motivation
+
+There were previous write ups on the subject. The design of tenant relocation was planned at the time when we had quite different landscape. I e there was no on-demand download/eviction. They were on the horizon but we still planned for cases when they were not available. Some other things have changed. Now safekeepers offload wal to s3 so we're not risking overflowing their disks. Having all of the above, it makes sense to recap and take a look at the options we have now, which adjustments we'd like to make to original process, etc.
+
+Related (in chronological order):
+
+- Tracking issue with initial discussion: [#886](https://github.com/neondatabase/neon/issues/886)
+- [015. Storage Messaging](015-storage-messaging.md)
+- [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md)
+
+## Summary
+
+The RFC consists of a walkthrough of prior art on tenant relocation and corresponding problems. It describes 3 approaches.
+
+1. Simplistic approach that uses ignore and is the fastest to implement. The main downside is a requirement of short downtime.
+2. More complicated approach that avoids even short downtime.
+3. Even more complicated approach that will allow multiple pageservers to operate concurrently on the same tenant possibly allowing for HA cluster topologies and horizontal scaling of reads (i e compute talks to multiple pageservers).
+
+The order in which solutions are described is a bit different. We start from 2, then move to possible compromises (aka simplistic approach) and then move to discussing directions for solving HA/Pageserver replica case with 3.
+
+## Components
+
+pageserver, control-plane, safekeepers (a bit)
+
+## Requirements
+
+Relocation procedure should move tenant from one pageserver to another without downtime introduced by storage side. For now restarting compute for applying new configuration is fine.
+
+- component restarts
+- component outage
+- pageserver loss
+
+## The original proposed implementation
+
+The starting point is this sequence:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS2: Attach tenant X
+    PS2->>S3: Fetch timelines, indexes for them
+    PS2->>CP: Accepted
+    CP->>CP: Change pageserver id in project
+    CP->>PS1: Detach
+```
+
+Which problems do we have with naive approach?
+
+### Concurrent GC and Compaction
+
+The problem is that they can run on both, PS1 and PS2. Consider this example from [Pageserver S3 Coordination RFC](020-pageserver-s3-coordination.md)
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant PS1
+    participant S3
+    participant PS2
+
+    PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
+    PS2->>S3: Attach called, sees L1, L2
+    PS1->>S3: Compaction comes <br/> Removes L1, adds L3
+    note over S3: Index now L2, L3
+    PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
+    note over S3: Index now L1, L2, L4
+```
+
+At this point it is not possible to restore the state from index, it contains L2 which
+is no longer available in s3 and doesnt contain L3 added by compaction by the
+first pageserver. So if any of the pageservers restart, initial sync will fail
+(or in on-demand world it will fail a bit later during page request from
+missing layer)
+
+The problem lies in shared index_part.json. Having intersecting layers from append only edits is expected to work, though this is an uncharted territory without tests.
+
+#### Options
+
+There are several options on how to restrict concurrent access to index file.
+
+First and the simplest one is external orchestration. Control plane which runs migration can use special api call on pageserver to stop background processes (gc, compaction), and even possibly all uploads.
+
+So the sequence becomes:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS1: Pause background jobs, pause uploading new layers.
+    CP->>PS2: Attach tenant X.
+    PS2->>S3: Fetch timelines, index, start background operations
+    PS2->>CP: Accepted
+    CP->>CP: Monitor PS2 last record lsn, ensure OK lag
+    CP->>CP: Change pageserver id in project
+    CP->>PS1: Detach
+```
+
+The downside of this sequence is the potential rollback process. What if something goes wrong on new pageserver? Can we safely roll back to source pageserver?
+
+There are two questions:
+
+#### How can we detect that something went wrong?
+
+We can run usual availability check (consists of compute startup and an update of one row).
+Note that we cant run separate compute for that before touching compute that client runs actual workload on, because we cant have two simultaneous computes running in read-write mode on the same timeline (enforced by safekeepers consensus algorithm). So we can either run some readonly check first (basebackup) and then change pageserver id and run availability check. If it failed we can roll it back to the old one.
+
+#### What can go wrong? And how we can safely roll-back?
+
+In the sequence above during attach we start background processes/uploads. They change state in remote storage so it is possible that after rollback remote state will be different from one that was observed by source pageserver. So if target pageserver goes wild then source pageserver may fail to start with changed remote state.
+
+Proposed option would be to implement a barrier (read-only) mode when pageserver does not update remote state.
+
+So the sequence for happy path becomes this one:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS1: Pause background jobs, pause uploading new layers.
+    CP->>PS2: Attach tenant X in remote readonly mode.
+    PS2->>S3: Fetch timelines, index
+    PS2->>CP: Accepted
+    CP->>CP: Monitor PS2 last record lsn, ensure OK lag
+    CP->>CP: Change pageserver id in project
+    CP->>CP: Run successful availability check
+    CP->>PS2: Start uploads, background tasks
+    CP->>PS1: Detach
+```
+
+With this sequence we restrict any changes to remote storage to one pageserver. So there is no concurrent access at all, not only for index_part.json, but for everything else too. This approach makes it possible to roll back after failure on new pageserver.
+
+The sequence with roll back process:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant S3
+
+    CP->>PS1: Pause background jobs, pause uploading new layers.
+    CP->>PS2: Attach tenant X in remote readonly mode.
+    PS2->>S3: Fetch timelines, index
+    PS2->>CP: Accepted
+    CP->>CP: Monitor PS2 last record lsn, ensure OK lag
+    CP->>CP: Change pageserver id in project
+    CP->>CP: Availability check Failed
+    CP->>CP: Change pageserver id back
+    CP->>PS1: Resume remote operations
+    CP->>PS2: Ignore (instead of detach for investigation purposes)
+```
+
+## Concurrent branch creation
+
+Another problem is a possibility of concurrent branch creation calls.
+
+I e during migration create_branch can be called on old pageserver and newly created branch wont be seen on new pageserver. Prior art includes prototyping an approach of trying to mirror such branches, but currently it lost its importance, because now attach is fast because we dont need to download all data, and additionally to the best of my knowledge of control plane internals (cc @ololobus to confirm) operations on one project are executed sequentially, so it is not possible to have such case. So branch create operation will be executed only when relocation is completed. As a safety measure we can forbid branch creation for tenants that are in readonly remote state.
+
+## Simplistic approach
+
+The difference of simplistic approach from one described above is that it calls ignore on source tenant first and then calls attach on target pageserver. Approach above does it in opposite order thus opening a possibility for race conditions we strive to avoid.
+
+The approach largely follows this guide: <https://github.com/neondatabase/cloud/wiki/Cloud:-Ad-hoc-tenant-relocation>
+
+The happy path sequence:
+
+```mermaid
+sequenceDiagram
+    autonumber
+    participant CP as Control Plane
+    participant PS1 as Pageserver 1
+    participant PS2 as Pageserver 2
+    participant SK as Safekeeper
+    participant S3
+
+    CP->>CP: Enable maintenance mode
+    CP->>PS1: Ignore
+    CP->>PS2: Attach
+    PS2->>CP: Accepted
+    loop Delete layers for each timeline
+        CP->>PS2: Get last record lsn
+        CP->>SK: Get commit lsn
+        CP->>CP: OK? Timed out?
+    end
+    CP->>CP: Change pageserver id in project
+    CP->>CP: Run successful availability check
+    CP->>CP: Disable maintenance mode
+    CP->>PS1: Detach ignored
+```
+
+The sequence contains exactly the same rollback problems as in previous approach described above. They can be resolved the same way.
+
+Most probably we'd like to move forward without this safety measure and implement it on top of this approach to make progress towards the downtime-less one.
+
+## Lease based approach
+
+In order to allow for concurrent operation on the same data on remote storage for multiple pageservers we need to go further than external orchestration.
+
+NOTE: [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) discusses one more approach that relies on duplication of index_part.json for each pageserver operating on the timeline. This approach still requires external coordination which makes certain things easier but requires additional bookkeeping to account for multiple index_part.json files. Discussion/comparison with proposed lease based approach
+
+The problems are outlined in [020. Pageserver S3 Coordination](020-pageserver-s3-coordination.md) and suggested solution includes [Coordination based approach](020-pageserver-s3-coordination.md#coordination-based-approach). This way it will allow to do basic leader election for pageservers so they can decide which node will be responsible for running GC and compaction. The process is based on extensive communication via storage broker and consists of a lease that is taken by one of the pageservers that extends it to continue serving a leader role.
+
+There are two options for ingesting new data into pageserver in follower role. One option is to avoid WAL ingestion at all and rely on notifications from leader to discover new layers on s3. Main downside of this approach is that follower will always lag behind the primary node because it wont have the last layer until it is uploaded to remote storage. In case of a primary failure follower will be required to reingest last segment (up to 256Mb of WAL currently) which slows down recovery. Additionally if compute is connected to follower pageserver it will observe latest data with a delay. Queries from compute will likely experience bigger delays when recent lsn is required.
+
+The second option is to consume WAL stream on both pageservers. In this case the only problem is non deterministic layer generation. Additional bookkeeping will be required to deduplicate layers from primary with local ones. Some process needs to somehow merge them to remove duplicated data. Additionally we need to have good testing coverage to ensure that our implementation of `get_page@lsn` properly handles intersecting layers.
+
+There is another tradeoff. Approaches may be different in amount of traffic between system components. With first approach there can be increased traffic between follower and remote storage. But only in case follower has some activity that actually requests pages (!). With other approach traffic increase will be permanent and will be caused by two WAL streams instead of one.
+
+## Summary
+
+Proposed implementation strategy:
+
+Go with the simplest approach for now. Then work on tech debt, increase test coverage. Then gradually move forward to second approach by implementing safety measures first, finishing with switch of order between ignore and attach operation.
+
+And only then go to lease based approach to solve HA/Pageserver replica use cases.
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1,9 +1,7 @@
 use std::{
    collections::HashMap,
-    marker::PhantomData,
    num::{NonZeroU64, NonZeroUsize},
    time::SystemTime,
-    unreachable,
 };

 use byteorder::{BigEndian, ReadBytesExt};
@@ -133,7 +131,7 @@ pub struct TimelineCreateRequest {
 }

 #[serde_as]
-#[derive(Serialize, Deserialize, Default)]
+#[derive(Serialize, Deserialize, Debug, Default)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
    #[serde(default)]
@@ -151,7 +149,7 @@ impl std::ops::Deref for TenantCreateRequest {
    }
 }

-#[derive(Serialize, Deserialize, Default)]
+#[derive(Serialize, Deserialize, Debug, Default)]
 pub struct TenantConfig {
    pub checkpoint_distance: Option<u64>,
    pub checkpoint_timeout: Option<String>,
@@ -195,13 +193,13 @@ impl TenantCreateRequest {
 }

 #[serde_as]
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantConfigRequest {
    #[serde_as(as = "DisplayFromStr")]
    pub tenant_id: TenantId,
    #[serde(flatten)]
-    pub config: TenantConfig,
+    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }

 impl std::ops::Deref for TenantConfigRequest {
@@ -772,4 +770,31 @@ mod tests {
        assert!(format!("{:?}", &original_broken.state).contains("reason"));
        assert!(format!("{:?}", &original_broken.state).contains("backtrace info"));
    }
+
+    #[test]
+    fn test_reject_unknown_field() {
+        let id = TenantId::generate();
+        let create_request = json!({
+            "new_tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+
+        let id = TenantId::generate();
+        let config_request = json!({
+            "tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantConfigRequest>(config_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+    }
 }
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -144,6 +144,8 @@ where
    ///
    /// This call won't complete until someone has called `advance`
    /// with a number greater than or equal to the one we're waiting for.
+    ///
+    /// This function is async cancellation-safe.
    pub async fn wait_for(&self, num: V) -> Result<(), SeqWaitError> {
        match self.queue_for_wait(num) {
            Ok(None) => Ok(()),
@@ -159,6 +161,8 @@ where
    ///
    /// If that hasn't happened after the specified timeout duration,
    /// [`SeqWaitError::Timeout`] will be returned.
+    ///
+    /// This function is async cancellation-safe.
    pub async fn wait_for_timeout(
        &self,
        num: V,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,10 +12,12 @@
 //!

 use anyhow::{bail, Context};
+use either::Either;
 use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
+use std::future::Future;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tracing::*;
@@ -1592,78 +1594,88 @@ impl Tenant {
        debug_assert_current_span_has_tenant_id();

        let mut result = Ok(());
-        self.state.send_modify(|current_state| {
+        Self::state_send_modify_async(&self.state, |current_state| {
            match &*current_state {
                TenantState::Active => {
                    // activate() was called on an already Active tenant. Shouldn't happen.
                    result = Err(anyhow::anyhow!("Tenant is already active"));
+                    Either::Left(None)
                }
                TenantState::Broken { reason, .. } => {
                    // This shouldn't happen either
                    result = Err(anyhow::anyhow!(
                        "Could not activate tenant because it is in broken state due to: {reason}",
                    ));
+                    Either::Left(None)
                }
                TenantState::Stopping => {
                    // The tenant was detached, or system shutdown was requested, while we were
                    // loading or attaching the tenant.
                    info!("Tenant is already in Stopping state, skipping activation");
+                    Either::Left(None)
                }
                TenantState::Loading | TenantState::Attaching => {
                    *current_state = TenantState::Active;
+                    let mut post_state = TenantState::Active;
+                    Either::Right(
+                        async move {
+                            debug!(tenant_id = %self.tenant_id, "Activating tenant");
+                            let timelines_accessor = self.timelines.lock().unwrap();
+                            let not_broken_timelines =
+                                timelines_accessor.values().filter(|timeline| {
+                                    timeline.current_state() != TimelineState::Broken
+                                });

-                    debug!(tenant_id = %self.tenant_id, "Activating tenant");
+                            // Spawn gc and compaction loops. The loops will shut themselves
+                            // down when they notice that the tenant is inactive.
+                            tasks::start_background_loops(self.tenant_id);

-                    let timelines_accessor = self.timelines.lock().unwrap();
-                    let not_broken_timelines = timelines_accessor
-                        .values()
-                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+                            let mut activated_timelines = 0;
+                            let mut timelines_broken_during_activation = 0;

-                    // Spawn gc and compaction loops. The loops will shut themselves
-                    // down when they notice that the tenant is inactive.
-                    tasks::start_background_loops(self.tenant_id);
+                            for timeline in not_broken_timelines {
+                                match timeline
+                                    .activate(ctx)
+                                    .context("timeline activation for activating tenant")
+                                {
+                                    Ok(()) => {
+                                        activated_timelines += 1;
+                                    }
+                                    Err(e) => {
+                                        error!(
+                                            "Failed to activate timeline {}: {:#}",
+                                            timeline.timeline_id, e
+                                        );
+                                        timeline.set_state(TimelineState::Broken);
+                                        post_state = TenantState::broken_from_reason(format!(
+                                            "failed to activate timeline {}: {}",
+                                            timeline.timeline_id, e
+                                        ));

-                    let mut activated_timelines = 0;
-                    let mut timelines_broken_during_activation = 0;
-
-                    for timeline in not_broken_timelines {
-                        match timeline
-                            .activate(ctx)
-                            .context("timeline activation for activating tenant")
-                        {
-                            Ok(()) => {
-                                activated_timelines += 1;
+                                        timelines_broken_during_activation += 1;
+                                    }
+                                }
                            }
-                            Err(e) => {
-                                error!(
-                                    "Failed to activate timeline {}: {:#}",
-                                    timeline.timeline_id, e
-                                );
-                                timeline.set_state(TimelineState::Broken);
-                                *current_state = TenantState::broken_from_reason(format!(
-                                    "failed to activate timeline {}: {}",
-                                    timeline.timeline_id, e
-                                ));

-                                timelines_broken_during_activation += 1;
-                            }
+                            let elapsed = self.loading_started_at.elapsed();
+                            let total_timelines = timelines_accessor.len();
+
+                            // log a lot of stuff, because some tenants sometimes suffer from user-visible
+                            // times to activate. see https://github.com/neondatabase/neon/issues/4025
+                            info!(
+                                since_creation_millis = elapsed.as_millis(),
+                                tenant_id = %self.tenant_id,
+                                activated_timelines,
+                                timelines_broken_during_activation,
+                                total_timelines,
+                                post_state = <&'static str>::from(&post_state),
+                                "activation attempt finished"
+                            );
+
+                            Some(post_state)
                        }
-                    }
-
-                    let elapsed = self.loading_started_at.elapsed();
-                    let total_timelines = timelines_accessor.len();
-
-                    // log a lot of stuff, because some tenants sometimes suffer from user-visible
-                    // times to activate. see https://github.com/neondatabase/neon/issues/4025
-                    info!(
-                        since_creation_millis = elapsed.as_millis(),
-                        tenant_id = %self.tenant_id,
-                        activated_timelines,
-                        timelines_broken_during_activation,
-                        total_timelines,
-                        post_state = <&'static str>::from(&*current_state),
-                        "activation attempt finished"
-                    );
+                        .in_current_span(),
+                    )
                }
            }
        });
@@ -1672,30 +1684,34 @@ impl Tenant {

    /// Change tenant status to Stopping, to mark that it is being shut down
    pub fn set_stopping(&self) {
-        self.state.send_modify(|current_state| {
+        Self::state_send_modify_async(&self.state, |current_state| {
            match current_state {
                TenantState::Active | TenantState::Loading | TenantState::Attaching => {
                    *current_state = TenantState::Stopping;
-
-                    // FIXME: If the tenant is still Loading or Attaching, new timelines
-                    // might be created after this. That's harmless, as the Timelines
-                    // won't be accessible to anyone, when the Tenant is in Stopping
-                    // state.
-                    let timelines_accessor = self.timelines.lock().unwrap();
-                    let not_broken_timelines = timelines_accessor
-                        .values()
-                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
-                    for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Stopping);
-                    }
+                    Either::Right(async move {
+                        // FIXME: If the tenant is still Loading or Attaching, new timelines
+                        // might be created after this. That's harmless, as the Timelines
+                        // won't be accessible to anyone, when the Tenant is in Stopping
+                        // state.
+                        let timelines_accessor = self.timelines.lock().unwrap();
+                        let not_broken_timelines = timelines_accessor
+                            .values()
+                            .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+                        for timeline in not_broken_timelines {
+                            timeline.set_state(TimelineState::Stopping);
+                        }
+                        None
+                    })
                }
                TenantState::Broken { reason, .. } => {
                    info!("Cannot set tenant to Stopping state, it is in Broken state due to: {reason}");
+                    Either::Left(None)
                }
                TenantState::Stopping => {
                    // The tenant was detached, or system shutdown was requested, while we were
                    // loading or attaching the tenant.
                    info!("Tenant is already in Stopping state");
+                    Either::Left(None)
                }
            }
        });
@@ -1732,6 +1748,40 @@ impl Tenant {
        });
    }

+    fn state_send_modify_async<MakeFut, Fut, T>(
+        watch_sender: &tokio::sync::watch::Sender<T>,
+        async_clos: MakeFut,
+    ) where
+        MakeFut: FnOnce(&mut T) -> Either<Option<T>, Fut> + Send,
+        Fut: Future<Output = Option<T>> + Send,
+        T: Send,
+    {
+        let rt = tokio::runtime::Handle::current();
+        let span = tracing::Span::current();
+        watch_sender.send_modify(|current_state| {
+            match async_clos(current_state) {
+                Either::Left(None) => {},
+                Either::Left(Some(update)) => {
+                    *current_state = update;
+                }
+                Either::Right(fut) => {
+                    let maybe_update = std::thread::scope(|scope| {
+                        let jh = scope.spawn(|| {
+                            rt.block_on(fut.instrument(span))
+                        });
+                        jh.join().expect(
+                            "the thread that executes the closure panicked, likely self.state is poisoned now",
+                        )
+                    });
+                    match maybe_update {
+                        None => {},
+                        Some(update) => {*current_state = update},
+                    }
+                }
+            }
+        });
+    }
+
    pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TenantState> {
        self.state.subscribe()
    }
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -379,17 +379,6 @@ impl XlXactParsedRecord {
                });
            }
        }
-        if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
-            let nmsgs = buf.get_i32_le();
-            for _i in 0..nmsgs {
-                let sizeof_shared_invalidation_message = 0;
-                buf.advance(sizeof_shared_invalidation_message);
-            }
-        }
-        if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
-            xid = buf.get_u32_le();
-            trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
-        }

        if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 {
            let nitems = buf.get_i32_le();
@@ -397,7 +386,23 @@ impl XlXactParsedRecord {
                "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}",
                nitems
            );
-            //FIXME: do we need to handle dropped stats here?
+            let sizeof_xl_xact_stats_item = 12;
+            buf.advance((nitems * sizeof_xl_xact_stats_item).try_into().unwrap());
+        }
+
+        if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
+            let nmsgs = buf.get_i32_le();
+            let sizeof_shared_invalidation_message = 16;
+            buf.advance(
+                (nmsgs * sizeof_shared_invalidation_message)
+                    .try_into()
+                    .unwrap(),
+            );
+        }
+
+        if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
+            xid = buf.get_u32_le();
+            debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
        }

        XlXactParsedRecord {
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -149,7 +149,7 @@ def top_output_dir(base_dir: Path) -> Iterator[Path]:

@pytest.fixture(scope="session")
 def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Iterator[Path]:
-    versioned_dir = pg_distrib_dir / f"v{pg_version}"
+    versioned_dir = pg_distrib_dir / pg_version.v_prefixed

    psql_bin_path = versioned_dir / "bin/psql"
    postgres_bin_path = versioned_dir / "bin/postgres"
@@ -1745,8 +1745,8 @@ class PgBin:
    def __init__(self, log_dir: Path, pg_distrib_dir: Path, pg_version: PgVersion):
        self.log_dir = log_dir
        self.pg_version = pg_version
-        self.pg_bin_path = pg_distrib_dir / f"v{pg_version}" / "bin"
-        self.pg_lib_dir = pg_distrib_dir / f"v{pg_version}" / "lib"
+        self.pg_bin_path = pg_distrib_dir / pg_version.v_prefixed / "bin"
+        self.pg_lib_dir = pg_distrib_dir / pg_version.v_prefixed / "lib"
        self.env = os.environ.copy()
        self.env["LD_LIBRARY_PATH"] = str(self.pg_lib_dir)

--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -27,6 +27,12 @@ class PgVersion(str, enum.Enum):
    def __repr__(self) -> str:
        return f"'{self.value}'"

+    # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14),
+    # sometime we need to do so in tests.
+    @property
+    def v_prefixed(self) -> str:
+        return f"v{self.value}"
+
    @classmethod
    def _missing_(cls, value) -> Optional["PgVersion"]:
        known_values = {v.value for _, v in cls.__members__.items()}
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -16,7 +16,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn
 from pytest import FixtureRequest

@@ -41,7 +41,6 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )


-@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(before="test_forward_compatibility")
 def test_create_snapshot(
@@ -49,12 +48,13 @@ def test_create_snapshot(
    pg_bin: PgBin,
    top_output_dir: Path,
    test_output_dir: Path,
+    pg_version: PgVersion,
 ):
    # The test doesn't really test anything
    # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
    #
    # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
-    neon_env_builder.pg_version = PgVersion.V14
+    neon_env_builder.pg_version = pg_version
    neon_env_builder.num_safekeepers = 3
    neon_env_builder.enable_local_fs_remote_storage()
    neon_env_builder.preserve_database_files = True
@@ -90,13 +90,14 @@ def test_create_snapshot(
    env.pageserver.stop()

    # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
-    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"
+    compatibility_snapshot_dir = (
+        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
+    )
    if compatibility_snapshot_dir.exists():
        shutil.rmtree(compatibility_snapshot_dir)
    shutil.copytree(test_output_dir, compatibility_snapshot_dir)


-@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
@@ -115,7 +116,7 @@ def test_backward_compatibility(
    compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR")
    assert (
        compatibility_snapshot_dir_env is not None
-    ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)"
+    ), f"COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg{pg_version.v_prefixed}` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)"
    compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()

    breaking_changes_allowed = (
@@ -155,7 +156,6 @@ def test_backward_compatibility(
    ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"


-@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
@@ -183,7 +183,9 @@ def test_forward_compatibility(
    ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)"
    compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve()

-    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"
+    compatibility_snapshot_dir = (
+        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
+    )

    breaking_changes_allowed = (
        os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,9 +1,7 @@
 import pytest
 from fixtures.neon_fixtures import NeonEnv
-from fixtures.pg_version import PgVersion, xfail_on_postgres


-@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4182")
@pytest.mark.timeout(1800)
 def test_hot_standby(neon_simple_env: NeonEnv):
    env = neon_simple_env
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -64,12 +64,15 @@ def test_ondemand_download_large_rel(
    tenant, _ = env.neon_cli.create_tenant(
        conf={
            # disable background GC
-            "gc_period": "10 m",
+            "gc_period": "0s",
            "gc_horizon": f"{10 * 1024 ** 3}",  # 10 GB
            # small checkpoint distance to create more delta layer files
            "checkpoint_distance": f"{10 * 1024 ** 2}",  # 10 MB
+            # allow compaction with the checkpoint
            "compaction_threshold": "3",
            "compaction_target_size": f"{10 * 1024 ** 2}",  # 10 MB
+            # but don't run compaction in background or on restart
+            "compaction_period": "0s",
        }
    )
    env.initial_tenant = tenant
@@ -96,9 +99,17 @@ def test_ondemand_download_large_rel(

        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))

-    # wait until pageserver receives that data
    wait_for_last_record_lsn(client, tenant_id, timeline_id, current_lsn)

+    # stop endpoint before checkpoint to stop wal generation
+    endpoint.stop()
+
+    # stopping of safekeepers now will help us not to calculate logical size
+    # after startup, so page requests should be the only one on-demand
+    # downloading the layers
+    for sk in env.safekeepers:
+        sk.stop()
+
    # run checkpoint manually to be sure that data landed in remote storage
    client.timeline_checkpoint(tenant_id, timeline_id)

@@ -107,7 +118,6 @@ def test_ondemand_download_large_rel(
    log.info("uploads have finished")

    ##### Stop the first pageserver instance, erase all its data
-    endpoint.stop()
    env.pageserver.stop()

    # remove all the layer files
@@ -118,8 +128,13 @@ def test_ondemand_download_large_rel(
    ##### Second start, restore the data and ensure it's the same
    env.pageserver.start()

-    endpoint.start()
+    # start a readonly endpoint which we'll use to check the database.
+    # readonly (with lsn=) is required so that we don't try to connect to
+    # safekeepers, that have now been shut down.
+    endpoint = env.endpoints.create_start("main", lsn=current_lsn)
+
    before_downloads = get_num_downloaded_layers(client, tenant_id, timeline_id)
+    assert before_downloads != 0, "basebackup should on-demand non-zero layers"

    # Probe in the middle of the table. There's a high chance that the beginning
    # and end of the table was stored together in the same layer files with data
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -5,7 +5,6 @@ from pathlib import Path

 import pytest
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
-from fixtures.pg_version import PgVersion, xfail_on_postgres


 # Run the main PostgreSQL regression tests, in src/test/regress.
@@ -33,8 +32,8 @@ def test_pg_regress(
    (runpath / "testtablespace").mkdir(parents=True)

    # Compute all the file locations that pg_regress will need.
-    build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress"
-    src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/regress"
+    build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress"
+    src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress"
    bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
    schedule = src_path / "parallel_schedule"
    pg_regress = build_path / "pg_regress"
@@ -72,7 +71,6 @@ def test_pg_regress(
 #
 # This runs for a long time, especially in debug mode, so use a larger-than-default
 # timeout.
-@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4213")
@pytest.mark.timeout(1800)
 def test_isolation(
    neon_simple_env: NeonEnv,
@@ -97,8 +95,8 @@ def test_isolation(
    (runpath / "testtablespace").mkdir(parents=True)

    # Compute all the file locations that pg_isolation_regress will need.
-    build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/isolation"
-    src_path = base_dir / f"vendor/postgres-v{env.pg_version}/src/test/isolation"
+    build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation"
+    src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation"
    bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
    schedule = src_path / "isolation_schedule"
    pg_isolation_regress = build_path / "pg_isolation_regress"
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -1,17 +1,13 @@
 import json
 from contextlib import closing
-from typing import Generator

 import psycopg2.extras
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    LocalFsStorage,
-    NeonEnv,
    NeonEnvBuilder,
    RemoteStorageKind,
 )
-from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload
 from fixtures.types import Lsn
 from fixtures.utils import wait_until
@@ -407,62 +403,3 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
    metric = get_metric()
    assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default"
    assert int(metric.value) == 0, "value resets to default"
-
-
-@pytest.fixture
-def unknown_fields_env(neon_env_builder: NeonEnvBuilder) -> Generator[NeonEnv, None, None]:
-    env = neon_env_builder.init_start()
-    yield env
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*/v1/tenant .*Error processing HTTP request: Bad request.*",
-            ".*/v1/tenant/config .*Error processing HTTP request: Bad request.*",
-        ]
-    )
-
-
-def test_unknown_fields_cli_create(unknown_fields_env: NeonEnv):
-    """
-    When specifying an invalid config field during tenant creation on the CLI, the CLI should fail with an error.
-    """
-
-    with pytest.raises(Exception, match="Unrecognized tenant settings"):
-        unknown_fields_env.neon_cli.create_tenant(conf={"unknown_field": "unknown_value"})
-
-
-def test_unknown_fields_http_create(unknown_fields_env: NeonEnv):
-    """
-    When specifying an invalid config field during tenant creation on the HTTP API, the API should fail with an error.
-    """
-
-    ps_http = unknown_fields_env.pageserver.http_client()
-
-    with pytest.raises(PageserverApiException) as excinfo:
-        ps_http.tenant_create(conf={"unknown_field": "unknown_value"})
-    assert excinfo.value.status_code == 400
-
-
-def test_unknown_fields_cli_config(unknown_fields_env: NeonEnv):
-    """
-    When specifying an invalid config field during tenant configuration on the CLI, the CLI should fail with an error.
-    """
-
-    (tenant_id, _) = unknown_fields_env.neon_cli.create_tenant()
-
-    with pytest.raises(Exception, match="Unrecognized tenant settings"):
-        unknown_fields_env.neon_cli.config_tenant(
-            tenant_id, conf={"unknown_field": "unknown_value"}
-        )
-
-
-def test_unknown_fields_http_config(unknown_fields_env: NeonEnv):
-    """
-    When specifying an invalid config field during tenant configuration on the HTTP API, the API should fail with an error.
-    """
-
-    (tenant_id, _) = unknown_fields_env.neon_cli.create_tenant()
-    ps_http = unknown_fields_env.pageserver.http_client()
-
-    with pytest.raises(PageserverApiException) as excinfo:
-        ps_http.set_tenant_config(tenant_id, {"unknown_field": "unknown_value"})
-    assert excinfo.value.status_code == 400
Author	SHA1	Message	Date
Christian Schwarz	f670caa4d8	hack: allow using async inside Tenant::activate	2023-05-22 11:40:55 +02:00
Christian Schwarz	ef7d20f582	refactor: prepare to allow async code inside Tenant::state.send_modify()	2023-05-22 11:39:50 +02:00
Alexander Bayandin	3837fca7a2	compute-node-image: fix postgis download (#4280 ) ## Problem `osgeo.org` is experiencing some problems with DNS resolving which breaks `compute-node-image` (because it can't download postgis) ## Summary of changes - Add `140.211.15.30 download.osgeo.org` to /etc/hosts by passing it via the container option	2023-05-19 15:34:22 +01:00
Dmitry Rodionov	7529ee2ec7	rfc: the state of pageserver tenant relocation (#3868 ) Summarize current state of tenant relocation related activities and implementation ideas	2023-05-19 14:35:33 +03:00
Christian Schwarz	b391c94440	tenant create / update-config: reject unknown fields (#4267 ) This PR enforces that the tenant create / update-config APIs reject requests with unknown fields. This is a desirable property because some tenant config settings control the lifetime of user data (e.g., GC horizon or PITR interval). Suppose we inadvertently rename the `pitr_interval` field in the Rust code. Then, right now, a client that still uses the old name will send a tenant config request to configure a new PITR interval. Before this PR, we would accept such a request, ignore the old name field, and use the pageserver.toml default value for what the new PITR interval is. With this PR, we will instead reject such a request. One might argue that the client could simply check whether the config it sent has been applied, using the `/v1/tenant/.../config` endpoint. That is correct for tenant create and update-config. But, attach will soon [^1] grow the ability to have attach-time config as well. If we ignore unknown fields and fall back to global defaults in that case, we risk data loss. Example: 1. Default PITR in pageservers is 7 days. 2. Create a tenant and set its PITR to 30 days. 3. For 30 days, fill the tenant continuously with data. 4. Detach the tenant. 5. Attach tenant. Attach must use the 30-day PITR setting in this scenario. If it were to fall back to the 7-day default value, we would lose 23 days of PITR capability for the tenant. So, the PR that adds attach-time tenant config will build on the (clunky) infrastructure added in this PR [^1]: https://github.com/neondatabase/neon/pull/4255 Implementation Notes ==================== This could have been a simple `#[serde(deny_unknown_fields)]` but sadly, that is documented- but silent-at-compile-time-incompatible with `#[serde(flatten)]`. But we are still using this by adding on outer struct and use unit tests to ensure it is correct. `neon_local tenant config` now uses the `.remove()` pattern + bail if there are leftover config args. That's in line with what `neon_local tenant create` does. We should dedupe that logic in a future PR. --------- Signed-off-by: Alex Chi <iskyzh@gmail.com> Co-authored-by: Alex Chi <iskyzh@gmail.com>	2023-05-18 21:16:09 -04:00
Alexander Bayandin	5abc4514b7	Un-xfail fixed tests on Postgres 15 (#4275 ) - https://github.com/neondatabase/neon/pull/4182 - https://github.com/neondatabase/neon/pull/4213	2023-05-18 22:38:33 +01:00
Alexander Bayandin	1b2ece3715	Re-enable compatibility tests on Postgres 15 (#4274 ) - Enable compatibility tests for Postgres 15 - Also add `PgVersion::v_prefixed` property to return the version number with, _guess what,_ v-prefix!	2023-05-18 19:56:09 +01:00
Anastasia Lubennikova	8ebae74c6f	Fix handling of XLOG_XACT_COMMIT/ABORT: Previously we didn't handle XACT_XINFO_HAS_INVALS and XACT_XINFO_HAS_DROPPED_STAT correctly, which led to getting incorrect value of twophase_xid for records with XACT_XINFO_HAS_TWOPHASE. This caused 'twophase file for xid {} does not exist' errors in test_isolation	2023-05-18 14:36:45 +01:00
Vadim Kharitonov	fc886dc8c0	Compile pg_cron extension	2023-05-17 17:43:50 +02:00
Heikki Linnakangas	72346e102d	Document that our code is mostly not async cancellation-safe. We had a hot debate on whether we should try to make our code cancellation-safe, or just accept that it's not, and make sure that our Futures are driven to completion. The decision is that we drive Futures to completion. This documents the decision, and summarizes the reasoning for that. Discussion that sparked this: https://github.com/neondatabase/neon/pull/4198#discussion_r1190209316	2023-05-17 17:29:54 +03:00
Joonas Koivunen	918cd25453	ondemand_download_large_rel: solve flakyness (#3697 ) Disable background tasks to not get compaction downloading all layers but also stop safekeepers before checkpointing, use a readonly endpoint. Fixes: #3666 Co-authored-by: Christian Schwarz <christian@neon.tech>	2023-05-17 16:19:02 +02:00
Alex Chi Z	9767432cff	add `cargo neon` shortcut for neon_local (#4240 ) Add `cargo neon` as a shortcut for compiling and running `neon_local`. --------- Signed-off-by: Alex Chi <iskyzh@gmail.com>	2023-05-17 16:48:00 +03:00
Anastasia Lubennikova	0c4dc55a39	Disable recovery_prefetch for Neon hot standby. Prefetching of blocks referenced in WAL doesn't make sense for us, because Neon hot standby anyway ignores pages that are not in the shared_buffers.	2023-05-17 13:35:56 +01:00