Disable XLP_FIRST_IS_CONTRECORD check

Set up a workflow to run pgbench against captest (#2077 )
Bump vendor/postgres to include XLP_FIRST_IS_CONTRECORD fix. (#2274 )
2026-05-20 06:30:43 +00:00 · 2022-08-15 19:38:12 +00:00 · 2022-08-15 18:54:31 +01:00 · 2022-08-15 18:24:24 +03:00 · 2022-08-15 18:02:27 +03:00 · 2022-08-15 13:31:26 +03:00
57 changed files with 1684 additions and 1312 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -83,6 +83,7 @@ runs:
        # this variable will be embedded in perf test report
        # and is needed to distinguish different environments
        PLATFORM: github-actions-selfhosted
+        BUILD_TYPE: ${{ inputs.build_type }}
        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
      shell: bash -euxo pipefail {0}
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -29,8 +29,12 @@ runs:
          time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd .
        elif [ -f ${SOURCE} ]; then
          time tar -cf ${ARCHIVE} --zstd ${SOURCE}
+        elif ! ls ${SOURCE} > /dev/null 2>&1; then
+          echo 2>&1 "${SOURCE} does not exist"
+          exit 2
        else
-          echo 2>&1 "${SOURCE} neither directory nor file, don't know how to handle it"
+          echo 2>&1 "${SOURCE} is neither a directory nor a file, do not know how to handle it"
+          exit 3
        fi

    - name: Upload artifact
--- a/.github/ansible/get_binaries.sh
+++ b/.github/ansible/get_binaries.sh
@@ -2,30 +2,14 @@

 set -e

-RELEASE=${RELEASE:-false}
-
-# look at docker hub for latest tag for neon docker image
-if [ "${RELEASE}" = "true" ]; then
-    echo "search latest release tag"
-    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
-    if [ -z "${VERSION}" ]; then
-        echo "no any docker tags found, exiting..."
-        exit 1
-    else
-        TAG="release-${VERSION}"
-    fi
+if [ -n "${DOCKER_TAG}" ]; then
+  # Verson is DOCKER_TAG but without prefix
+  VERSION=$(echo $DOCKER_TAG | sed 's/^.*-//g')
 else
-    echo "search latest dev tag"
-    VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
-    if [ -z "${VERSION}" ]; then
-        echo "no any docker tags found, exiting..."
-        exit 1
-    else
-        TAG="${VERSION}"
-    fi
+  echo "Please set DOCKER_TAG environment variable"
+  exit 1
 fi

-echo "found ${VERSION}"

 # do initial cleanup
 rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
@@ -33,8 +17,8 @@ mkdir neon_install

 # retrieve binaries from docker image
 echo "getting binaries from docker image"
-docker pull --quiet neondatabase/neon:${TAG}
-ID=$(docker create neondatabase/neon:${TAG})
+docker pull --quiet neondatabase/neon:${DOCKER_TAG}
+ID=$(docker create neondatabase/neon:${DOCKER_TAG})
 docker cp ${ID}:/data/postgres_install.tar.gz .
 tar -xzf postgres_install.tar.gz -C neon_install
 docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -1,4 +1,4 @@
-name: benchmarking
+name: Benchmarking

 on:
  # uncomment to run on push for debugging your PR
@@ -15,6 +15,15 @@ on:

  workflow_dispatch: # adds ability to run this manually

+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
 jobs:
  bench:
    # this workflow runs on self hosteed runner
@@ -60,7 +69,6 @@ jobs:
    - name: Setup cluster
      env:
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
-      shell: bash -euxo pipefail {0}
      run: |
        set -e

@@ -96,7 +104,9 @@ jobs:
        # since it might generate duplicates when calling ingest_perf_test_result.py
        rm -rf perf-report-staging
        mkdir -p perf-report-staging
-        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600
+        # Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file,
+        # it's important for test_perf_pgbench.py::test_pgbench_remote_* tests
+        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600

    - name: Submit result
      env:
@@ -113,3 +123,106 @@ jobs:
        slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
      env:
        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  pgbench-compare:
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
+      TEST_PG_BENCH_SCALES_MATRIX: "10gb"
+      REMOTE_ENV: "1"
+      POSTGRES_DISTRIB_DIR: /usr
+      TEST_OUTPUT: /tmp/test_output
+
+    strategy:
+      fail-fast: false
+      matrix:
+        connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ]
+
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2817580636
+
+    timeout-minutes: 360 # 6h
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Cache poetry deps
+      id: cache_poetry
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      run: ./scripts/pysync
+
+    - name: Calculate platform
+      id: calculate-platform
+      env:
+        CONNSTR: ${{ matrix.connstr }}
+      run: |
+        if [ "${CONNSTR}" = "BENCHMARK_CAPTEST_CONNSTR" ]; then
+          PLATFORM=neon-captest
+        elif [ "${CONNSTR}" = "BENCHMARK_RDS_CONNSTR" ]; then
+          PLATFORM=rds-aurora
+        else
+          echo 2>&1 "Unknown CONNSTR=${CONNSTR}. Allowed are BENCHMARK_CAPTEST_CONNSTR, and BENCHMARK_RDS_CONNSTR only"
+          exit 1
+        fi
+
+        echo "::set-output name=PLATFORM::${PLATFORM}"
+
+    - name: Install Deps
+      run: |
+        echo "deb http://apt.postgresql.org/pub/repos/apt focal-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
+        wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
+        sudo apt -y update
+        sudo apt install -y postgresql-14 postgresql-client-14
+
+    - name: Benchmark init
+      env:
+        PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
+        BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
+      run: |
+        mkdir -p perf-report-captest
+
+        psql $BENCHMARK_CONNSTR -c "SELECT 1;"
+        ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_init -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
+
+    - name: Benchmark simple-update
+      env:
+        PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
+        BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
+      run: |
+        psql $BENCHMARK_CONNSTR -c "SELECT 1;"
+        ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_simple_update -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
+
+    - name: Benchmark select-only
+      env:
+        PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
+        BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
+      run: |
+        psql $BENCHMARK_CONNSTR -c "SELECT 1;"
+        ./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_select_only -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
+
+    - name: Submit result
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+      run: |
+        REPORT_FROM=$(realpath perf-report-captest) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
+
+    - name: Upload logs
+      if: always()
+      uses: ./.github/actions/upload
+      with:
+        name: bench-captest-${{ steps.calculate-platform.outputs.PLATFORM }}
+        path: /tmp/test_output/
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -562,6 +562,7 @@ jobs:

      - name: Redeploy
        run: |
+          export DOCKER_TAG=${{needs.docker-image.outputs.build-tag}}
          cd "$(pwd)/.github/ansible"

          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -19,8 +19,12 @@ concurrency:

 jobs:
  test-postgres-client-libs:
+    # TODO: switch to gen2 runner, requires docker
    runs-on: [ ubuntu-latest ]

+    env:
+      TEST_OUTPUT: /tmp/test_output
+
    steps:
    - name: Checkout
      uses: actions/checkout@v3
@@ -47,7 +51,7 @@ jobs:
      env:
        REMOTE_ENV: 1
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
-        TEST_OUTPUT: /tmp/test_output
+
        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
      shell: bash -euxo pipefail {0}
      run: |
@@ -61,9 +65,18 @@ jobs:
          -m "remote_cluster" \
          -rA "test_runner/pg_clients"

+    # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
+    # It will be fixed after switching to gen2 runner
+    - name: Upload python test logs
+      if: always()
+      uses: actions/upload-artifact@v3
+      with:
+        retention-days: 7
+        name: python-test-pg_clients-${{ runner.os }}-stage-logs
+        path: ${{ env.TEST_OUTPUT }}
+
    - name: Post to a Slack channel
-      if: failure()
-      id: slack
+      if: ${{ github.event.schedule && failure() }}
      uses: slackapi/slack-github-action@v1
      with:
        channel-id: "C033QLM5P7D" # dev-staging-stream
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2269,6 +2269,7 @@ dependencies = [
 "anyhow",
 "async-trait",
 "base64",
+ "bstr",
 "bytes",
 "clap 3.2.12",
 "futures",
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -401,6 +401,7 @@ impl PageServerNode {
                    .get("checkpoint_distance")
                    .map(|x| x.parse::<u64>())
                    .transpose()?,
+                checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
                compaction_target_size: settings
                    .get("compaction_target_size")
                    .map(|x| x.parse::<u64>())
@@ -455,6 +456,7 @@ impl PageServerNode {
                    .map(|x| x.parse::<u64>())
                    .transpose()
                    .context("Failed to parse 'checkpoint_distance' as an integer")?,
+                checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
                compaction_target_size: settings
                    .get("compaction_target_size")
                    .map(|x| x.parse::<u64>())
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -79,5 +79,4 @@
 - [014-storage-lsm](rfcs/014-storage-lsm.md)
 - [015-storage-messaging](rfcs/015-storage-messaging.md)
 - [016-connection-routing](rfcs/016-connection-routing.md)
- [017-pageserver-op-atomicity](rfcs/017-pageserver-op-atomicity.md)
 - [cluster-size-limits](rfcs/cluster-size-limits.md)
--- a/docs/rfcs/017-pageserver-op-atomicity.md
+++ b/docs/rfcs/017-pageserver-op-atomicity.md
@@ -1,153 +0,0 @@
-# Durability and atomicity of tenant/timeline operations
-
-The pageserver has 8 tenant/timeline operations, listed below.  In
-addition to that, data can be appended to a timeline by WAL receiver,
-pages can be requested by the compute node, and tenant/timeline status
-can be queries through the mgmt API. But these are the operations that
-modify state in pageserver or in S3, and need to worry about crash
-safety.
-
-To make these operations atomic and recoverable, let's introduce a new
-"tenant index file", called `tenant.json`. For each tenant, there is
-one tenant index file, and it contains a list of all timelines for
-that tenant:
-
-{
-  tenant_id: a93a94724945e95e1a0c448004ece2ec
-
-  timelines: [
-    { timeline_id: "9979cd302340a058606473912651f27f",
-	  ancestor_id: ""
-	  ancestor_lsn: "0/0"
-    },
-    { timeline_id: "f0a6f3372d273dd9ca3480d19e6b565c",
-	  ancestor_id: "9979cd302340a058606473912651f27f"
-	  ancestor_lsn: "1/1698C48"
-	},
-  ]
-}
-
-The file only contains the immutable metadata of each timeline, like
-the point it was branched from. The changing parts, like
-disk_consistent_lsn, are still stored in the per-timeline metadata
-file.
-
-This file allows us to resolve some ambiguous situations, like
-remembering that a tenant exists when it doesn't have any timelines.
-It also allows us to quickly fetch the list of all timelines of a
-tenant, without having to perform S3 LIST operations.
-
-Below is a brief description of all the pageserver tenant/timeline
-operations, and how the steps of creating/deleting local files or
-directories and uploads to S3 are performed. The steps are listed in
-such an order that each operation can be sanely recovered or aborted,
-if the pageserver crashes while the operation is being perfromed.
-
-## Create tenant
-
-Create an empty tenant. It doesn't have any timelines initially.
-
-1. Create local tenant-directory with .temp extension
-2. Create tenant.json file in the directory, with a special flag
-   indicating that the tenant-creation is in progress
-3. Rename the local tenant directory in place
-4. Upload the tenant.json file to S3, without the flag
-5. Update the local file, removing the flag
-
-At pageserver startup, if we see a tenant.json file with the special
-flag, check if the tenant exists in S3. If not, remove the local directory.
-Otherwise remove the flag from local file.
-
-## Create timeline
-
-Create a timeline for a tenant, as result of running initdb.
-
-1. create timeline directory locally, with .temp extension
-2. run initdb, creating the initial set of layers
-3. upload all layer files to S3
-4. upload metadata file to S3
-5. update tenant.json file in S3
-6. Rename local directory in place
-
-If we crash before step 5, S3 may have a timeline metadata file and some
-layer files, without corresponding entry in tenant.json file. That's OK.
-Whenever we see that, we can delete the leftover timeline files.
-
-If we want to make that less scary, we could update a tenant.json file in S3
-twice. First, add the new timeline ID to the file with a flag indicating
-that it's being created. Do that before uploading anything else to S3. And
-then in step 5, update tenant.json to indicate that the creation is complete.
-
-## Branch timeline
-
-Create a new timeline with an existing timeline as parent
-
-1. create timeline directory locally, with .temp extension
-2. create metadata file in the local directory
-3. upload metadata file to S3
-4. update tenant.json file in S3
-5. Rename local directory in place
-
-Like with Create timeline, if we crash between steps 3 and 4, we will
-leave behind a timeline metadata file with no corresponding entry in
-tenant.json.  That's harmless.
-
-## Delete timeline
-
-1. rename local timeline directory to have .temp extension
-2. Update tenant.json file in S3
-3. delete index file from S3
-4. delete layer files from S3
-5. delete local directory
-
-Like with creation, if this is interrupted, we will leave behind
-timeline files in S3 with no corresponding entry in tenant.json. If we
-want to make that less scary, we can update tenant.json in step 2 with
-a tombstone flag for the timeline we're removing, instead of removing
-the entry for it outright.
-
-## Delete tenant
-
-1. rename local tenant directory to have .temp extension
-2. delete tenant.json file in S3
-3. delete all timeline index files from S3
-4. delete all layer files from S3
-5. delete local directory
-
-Like with timeline creation, this can leave behind files with no corresponding
-tenant.json file. We can make it less scary by adding tombstones.
-
-## Attach tenant
-
-1. create local tenant directory with .temp extension
-2. Download tenant.json file
-3. download index files for every timeline
-4. download all layer files (in the future, skip this and download them on demand)
-5. rename local tenant directory in place
-
-## Detach tenant
-
-1. rename local tenant directory to have .temp extension
-2. delete local directory
-
-
-## Load tenant
-
-This happens automatically at pageserver startup, for every tenant that is found
-in the tenants-directory. I.e. for every tenant that was attached to the pageserver
-before the crash or shutdown.
-
-1. download tenant.json file
-2. for every timeline that's in remote tenant.json:
-   1. download remote index file
-   2. download all layer files that are missing locally (skip in future, and download on-demand)
-   3. schedule upload of all files present locally, but missing remotely
-   4. schedule index file upload
-3. delete all locally present timeline directories that's not in tenant.json
-
-
-On startup, delete everything with the .temp extension
-
-
- we could skip some of the downloads if we stored the S3 etag of the object in the local file,
-  and compared that
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -15,7 +15,7 @@ listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'

 checkpoint_distance = '268435456' # in bytes
-checkpoint_period = '1 s'
+checkpoint_timeout = '10m'

 gc_period = '100 s'
 gc_horizon = '67108864'
@@ -46,7 +46,7 @@ Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#ta

 All values can be passed as an argument to the pageserver binary, using the `-c` parameter and specified as a valid TOML string. All tables should be passed in the inline form.

-Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage={local_path='/some/local/path/'}"`
+Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage={local_path='/some/local/path/'}"`

 Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.

@@ -82,6 +82,14 @@ S3.

 The unit is # of bytes.

+#### checkpoint_timeout
+
+Apart from `checkpoint_distance`, open layer flushing is also triggered
+`checkpoint_timeout` after the last flush. This makes WAL eventually uploaded to
+s3 when activity is stopped.
+
+The default is 10m.
+
 #### compaction_period

 Every `compaction_period` seconds, the page server checks if
--- a/libs/postgres_ffi/src/waldecoder.rs
+++ b/libs/postgres_ffi/src/waldecoder.rs
@@ -13,24 +13,30 @@ use super::xlog_utils::*;
 use super::XLogLongPageHeaderData;
 use super::XLogPageHeaderData;
 use super::XLogRecord;
+use super::XLOG_PAGE_MAGIC;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use crc32c::*;
 use log::*;
 use std::cmp::min;
+use std::num::NonZeroU32;
 use thiserror::Error;
 use utils::lsn::Lsn;

+enum State {
+    WaitingForRecord,
+    ReassemblingRecord {
+        recordbuf: BytesMut,
+        contlen: NonZeroU32,
+    },
+    SkippingEverything {
+        skip_until_lsn: Lsn,
+    },
+}
+
 pub struct WalStreamDecoder {
    lsn: Lsn,
-
-    startlsn: Lsn, // LSN where this record starts
-    contlen: u32,
-    padlen: u32,
-
    inputbuf: BytesMut,
-
-    /// buffer used to reassemble records that cross page boundaries.
-    recordbuf: BytesMut,
+    state: State,
 }

 #[derive(Error, Debug, Clone)]
@@ -48,13 +54,8 @@ impl WalStreamDecoder {
    pub fn new(lsn: Lsn) -> WalStreamDecoder {
        WalStreamDecoder {
            lsn,
-
-            startlsn: Lsn(0),
-            contlen: 0,
-            padlen: 0,
-
            inputbuf: BytesMut::new(),
-            recordbuf: BytesMut::new(),
+            state: State::WaitingForRecord,
        }
    }

@@ -67,6 +68,58 @@ impl WalStreamDecoder {
        self.inputbuf.extend_from_slice(buf);
    }

+    fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> {
+        let validate_impl = || {
+            if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 {
+                return Err(format!(
+                    "invalid xlog page header: xlp_magic={}, expected {}",
+                    hdr.xlp_magic, XLOG_PAGE_MAGIC
+                ));
+            }
+            if hdr.xlp_pageaddr != self.lsn.0 {
+                return Err(format!(
+                    "invalid xlog page header: xlp_pageaddr={}, expected {}",
+                    hdr.xlp_pageaddr, self.lsn
+                ));
+            }
+            match self.state {
+                State::WaitingForRecord => {
+                    // if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD != 0 {
+                    //     return Err(
+                    //         "invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD".into(),
+                    //     );
+                    // }
+                    if hdr.xlp_rem_len != 0 {
+                        return Err(format!(
+                            "invalid xlog page header: xlp_rem_len={}, but it's not a contrecord",
+                            hdr.xlp_rem_len
+                        ));
+                    }
+                }
+                State::ReassemblingRecord { contlen, .. } => {
+                    if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD == 0 {
+                        return Err(
+                            "invalid xlog page header: XLP_FIRST_IS_CONTRECORD expected, not found"
+                                .into(),
+                        );
+                    }
+                    if hdr.xlp_rem_len != contlen.get() {
+                        return Err(format!(
+                            "invalid xlog page header: xlp_rem_len={}, expected {}",
+                            hdr.xlp_rem_len,
+                            contlen.get()
+                        ));
+                    }
+                }
+                State::SkippingEverything { .. } => {
+                    panic!("Should not be validating page header in the SkippingEverything state");
+                }
+            };
+            Ok(())
+        };
+        validate_impl().map_err(|msg| WalDecodeError { msg, lsn: self.lsn })
+    }
+
    /// Attempt to decode another WAL record from the input that has been fed to the
    /// decoder so far.
    ///
@@ -76,128 +129,121 @@ impl WalStreamDecoder {
    ///     Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid.
    ///
    pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-        let recordbuf;
-
        // Run state machine that validates page headers, and reassembles records
        // that cross page boundaries.
        loop {
            // parse and verify page boundaries as we go
-            if self.padlen > 0 {
-                // We should first skip padding, as we may have to skip some page headers if we're processing the XLOG_SWITCH record.
-                if self.inputbuf.remaining() < self.padlen as usize {
-                    return Ok(None);
-                }
+            // However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason.
+            match self.state {
+                State::WaitingForRecord | State::ReassemblingRecord { .. } => {
+                    if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
+                        // parse long header

-                // skip padding
-                self.inputbuf.advance(self.padlen as usize);
-                self.lsn += self.padlen as u64;
-                self.padlen = 0;
-            } else if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
-                // parse long header
+                        if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
+                            return Ok(None);
+                        }

-                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
-                    return Ok(None);
-                }
+                        let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(
+                            |e| WalDecodeError {
+                                msg: format!("long header deserialization failed {}", e),
+                                lsn: self.lsn,
+                            },
+                        )?;

-                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
-                    WalDecodeError {
-                        msg: format!("long header deserialization failed {}", e),
-                        lsn: self.lsn,
+                        self.validate_page_header(&hdr.std)?;
+
+                        self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+                    } else if self.lsn.block_offset() == 0 {
+                        if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
+                            return Ok(None);
+                        }
+
+                        let hdr =
+                            XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
+                                WalDecodeError {
+                                    msg: format!("header deserialization failed {}", e),
+                                    lsn: self.lsn,
+                                }
+                            })?;
+
+                        self.validate_page_header(&hdr)?;
+
+                        self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
                    }
-                })?;
-
-                if hdr.std.xlp_pageaddr != self.lsn.0 {
-                    return Err(WalDecodeError {
-                        msg: "invalid xlog segment header".into(),
-                        lsn: self.lsn,
-                    });
                }
-                // TODO: verify the remaining fields in the header
-
-                self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
-                continue;
-            } else if self.lsn.block_offset() == 0 {
-                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
-                    return Ok(None);
-                }
-
-                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
-                    WalDecodeError {
-                        msg: format!("header deserialization failed {}", e),
-                        lsn: self.lsn,
+                State::SkippingEverything { .. } => {}
+            }
+            match &mut self.state {
+                State::WaitingForRecord => {
+                    // need to have at least the xl_tot_len field
+                    if self.inputbuf.remaining() < 4 {
+                        return Ok(None);
                    }
-                })?;

-                if hdr.xlp_pageaddr != self.lsn.0 {
-                    return Err(WalDecodeError {
-                        msg: "invalid xlog page header".into(),
-                        lsn: self.lsn,
-                    });
+                    // peek xl_tot_len at the beginning of the record.
+                    // FIXME: assumes little-endian
+                    let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
+                    if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
+                        return Err(WalDecodeError {
+                            msg: format!("invalid xl_tot_len {}", xl_tot_len),
+                            lsn: self.lsn,
+                        });
+                    }
+                    // Fast path for the common case that the whole record fits on the page.
+                    let pageleft = self.lsn.remaining_in_block() as u32;
+                    if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
+                        self.lsn += xl_tot_len as u64;
+                        let recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
+                        return Ok(Some(self.complete_record(recordbuf)?));
+                    } else {
+                        // Need to assemble the record from pieces. Remember the size of the
+                        // record, and loop back. On next iteration, we will reach the 'else'
+                        // branch below, and copy the part of the record that was on this page
+                        // to 'recordbuf'.  Subsequent iterations will skip page headers, and
+                        // append the continuations from the next pages to 'recordbuf'.
+                        self.state = State::ReassemblingRecord {
+                            recordbuf: BytesMut::with_capacity(xl_tot_len as usize),
+                            contlen: NonZeroU32::new(xl_tot_len).unwrap(),
+                        }
+                    }
                }
-                // TODO: verify the remaining fields in the header
+                State::ReassemblingRecord { recordbuf, contlen } => {
+                    // we're continuing a record, possibly from previous page.
+                    let pageleft = self.lsn.remaining_in_block() as u32;

-                self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
-                continue;
-            } else if self.contlen == 0 {
-                assert!(self.recordbuf.is_empty());
+                    // read the rest of the record, or as much as fits on this page.
+                    let n = min(contlen.get(), pageleft) as usize;

-                // need to have at least the xl_tot_len field
-                if self.inputbuf.remaining() < 4 {
-                    return Ok(None);
+                    if self.inputbuf.remaining() < n {
+                        return Ok(None);
+                    }
+
+                    recordbuf.put(self.inputbuf.split_to(n));
+                    self.lsn += n as u64;
+                    *contlen = match NonZeroU32::new(contlen.get() - n as u32) {
+                        Some(x) => x,
+                        None => {
+                            // The record is now complete.
+                            let recordbuf = std::mem::replace(recordbuf, BytesMut::new()).freeze();
+                            return Ok(Some(self.complete_record(recordbuf)?));
+                        }
+                    }
                }
-
-                // peek xl_tot_len at the beginning of the record.
-                // FIXME: assumes little-endian
-                self.startlsn = self.lsn;
-                let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
-                if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
-                    return Err(WalDecodeError {
-                        msg: format!("invalid xl_tot_len {}", xl_tot_len),
-                        lsn: self.lsn,
-                    });
+                State::SkippingEverything { skip_until_lsn } => {
+                    assert!(*skip_until_lsn >= self.lsn);
+                    let n = skip_until_lsn.0 - self.lsn.0;
+                    if self.inputbuf.remaining() < n as usize {
+                        return Ok(None);
+                    }
+                    self.inputbuf.advance(n as usize);
+                    self.lsn += n;
+                    self.state = State::WaitingForRecord;
                }
-
-                // Fast path for the common case that the whole record fits on the page.
-                let pageleft = self.lsn.remaining_in_block() as u32;
-                if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
-                    // Take the record from the 'inputbuf', and validate it.
-                    recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
-                    self.lsn += xl_tot_len as u64;
-                    break;
-                } else {
-                    // Need to assemble the record from pieces. Remember the size of the
-                    // record, and loop back. On next iteration, we will reach the 'else'
-                    // branch below, and copy the part of the record that was on this page
-                    // to 'recordbuf'.  Subsequent iterations will skip page headers, and
-                    // append the continuations from the next pages to 'recordbuf'.
-                    self.recordbuf.reserve(xl_tot_len as usize);
-                    self.contlen = xl_tot_len;
-                    continue;
-                }
-            } else {
-                // we're continuing a record, possibly from previous page.
-                let pageleft = self.lsn.remaining_in_block() as u32;
-
-                // read the rest of the record, or as much as fits on this page.
-                let n = min(self.contlen, pageleft) as usize;
-
-                if self.inputbuf.remaining() < n {
-                    return Ok(None);
-                }
-
-                self.recordbuf.put(self.inputbuf.split_to(n));
-                self.lsn += n as u64;
-                self.contlen -= n as u32;
-
-                if self.contlen == 0 {
-                    // The record is now complete.
-                    recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
-                    break;
-                }
-                continue;
            }
        }
+    }

+    fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError> {
        // We now have a record in the 'recordbuf' local variable.
        let xlogrec =
            XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| {
@@ -219,18 +265,20 @@ impl WalStreamDecoder {

        // XLOG_SWITCH records are special. If we see one, we need to skip
        // to the next WAL segment.
-        if xlogrec.is_xlog_switch_record() {
+        let next_lsn = if xlogrec.is_xlog_switch_record() {
            trace!("saw xlog switch record at {}", self.lsn);
-            self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
+            self.lsn + self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64)
        } else {
            // Pad to an 8-byte boundary
-            self.padlen = self.lsn.calc_padding(8u32) as u32;
-        }
+            self.lsn.align()
+        };
+        self.state = State::SkippingEverything {
+            skip_until_lsn: next_lsn,
+        };

        // We should return LSN of the next record, not the last byte of this record or
        // the byte immediately after. Note that this handles both XLOG_SWITCH and usual
        // records, the former "spans" until the next WAL segment (see test_xlog_switch).
-        let result = (self.lsn + self.padlen as u64, recordbuf);
-        Ok(Some(result))
+        Ok((next_lsn, recordbuf))
    }
 }
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -16,22 +16,22 @@ use crate::XLogRecord;
 use crate::XLOG_PAGE_MAGIC;

 use crate::pg_constants::WAL_SEGMENT_SIZE;
-use anyhow::{anyhow, bail, ensure};
-use byteorder::{ByteOrder, LittleEndian};
+use crate::waldecoder::WalStreamDecoder;
+
 use bytes::BytesMut;
 use bytes::{Buf, Bytes};
-use crc32c::*;
+
 use log::*;
-use std::cmp::max;
-use std::cmp::min;
-use std::fs::{self, File};
+
+use std::fs::File;
 use std::io::prelude::*;
+use std::io::ErrorKind;
 use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
 use std::time::SystemTime;
 use utils::bin_ser::DeserializeError;
 use utils::bin_ser::SerializeError;
-use utils::const_assert;
+
 use utils::lsn::Lsn;

 pub const XLOG_FNAME_LEN: usize = 24;
@@ -140,338 +140,93 @@ pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
    }
 }

-/// Return offset of the last valid record in the segment segno, starting
-/// looking at start_offset. Returns start_offset if no records found.
-fn find_end_of_wal_segment(
-    data_dir: &Path,
-    segno: XLogSegNo,
-    tli: TimeLineID,
-    wal_seg_size: usize,
-    start_offset: usize, // start reading at this point
-) -> anyhow::Result<u32> {
-    // step back to the beginning of the page to read it in...
-    let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ;
-    let mut skipping_first_contrecord: bool = false;
-    let mut contlen: usize = 0;
-    let mut xl_crc: u32 = 0;
-    let mut crc: u32 = 0;
-    let mut rec_offs: usize = 0;
-    let mut buf = [0u8; XLOG_BLCKSZ];
-    let file_name = XLogFileName(tli, segno, wal_seg_size);
-    let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
-    let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?;
-    file.seek(SeekFrom::Start(offs as u64))?;
-    // xl_crc is the last field in XLogRecord, will not be read into rec_hdr
-    const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
-    let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS];
-
-    trace!("find_end_of_wal_segment(data_dir={}, segno={}, tli={}, wal_seg_size={}, start_offset=0x{:x})", data_dir.display(), segno, tli, wal_seg_size, start_offset);
-    while offs < wal_seg_size {
-        // we are at the beginning of the page; read it in
-        if offs % XLOG_BLCKSZ == 0 {
-            trace!("offs=0x{:x}: new page", offs);
-            let bytes_read = file.read(&mut buf)?;
-            if bytes_read != buf.len() {
-                bail!(
-                    "failed to read {} bytes from {} at {}",
-                    XLOG_BLCKSZ,
-                    file_name,
-                    offs
-                );
-            }
-
-            let xlp_magic = LittleEndian::read_u16(&buf[0..2]);
-            let xlp_info = LittleEndian::read_u16(&buf[2..4]);
-            let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]);
-            trace!(
-                "  xlp_magic=0x{:x}, xlp_info=0x{:x}, xlp_rem_len={}",
-                xlp_magic,
-                xlp_info,
-                xlp_rem_len
-            );
-            // this is expected in current usage when valid WAL starts after page header
-            if xlp_magic != XLOG_PAGE_MAGIC as u16 {
-                trace!(
-                    "  invalid WAL file {}.partial magic {} at {:?}",
-                    file_name,
-                    xlp_magic,
-                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
-                );
-            }
-            if offs == 0 {
-                offs += XLOG_SIZE_OF_XLOG_LONG_PHD;
-                if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 {
-                    trace!("  first record is contrecord");
-                    skipping_first_contrecord = true;
-                    contlen = xlp_rem_len as usize;
-                    if offs < start_offset {
-                        // Pre-condition failed: the beginning of the segment is unexpectedly corrupted.
-                        ensure!(start_offset - offs >= contlen,
-                            "start_offset is in the middle of the first record (which happens to be a contrecord), \
-                             expected to be on a record boundary. Is beginning of the segment corrupted?");
-                        contlen = 0;
-                        // keep skipping_first_contrecord to avoid counting the contrecord as valid, we did not check it.
-                    }
-                } else {
-                    trace!("  first record is not contrecord");
-                }
-            } else {
-                offs += XLOG_SIZE_OF_XLOG_SHORT_PHD;
-            }
-            // ... and step forward again if asked
-            trace!("  skipped header to 0x{:x}", offs);
-            offs = max(offs, start_offset);
-        // beginning of the next record
-        } else if contlen == 0 {
-            let page_offs = offs % XLOG_BLCKSZ;
-            let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
-            trace!("offs=0x{:x}: new record, xl_tot_len={}", offs, xl_tot_len);
-            if xl_tot_len == 0 {
-                info!(
-                    "find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}",
-                    Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
-                    Lsn(XLogSegNoOffsetToRecPtr(
-                        segno,
-                        last_valid_rec_pos as u32,
-                        wal_seg_size
-                    ))
-                );
-                break; // zeros, reached the end
-            }
-            if skipping_first_contrecord {
-                skipping_first_contrecord = false;
-                trace!("  first contrecord has been just completed");
-            } else {
-                trace!(
-                    "  updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
-                    last_valid_rec_pos,
-                    offs
-                );
-                last_valid_rec_pos = offs;
-            }
-            offs += 4;
-            rec_offs = 4;
-            contlen = xl_tot_len - 4;
-            trace!(
-                "  reading rec_hdr[0..4] <-- [0x{:x}; 0x{:x})",
-                page_offs,
-                page_offs + 4
-            );
-            rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]);
-        } else {
-            // we're continuing a record, possibly from previous page.
-            let page_offs = offs % XLOG_BLCKSZ;
-            let pageleft = XLOG_BLCKSZ - page_offs;
-
-            // read the rest of the record, or as much as fits on this page.
-            let n = min(contlen, pageleft);
-            trace!(
-                "offs=0x{:x}, record continuation, pageleft={}, contlen={}",
-                offs,
-                pageleft,
-                contlen
-            );
-            // fill rec_hdr header up to (but not including) xl_crc field
-            trace!(
-                "  rec_offs={}, XLOG_RECORD_CRC_OFFS={}, XLOG_SIZE_OF_XLOG_RECORD={}",
-                rec_offs,
-                XLOG_RECORD_CRC_OFFS,
-                XLOG_SIZE_OF_XLOG_RECORD
-            );
-            if rec_offs < XLOG_RECORD_CRC_OFFS {
-                let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n);
-                trace!(
-                    "  reading rec_hdr[{}..{}] <-- [0x{:x}; 0x{:x})",
-                    rec_offs,
-                    rec_offs + len,
-                    page_offs,
-                    page_offs + len
-                );
-                rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]);
-            }
-            if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD {
-                let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS;
-                // All records are aligned on 8-byte boundary, so their 8-byte frames
-                // cannot be split between pages. As xl_crc is the last field,
-                // its content is always on the same page.
-                const_assert!(XLOG_RECORD_CRC_OFFS % 8 == 4);
-                // We should always start reading aligned records even in incorrect WALs so if
-                // the condition is false it is likely a bug. However, it is localized somewhere
-                // in this function, hence we do not crash and just report failure instead.
-                ensure!(crc_offs % 8 == 4, "Record is not aligned properly (bug?)");
-                xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
-                trace!(
-                    "  reading xl_crc: [0x{:x}; 0x{:x}) = 0x{:x}",
-                    crc_offs,
-                    crc_offs + 4,
-                    xl_crc
-                );
-                crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]);
-                trace!(
-                    "  initializing crc: [0x{:x}; 0x{:x}); crc = 0x{:x}",
-                    crc_offs + 4,
-                    page_offs + n,
-                    crc
-                );
-            } else if rec_offs > XLOG_RECORD_CRC_OFFS {
-                // As all records are 8-byte aligned, the header is already fully read and `crc` is initialized in the branch above.
-                ensure!(rec_offs >= XLOG_SIZE_OF_XLOG_RECORD);
-                let old_crc = crc;
-                crc = crc32c_append(crc, &buf[page_offs..page_offs + n]);
-                trace!(
-                    "  appending to crc: [0x{:x}; 0x{:x}); 0x{:x} --> 0x{:x}",
-                    page_offs,
-                    page_offs + n,
-                    old_crc,
-                    crc
-                );
-            } else {
-                // Correct because of the way conditions are written above.
-                assert!(rec_offs + n < XLOG_SIZE_OF_XLOG_RECORD);
-                // If `skipping_first_contrecord == true`, we may be reading from a middle of a record
-                // which started in the previous segment. Hence there is no point in validating the header.
-                if !skipping_first_contrecord && rec_offs + n > XLOG_RECORD_CRC_OFFS {
-                    info!(
-                        "Curiously corrupted WAL: a record stops inside the header; \
-                             offs=0x{:x}, record continuation, pageleft={}, contlen={}",
-                        offs, pageleft, contlen
-                    );
-                    break;
-                }
-                // Do nothing: we are still reading the header. It's accounted in CRC in the end of the record.
-            }
-            rec_offs += n;
-            offs += n;
-            contlen -= n;
-
-            if contlen == 0 {
-                trace!("  record completed at 0x{:x}", offs);
-                crc = crc32c_append(crc, &rec_hdr);
-                offs = (offs + 7) & !7; // pad on 8 bytes boundary */
-                trace!(
-                    "  padded offs to 0x{:x}, crc is {:x}, expected crc is {:x}",
-                    offs,
-                    crc,
-                    xl_crc
-                );
-                if skipping_first_contrecord {
-                    // do nothing, the flag will go down on next iteration when we're reading new record
-                    trace!("  first conrecord has been just completed");
-                } else if crc == xl_crc {
-                    // record is valid, advance the result to its end (with
-                    // alignment to the next record taken into account)
-                    trace!(
-                        "  updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
-                        last_valid_rec_pos,
-                        offs
-                    );
-                    last_valid_rec_pos = offs;
-                } else {
-                    info!(
-                        "CRC mismatch {} vs {} at {}",
-                        crc, xl_crc, last_valid_rec_pos
-                    );
-                    break;
-                }
-            }
-        }
-    }
-    trace!("last_valid_rec_pos=0x{:x}", last_valid_rec_pos);
-    Ok(last_valid_rec_pos as u32)
-}
-
-///
-/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL.
-/// If precise, returns end LSN (next insertion point, basically);
-/// otherwise, start of the last segment.
-/// Returns (0, 0) if there is no WAL.
-///
+// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
+// start_lsn must point to some previously known record boundary (beginning of
+// the next record). If no valid record after is found, start_lsn is returned
+// back.
 pub fn find_end_of_wal(
    data_dir: &Path,
    wal_seg_size: usize,
-    precise: bool,
-    start_lsn: Lsn, // start reading WAL at this point or later
-) -> anyhow::Result<(XLogRecPtr, TimeLineID)> {
-    let mut high_segno: XLogSegNo = 0;
-    let mut high_tli: TimeLineID = 0;
-    let mut high_ispartial = false;
+    start_lsn: Lsn, // start reading WAL at this point; must point at record start_lsn.
+) -> anyhow::Result<Lsn> {
+    let mut result = start_lsn;
+    let mut curr_lsn = start_lsn;
+    let mut buf = [0u8; XLOG_BLCKSZ];
+    let mut decoder = WalStreamDecoder::new(start_lsn);

-    for entry in fs::read_dir(data_dir)?.flatten() {
-        let ispartial: bool;
-        let entry_name = entry.file_name();
-        let fname = entry_name
-            .to_str()
-            .ok_or_else(|| anyhow!("Invalid file name"))?;
-
-        /*
-         * Check if the filename looks like an xlog file, or a .partial file.
-         */
-        if IsXLogFileName(fname) {
-            ispartial = false;
-        } else if IsPartialXLogFileName(fname) {
-            ispartial = true;
-        } else {
-            continue;
-        }
-        let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
-        if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 {
-            continue;
-        }
-        if segno > high_segno
-            || (segno == high_segno && tli > high_tli)
-            || (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
-        {
-            high_segno = segno;
-            high_tli = tli;
-            high_ispartial = ispartial;
-        }
-    }
-    if high_segno > 0 {
-        let mut high_offs = 0;
-        /*
-         * Move the starting pointer to the start of the next segment, if the
-         * highest one we saw was completed.
-         */
-        if !high_ispartial {
-            high_segno += 1;
-        } else if precise {
-            /* otherwise locate last record in last partial segment */
-            if start_lsn.segment_number(wal_seg_size) > high_segno {
-                bail!(
-                    "provided start_lsn {:?} is beyond highest segno {:?} available",
-                    start_lsn,
-                    high_segno,
+    // loop over segments
+    loop {
+        let segno = curr_lsn.segment_number(wal_seg_size);
+        let seg_file_name = XLogFileName(PG_TLI, segno, wal_seg_size);
+        let seg_file_path = data_dir.join(seg_file_name);
+        match open_wal_segment(&seg_file_path)? {
+            None => {
+                // no more segments
+                info!(
+                    "find_end_of_wal reached end at {:?}, segment {:?} doesn't exist",
+                    result, seg_file_path
                );
+                return Ok(result);
+            }
+            Some(mut segment) => {
+                let seg_offs = curr_lsn.segment_offset(wal_seg_size);
+                segment.seek(SeekFrom::Start(seg_offs as u64))?;
+                // loop inside segment
+                loop {
+                    let bytes_read = segment.read(&mut buf)?;
+                    if bytes_read == 0 {
+                        break; // EOF
+                    }
+                    curr_lsn += bytes_read as u64;
+                    decoder.feed_bytes(&buf[0..bytes_read]);
+
+                    // advance result past all completely read records
+                    loop {
+                        match decoder.poll_decode() {
+                            Ok(Some(record)) => result = record.0,
+                            Err(e) => {
+                                info!(
+                                    "find_end_of_wal reached end at {:?}, decode error: {:?}",
+                                    result, e
+                                );
+                                return Ok(result);
+                            }
+                            Ok(None) => break, // need more data
+                        }
+                    }
+                }
            }
-            let start_offset = if start_lsn.segment_number(wal_seg_size) == high_segno {
-                start_lsn.segment_offset(wal_seg_size)
-            } else {
-                0
-            };
-            high_offs = find_end_of_wal_segment(
-                data_dir,
-                high_segno,
-                high_tli,
-                wal_seg_size,
-                start_offset,
-            )?;
        }
-        let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
-        return Ok((high_ptr, high_tli));
    }
-    Ok((0, 0))
+}
+
+// Open .partial or full WAL segment file, if present.
+fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
+    let mut partial_path = seg_file_path.to_owned();
+    partial_path.set_extension("partial");
+    match File::open(partial_path) {
+        Ok(file) => Ok(Some(file)),
+        Err(e) => match e.kind() {
+            ErrorKind::NotFound => {
+                // .partial not found, try full
+                match File::open(seg_file_path) {
+                    Ok(file) => Ok(Some(file)),
+                    Err(e) => match e.kind() {
+                        ErrorKind::NotFound => Ok(None),
+                        _ => Err(e.into()),
+                    },
+                }
+            }
+            _ => Err(e.into()),
+        },
+    }
 }

 pub fn main() {
    let mut data_dir = PathBuf::new();
    data_dir.push(".");
-    let (wal_end, tli) = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, true, Lsn(0)).unwrap();
-    println!(
-        "wal_end={:>08X}{:>08X}, tli={}",
-        (wal_end >> 32) as u32,
-        wal_end as u32,
-        tli
-    );
+    let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
+    println!("wal_end={:?}", wal_end);
 }

 impl XLogRecord {
@@ -595,7 +350,10 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali
 mod tests {
    use super::*;
    use regex::Regex;
+    use std::cmp::min;
+    use std::fs;
    use std::{env, str::FromStr};
+    use utils::const_assert;

    fn init_logging() {
        let _ = env_logger::Builder::from_env(
@@ -606,10 +364,7 @@ mod tests {
        .try_init();
    }

-    fn test_end_of_wal<C: wal_craft::Crafter>(
-        test_name: &str,
-        expected_end_of_wal_non_partial: Lsn,
-    ) {
+    fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
        use wal_craft::*;
        // Craft some WAL
        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
@@ -630,7 +385,7 @@ mod tests {
            .iter()
            .map(|&lsn| u64::from(lsn).into())
            .collect();
-        let expected_end_of_wal_partial: Lsn = u64::from(expected_end_of_wal_partial).into();
+        let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
        srv.kill();

        // Check find_end_of_wal on the initial WAL
@@ -642,10 +397,10 @@ mod tests {
            .filter(|fname| IsXLogFileName(fname))
            .max()
            .unwrap();
-        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial);
-        for start_lsn in std::iter::once(Lsn(0))
-            .chain(intermediate_lsns)
-            .chain(std::iter::once(expected_end_of_wal_partial))
+        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
+        for start_lsn in intermediate_lsns
+            .iter()
+            .chain(std::iter::once(&expected_end_of_wal))
        {
            // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
            // We assume that `start_lsn` is non-decreasing.
@@ -660,7 +415,7 @@ mod tests {
                }
                let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
                let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
-                if seg_start_lsn > u64::from(start_lsn) {
+                if seg_start_lsn > u64::from(*start_lsn) {
                    continue;
                }
                let mut f = File::options().write(true).open(file.path()).unwrap();
@@ -668,18 +423,12 @@ mod tests {
                f.write_all(
                    &ZEROS[0..min(
                        WAL_SEGMENT_SIZE,
-                        (u64::from(start_lsn) - seg_start_lsn) as usize,
+                        (u64::from(*start_lsn) - seg_start_lsn) as usize,
                    )],
                )
                .unwrap();
            }
-            check_end_of_wal(
-                &cfg,
-                &last_segment,
-                start_lsn,
-                expected_end_of_wal_non_partial,
-                expected_end_of_wal_partial,
-            );
+            check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
        }
    }

@@ -716,18 +465,15 @@ mod tests {
        cfg: &wal_craft::Conf,
        last_segment: &str,
        start_lsn: Lsn,
-        expected_end_of_wal_non_partial: Lsn,
-        expected_end_of_wal_partial: Lsn,
+        expected_end_of_wal: Lsn,
    ) {
        // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
-        let (wal_end, tli) =
-            find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
-        let wal_end = Lsn(wal_end);
-        info!(
-            "find_end_of_wal returned (wal_end={}, tli={}) with non-partial WAL segment",
-            wal_end, tli
-        );
-        assert_eq!(wal_end, expected_end_of_wal_non_partial);
+        // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
+        // info!(
+        //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
+        //     wal_end
+        // );
+        // assert_eq!(wal_end, expected_end_of_wal_non_partial);

        // Rename file to partial to actually find last valid lsn, then rename it back.
        fs::rename(
@@ -735,14 +481,12 @@ mod tests {
            cfg.wal_dir().join(format!("{}.partial", last_segment)),
        )
        .unwrap();
-        let (wal_end, tli) =
-            find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
-        let wal_end = Lsn(wal_end);
+        let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
        info!(
-            "find_end_of_wal returned (wal_end={}, tli={}) with partial WAL segment",
-            wal_end, tli
+            "find_end_of_wal returned wal_end={} with partial WAL segment",
+            wal_end
        );
-        assert_eq!(wal_end, expected_end_of_wal_partial);
+        assert_eq!(wal_end, expected_end_of_wal);
        fs::rename(
            cfg.wal_dir().join(format!("{}.partial", last_segment)),
            cfg.wal_dir().join(last_segment),
@@ -755,10 +499,7 @@ mod tests {
    #[test]
    pub fn test_find_end_of_wal_simple() {
        init_logging();
-        test_end_of_wal::<wal_craft::Simple>(
-            "test_find_end_of_wal_simple",
-            "0/2000000".parse::<Lsn>().unwrap(),
-        );
+        test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
    }

    #[test]
@@ -766,17 +507,14 @@ mod tests {
        init_logging();
        test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
-            "0/3000000".parse::<Lsn>().unwrap(),
        );
    }

    #[test]
-    #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
    pub fn test_find_end_of_wal_last_crossing_segment() {
        init_logging();
        test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
            "test_find_end_of_wal_last_crossing_segment",
-            "0/3000000".parse::<Lsn>().unwrap(),
        );
    }

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -59,6 +59,7 @@ pub mod defaults {

 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
+#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
 #compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
 #compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
 #compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
@@ -452,6 +453,13 @@ impl PageServerConf {
                Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
        }

+        if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
+            t_conf.checkpoint_timeout = Some(parse_toml_duration(
+                "checkpoint_timeout",
+                checkpoint_timeout,
+            )?);
+        }
+
        if let Some(compaction_target_size) = item.get("compaction_target_size") {
            t_conf.compaction_target_size = Some(parse_toml_u64(
                "compaction_target_size",
--- a/pageserver/src/http/models.rs
+++ b/pageserver/src/http/models.rs
@@ -32,6 +32,7 @@ pub struct TenantCreateRequest {
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub new_tenant_id: Option<ZTenantId>,
    pub checkpoint_distance: Option<u64>,
+    pub checkpoint_timeout: Option<String>,
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
@@ -70,6 +71,7 @@ pub struct TenantConfigRequest {
    #[serde(default)]
    #[serde_as(as = "Option<DisplayFromStr>")]
    pub checkpoint_distance: Option<u64>,
+    pub checkpoint_timeout: Option<String>,
    pub compaction_target_size: Option<u64>,
    pub compaction_period: Option<String>,
    pub compaction_threshold: Option<usize>,
@@ -87,6 +89,7 @@ impl TenantConfigRequest {
        TenantConfigRequest {
            tenant_id,
            checkpoint_distance: None,
+            checkpoint_timeout: None,
            compaction_target_size: None,
            compaction_period: None,
            compaction_threshold: None,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -560,6 +560,8 @@ components:
          type: string
        checkpoint_distance:
          type: integer
+        checkpoint_timeout:
+          type: string
        compaction_period:
          type: string
        compaction_threshold:
@@ -578,6 +580,8 @@ components:
          type: string
        checkpoint_distance:
          type: integer
+        checkpoint_timeout:
+          type: string
        compaction_period:
          type: string
        compaction_threshold:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -623,6 +623,11 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
+    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
+        tenant_conf.checkpoint_timeout =
+            Some(humantime::parse_duration(&checkpoint_timeout).map_err(ApiError::from_err)?);
+    }
+
    tenant_conf.compaction_target_size = request_data.compaction_target_size;
    tenant_conf.compaction_threshold = request_data.compaction_threshold;

@@ -683,6 +688,10 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
    }

    tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
+    if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
+        tenant_conf.checkpoint_timeout =
+            Some(humantime::parse_duration(&checkpoint_timeout).map_err(ApiError::from_err)?);
+    }
    tenant_conf.compaction_target_size = request_data.compaction_target_size;
    tenant_conf.compaction_threshold = request_data.compaction_threshold;

--- a/pageserver/src/layered_repository.rs
+++ b/pageserver/src/layered_repository.rs
@@ -433,6 +433,13 @@ impl LayeredRepository {
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

+    pub fn get_checkpoint_timeout(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .checkpoint_timeout
+            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
+    }
+
    pub fn get_compaction_target_size(&self) -> u64 {
        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
--- a/pageserver/src/layered_repository/timeline.rs
+++ b/pageserver/src/layered_repository/timeline.rs
@@ -16,7 +16,7 @@ use std::ops::{Deref, Range};
 use std::path::PathBuf;
 use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError};
-use std::time::{Duration, SystemTime};
+use std::time::{Duration, Instant, SystemTime};

 use metrics::{
    register_histogram_vec, register_int_counter, register_int_counter_vec, register_int_gauge_vec,
@@ -233,6 +233,8 @@ pub struct LayeredTimeline {
    pub layers: RwLock<LayerMap>,

    last_freeze_at: AtomicLsn,
+    // Atomic would be more appropriate here.
+    last_freeze_ts: RwLock<Instant>,

    // WAL redo manager
    walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
@@ -560,6 +562,13 @@ impl LayeredTimeline {
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

+    fn get_checkpoint_timeout(&self) -> Duration {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .checkpoint_timeout
+            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
+    }
+
    fn get_compaction_target_size(&self) -> u64 {
        let tenant_conf = self.tenant_conf.read().unwrap();
        tenant_conf
@@ -649,6 +658,7 @@ impl LayeredTimeline {
            disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),

            last_freeze_at: AtomicLsn::new(metadata.disk_consistent_lsn().0),
+            last_freeze_ts: RwLock::new(Instant::now()),

            ancestor_timeline: ancestor,
            ancestor_lsn: metadata.ancestor_lsn(),
@@ -1094,8 +1104,11 @@ impl LayeredTimeline {
    }

    ///
-    /// Check if more than 'checkpoint_distance' of WAL has been accumulated
-    /// in the in-memory layer, and initiate flushing it if so.
+    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
+    /// the in-memory layer, and initiate flushing it if so.
+    ///
+    /// Also flush after a period of time without new data -- it helps
+    /// safekeepers to regard pageserver as caught up and suspend activity.
    ///
    pub fn check_checkpoint_distance(self: &Arc<LayeredTimeline>) -> Result<()> {
        let last_lsn = self.get_last_record_lsn();
@@ -1103,21 +1116,27 @@ impl LayeredTimeline {
        if let Some(open_layer) = &layers.open_layer {
            let open_layer_size = open_layer.size()?;
            drop(layers);
-            let distance = last_lsn.widening_sub(self.last_freeze_at.load());
+            let last_freeze_at = self.last_freeze_at.load();
+            let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
+            let distance = last_lsn.widening_sub(last_freeze_at);
            // Checkpointing the open layer can be triggered by layer size or LSN range.
            // S3 has a 5 GB limit on the size of one upload (without multi-part upload), and
            // we want to stay below that with a big margin.  The LSN distance determines how
            // much WAL the safekeepers need to store.
            if distance >= self.get_checkpoint_distance().into()
                || open_layer_size > self.get_checkpoint_distance()
+                || (distance > 0 && last_freeze_ts.elapsed() >= self.get_checkpoint_timeout())
            {
                info!(
-                    "check_checkpoint_distance {}, layer size {}",
-                    distance, open_layer_size
+                    "check_checkpoint_distance {}, layer size {}, elapsed since last flush {:?}",
+                    distance,
+                    open_layer_size,
+                    last_freeze_ts.elapsed()
                );

                self.freeze_inmem_layer(true);
                self.last_freeze_at.store(last_lsn);
+                *(self.last_freeze_ts.write().unwrap()) = Instant::now();

                // Launch a thread to flush the frozen layer to disk, unless
                // a thread was already running. (If the thread was running
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -93,3 +93,56 @@ pub fn shutdown_pageserver(exit_code: i32) {
    info!("Shut down successfully completed");
    std::process::exit(exit_code);
 }
+
+const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
+const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
+
+async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
+    let backoff_duration_seconds =
+        exponential_backoff_duration_seconds(n, base_increment, max_seconds);
+    if backoff_duration_seconds > 0.0 {
+        info!(
+            "Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
+        );
+        tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
+    }
+}
+
+fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
+    if n == 0 {
+        0.0
+    } else {
+        (1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
+    }
+}
+
+#[cfg(test)]
+mod backoff_defaults_tests {
+    use super::*;
+
+    #[test]
+    fn backoff_defaults_produce_growing_backoff_sequence() {
+        let mut current_backoff_value = None;
+
+        for i in 0..10_000 {
+            let new_backoff_value = exponential_backoff_duration_seconds(
+                i,
+                DEFAULT_BASE_BACKOFF_SECONDS,
+                DEFAULT_MAX_BACKOFF_SECONDS,
+            );
+
+            if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
+                assert!(
+                    old_backoff_value <= new_backoff_value,
+                    "{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
+                )
+            }
+        }
+
+        assert_eq!(
+            current_backoff_value.expect("Should have produced backoff values to compare"),
+            DEFAULT_MAX_BACKOFF_SECONDS,
+            "Given big enough of retries, backoff should reach its allowed max value"
+        );
+    }
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1044,6 +1044,7 @@ impl postgres_backend::Handler for PageServerHandler {
            let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
+                RowDescriptor::int8_col(b"checkpoint_timeout"),
                RowDescriptor::int8_col(b"compaction_target_size"),
                RowDescriptor::int8_col(b"compaction_period"),
                RowDescriptor::int8_col(b"compaction_threshold"),
@@ -1054,6 +1055,12 @@ impl postgres_backend::Handler for PageServerHandler {
            ]))?
            .write_message_noflush(&BeMessage::DataRow(&[
                Some(repo.get_checkpoint_distance().to_string().as_bytes()),
+                Some(
+                    repo.get_checkpoint_timeout()
+                        .as_secs()
+                        .to_string()
+                        .as_bytes(),
+                ),
                Some(repo.get_compaction_target_size().to_string().as_bytes()),
                Some(
                    repo.get_compaction_period()
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -708,20 +708,25 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
    /// Truncate relation
    pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
        ensure!(rel.relnode != 0, "invalid relnode");
-        let size_key = rel_size_to_key(rel);
+        let last_lsn = self.tline.get_last_record_lsn();
+        if self.tline.get_rel_exists(rel, last_lsn)? {
+            let size_key = rel_size_to_key(rel);
+            // Fetch the old size first
+            let old_size = self.get(size_key)?.get_u32_le();

-        // Fetch the old size first
-        let old_size = self.get(size_key)?.get_u32_le();
+            // Update the entry with the new size.
+            let buf = nblocks.to_le_bytes();
+            self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));

-        // Update the entry with the new size.
-        let buf = nblocks.to_le_bytes();
-        self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
+            // Update relation size cache
+            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);

-        // Update relation size cache
-        self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
+            // Update relation size cache
+            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);

-        // Update logical database size.
-        self.pending_nblocks -= old_size as isize - nblocks as isize;
+            // Update logical database size.
+            self.pending_nblocks -= old_size as isize - nblocks as isize;
+        }
        Ok(())
    }

@@ -961,8 +966,8 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
                bail!("unexpected pending WAL record");
            }
        } else {
-            let last_lsn = self.tline.get_last_record_lsn();
-            self.tline.get(key, last_lsn)
+            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+            self.tline.get(key, lsn)
        }
    }

--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -445,6 +445,7 @@ pub mod repo_harness {
        fn from(tenant_conf: TenantConf) -> Self {
            Self {
                checkpoint_distance: Some(tenant_conf.checkpoint_distance),
+                checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
                compaction_target_size: Some(tenant_conf.compaction_target_size),
                compaction_period: Some(tenant_conf.compaction_period),
                compaction_threshold: Some(tenant_conf.compaction_threshold),
--- a/pageserver/src/storage_sync.rs
+++ b/pageserver/src/storage_sync.rs
@@ -172,6 +172,7 @@ use self::{
 };
 use crate::{
    config::PageServerConf,
+    exponential_backoff,
    layered_repository::{
        ephemeral_file::is_ephemeral_file,
        metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
@@ -969,14 +970,19 @@ fn storage_sync_loop<P, S>(
    }
 }

-// needed to check whether the download happened
-// more informative than just a bool
 #[derive(Debug)]
-enum DownloadMarker {
+enum DownloadStatus {
    Downloaded,
    Nothing,
 }

+#[derive(Debug)]
+enum UploadStatus {
+    Uploaded,
+    Failed(anyhow::Error),
+    Nothing,
+}
+
 async fn process_batches<P, S>(
    conf: &'static PageServerConf,
    max_sync_errors: NonZeroU32,
@@ -1016,7 +1022,7 @@ where
            "Finished storage sync task for sync id {sync_id} download marker {:?}",
            download_marker
        );
-        if matches!(download_marker, DownloadMarker::Downloaded) {
+        if matches!(download_marker, DownloadStatus::Downloaded) {
            downloaded_timelines.insert(sync_id.tenant_id);
        }
    }
@@ -1030,7 +1036,7 @@ async fn process_sync_task_batch<P, S>(
    max_sync_errors: NonZeroU32,
    sync_id: ZTenantTimelineId,
    batch: SyncTaskBatch,
-) -> DownloadMarker
+) -> DownloadStatus
 where
    P: Debug + Send + Sync + 'static,
    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -1047,66 +1053,71 @@ where
    // When operating in a system without tasks failing over the error threshold,
    // current batching and task processing systems aim to update the layer set and metadata files (remote and local),
    // without "losing" such layer files.
-    let (upload_result, status_update) = tokio::join!(
+    let (upload_status, download_status) = tokio::join!(
        async {
            if let Some(upload_data) = upload_data {
-                match validate_task_retries(upload_data, max_sync_errors)
+                let upload_retries = upload_data.retries;
+                match validate_task_retries(upload_retries, max_sync_errors)
                    .instrument(info_span!("retries_validation"))
                    .await
                {
-                    ControlFlow::Continue(new_upload_data) => {
+                    ControlFlow::Continue(()) => {
                        upload_timeline_data(
                            conf,
                            (storage.as_ref(), &index, sync_queue),
                            current_remote_timeline.as_ref(),
                            sync_id,
-                            new_upload_data,
+                            upload_data,
                            sync_start,
                            "upload",
                        )
-                        .await;
-                        return Some(());
-                    }
-                    ControlFlow::Break(failed_upload_data) => {
-                        if let Err(e) = update_remote_data(
-                            conf,
-                            storage.as_ref(),
-                            &index,
-                            sync_id,
-                            RemoteDataUpdate::Upload {
-                                uploaded_data: failed_upload_data.data,
-                                upload_failed: true,
-                            },
-                        )
                        .await
-                        {
-                            error!("Failed to update remote timeline {sync_id}: {e:?}");
-                        }
                    }
+                    ControlFlow::Break(()) => match update_remote_data(
+                        conf,
+                        storage.as_ref(),
+                        &index,
+                        sync_id,
+                        RemoteDataUpdate::Upload {
+                            uploaded_data: upload_data.data,
+                            upload_failed: true,
+                        },
+                    )
+                    .await
+                    {
+                        Ok(()) => UploadStatus::Failed(anyhow::anyhow!(
+                            "Aborted after retries validation, current retries: {upload_retries}, max retries allowed: {max_sync_errors}"
+                        )),
+                        Err(e) => {
+                            error!("Failed to update remote timeline {sync_id}: {e:?}");
+                            UploadStatus::Failed(e)
+                        }
+                    },
                }
+            } else {
+                UploadStatus::Nothing
            }
-            None
        }
        .instrument(info_span!("upload_timeline_data")),
        async {
            if let Some(download_data) = download_data {
-                match validate_task_retries(download_data, max_sync_errors)
+                match validate_task_retries(download_data.retries, max_sync_errors)
                    .instrument(info_span!("retries_validation"))
                    .await
                {
-                    ControlFlow::Continue(new_download_data) => {
+                    ControlFlow::Continue(()) => {
                        return download_timeline_data(
                            conf,
                            (storage.as_ref(), &index, sync_queue),
                            current_remote_timeline.as_ref(),
                            sync_id,
-                            new_download_data,
+                            download_data,
                            sync_start,
                            "download",
                        )
                        .await;
                    }
-                    ControlFlow::Break(_) => {
+                    ControlFlow::Break(()) => {
                        index
                            .write()
                            .await
@@ -1115,51 +1126,53 @@ where
                    }
                }
            }
-            DownloadMarker::Nothing
+            DownloadStatus::Nothing
        }
        .instrument(info_span!("download_timeline_data")),
    );

-    if let Some(mut delete_data) = batch.delete {
-        if upload_result.is_some() {
-            match validate_task_retries(delete_data, max_sync_errors)
-                .instrument(info_span!("retries_validation"))
-                .await
-            {
-                ControlFlow::Continue(new_delete_data) => {
-                    delete_timeline_data(
-                        conf,
-                        (storage.as_ref(), &index, sync_queue),
-                        sync_id,
-                        new_delete_data,
-                        sync_start,
-                        "delete",
-                    )
-                    .instrument(info_span!("delete_timeline_data"))
-                    .await;
-                }
-                ControlFlow::Break(failed_delete_data) => {
-                    if let Err(e) = update_remote_data(
-                        conf,
-                        storage.as_ref(),
-                        &index,
-                        sync_id,
-                        RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers),
-                    )
+    if let Some(delete_data) = batch.delete {
+        match upload_status {
+            UploadStatus::Uploaded | UploadStatus::Nothing => {
+                match validate_task_retries(delete_data.retries, max_sync_errors)
+                    .instrument(info_span!("retries_validation"))
                    .await
-                    {
-                        error!("Failed to update remote timeline {sync_id}: {e:?}");
+                {
+                    ControlFlow::Continue(()) => {
+                        delete_timeline_data(
+                            conf,
+                            (storage.as_ref(), &index, sync_queue),
+                            sync_id,
+                            delete_data,
+                            sync_start,
+                            "delete",
+                        )
+                        .instrument(info_span!("delete_timeline_data"))
+                        .await;
+                    }
+                    ControlFlow::Break(()) => {
+                        if let Err(e) = update_remote_data(
+                            conf,
+                            storage.as_ref(),
+                            &index,
+                            sync_id,
+                            RemoteDataUpdate::Delete(&delete_data.data.deleted_layers),
+                        )
+                        .await
+                        {
+                            error!("Failed to update remote timeline {sync_id}: {e:?}");
+                        }
                    }
                }
            }
-        } else {
-            delete_data.retries += 1;
-            sync_queue.push(sync_id, SyncTask::Delete(delete_data));
-            warn!("Skipping delete task due to failed upload tasks, reenqueuing");
+            UploadStatus::Failed(e) => {
+                warn!("Skipping delete task due to failed upload tasks, reenqueuing. Upload data: {:?}, delete data: {delete_data:?}. Upload failure: {e:#}", batch.upload);
+                sync_queue.push(sync_id, SyncTask::Delete(delete_data));
+            }
        }
    }

-    status_update
+    download_status
 }

 async fn download_timeline_data<P, S>(
@@ -1170,7 +1183,7 @@ async fn download_timeline_data<P, S>(
    new_download_data: SyncData<LayersDownload>,
    sync_start: Instant,
    task_name: &str,
-) -> DownloadMarker
+) -> DownloadStatus
 where
    P: Debug + Send + Sync + 'static,
    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -1199,7 +1212,7 @@ where
                Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
                    Ok(()) => {
                        register_sync_status(sync_id, sync_start, task_name, Some(true));
-                        return DownloadMarker::Downloaded;
+                        return DownloadStatus::Downloaded;
                    }
                    Err(e) => {
                        error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
@@ -1215,7 +1228,7 @@ where
        }
    }

-    DownloadMarker::Nothing
+    DownloadStatus::Nothing
 }

 async fn update_local_metadata(
@@ -1338,7 +1351,8 @@ async fn upload_timeline_data<P, S>(
    new_upload_data: SyncData<LayersUpload>,
    sync_start: Instant,
    task_name: &str,
-) where
+) -> UploadStatus
+where
    P: Debug + Send + Sync + 'static,
    S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
 {
@@ -1351,9 +1365,9 @@ async fn upload_timeline_data<P, S>(
    )
    .await
    {
-        UploadedTimeline::FailedAndRescheduled => {
+        UploadedTimeline::FailedAndRescheduled(e) => {
            register_sync_status(sync_id, sync_start, task_name, Some(false));
-            return;
+            return UploadStatus::Failed(e);
        }
        UploadedTimeline::Successful(upload_data) => upload_data,
    };
@@ -1372,12 +1386,14 @@ async fn upload_timeline_data<P, S>(
    {
        Ok(()) => {
            register_sync_status(sync_id, sync_start, task_name, Some(true));
+            UploadStatus::Uploaded
        }
        Err(e) => {
            error!("Failed to update remote timeline {sync_id}: {e:?}");
            uploaded_data.retries += 1;
            sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
            register_sync_status(sync_id, sync_start, task_name, Some(false));
+            UploadStatus::Failed(e)
        }
    }
 }
@@ -1480,25 +1496,17 @@ where
        .context("Failed to upload new index part")
 }

-async fn validate_task_retries<T>(
-    sync_data: SyncData<T>,
+async fn validate_task_retries(
+    current_attempt: u32,
    max_sync_errors: NonZeroU32,
-) -> ControlFlow<SyncData<T>, SyncData<T>> {
-    let current_attempt = sync_data.retries;
+) -> ControlFlow<(), ()> {
    let max_sync_errors = max_sync_errors.get();
    if current_attempt >= max_sync_errors {
-        error!(
-            "Aborting task that failed {current_attempt} times, exceeding retries threshold of {max_sync_errors}",
-        );
-        return ControlFlow::Break(sync_data);
+        return ControlFlow::Break(());
    }

-    if current_attempt > 0 {
-        let seconds_to_wait = 2.0_f64.powf(current_attempt as f64 - 1.0).min(30.0);
-        info!("Waiting {seconds_to_wait} seconds before starting the task");
-        tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await;
-    }
-    ControlFlow::Continue(sync_data)
+    exponential_backoff(current_attempt, 1.0, 30.0).await;
+    ControlFlow::Continue(())
 }

 fn schedule_first_sync_tasks(
--- a/pageserver/src/storage_sync/delete.rs
+++ b/pageserver/src/storage_sync/delete.rs
@@ -95,6 +95,8 @@ where
        debug!("Reenqueuing failed delete task for timeline {sync_id}");
        delete_data.retries += 1;
        sync_queue.push(sync_id, SyncTask::Delete(delete_data));
+    } else {
+        info!("Successfully deleted all layers");
    }
    errored
 }
--- a/pageserver/src/storage_sync/upload.rs
+++ b/pageserver/src/storage_sync/upload.rs
@@ -75,7 +75,7 @@ where
 #[derive(Debug)]
 pub(super) enum UploadedTimeline {
    /// Upload failed due to some error, the upload task is rescheduled for another retry.
-    FailedAndRescheduled,
+    FailedAndRescheduled(anyhow::Error),
    /// No issues happened during the upload, all task files were put into the remote storage.
    Successful(SyncData<LayersUpload>),
 }
@@ -179,7 +179,7 @@ where
        })
        .collect::<FuturesUnordered<_>>();

-    let mut errors_happened = false;
+    let mut errors = Vec::new();
    while let Some(upload_result) = upload_tasks.next().await {
        match upload_result {
            Ok(uploaded_path) => {
@@ -188,13 +188,13 @@ where
            }
            Err(e) => match e {
                UploadError::Other(e) => {
-                    errors_happened = true;
                    error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
+                    errors.push(format!("{e:#}"));
                }
                UploadError::MissingLocalFile(source_path, e) => {
                    if source_path.exists() {
-                        errors_happened = true;
                        error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
+                        errors.push(format!("{e:#}"));
                    } else {
                        // We have run the upload sync task, but the file we wanted to upload is gone.
                        // This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to
@@ -217,14 +217,17 @@ where
        }
    }

-    if errors_happened {
+    if errors.is_empty() {
+        info!("Successfully uploaded all layers");
+        UploadedTimeline::Successful(upload_data)
+    } else {
        debug!("Reenqueuing failed upload task for timeline {sync_id}");
        upload_data.retries += 1;
        sync_queue.push(sync_id, SyncTask::Upload(upload_data));
-        UploadedTimeline::FailedAndRescheduled
-    } else {
-        info!("Successfully uploaded all layers");
-        UploadedTimeline::Successful(upload_data)
+        UploadedTimeline::FailedAndRescheduled(anyhow::anyhow!(
+            "Errors appeared during layer uploads: {:?}",
+            errors
+        ))
    }
 }

--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -23,6 +23,7 @@ pub mod defaults {
    // which is good for now to trigger bugs.
    // This parameter actually determines L0 layer file size.
    pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
+    pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";

    // Target file size, when creating image and delta layers.
    // This parameter determines L1 layer file size.
@@ -36,7 +37,7 @@ pub mod defaults {
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
-    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
+    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
 }

@@ -48,6 +49,9 @@ pub struct TenantConf {
    // page server crashes.
    // This parameter actually determines L0 layer file size.
    pub checkpoint_distance: u64,
+    // Inmemory layer is also flushed at least once in checkpoint_timeout to
+    // eventually upload WAL after activity is stopped.
+    pub checkpoint_timeout: Duration,
    // Target file size, when creating image and delta layers.
    // This parameter determines L1 layer file size.
    pub compaction_target_size: u64,
@@ -90,6 +94,7 @@ pub struct TenantConf {
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub struct TenantConfOpt {
    pub checkpoint_distance: Option<u64>,
+    pub checkpoint_timeout: Option<Duration>,
    pub compaction_target_size: Option<u64>,
    #[serde(with = "humantime_serde")]
    pub compaction_period: Option<Duration>,
@@ -113,6 +118,9 @@ impl TenantConfOpt {
            checkpoint_distance: self
                .checkpoint_distance
                .unwrap_or(global_conf.checkpoint_distance),
+            checkpoint_timeout: self
+                .checkpoint_timeout
+                .unwrap_or(global_conf.checkpoint_timeout),
            compaction_target_size: self
                .compaction_target_size
                .unwrap_or(global_conf.compaction_target_size),
@@ -142,6 +150,9 @@ impl TenantConfOpt {
        if let Some(checkpoint_distance) = other.checkpoint_distance {
            self.checkpoint_distance = Some(checkpoint_distance);
        }
+        if let Some(checkpoint_timeout) = other.checkpoint_timeout {
+            self.checkpoint_timeout = Some(checkpoint_timeout);
+        }
        if let Some(compaction_target_size) = other.compaction_target_size {
            self.compaction_target_size = Some(compaction_target_size);
        }
@@ -181,6 +192,8 @@ impl TenantConf {

        TenantConf {
            checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
+            checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT)
+                .expect("cannot parse default checkpoint timeout"),
            compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
            compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
                .expect("cannot parse default compaction period"),
@@ -212,6 +225,7 @@ impl TenantConf {
    pub fn dummy_conf() -> Self {
        TenantConf {
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
+            checkpoint_timeout: Duration::from_secs(600),
            compaction_target_size: 4 * 1024 * 1024,
            compaction_period: Duration::from_secs(10),
            compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
--- a/pageserver/src/timelines.rs
+++ b/pageserver/src/timelines.rs
@@ -232,7 +232,7 @@ pub(crate) fn create_timeline(
        return Ok(None);
    }

-    let _new_timeline = match ancestor_timeline_id {
+    match ancestor_timeline_id {
        Some(ancestor_timeline_id) => {
            let ancestor_timeline = repo
                .get_timeline_load(ancestor_timeline_id)
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -17,7 +17,7 @@ use std::{
 };

 use anyhow::Context;
-use chrono::{DateTime, Local, NaiveDateTime, Utc};
+use chrono::{NaiveDateTime, Utc};
 use etcd_broker::{
    subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
    BrokerUpdate, Client,
@@ -25,15 +25,18 @@ use etcd_broker::{
 use tokio::select;
 use tracing::*;

-use crate::repository::{Repository, Timeline};
+use crate::{
+    exponential_backoff,
+    repository::{Repository, Timeline},
+    DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
+};
 use crate::{RepositoryImpl, TimelineImpl};
 use utils::{
    lsn::Lsn,
-    pq_proto::ReplicationFeedback,
    zid::{NodeId, ZTenantTimelineId},
 };

-use super::{TaskEvent, TaskHandle};
+use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};

 /// Spawns the loop to take care of the timeline's WAL streaming connection.
 pub(super) fn spawn_connection_manager_task(
@@ -110,21 +113,26 @@ async fn connection_manager_loop_step(
                }
            } => {
                let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard");
-                match &wal_connection_update {
+                match wal_connection_update {
                    TaskEvent::Started => {
-                        wal_connection.latest_connection_update = Utc::now().naive_utc();
                        *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1;
                    },
-                    TaskEvent::NewEvent(replication_feedback) => {
-                        wal_connection.latest_connection_update = DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc();
-                        // reset connection attempts here only, the only place where both nodes
-                        // explicitly confirmn with replication feedback that they are connected to each other
-                        walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id);
+                    TaskEvent::NewEvent(status) => {
+                        if status.has_received_wal {
+                            // Reset connection attempts here only, we know that safekeeper is healthy
+                            // because it can send us a WAL update.
+                            walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id);
+                        }
+                        wal_connection.status = status;
                    },
                    TaskEvent::End(end_result) => {
                        match end_result {
                            Ok(()) => debug!("WAL receiving task finished"),
-                            Err(e) => warn!("WAL receiving task failed: {e}"),
+                            Err(e) => {
+                                warn!("WAL receiving task failed: {e}");
+                                // If the task failed, set the connection attempts to at least 1, to try other safekeepers.
+                                let _ = *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(1);
+                            }
                        };
                        walreceiver_state.wal_connection = None;
                    },
@@ -230,18 +238,6 @@ async fn subscribe_for_timeline_updates(
    }
 }

-const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
-const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
-
-async fn exponential_backoff(n: u32, base: f64, max_seconds: f64) {
-    if n == 0 {
-        return;
-    }
-    let seconds_to_wait = base.powf(f64::from(n) - 1.0).min(max_seconds);
-    info!("Backoff: waiting {seconds_to_wait} seconds before proceeding with the task");
-    tokio::time::sleep(Duration::from_secs_f64(seconds_to_wait)).await;
-}
-
 /// All data that's needed to run endless broker loop and keep the WAL streaming connection alive, if possible.
 struct WalreceiverState {
    id: ZTenantTimelineId,
@@ -265,10 +261,21 @@ struct WalreceiverState {
 struct WalConnection {
    /// Current safekeeper pageserver is connected to for WAL streaming.
    sk_id: NodeId,
-    /// Connection task start time or the timestamp of a latest connection message received.
-    latest_connection_update: NaiveDateTime,
+    /// Status of the connection.
+    status: WalConnectionStatus,
    /// WAL streaming task handle.
-    connection_task: TaskHandle<ReplicationFeedback>,
+    connection_task: TaskHandle<WalConnectionStatus>,
+    /// Have we discovered that other safekeeper has more recent WAL than we do?
+    discovered_new_wal: Option<NewCommittedWAL>,
+}
+
+/// Notion of a new committed WAL, which exists on other safekeeper.
+#[derive(Debug, Clone, Copy)]
+struct NewCommittedWAL {
+    /// LSN of the new committed WAL.
+    lsn: Lsn,
+    /// When we discovered that the new committed WAL exists on other safekeeper.
+    discovered_at: NaiveDateTime,
 }

 /// Data about the timeline to connect to, received from etcd.
@@ -335,10 +342,19 @@ impl WalreceiverState {
            .instrument(info_span!("walreceiver_connection", id = %id))
        });

+        let now = Utc::now().naive_utc();
        self.wal_connection = Some(WalConnection {
            sk_id: new_sk_id,
-            latest_connection_update: Utc::now().naive_utc(),
+            status: WalConnectionStatus {
+                is_connected: false,
+                has_received_wal: false,
+                latest_connection_update: now,
+                latest_wal_update: now,
+                streaming_lsn: None,
+                commit_lsn: None,
+            },
            connection_task: connection_handle,
+            discovered_new_wal: None,
        });
    }

@@ -369,14 +385,16 @@ impl WalreceiverState {
    /// Cleans up stale etcd records and checks the rest for the new connection candidate.
    /// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise.
    /// The current rules for approving new candidates:
-    /// * pick from the input data from etcd for currently connected safekeeper (if any)
-    /// * out of the rest input entries, pick one with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline
+    /// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps
    /// * if there's no such entry, no new candidate found, abort
-    /// * check the current connection time data for staleness, reconnect if stale
-    /// * otherwise, check if etcd updates contain currently connected safekeeper
-    ///     * if not, that means no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection)
-    ///       Reconnect if the time exceeds the threshold.
-    ///     * if there's one, compare its Lsn with the other candidate's, reconnect if candidate's over threshold
+    /// * otherwise check if the candidate is much better than the current one
+    ///
+    /// To understand exact rules for determining if the candidate is better than the current one, refer to this function's implementation.
+    /// General rules are following:
+    /// * if connected safekeeper is not present, pick the candidate
+    /// * if we haven't received any updates for some time, pick the candidate
+    /// * if the candidate commit_lsn is much higher than the current one, pick the candidate
+    /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate
    ///
    /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently.
    /// Both thresholds are configured per tenant.
@@ -392,53 +410,128 @@ impl WalreceiverState {

                let now = Utc::now().naive_utc();
                if let Ok(latest_interaciton) =
-                    (now - existing_wal_connection.latest_connection_update).to_std()
+                    (now - existing_wal_connection.status.latest_connection_update).to_std()
                {
-                    if latest_interaciton > self.lagging_wal_timeout {
+                    // Drop connection if we haven't received keepalive message for a while.
+                    if latest_interaciton > self.wal_connect_timeout {
                        return Some(NewWalConnectionCandidate {
                            safekeeper_id: new_sk_id,
                            wal_source_connstr: new_wal_source_connstr,
-                            reason: ReconnectReason::NoWalTimeout {
-                                last_wal_interaction: Some(
-                                    existing_wal_connection.latest_connection_update,
+                            reason: ReconnectReason::NoKeepAlives {
+                                last_keep_alive: Some(
+                                    existing_wal_connection.status.latest_connection_update,
                                ),
                                check_time: now,
-                                threshold: self.lagging_wal_timeout,
+                                threshold: self.wal_connect_timeout,
                            },
                        });
                    }
                }

-                match self.wal_stream_candidates.get(&connected_sk_node) {
-                    Some(current_connection_etcd_data) => {
-                        let new_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
-                        let current_lsn = current_connection_etcd_data
-                            .timeline
-                            .commit_lsn
-                            .unwrap_or(Lsn(0));
-                        match new_lsn.0.checked_sub(current_lsn.0)
-                            {
-                                Some(new_sk_lsn_advantage) => {
-                                    if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() {
-                                        return Some(
-                                            NewWalConnectionCandidate {
-                                                safekeeper_id: new_sk_id,
-                                                wal_source_connstr: new_wal_source_connstr,
-                                                reason: ReconnectReason::LaggingWal { current_lsn, new_lsn, threshold: self.max_lsn_wal_lag },
-                                            });
-                                    }
-                                }
-                                None => debug!("Best SK candidate has its commit Lsn behind the current timeline's latest consistent Lsn"),
+                if !existing_wal_connection.status.is_connected {
+                    // We haven't connected yet and we shouldn't switch until connection timeout (condition above).
+                    return None;
+                }
+
+                if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn {
+                    let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
+                    // Check if the new candidate has much more WAL than the current one.
+                    match new_commit_lsn.0.checked_sub(current_commit_lsn.0) {
+                        Some(new_sk_lsn_advantage) => {
+                            if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() {
+                                return Some(NewWalConnectionCandidate {
+                                    safekeeper_id: new_sk_id,
+                                    wal_source_connstr: new_wal_source_connstr,
+                                    reason: ReconnectReason::LaggingWal {
+                                        current_commit_lsn,
+                                        new_commit_lsn,
+                                        threshold: self.max_lsn_wal_lag,
+                                    },
+                                });
                            }
-                    }
-                    None => {
-                        return Some(NewWalConnectionCandidate {
-                            safekeeper_id: new_sk_id,
-                            wal_source_connstr: new_wal_source_connstr,
-                            reason: ReconnectReason::NoEtcdDataForExistingConnection,
-                        })
+                        }
+                        None => debug!(
+                            "Best SK candidate has its commit_lsn behind connected SK's commit_lsn"
+                        ),
                    }
                }
+
+                let current_lsn = match existing_wal_connection.status.streaming_lsn {
+                    Some(lsn) => lsn,
+                    None => self.local_timeline.get_last_record_lsn(),
+                };
+                let current_commit_lsn = existing_wal_connection
+                    .status
+                    .commit_lsn
+                    .unwrap_or(current_lsn);
+                let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
+
+                // Keep discovered_new_wal only if connected safekeeper has not caught up yet.
+                let mut discovered_new_wal = existing_wal_connection
+                    .discovered_new_wal
+                    .filter(|new_wal| new_wal.lsn > current_commit_lsn);
+
+                if discovered_new_wal.is_none() {
+                    // Check if the new candidate has more WAL than the current one.
+                    // If the new candidate has more WAL than the current one, we consider switching to the new candidate.
+                    discovered_new_wal = if candidate_commit_lsn > current_commit_lsn {
+                        trace!(
+                            "New candidate has commit_lsn {}, higher than current_commit_lsn {}",
+                            candidate_commit_lsn,
+                            current_commit_lsn
+                        );
+                        Some(NewCommittedWAL {
+                            lsn: candidate_commit_lsn,
+                            discovered_at: Utc::now().naive_utc(),
+                        })
+                    } else {
+                        None
+                    };
+                }
+
+                let waiting_for_new_lsn_since = if current_lsn < current_commit_lsn {
+                    // Connected safekeeper has more WAL, but we haven't received updates for some time.
+                    trace!(
+                        "Connected safekeeper has more WAL, but we haven't received updates for {:?}. current_lsn: {}, current_commit_lsn: {}",
+                        (now - existing_wal_connection.status.latest_wal_update).to_std(),
+                        current_lsn,
+                        current_commit_lsn
+                    );
+                    Some(existing_wal_connection.status.latest_wal_update)
+                } else {
+                    discovered_new_wal.as_ref().map(|new_wal| {
+                        // We know that new WAL is available on other safekeeper, but connected safekeeper don't have it.
+                        new_wal
+                            .discovered_at
+                            .max(existing_wal_connection.status.latest_wal_update)
+                    })
+                };
+
+                // If we haven't received any WAL updates for a while and candidate has more WAL, switch to it.
+                if let Some(waiting_for_new_lsn_since) = waiting_for_new_lsn_since {
+                    if let Ok(waiting_for_new_wal) = (now - waiting_for_new_lsn_since).to_std() {
+                        if candidate_commit_lsn > current_commit_lsn
+                            && waiting_for_new_wal > self.lagging_wal_timeout
+                        {
+                            return Some(NewWalConnectionCandidate {
+                                safekeeper_id: new_sk_id,
+                                wal_source_connstr: new_wal_source_connstr,
+                                reason: ReconnectReason::NoWalTimeout {
+                                    current_lsn,
+                                    current_commit_lsn,
+                                    candidate_commit_lsn,
+                                    last_wal_interaction: Some(
+                                        existing_wal_connection.status.latest_wal_update,
+                                    ),
+                                    check_time: now,
+                                    threshold: self.lagging_wal_timeout,
+                                },
+                            });
+                        }
+                    }
+                }
+
+                self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal;
            }
            None => {
                let (new_sk_id, _, new_wal_source_connstr) =
@@ -458,7 +551,7 @@ impl WalreceiverState {
    /// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
    ///
    /// The candidate that is chosen:
-    /// * has fewest connection attempts from pageserver to safekeeper node (reset every time the WAL replication feedback is sent)
+    /// * has fewest connection attempts from pageserver to safekeeper node (reset every time we receive a WAL message from the node)
    /// * has greatest data Lsn among the ones that are left
    ///
    /// NOTE:
@@ -497,14 +590,13 @@ impl WalreceiverState {
            .max_by_key(|(_, info, _)| info.commit_lsn)
    }

+    /// Returns a list of safekeepers that have valid info and ready for connection.
    fn applicable_connection_candidates(
        &self,
    ) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, String)> {
        self.wal_stream_candidates
            .iter()
-            .filter(|(_, etcd_info)| {
-                etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn())
-            })
+            .filter(|(_, info)| info.timeline.commit_lsn.is_some())
            .filter_map(|(sk_id, etcd_info)| {
                let info = &etcd_info.timeline;
                match wal_stream_connection_string(
@@ -520,6 +612,7 @@ impl WalreceiverState {
            })
    }

+    /// Remove candidates which haven't sent etcd updates for a while.
    fn cleanup_old_candidates(&mut self) {
        let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());

@@ -554,17 +647,24 @@ struct NewWalConnectionCandidate {
 #[derive(Debug, PartialEq, Eq)]
 enum ReconnectReason {
    NoExistingConnection,
-    NoEtcdDataForExistingConnection,
    LaggingWal {
-        current_lsn: Lsn,
-        new_lsn: Lsn,
+        current_commit_lsn: Lsn,
+        new_commit_lsn: Lsn,
        threshold: NonZeroU64,
    },
    NoWalTimeout {
+        current_lsn: Lsn,
+        current_commit_lsn: Lsn,
+        candidate_commit_lsn: Lsn,
        last_wal_interaction: Option<NaiveDateTime>,
        check_time: NaiveDateTime,
        threshold: Duration,
    },
+    NoKeepAlives {
+        last_keep_alive: Option<NaiveDateTime>,
+        check_time: NaiveDateTime,
+        threshold: Duration,
+    },
 }

 fn wal_stream_connection_string(
@@ -588,7 +688,6 @@ fn wal_stream_connection_string(

 #[cfg(test)]
 mod tests {
-    use std::time::SystemTime;

    use crate::repository::{
        repo_harness::{RepoHarness, TIMELINE_ID},
@@ -666,7 +765,7 @@ mod tests {
                        backup_lsn: None,
                        remote_consistent_lsn: None,
                        peer_horizon_lsn: None,
-                        safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
+                        safekeeper_connstr: None,
                    },
                    etcd_version: 0,
                    latest_update: delay_over_threshold,
@@ -692,22 +791,26 @@ mod tests {
        let connected_sk_id = NodeId(0);
        let current_lsn = 100_000;

+        let connection_status = WalConnectionStatus {
+            is_connected: true,
+            has_received_wal: true,
+            latest_connection_update: now,
+            latest_wal_update: now,
+            commit_lsn: Some(Lsn(current_lsn)),
+            streaming_lsn: Some(Lsn(current_lsn)),
+        };
+
        state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap();
        state.wal_connection = Some(WalConnection {
            sk_id: connected_sk_id,
-            latest_connection_update: now,
+            status: connection_status.clone(),
            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
-                    .send(TaskEvent::NewEvent(ReplicationFeedback {
-                        current_timeline_size: 1,
-                        ps_writelsn: 1,
-                        ps_applylsn: current_lsn,
-                        ps_flushlsn: 1,
-                        ps_replytime: SystemTime::now(),
-                    }))
+                    .send(TaskEvent::NewEvent(connection_status.clone()))
                    .ok();
                Ok(())
            }),
+            discovered_new_wal: None,
        });
        state.wal_stream_candidates = HashMap::from([
            (
@@ -932,65 +1035,6 @@ mod tests {
        Ok(())
    }

-    #[tokio::test]
-    async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> {
-        let harness = RepoHarness::create("connection_no_etcd_data_candidate")?;
-        let mut state = dummy_state(&harness);
-
-        let now = Utc::now().naive_utc();
-        let current_lsn = Lsn(100_000).align();
-        let connected_sk_id = NodeId(0);
-        let other_sk_id = NodeId(connected_sk_id.0 + 1);
-
-        state.wal_connection = Some(WalConnection {
-            sk_id: connected_sk_id,
-            latest_connection_update: now,
-            connection_task: TaskHandle::spawn(move |sender, _| async move {
-                sender
-                    .send(TaskEvent::NewEvent(ReplicationFeedback {
-                        current_timeline_size: 1,
-                        ps_writelsn: current_lsn.0,
-                        ps_applylsn: 1,
-                        ps_flushlsn: 1,
-                        ps_replytime: SystemTime::now(),
-                    }))
-                    .ok();
-                Ok(())
-            }),
-        });
-        state.wal_stream_candidates = HashMap::from([(
-            other_sk_id,
-            EtcdSkTimeline {
-                timeline: SkTimelineInfo {
-                    last_log_term: None,
-                    flush_lsn: None,
-                    commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
-                    backup_lsn: None,
-                    remote_consistent_lsn: None,
-                    peer_horizon_lsn: None,
-                    safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
-                },
-                etcd_version: 0,
-                latest_update: now,
-            },
-        )]);
-
-        let only_candidate = state
-            .next_connection_candidate()
-            .expect("Expected one candidate selected out of the only data option, but got none");
-        assert_eq!(only_candidate.safekeeper_id, other_sk_id);
-        assert_eq!(
-            only_candidate.reason,
-            ReconnectReason::NoEtcdDataForExistingConnection,
-            "Should select new safekeeper due to missing etcd data, even if there's an existing connection with this safekeeper"
-        );
-        assert!(only_candidate
-            .wal_source_connstr
-            .contains(DUMMY_SAFEKEEPER_CONNSTR));
-
-        Ok(())
-    }
-
    #[tokio::test]
    async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
        let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?;
@@ -1001,21 +1045,25 @@ mod tests {
        let connected_sk_id = NodeId(0);
        let new_lsn = Lsn(current_lsn.0 + state.max_lsn_wal_lag.get() + 1);

+        let connection_status = WalConnectionStatus {
+            is_connected: true,
+            has_received_wal: true,
+            latest_connection_update: now,
+            latest_wal_update: now,
+            commit_lsn: Some(current_lsn),
+            streaming_lsn: Some(current_lsn),
+        };
+
        state.wal_connection = Some(WalConnection {
            sk_id: connected_sk_id,
-            latest_connection_update: now,
+            status: connection_status.clone(),
            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
-                    .send(TaskEvent::NewEvent(ReplicationFeedback {
-                        current_timeline_size: 1,
-                        ps_writelsn: current_lsn.0,
-                        ps_applylsn: 1,
-                        ps_flushlsn: 1,
-                        ps_replytime: SystemTime::now(),
-                    }))
+                    .send(TaskEvent::NewEvent(connection_status.clone()))
                    .ok();
                Ok(())
            }),
+            discovered_new_wal: None,
        });
        state.wal_stream_candidates = HashMap::from([
            (
@@ -1060,8 +1108,8 @@ mod tests {
        assert_eq!(
            over_threshcurrent_candidate.reason,
            ReconnectReason::LaggingWal {
-                current_lsn,
-                new_lsn,
+                current_commit_lsn: current_lsn,
+                new_commit_lsn: new_lsn,
                threshold: state.max_lsn_wal_lag
            },
            "Should select bigger WAL safekeeper if it starts to lag enough"
@@ -1074,31 +1122,35 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?;
+    async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
+        let harness = RepoHarness::create("timeout_connection_threshhold_current_candidate")?;
        let mut state = dummy_state(&harness);
        let current_lsn = Lsn(100_000).align();
        let now = Utc::now().naive_utc();

-        let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?;
+        let wal_connect_timeout = chrono::Duration::from_std(state.wal_connect_timeout)?;
        let time_over_threshold =
-            Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout;
+            Utc::now().naive_utc() - wal_connect_timeout - wal_connect_timeout;
+
+        let connection_status = WalConnectionStatus {
+            is_connected: true,
+            has_received_wal: true,
+            latest_connection_update: time_over_threshold,
+            latest_wal_update: time_over_threshold,
+            commit_lsn: Some(current_lsn),
+            streaming_lsn: Some(current_lsn),
+        };

        state.wal_connection = Some(WalConnection {
            sk_id: NodeId(1),
-            latest_connection_update: time_over_threshold,
+            status: connection_status.clone(),
            connection_task: TaskHandle::spawn(move |sender, _| async move {
                sender
-                    .send(TaskEvent::NewEvent(ReplicationFeedback {
-                        current_timeline_size: 1,
-                        ps_writelsn: current_lsn.0,
-                        ps_applylsn: 1,
-                        ps_flushlsn: 1,
-                        ps_replytime: SystemTime::now(),
-                    }))
+                    .send(TaskEvent::NewEvent(connection_status.clone()))
                    .ok();
                Ok(())
            }),
+            discovered_new_wal: None,
        });
        state.wal_stream_candidates = HashMap::from([(
            NodeId(0),
@@ -1123,12 +1175,12 @@ mod tests {

        assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0));
        match over_threshcurrent_candidate.reason {
-            ReconnectReason::NoWalTimeout {
-                last_wal_interaction,
+            ReconnectReason::NoKeepAlives {
+                last_keep_alive,
                threshold,
                ..
            } => {
-                assert_eq!(last_wal_interaction, Some(time_over_threshold));
+                assert_eq!(last_keep_alive, Some(time_over_threshold));
                assert_eq!(threshold, state.lagging_wal_timeout);
            }
            unexpected => panic!("Unexpected reason: {unexpected:?}"),
@@ -1141,20 +1193,34 @@ mod tests {
    }

    #[tokio::test]
-    async fn timeout_connection_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = RepoHarness::create("timeout_connection_over_threshhold_current_candidate")?;
+    async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
+        let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?;
        let mut state = dummy_state(&harness);
        let current_lsn = Lsn(100_000).align();
+        let new_lsn = Lsn(100_100).align();
        let now = Utc::now().naive_utc();

        let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?;
        let time_over_threshold =
            Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout;

+        let connection_status = WalConnectionStatus {
+            is_connected: true,
+            has_received_wal: true,
+            latest_connection_update: now,
+            latest_wal_update: time_over_threshold,
+            commit_lsn: Some(current_lsn),
+            streaming_lsn: Some(current_lsn),
+        };
+
        state.wal_connection = Some(WalConnection {
            sk_id: NodeId(1),
-            latest_connection_update: time_over_threshold,
+            status: connection_status,
            connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
+            discovered_new_wal: Some(NewCommittedWAL {
+                discovered_at: time_over_threshold,
+                lsn: new_lsn,
+            }),
        });
        state.wal_stream_candidates = HashMap::from([(
            NodeId(0),
@@ -1162,7 +1228,7 @@ mod tests {
                timeline: SkTimelineInfo {
                    last_log_term: None,
                    flush_lsn: None,
-                    commit_lsn: Some(current_lsn),
+                    commit_lsn: Some(new_lsn),
                    backup_lsn: None,
                    remote_consistent_lsn: None,
                    peer_horizon_lsn: None,
@@ -1180,10 +1246,16 @@ mod tests {
        assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0));
        match over_threshcurrent_candidate.reason {
            ReconnectReason::NoWalTimeout {
+                current_lsn,
+                current_commit_lsn,
+                candidate_commit_lsn,
                last_wal_interaction,
                threshold,
                ..
            } => {
+                assert_eq!(current_lsn, current_lsn);
+                assert_eq!(current_commit_lsn, current_lsn);
+                assert_eq!(candidate_commit_lsn, new_lsn);
                assert_eq!(last_wal_interaction, Some(time_over_threshold));
                assert_eq!(threshold, state.lagging_wal_timeout);
            }
@@ -1210,7 +1282,7 @@ mod tests {
                .expect("Failed to create an empty timeline for dummy wal connection manager"),
            wal_connect_timeout: Duration::from_secs(1),
            lagging_wal_timeout: Duration::from_secs(1),
-            max_lsn_wal_lag: NonZeroU64::new(1).unwrap(),
+            max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
            wal_connection: None,
            wal_stream_candidates: HashMap::new(),
            wal_connection_attempts: HashMap::new(),
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/walreceiver/walreceiver_connection.rs
@@ -8,6 +8,7 @@ use std::{

 use anyhow::{bail, ensure, Context};
 use bytes::BytesMut;
+use chrono::{NaiveDateTime, Utc};
 use fail::fail_point;
 use futures::StreamExt;
 use postgres::{SimpleQueryMessage, SimpleQueryRow};
@@ -29,12 +30,29 @@ use crate::{
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId};

+/// Status of the connection.
+#[derive(Debug, Clone)]
+pub struct WalConnectionStatus {
+    /// If we were able to initiate a postgres connection, this means that safekeeper process is at least running.
+    pub is_connected: bool,
+    /// Defines a healthy connection as one on which we have received at least some WAL bytes.
+    pub has_received_wal: bool,
+    /// Connection establishment time or the timestamp of a latest connection message received.
+    pub latest_connection_update: NaiveDateTime,
+    /// Time of the latest WAL message received.
+    pub latest_wal_update: NaiveDateTime,
+    /// Latest WAL update contained WAL up to this LSN. Next WAL message with start from that LSN.
+    pub streaming_lsn: Option<Lsn>,
+    /// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet.
+    pub commit_lsn: Option<Lsn>,
+}
+
 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
 pub async fn handle_walreceiver_connection(
    id: ZTenantTimelineId,
    wal_source_connstr: &str,
-    events_sender: &watch::Sender<TaskEvent<ReplicationFeedback>>,
+    events_sender: &watch::Sender<TaskEvent<WalConnectionStatus>>,
    mut cancellation: watch::Receiver<()>,
    connect_timeout: Duration,
 ) -> anyhow::Result<()> {
@@ -49,12 +67,26 @@ pub async fn handle_walreceiver_connection(
    .await
    .context("Timed out while waiting for walreceiver connection to open")?
    .context("Failed to open walreceiver conection")?;
+
+    info!("connected!");
+    let mut connection_status = WalConnectionStatus {
+        is_connected: true,
+        has_received_wal: false,
+        latest_connection_update: Utc::now().naive_utc(),
+        latest_wal_update: Utc::now().naive_utc(),
+        streaming_lsn: None,
+        commit_lsn: None,
+    };
+    if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
+        warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
+        return Ok(());
+    }
+
    // The connection object performs the actual communication with the database,
    // so spawn it off to run on its own.
    let mut connection_cancellation = cancellation.clone();
    tokio::spawn(
        async move {
-            info!("connected!");
            select! {
                    connection_result = connection => match connection_result{
                            Ok(()) => info!("Walreceiver db connection closed"),
@@ -84,6 +116,14 @@ pub async fn handle_walreceiver_connection(

    let identify = identify_system(&mut replication_client).await?;
    info!("{identify:?}");
+
+    connection_status.latest_connection_update = Utc::now().naive_utc();
+    if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
+        warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
+        return Ok(());
+    }
+
+    // NB: this is a flush_lsn, not a commit_lsn.
    let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
    let mut caught_up = false;
    let ZTenantTimelineId {
@@ -118,7 +158,7 @@ pub async fn handle_walreceiver_connection(
    // There might be some padding after the last full record, skip it.
    startpoint += startpoint.calc_padding(8u32);

-    info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, server is at {end_of_wal}...");
+    info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}...");

    let query = format!("START_REPLICATION PHYSICAL {startpoint}");

@@ -140,6 +180,33 @@ pub async fn handle_walreceiver_connection(
        }
    } {
        let replication_message = replication_message?;
+        let now = Utc::now().naive_utc();
+
+        // Update the connection status before processing the message. If the message processing
+        // fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper.
+        match &replication_message {
+            ReplicationMessage::XLogData(xlog_data) => {
+                connection_status.latest_connection_update = now;
+                connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end()));
+                connection_status.streaming_lsn = Some(Lsn::from(
+                    xlog_data.wal_start() + xlog_data.data().len() as u64,
+                ));
+                if !xlog_data.data().is_empty() {
+                    connection_status.latest_wal_update = now;
+                    connection_status.has_received_wal = true;
+                }
+            }
+            ReplicationMessage::PrimaryKeepAlive(keepalive) => {
+                connection_status.latest_connection_update = now;
+                connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
+            }
+            &_ => {}
+        };
+        if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
+            warn!("Wal connection event listener dropped, aborting the connection: {e}");
+            return Ok(());
+        }
+
        let status_update = match replication_message {
            ReplicationMessage::XLogData(xlog_data) => {
                // Pass the WAL data to the decoder, and see if we can decode
@@ -178,16 +245,6 @@ pub async fn handle_walreceiver_connection(
                    caught_up = true;
                }

-                let timeline_to_check = Arc::clone(&timeline);
-                tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance())
-                    .await
-                    .with_context(|| {
-                        format!("Spawned checkpoint check task panicked for timeline {id}")
-                    })?
-                    .with_context(|| {
-                        format!("Failed to check checkpoint distance for timeline {id}")
-                    })?;
-
                Some(endlsn)
            }

@@ -208,6 +265,12 @@ pub async fn handle_walreceiver_connection(
            _ => None,
        };

+        let timeline_to_check = Arc::clone(&timeline);
+        tokio::task::spawn_blocking(move || timeline_to_check.check_checkpoint_distance())
+            .await
+            .with_context(|| format!("Spawned checkpoint check task panicked for timeline {id}"))?
+            .with_context(|| format!("Failed to check checkpoint distance for timeline {id}"))?;
+
        if let Some(last_lsn) = status_update {
            let remote_index = repo.get_remote_index();
            let timeline_remote_consistent_lsn = remote_index
@@ -261,10 +324,6 @@ pub async fn handle_walreceiver_connection(
                .as_mut()
                .zenith_status_update(data.len() as u64, &data)
                .await?;
-            if let Err(e) = events_sender.send(TaskEvent::NewEvent(zenith_status_update)) {
-                warn!("Wal connection event listener dropped, aborting the connection: {e}");
-                return Ok(());
-            }
        }
    }

--- a/poetry.lock
+++ b/poetry.lock
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2021"
 anyhow = "1.0"
 async-trait = "0.1"
 base64 = "0.13.0"
+bstr = "0.2.17"
 bytes = { version = "1.0.1", features = ['serde'] }
 clap = "3.0"
 futures = "0.3.13"
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -12,7 +12,7 @@ use password_hack::PasswordHackPayload;
 mod flow;
 pub use flow::*;

-use crate::{error::UserFacingError, waiters};
+use crate::error::UserFacingError;
 use std::io;
 use thiserror::Error;

@@ -22,51 +22,54 @@ pub type Result<T> = std::result::Result<T, AuthError>;
 /// Common authentication error.
 #[derive(Debug, Error)]
 pub enum AuthErrorImpl {
-    /// Authentication error reported by the console.
+    // This will be dropped in the future.
    #[error(transparent)]
-    Console(#[from] backend::AuthError),
+    Legacy(#[from] backend::LegacyAuthError),

    #[error(transparent)]
-    GetAuthInfo(#[from] backend::console::ConsoleAuthError),
+    Link(#[from] backend::LinkAuthError),

+    #[error(transparent)]
+    GetAuthInfo(#[from] backend::GetAuthInfoError),
+
+    #[error(transparent)]
+    WakeCompute(#[from] backend::WakeComputeError),
+
+    /// SASL protocol errors (includes [SCRAM](crate::scram)).
    #[error(transparent)]
    Sasl(#[from] crate::sasl::Error),

+    #[error("Unsupported authentication method: {0}")]
+    BadAuthMethod(Box<str>),
+
    #[error("Malformed password message: {0}")]
    MalformedPassword(&'static str),

-    /// Errors produced by [`crate::stream::PqStream`].
+    #[error(
+        "Project name is not specified. \
+        Either please upgrade the postgres client library (libpq) for SNI support \
+        or pass the project name as a parameter: '&options=project%3D<project-name>'. \
+        See more at https://neon.tech/sni"
+    )]
+    MissingProjectName,
+
+    /// Errors produced by e.g. [`crate::stream::PqStream`].
    #[error(transparent)]
    Io(#[from] io::Error),
 }

-impl AuthErrorImpl {
-    pub fn auth_failed(msg: impl Into<String>) -> Self {
-        Self::Console(backend::AuthError::auth_failed(msg))
-    }
-}
-
-impl From<waiters::RegisterError> for AuthErrorImpl {
-    fn from(e: waiters::RegisterError) -> Self {
-        Self::Console(backend::AuthError::from(e))
-    }
-}
-
-impl From<waiters::WaitError> for AuthErrorImpl {
-    fn from(e: waiters::WaitError) -> Self {
-        Self::Console(backend::AuthError::from(e))
-    }
-}
-
 #[derive(Debug, Error)]
 #[error(transparent)]
 pub struct AuthError(Box<AuthErrorImpl>);

-impl<T> From<T> for AuthError
-where
-    AuthErrorImpl: From<T>,
-{
-    fn from(e: T) -> Self {
+impl AuthError {
+    pub fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
+        AuthErrorImpl::BadAuthMethod(name.into()).into()
+    }
+}
+
+impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
+    fn from(e: E) -> Self {
        Self(Box::new(e.into()))
    }
 }
@@ -75,10 +78,14 @@ impl UserFacingError for AuthError {
    fn to_string_client(&self) -> String {
        use AuthErrorImpl::*;
        match self.0.as_ref() {
-            Console(e) => e.to_string_client(),
+            Legacy(e) => e.to_string_client(),
+            Link(e) => e.to_string_client(),
            GetAuthInfo(e) => e.to_string_client(),
+            WakeCompute(e) => e.to_string_client(),
            Sasl(e) => e.to_string_client(),
+            BadAuthMethod(_) => self.to_string(),
            MalformedPassword(_) => self.to_string(),
+            MissingProjectName => self.to_string(),
            _ => "Internal error".to_string(),
        }
    }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,10 +1,13 @@
-mod link;
 mod postgres;

-pub mod console;
+mod link;
+pub use link::LinkAuthError;
+
+mod console;
+pub use console::{GetAuthInfoError, WakeComputeError};

 mod legacy_console;
-pub use legacy_console::{AuthError, AuthErrorImpl};
+pub use legacy_console::LegacyAuthError;

 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
--- a/proxy/src/auth/backend/console.rs
+++ b/proxy/src/auth/backend/console.rs
@@ -13,21 +13,11 @@ use std::future::Future;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};

-pub type Result<T> = std::result::Result<T, ConsoleAuthError>;
+const REQUEST_FAILED: &str = "Console request failed";

 #[derive(Debug, Error)]
-pub enum ConsoleAuthError {
-    #[error(transparent)]
-    BadProjectName(#[from] auth::credentials::ClientCredsParseError),
-
-    // We shouldn't include the actual secret here.
-    #[error("Bad authentication secret")]
-    BadSecret,
-
-    #[error("Console responded with a malformed compute address: '{0}'")]
-    BadComputeAddress(String),
-
-    #[error("Console responded with a malformed JSON: '{0}'")]
+pub enum TransportError {
+    #[error("Console responded with a malformed JSON: {0}")]
    BadResponse(#[from] serde_json::Error),

    /// HTTP status (other than 200) returned by the console.
@@ -38,19 +28,72 @@ pub enum ConsoleAuthError {
    Io(#[from] std::io::Error),
 }

-impl UserFacingError for ConsoleAuthError {
+impl UserFacingError for TransportError {
    fn to_string_client(&self) -> String {
-        use ConsoleAuthError::*;
+        use TransportError::*;
        match self {
-            BadProjectName(e) => e.to_string_client(),
-            _ => "Internal error".to_string(),
+            HttpStatus(_) => self.to_string(),
+            _ => REQUEST_FAILED.to_owned(),
        }
    }
 }

-impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError {
-    fn from(e: &auth::credentials::ClientCredsParseError) -> Self {
-        ConsoleAuthError::BadProjectName(e.clone())
+// Helps eliminate graceless `.map_err` calls without introducing another ctor.
+impl From<reqwest::Error> for TransportError {
+    fn from(e: reqwest::Error) -> Self {
+        io_error(e).into()
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum GetAuthInfoError {
+    // We shouldn't include the actual secret here.
+    #[error("Console responded with a malformed auth secret")]
+    BadSecret,
+
+    #[error(transparent)]
+    Transport(TransportError),
+}
+
+impl UserFacingError for GetAuthInfoError {
+    fn to_string_client(&self) -> String {
+        use GetAuthInfoError::*;
+        match self {
+            BadSecret => REQUEST_FAILED.to_owned(),
+            Transport(e) => e.to_string_client(),
+        }
+    }
+}
+
+impl<E: Into<TransportError>> From<E> for GetAuthInfoError {
+    fn from(e: E) -> Self {
+        Self::Transport(e.into())
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum WakeComputeError {
+    // We shouldn't show users the address even if it's broken.
+    #[error("Console responded with a malformed compute address: {0}")]
+    BadComputeAddress(String),
+
+    #[error(transparent)]
+    Transport(TransportError),
+}
+
+impl UserFacingError for WakeComputeError {
+    fn to_string_client(&self) -> String {
+        use WakeComputeError::*;
+        match self {
+            BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
+            Transport(e) => e.to_string_client(),
+        }
+    }
+}
+
+impl<E: Into<TransportError>> From<E> for WakeComputeError {
+    fn from(e: E) -> Self {
+        Self::Transport(e.into())
    }
 }

@@ -95,7 +138,7 @@ impl<'a> Api<'a> {
        handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
    }

-    async fn get_auth_info(&self) -> Result<AuthInfo> {
+    async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
        let mut url = self.endpoint.clone();
        url.path_segments_mut().push("proxy_get_role_secret");
        url.query_pairs_mut()
@@ -105,21 +148,20 @@ impl<'a> Api<'a> {
        // TODO: use a proper logger
        println!("cplane request: {url}");

-        let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?;
+        let resp = reqwest::get(url.into_inner()).await?;
        if !resp.status().is_success() {
-            return Err(ConsoleAuthError::HttpStatus(resp.status()));
+            return Err(TransportError::HttpStatus(resp.status()).into());
        }

-        let response: GetRoleSecretResponse =
-            serde_json::from_str(&resp.text().await.map_err(io_error)?)?;
+        let response: GetRoleSecretResponse = serde_json::from_str(&resp.text().await?)?;

-        scram::ServerSecret::parse(response.role_secret.as_str())
+        scram::ServerSecret::parse(&response.role_secret)
            .map(AuthInfo::Scram)
-            .ok_or(ConsoleAuthError::BadSecret)
+            .ok_or(GetAuthInfoError::BadSecret)
    }

    /// Wake up the compute node and return the corresponding connection info.
-    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg> {
+    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
        let mut url = self.endpoint.clone();
        url.path_segments_mut().push("proxy_wake_compute");
        url.query_pairs_mut()
@@ -128,17 +170,16 @@ impl<'a> Api<'a> {
        // TODO: use a proper logger
        println!("cplane request: {url}");

-        let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?;
+        let resp = reqwest::get(url.into_inner()).await?;
        if !resp.status().is_success() {
-            return Err(ConsoleAuthError::HttpStatus(resp.status()));
+            return Err(TransportError::HttpStatus(resp.status()).into());
        }

-        let response: GetWakeComputeResponse =
-            serde_json::from_str(&resp.text().await.map_err(io_error)?)?;
+        let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await?)?;

        // Unfortunately, ownership won't let us use `Option::ok_or` here.
        let (host, port) = match parse_host_port(&response.address) {
-            None => return Err(ConsoleAuthError::BadComputeAddress(response.address)),
+            None => return Err(WakeComputeError::BadComputeAddress(response.address)),
            Some(x) => x,
        };

@@ -162,8 +203,8 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
    wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
 ) -> auth::Result<compute::NodeInfo>
 where
-    GetAuthInfo: Future<Output = Result<AuthInfo>>,
-    WakeCompute: Future<Output = Result<ComputeConnCfg>>,
+    GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
+    WakeCompute: Future<Output = Result<ComputeConnCfg, WakeComputeError>>,
 {
    let auth_info = get_auth_info(endpoint).await?;

@@ -171,7 +212,7 @@ where
    let scram_keys = match auth_info {
        AuthInfo::Md5(_) => {
            // TODO: decide if we should support MD5 in api v2
-            return Err(auth::AuthErrorImpl::auth_failed("MD5 is not supported").into());
+            return Err(auth::AuthError::bad_auth_method("MD5"));
        }
        AuthInfo::Scram(secret) => {
            let scram = auth::Scram(&secret);
--- a/proxy/src/auth/backend/legacy_console.rs
+++ b/proxy/src/auth/backend/legacy_console.rs
@@ -14,7 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use utils::pq_proto::BeMessage as Be;

 #[derive(Debug, Error)]
-pub enum AuthErrorImpl {
+pub enum LegacyAuthError {
    /// Authentication error reported by the console.
    #[error("Authentication failed: {0}")]
    AuthFailed(String),
@@ -24,7 +24,7 @@ pub enum AuthErrorImpl {
    HttpStatus(reqwest::StatusCode),

    #[error("Console responded with a malformed JSON: {0}")]
-    MalformedResponse(#[from] serde_json::Error),
+    BadResponse(#[from] serde_json::Error),

    #[error(transparent)]
    Transport(#[from] reqwest::Error),
@@ -36,30 +36,10 @@ pub enum AuthErrorImpl {
    WaiterWait(#[from] waiters::WaitError),
 }

-#[derive(Debug, Error)]
-#[error(transparent)]
-pub struct AuthError(Box<AuthErrorImpl>);
-
-impl AuthError {
-    /// Smart constructor for authentication error reported by `mgmt`.
-    pub fn auth_failed(msg: impl Into<String>) -> Self {
-        Self(Box::new(AuthErrorImpl::AuthFailed(msg.into())))
-    }
-}
-
-impl<T> From<T> for AuthError
-where
-    AuthErrorImpl: From<T>,
-{
-    fn from(e: T) -> Self {
-        Self(Box::new(e.into()))
-    }
-}
-
-impl UserFacingError for AuthError {
+impl UserFacingError for LegacyAuthError {
    fn to_string_client(&self) -> String {
-        use AuthErrorImpl::*;
-        match self.0.as_ref() {
+        use LegacyAuthError::*;
+        match self {
            AuthFailed(_) | HttpStatus(_) => self.to_string(),
            _ => "Internal error".to_string(),
        }
@@ -88,7 +68,7 @@ async fn authenticate_proxy_client(
    md5_response: &str,
    salt: &[u8; 4],
    psql_session_id: &str,
-) -> Result<DatabaseInfo, AuthError> {
+) -> Result<DatabaseInfo, LegacyAuthError> {
    let mut url = auth_endpoint.clone();
    url.query_pairs_mut()
        .append_pair("login", &creds.user)
@@ -102,17 +82,17 @@ async fn authenticate_proxy_client(
        // TODO: leverage `reqwest::Client` to reuse connections
        let resp = reqwest::get(url).await?;
        if !resp.status().is_success() {
-            return Err(AuthErrorImpl::HttpStatus(resp.status()).into());
+            return Err(LegacyAuthError::HttpStatus(resp.status()));
        }

-        let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?;
+        let auth_info = serde_json::from_str(resp.text().await?.as_str())?;
        println!("got auth info: {:?}", auth_info);

        use ProxyAuthResponse::*;
        let db_info = match auth_info {
            Ready { conn_info } => conn_info,
-            Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()),
-            NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?,
+            Error { error } => return Err(LegacyAuthError::AuthFailed(error)),
+            NotReady { .. } => waiter.await?.map_err(LegacyAuthError::AuthFailed)?,
        };

        Ok(db_info)
@@ -124,7 +104,7 @@ async fn handle_existing_user(
    auth_endpoint: &reqwest::Url,
    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
    creds: &ClientCredentials,
-) -> Result<compute::NodeInfo, auth::AuthError> {
+) -> auth::Result<compute::NodeInfo> {
    let psql_session_id = super::link::new_psql_session_id();
    let md5_salt = rand::random();

--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -1,7 +1,34 @@
-use crate::{auth, compute, stream::PqStream};
+use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
+use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};

+#[derive(Debug, Error)]
+pub enum LinkAuthError {
+    /// Authentication error reported by the console.
+    #[error("Authentication failed: {0}")]
+    AuthFailed(String),
+
+    #[error(transparent)]
+    WaiterRegister(#[from] waiters::RegisterError),
+
+    #[error(transparent)]
+    WaiterWait(#[from] waiters::WaitError),
+
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+}
+
+impl UserFacingError for LinkAuthError {
+    fn to_string_client(&self) -> String {
+        use LinkAuthError::*;
+        match self {
+            AuthFailed(_) => self.to_string(),
+            _ => "Internal error".to_string(),
+        }
+    }
+}
+
 fn hello_message(redirect_uri: &str, session_id: &str) -> String {
    format!(
        concat![
@@ -34,7 +61,7 @@ pub async fn handle_user(
            .await?;

        // Wait for web console response (see `mgmt`)
-        waiter.await?.map_err(auth::AuthErrorImpl::auth_failed)
+        waiter.await?.map_err(LinkAuthError::AuthFailed)
    })
    .await?;

--- a/proxy/src/auth/backend/postgres.rs
+++ b/proxy/src/auth/backend/postgres.rs
@@ -3,7 +3,7 @@
 use crate::{
    auth::{
        self,
-        backend::console::{self, AuthInfo, Result},
+        backend::console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
        ClientCredentials,
    },
    compute::{self, ComputeConnCfg},
@@ -20,6 +20,13 @@ pub(super) struct Api<'a> {
    creds: &'a ClientCredentials,
 }

+// Helps eliminate graceless `.map_err` calls without introducing another ctor.
+impl From<tokio_postgres::Error> for TransportError {
+    fn from(e: tokio_postgres::Error) -> Self {
+        io_error(e).into()
+    }
+}
+
 impl<'a> Api<'a> {
    /// Construct an API object containing the auth parameters.
    pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self {
@@ -36,21 +43,16 @@ impl<'a> Api<'a> {
    }

    /// This implementation fetches the auth info from a local postgres instance.
-    async fn get_auth_info(&self) -> Result<AuthInfo> {
+    async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
        // Perhaps we could persist this connection, but then we'd have to
        // write more code for reopening it if it got closed, which doesn't
        // seem worth it.
        let (client, connection) =
-            tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls)
-                .await
-                .map_err(io_error)?;
+            tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;

        tokio::spawn(connection);
        let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1";
-        let rows = client
-            .query(query, &[&self.creds.user])
-            .await
-            .map_err(io_error)?;
+        let rows = client.query(query, &[&self.creds.user]).await?;

        match &rows[..] {
            // We can't get a secret if there's no such user.
@@ -74,13 +76,13 @@ impl<'a> Api<'a> {
                        }))
                    })
                    // Putting the secret into this message is a security hazard!
-                    .ok_or(console::ConsoleAuthError::BadSecret)
+                    .ok_or(GetAuthInfoError::BadSecret)
            }
        }
    }

    /// We don't need to wake anything locally, so we just return the connection info.
-    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg> {
+    pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
        let mut config = ComputeConnCfg::new();
        config
            .host(self.endpoint.host_str().unwrap_or("localhost"))
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -75,13 +75,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
            .strip_suffix(&[0])
            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;

-        // The so-called "password" should contain a base64-encoded json.
-        // We will use it later to route the client to their project.
-        let bytes = base64::decode(password)
-            .map_err(|_| AuthErrorImpl::MalformedPassword("bad encoding"))?;
-
-        let payload = serde_json::from_slice(&bytes)
-            .map_err(|_| AuthErrorImpl::MalformedPassword("invalid payload"))?;
+        let payload = PasswordHackPayload::parse(password)
+            // If we ended up here and the payload is malformed, it means that
+            // the user neither enabled SNI nor resorted to any other method
+            // for passing the project name we rely on. We should show them
+            // the most helpful error message and point to the documentation.
+            .ok_or(AuthErrorImpl::MissingProjectName)?;

        Ok(payload)
    }
@@ -98,7 +97,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {

        // Currently, the only supported SASL method is SCRAM.
        if !scram::METHODS.contains(&sasl.method) {
-            return Err(AuthErrorImpl::auth_failed("method not supported").into());
+            return Err(super::AuthError::bad_auth_method(sasl.method));
        }

        let secret = self.state.0;
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -1,102 +1,46 @@
 //! Payload for ad hoc authentication method for clients that don't support SNI.
 //! See the `impl` for [`super::backend::BackendType<ClientCredentials>`].
 //! Read more: <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
+//! UPDATE (Mon Aug  8 13:20:34 UTC 2022): the payload format has been simplified.

-use serde::{de, Deserialize, Deserializer};
-use std::fmt;
+use bstr::ByteSlice;

-#[derive(Deserialize)]
-#[serde(untagged)]
-pub enum Password {
-    /// A regular string for utf-8 encoded passwords.
-    Simple { password: String },
-
-    /// Password is base64-encoded because it may contain arbitrary byte sequences.
-    Encoded {
-        #[serde(rename = "password_", deserialize_with = "deserialize_base64")]
-        password: Vec<u8>,
-    },
-}
-
-impl AsRef<[u8]> for Password {
-    fn as_ref(&self) -> &[u8] {
-        match self {
-            Password::Simple { password } => password.as_ref(),
-            Password::Encoded { password } => password.as_ref(),
-        }
-    }
-}
-
-#[derive(Deserialize)]
 pub struct PasswordHackPayload {
    pub project: String,
-
-    #[serde(flatten)]
-    pub password: Password,
+    pub password: Vec<u8>,
 }

-fn deserialize_base64<'a, D: Deserializer<'a>>(des: D) -> Result<Vec<u8>, D::Error> {
-    // It's very tempting to replace this with
-    //
-    // ```
-    // let base64: &str = Deserialize::deserialize(des)?;
-    // base64::decode(base64).map_err(serde::de::Error::custom)
-    // ```
-    //
-    // Unfortunately, we can't always deserialize into `&str`, so we'd
-    // have to use an allocating `String` instead. Thus, visitor is better.
-    struct Visitor;
+impl PasswordHackPayload {
+    pub fn parse(bytes: &[u8]) -> Option<Self> {
+        // The format is `project=<utf-8>;<password-bytes>`.
+        let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";");
+        let project = iter.next()?.to_str().ok()?.to_owned();
+        let password = iter.next()?.to_owned();

-    impl<'de> de::Visitor<'de> for Visitor {
-        type Value = Vec<u8>;
-
-        fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
-            formatter.write_str("a string")
-        }
-
-        fn visit_str<E: de::Error>(self, v: &str) -> Result<Self::Value, E> {
-            base64::decode(v).map_err(de::Error::custom)
-        }
+        Some(Self { project, password })
    }
-
-    des.deserialize_str(Visitor)
 }

 #[cfg(test)]
 mod tests {
    use super::*;
-    use rstest::rstest;
-    use serde_json::json;

    #[test]
-    fn parse_password() -> anyhow::Result<()> {
-        let password: Password = serde_json::from_value(json!({
-            "password": "foo",
-        }))?;
-        assert_eq!(password.as_ref(), "foo".as_bytes());
+    fn parse_password_hack_payload() {
+        let bytes = b"";
+        assert!(PasswordHackPayload::parse(bytes).is_none());

-        let password: Password = serde_json::from_value(json!({
-            "password_": base64::encode("foo"),
-        }))?;
-        assert_eq!(password.as_ref(), "foo".as_bytes());
+        let bytes = b"project=";
+        assert!(PasswordHackPayload::parse(bytes).is_none());

-        Ok(())
-    }
+        let bytes = b"project=;";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.project, "");
+        assert_eq!(payload.password, b"");

-    #[rstest]
-    #[case("password", str::to_owned)]
-    #[case("password_", base64::encode)]
-    fn parse(#[case] key: &str, #[case] encode: fn(&'static str) -> String) -> anyhow::Result<()> {
-        let (password, project) = ("password", "pie-in-the-sky");
-        let payload = json!({
-            "project": project,
-            key: encode(password),
-        });
-
-        let payload: PasswordHackPayload = serde_json::from_value(payload)?;
-        assert_eq!(payload.password.as_ref(), password.as_bytes());
-        assert_eq!(payload.project, project);
-
-        Ok(())
+        let bytes = b"project=foobar;pass;word";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.project, "foobar");
+        assert_eq!(payload.password, b"pass;word");
    }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
 Werkzeug = "2.1.2"
+pytest-order = "^1.0.1"

 [tool.poetry.dev-dependencies]
 yapf = "==0.31.0"
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -727,7 +727,7 @@ where
                info!("setting local_start_lsn to {:?}", state.local_start_lsn);
            }
            // Initializing commit_lsn before acking first flushed record is
-            // important to let find_end_of_wal skip the whole in the beginning
+            // important to let find_end_of_wal skip the hole in the beginning
            // of the first segment.
            //
            // NB: on new clusters, this happens at the same time as
@@ -738,6 +738,10 @@ where

            // Initializing backup_lsn is useful to avoid making backup think it should upload 0 segment.
            self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn);
+            // Initializing remote_consistent_lsn sets that we have nothing to
+            // stream to pageserver(s) immediately after creation.
+            self.inmem.remote_consistent_lsn =
+                max(self.inmem.remote_consistent_lsn, state.timeline_start_lsn);

            state.acceptor_state.term_history = msg.term_history.clone();
            self.persist_control_file(state)?;
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -137,7 +137,7 @@ impl SharedState {
        self.is_wal_backup_required()
            // FIXME: add tracking of relevant pageservers and check them here individually,
            // otherwise migration won't work (we suspend too early).
-            || self.sk.inmem.remote_consistent_lsn <= self.sk.inmem.commit_lsn
+            || self.sk.inmem.remote_consistent_lsn < self.sk.inmem.commit_lsn
    }

    /// Mark timeline active/inactive and return whether s3 offloading requires
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -332,7 +332,7 @@ impl Storage for PhysicalStorage {
        self.write_lsn = if state.commit_lsn == Lsn(0) {
            Lsn(0)
        } else {
-            Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.commit_lsn)?.0)
+            find_end_of_wal(&self.timeline_dir, wal_seg_size, state.commit_lsn)?
        };

        self.write_record_lsn = self.write_lsn;
--- a/test_runner/batch_others/test_fsm_truncate.py
+++ b/test_runner/batch_others/test_fsm_truncate.py
@@ -0,0 +1,11 @@
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverHttpClient
+import pytest
+
+
+def test_fsm_truncate(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_fsm_truncate")
+    pg = env.postgres.create_start('test_fsm_truncate')
+    pg.safe_psql(
+        'CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;')
--- a/test_runner/batch_others/test_import.py
+++ b/test_runner/batch_others/test_import.py
@@ -1,9 +1,10 @@
+import re
 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_upload, wait_for_last_record_lsn
-from fixtures.utils import lsn_from_hex, lsn_to_hex
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, Postgres, wait_for_upload, wait_for_last_record_lsn
+from fixtures.utils import lsn_from_hex
 from uuid import UUID, uuid4
-import tarfile
 import os
+import tarfile
 import shutil
 from pathlib import Path
 import json
@@ -105,20 +106,63 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build


@pytest.mark.timeout(600)
-def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_builder):
-
-    num_rows = 3000
+def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
    neon_env_builder.num_safekeepers = 1
    neon_env_builder.enable_local_fs_remote_storage()
    env = neon_env_builder.init_start()

-    env.neon_cli.create_branch('test_import_from_pageserver')
-    pgmain = env.postgres.create_start('test_import_from_pageserver')
-    log.info("postgres is running on 'test_import_from_pageserver' branch")
+    timeline = env.neon_cli.create_branch('test_import_from_pageserver_small')
+    pg = env.postgres.create_start('test_import_from_pageserver_small')

-    timeline = pgmain.safe_psql("SHOW neon.timeline_id")[0][0]
+    num_rows = 3000
+    lsn = _generate_data(num_rows, pg)
+    _import(num_rows, lsn, env, pg_bin, timeline)

-    with closing(pgmain.connect()) as conn:
+
+@pytest.mark.timeout(1800)
+# TODO: temporarily disable `test_import_from_pageserver_multisegment` test, enable
+# the test back after finding the failure cause.
+# @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build")
+@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255")
+def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    neon_env_builder.enable_local_fs_remote_storage()
+    env = neon_env_builder.init_start()
+
+    timeline = env.neon_cli.create_branch('test_import_from_pageserver_multisegment')
+    pg = env.postgres.create_start('test_import_from_pageserver_multisegment')
+
+    # For `test_import_from_pageserver_multisegment`, we want to make sure that the data
+    # is large enough to create multi-segment files. Typically, a segment file's size is
+    # at most 1GB. A large number of inserted rows (`30000000`) is used to increase the
+    # DB size to above 1GB. Related: https://github.com/neondatabase/neon/issues/2097.
+    num_rows = 30000000
+    lsn = _generate_data(num_rows, pg)
+
+    logical_size = env.pageserver.http_client().timeline_detail(
+        env.initial_tenant, timeline)['local']['current_logical_size']
+    log.info(f"timeline logical size = {logical_size / (1024 ** 2)}MB")
+    assert logical_size > 1024**3  # = 1GB
+
+    tar_output_file = _import(num_rows, lsn, env, pg_bin, timeline)
+
+    # Check if the backup data contains multiple segment files
+    cnt_seg_files = 0
+    segfile_re = re.compile('[0-9]+\\.[0-9]+')
+    with tarfile.open(tar_output_file, "r") as tar_f:
+        for f in tar_f.getnames():
+            if segfile_re.search(f) is not None:
+                cnt_seg_files += 1
+                log.info(f"Found a segment file: {f} in the backup archive file")
+    assert cnt_seg_files > 0
+
+
+def _generate_data(num_rows: int, pg: Postgres) -> str:
+    """Generate a table with `num_rows` rows.
+
+    Returns:
+    the latest insert WAL's LSN"""
+    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
            # data loading may take a while, so increase statement timeout
            cur.execute("SET statement_timeout='300s'")
@@ -127,15 +171,28 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu
            cur.execute("CHECKPOINT")

            cur.execute('SELECT pg_current_wal_insert_lsn()')
-            lsn = cur.fetchone()[0]
-            log.info(f"start_backup_lsn = {lsn}")
+            res = cur.fetchone()
+            assert res is not None and isinstance(res[0], str)
+            return res[0]
+
+
+def _import(expected_num_rows: int, lsn: str, env: NeonEnv, pg_bin: PgBin, timeline: UUID) -> str:
+    """Test importing backup data to the pageserver.
+
+    Args:
+    expected_num_rows: the expected number of rows of the test table in the backup data
+    lsn: the backup's base LSN
+
+    Returns:
+    path to the backup archive file"""
+    log.info(f"start_backup_lsn = {lsn}")

    # Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
    # PgBin sets it automatically, but here we need to pipe psql output to the tar command.
    psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')}

    # Get a fullbackup from pageserver
-    query = f"fullbackup { env.initial_tenant.hex} {timeline} {lsn}"
+    query = f"fullbackup { env.initial_tenant.hex} {timeline.hex} {lsn}"
    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
    result_basepath = pg_bin.run_capture(cmd, env=psql_env)
    tar_output_file = result_basepath + ".stdout"
@@ -152,7 +209,7 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu
    env.pageserver.start()

    # Import using another tenantid, because we use the same pageserver.
-    # TODO Create another pageserver to maeke test more realistic.
+    # TODO Create another pageserver to make test more realistic.
    tenant = uuid4()

    # Import to pageserver
@@ -165,7 +222,7 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu
        "--tenant-id",
        tenant.hex,
        "--timeline-id",
-        timeline,
+        timeline.hex,
        "--node-name",
        node_name,
        "--base-lsn",
@@ -175,15 +232,15 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu
    ])

    # Wait for data to land in s3
-    wait_for_last_record_lsn(client, tenant, UUID(timeline), lsn_from_hex(lsn))
-    wait_for_upload(client, tenant, UUID(timeline), lsn_from_hex(lsn))
+    wait_for_last_record_lsn(client, tenant, timeline, lsn_from_hex(lsn))
+    wait_for_upload(client, tenant, timeline, lsn_from_hex(lsn))

    # Check it worked
    pg = env.postgres.create_start(node_name, tenant_id=tenant)
-    assert pg.safe_psql('select count(*) from tbl') == [(num_rows, )]
+    assert pg.safe_psql('select count(*) from tbl') == [(expected_num_rows, )]

    # Take another fullbackup
-    query = f"fullbackup { tenant.hex} {timeline} {lsn}"
+    query = f"fullbackup { tenant.hex} {timeline.hex} {lsn}"
    cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
    result_basepath = pg_bin.run_capture(cmd, env=psql_env)
    new_tar_output_file = result_basepath + ".stdout"
@@ -195,4 +252,6 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu
    # Check that gc works
    psconn = env.pageserver.connect()
    pscur = psconn.cursor()
-    pscur.execute(f"do_gc {tenant.hex} {timeline} 0")
+    pscur.execute(f"do_gc {tenant.hex} {timeline.hex} 0")
+
+    return tar_output_file
--- a/test_runner/batch_others/test_proxy.py
+++ b/test_runner/batch_others/test_proxy.py
@@ -1,6 +1,5 @@
 import pytest
-import json
-import base64
+import psycopg2


 def test_proxy_select_1(static_proxy):
@@ -13,22 +12,14 @@ def test_password_hack(static_proxy):
    static_proxy.safe_psql(f"create role {user} with login password '{password}'",
                           options='project=irrelevant')

-    def encode(s: str) -> str:
-        return base64.b64encode(s.encode('utf-8')).decode('utf-8')
-
-    magic = encode(json.dumps({
-        'project': 'irrelevant',
-        'password': password,
-    }))
-
+    # Note the format of `magic`!
+    magic = f"project=irrelevant;{password}"
    static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic)

-    magic = encode(json.dumps({
-        'project': 'irrelevant',
-        'password_': encode(password),
-    }))
-
-    static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic)
+    # Must also check that invalid magic won't be accepted.
+    with pytest.raises(psycopg2.errors.OperationalError):
+        magic = "broken"
+        static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic)


 # Pass extra options to the server.
--- a/test_runner/batch_others/test_timeline_size.py
+++ b/test_runner/batch_others/test_timeline_size.py
@@ -4,7 +4,7 @@ from uuid import UUID
 import re
 import psycopg2.extras
 import psycopg2.errors
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, Postgres, assert_timeline_local, wait_for_last_flush_lsn
 from fixtures.log_helper import log
 import time

@@ -192,6 +192,8 @@ def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
           FROM generate_series(1, 1000) g""",
    ])

+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
+
    # restart the pageserer to force calculating timeline's initial physical size
    env.pageserver.stop()
    env.pageserver.start()
@@ -211,7 +213,9 @@ def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
           FROM generate_series(1, 1000) g""",
    ])

+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
+
    assert_physical_size(env, env.initial_tenant, new_timeline_id)


@@ -232,8 +236,10 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder
           FROM generate_series(1, 100000) g""",
    ])

+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
    env.pageserver.safe_psql(f"compact {env.initial_tenant.hex} {new_timeline_id.hex}")
+
    assert_physical_size(env, env.initial_tenant, new_timeline_id)


@@ -254,15 +260,21 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
           SELECT 'long string to consume some space' || g
           FROM generate_series(1, 100000) g""",
    ])
+
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")
+
    pg.safe_psql("""
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
            FROM generate_series(1, 100000) g
    """)
+
+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")

    env.pageserver.safe_psql(f"do_gc {env.initial_tenant.hex} {new_timeline_id.hex} 0")
+
    assert_physical_size(env, env.initial_tenant, new_timeline_id)


@@ -279,6 +291,7 @@ def test_timeline_physical_size_metric(neon_simple_env: NeonEnv):
           FROM generate_series(1, 100000) g""",
    ])

+    wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
    env.pageserver.safe_psql(f"checkpoint {env.initial_tenant.hex} {new_timeline_id.hex}")

    # get the metrics and parse the metric for the current timeline's physical size
@@ -319,6 +332,7 @@ def test_tenant_physical_size(neon_simple_env: NeonEnv):
            f"INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, {n_rows}) g",
        ])

+        wait_for_last_flush_lsn(env, pg, tenant, timeline)
        env.pageserver.safe_psql(f"checkpoint {tenant.hex} {timeline.hex}")

        timeline_total_size += get_timeline_physical_size(timeline)
--- a/test_runner/batch_others/test_wal_acceptor.py
+++ b/test_runner/batch_others/test_wal_acceptor.py
@@ -284,9 +284,12 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    env.neon_cli.create_branch('test_safekeepers_wal_removal')
    pg = env.postgres.create_start('test_safekeepers_wal_removal')

+    # Note: it is important to insert at least two segments, as currently
+    # control file is synced roughly once in segment range and WAL is not
+    # removed until all horizons are persisted.
    pg.safe_psql_many([
        'CREATE TABLE t(key int primary key, value text)',
-        "INSERT INTO t SELECT generate_series(1,100000), 'payload'",
+        "INSERT INTO t SELECT generate_series(1,200000), 'payload'",
    ])

    tenant_id = pg.safe_psql("show neon.tenant_id")[0][0]
@@ -1087,11 +1090,9 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):

    # Remove initial tenant fully (two branches are active)
    response = sk_http.tenant_delete_force(tenant_id)
-    assert response == {
-        timeline_id_3: {
-            "dir_existed": True,
-            "was_active": True,
-        }
+    assert response[timeline_id_3] == {
+        "dir_existed": True,
+        "was_active": True,
    }
    assert not (sk_data_dir / tenant_id).exists()
    assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()
--- a/test_runner/batch_others/test_wal_acceptor_async.py
+++ b/test_runner/batch_others/test_wal_acceptor_async.py
@@ -520,3 +520,68 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder):
    pg = env.postgres.create_start('test_safekeepers_race_conditions')

    asyncio.run(run_race_conditions(env, pg))
+
+
+# Check that pageserver can select safekeeper with largest commit_lsn
+# and switch if LSN is not updated for some time (NoWalTimeout).
+async def run_wal_lagging(env: NeonEnv, pg: Postgres):
+    def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str:
+        # use ports 10, 11 and 12 to simulate unavailable safekeepers
+        return ','.join([
+            f'localhost:{sk.port.pg if active else 10 + i}'
+            for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk))
+        ])
+
+    conn = await pg.connect_async()
+    await conn.execute('CREATE TABLE t(key int primary key, value text)')
+    await conn.close()
+    pg.stop()
+
+    n_iterations = 20
+    n_txes = 10000
+    expected_sum = 0
+    i = 1
+    quorum = len(env.safekeepers) // 2 + 1
+
+    for it in range(n_iterations):
+        active_sk = list(map(lambda _: random.random() >= 0.5, env.safekeepers))
+        active_count = sum(active_sk)
+
+        if active_count < quorum:
+            it -= 1
+            continue
+
+        pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk))
+        log.info(f'Iteration {it}: {active_sk}')
+
+        pg.start()
+        conn = await pg.connect_async()
+
+        for _ in range(n_txes):
+            await conn.execute(f"INSERT INTO t values ({i}, 'payload')")
+            expected_sum += i
+            i += 1
+
+        await conn.close()
+        pg.stop()
+
+    pg.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers)))
+    pg.start()
+    conn = await pg.connect_async()
+
+    log.info(f'Executed {i-1} queries')
+
+    res = await conn.fetchval('SELECT sum(key) FROM t')
+    assert res == expected_sum
+
+
+# do inserts while restarting postgres and messing with safekeeper addresses
+def test_wal_lagging(neon_env_builder: NeonEnvBuilder):
+
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch('test_wal_lagging')
+    pg = env.postgres.create_start('test_wal_lagging')
+
+    asyncio.run(run_wal_lagging(env, pg))
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -1,23 +1,21 @@
+import calendar
 import dataclasses
+import enum
 import json
 import os
-from pathlib import Path
 import re
-import subprocess
 import timeit
-import calendar
-import enum
-from datetime import datetime
 import uuid
+import warnings
+from contextlib import contextmanager
+from datetime import datetime
+from pathlib import Path
+# Type-related stuff
+from typing import Iterator, Optional
+
 import pytest
 from _pytest.config import Config
 from _pytest.terminal import TerminalReporter
-import warnings
-
-from contextlib import contextmanager
-
-# Type-related stuff
-from typing import Iterator, Optional
 """
 This file contains fixtures for micro-benchmarks.

@@ -77,7 +75,7 @@ class PgBenchRunResult:

        # we know significant parts of these values from test input
        # but to be precise take them from output
-        for line in stdout.splitlines():
+        for line in stdout_lines:
            # scaling factor: 5
            if line.startswith("scaling factor:"):
                scale = int(line.split()[-1])
@@ -131,6 +129,58 @@ class PgBenchRunResult:
        )


+@dataclasses.dataclass
+class PgBenchInitResult:
+    total: float
+    drop_tables: Optional[float]
+    create_tables: Optional[float]
+    client_side_generate: Optional[float]
+    vacuum: Optional[float]
+    primary_keys: Optional[float]
+    duration: float
+    start_timestamp: int
+    end_timestamp: int
+
+    @classmethod
+    def parse_from_stderr(
+        cls,
+        stderr: str,
+        duration: float,
+        start_timestamp: int,
+        end_timestamp: int,
+    ):
+        # Parses pgbench initialize output for default initialization steps (dtgvp)
+        # Example: done in 5.66 s (drop tables 0.05 s, create tables 0.31 s, client-side generate 2.01 s, vacuum 0.53 s, primary keys 0.38 s).
+
+        last_line = stderr.splitlines()[-1]
+
+        regex = re.compile(r"done in (\d+\.\d+) s "
+                           r"\("
+                           r"(?:drop tables (\d+\.\d+) s)?(?:, )?"
+                           r"(?:create tables (\d+\.\d+) s)?(?:, )?"
+                           r"(?:client-side generate (\d+\.\d+) s)?(?:, )?"
+                           r"(?:vacuum (\d+\.\d+) s)?(?:, )?"
+                           r"(?:primary keys (\d+\.\d+) s)?(?:, )?"
+                           r"\)\.")
+
+        if (m := regex.match(last_line)) is not None:
+            total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [float(v) for v in m.groups() if v is not None]
+        else:
+            raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`")
+
+        return cls(
+            total=total,
+            drop_tables=drop_tables,
+            create_tables=create_tables,
+            client_side_generate=client_side_generate,
+            vacuum=vacuum,
+            primary_keys=primary_keys,
+            duration=duration,
+            start_timestamp=start_timestamp,
+            end_timestamp=end_timestamp,
+        )
+
+
@enum.unique
 class MetricReport(str, enum.Enum):  # str is a hack to make it json serializable
    # this means that this is a constant test parameter
@@ -232,6 +282,32 @@ class NeonBenchmarker:
                    '',
                    MetricReport.TEST_PARAM)

+    def record_pg_bench_init_result(self, prefix: str, result: PgBenchInitResult):
+        test_params = [
+            "start_timestamp",
+            "end_timestamp",
+        ]
+        for test_param in test_params:
+            self.record(f"{prefix}.{test_param}",
+                        getattr(result, test_param),
+                        '',
+                        MetricReport.TEST_PARAM)
+
+        metrics = [
+            "duration",
+            "drop_tables",
+            "create_tables",
+            "client_side_generate",
+            "vacuum",
+            "primary_keys",
+        ]
+        for metric in metrics:
+            if (value := getattr(result, metric)) is not None:
+                self.record(f"{prefix}.{metric}",
+                            value,
+                            unit="s",
+                            report=MetricReport.LOWER_IS_BETTER)
+
    def get_io_writes(self, pageserver) -> int:
        """
        Fetch the "cumulative # of bytes written" metric from the pageserver
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2440,7 +2440,7 @@ def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient,
                    timeline: uuid.UUID,
                    lsn: int):
    """waits for local timeline upload up to specified lsn"""
-    for i in range(10):
+    for i in range(20):
        current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
        if current_lsn >= lsn:
            return
@@ -2475,3 +2475,9 @@ def wait_for_last_record_lsn(pageserver_http_client: NeonPageserverHttpClient,
        time.sleep(1)
    raise Exception("timed out while waiting for last_record_lsn to reach {}, was {}".format(
        lsn_to_hex(lsn), lsn_to_hex(current_lsn)))
+
+
+def wait_for_last_flush_lsn(env: NeonEnv, pg: Postgres, tenant: uuid.UUID, timeline: uuid.UUID):
+    """Wait for pageserver to catch up the latest flush LSN"""
+    last_flush_lsn = lsn_from_hex(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -32,10 +32,16 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
    stdout_filename = basepath + '.stdout'
    stderr_filename = basepath + '.stderr'

-    with open(stdout_filename, 'w') as stdout_f:
-        with open(stderr_filename, 'w') as stderr_f:
-            log.info('(capturing output to "{}.stdout")'.format(base))
-            subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
+    try:
+        with open(stdout_filename, 'w') as stdout_f:
+            with open(stderr_filename, 'w') as stderr_f:
+                log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"')
+                subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
+    finally:
+        # Remove empty files if there is no output
+        for filename in (stdout_filename, stderr_filename):
+            if os.stat(filename).st_size == 0:
+                os.remove(filename)

    return basepath

@@ -140,3 +146,12 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
    key_parts = parts[0].split("-")
    lsn_parts = parts[1].split("-")
    return int(key_parts[0], 16), int(key_parts[1], 16), int(lsn_parts[0], 16), int(lsn_parts[1], 16)
+
+
+def get_scale_for_db(size_mb: int) -> int:
+    """Returns pgbench scale factor for given target db size in MB.
+
+    Ref https://www.cybertec-postgresql.com/en/a-formula-to-calculate-pgbench-scaling-factor-for-target-db-size/
+    """
+
+    return round(0.06689 * size_mb - 0.5)
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -10,7 +10,7 @@ In the CI, the performance tests are run in the same environment as the other in

 ## Remote tests

-There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests our staging environment daily. Staging is not an isolated environment, so there can be noise in the results due to activity of other clusters.
+There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests on our staging and captest environments daily. Those are not an isolated environments, so there can be noise in the results due to activity of other clusters.

 ## Noise

--- a/test_runner/performance/test_perf_pgbench.py
+++ b/test_runner/performance/test_perf_pgbench.py
@@ -1,17 +1,23 @@
-from contextlib import closing
-from fixtures.neon_fixtures import PgBin, VanillaPostgres, NeonEnv, profiling_supported
-from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare
-
-from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, NeonBenchmarker
-from fixtures.log_helper import log
-
-from pathlib import Path
-
-import pytest
-from datetime import datetime
 import calendar
+import enum
 import os
 import timeit
+from datetime import datetime
+from pathlib import Path
+from typing import List
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult
+from fixtures.compare_fixtures import NeonCompare, PgCompare
+from fixtures.neon_fixtures import profiling_supported
+from fixtures.utils import get_scale_for_db
+
+
+@enum.unique
+class PgBenchLoadType(enum.Enum):
+    INIT = "init"
+    SIMPLE_UPDATE = "simple_update"
+    SELECT_ONLY = "select-only"


 def utc_now_timestamp() -> int:
@@ -22,23 +28,24 @@ def init_pgbench(env: PgCompare, cmdline):
    # calculate timestamps and durations separately
    # timestamp is intended to be used for linking to grafana and logs
    # duration is actually a metric and uses float instead of int for timestamp
-    init_start_timestamp = utc_now_timestamp()
+    start_timestamp = utc_now_timestamp()
    t0 = timeit.default_timer()
    with env.record_pageserver_writes('init.pageserver_writes'):
-        env.pg_bin.run_capture(cmdline)
+        out = env.pg_bin.run_capture(cmdline)
        env.flush()
-    init_duration = timeit.default_timer() - t0
-    init_end_timestamp = utc_now_timestamp()

-    env.zenbenchmark.record("init.duration",
-                            init_duration,
-                            unit="s",
-                            report=MetricReport.LOWER_IS_BETTER)
-    env.zenbenchmark.record("init.start_timestamp",
-                            init_start_timestamp,
-                            '',
-                            MetricReport.TEST_PARAM)
-    env.zenbenchmark.record("init.end_timestamp", init_end_timestamp, '', MetricReport.TEST_PARAM)
+    duration = timeit.default_timer() - t0
+    end_timestamp = utc_now_timestamp()
+
+    stderr = Path(f"{out}.stderr").read_text()
+
+    res = PgBenchInitResult.parse_from_stderr(
+        stderr=stderr,
+        duration=duration,
+        start_timestamp=start_timestamp,
+        end_timestamp=end_timestamp,
+    )
+    env.zenbenchmark.record_pg_bench_init_result("init", res)


 def run_pgbench(env: PgCompare, prefix: str, cmdline):
@@ -70,38 +77,84 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline):
 # the test database.
 #
 # Currently, the # of connections is hardcoded at 4
-def run_test_pgbench(env: PgCompare, scale: int, duration: int):
-
-    # Record the scale and initialize
+def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType):
    env.zenbenchmark.record("scale", scale, '', MetricReport.TEST_PARAM)
-    init_pgbench(env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr()])

-    # Run simple-update workload
-    run_pgbench(env,
-                "simple-update", ['pgbench', '-N', '-c4', f'-T{duration}', '-P2', env.pg.connstr()])
+    if workload_type == PgBenchLoadType.INIT:
+        # Run initialize
+        init_pgbench(
+            env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr(options='-cstatement_timeout=1h')])

-    # Run SELECT workload
-    run_pgbench(env,
-                "select-only", ['pgbench', '-S', '-c4', f'-T{duration}', '-P2', env.pg.connstr()])
+    if workload_type == PgBenchLoadType.SIMPLE_UPDATE:
+        # Run simple-update workload
+        run_pgbench(env,
+                    "simple-update",
+                    [
+                        'pgbench',
+                        '-N',
+                        '-c4',
+                        f'-T{duration}',
+                        '-P2',
+                        '--progress-timestamp',
+                        env.pg.connstr(),
+                    ])
+
+    if workload_type == PgBenchLoadType.SELECT_ONLY:
+        # Run SELECT workload
+        run_pgbench(env,
+                    "select-only",
+                    [
+                        'pgbench',
+                        '-S',
+                        '-c4',
+                        f'-T{duration}',
+                        '-P2',
+                        '--progress-timestamp',
+                        env.pg.connstr(),
+                    ])

    env.report_size()


-def get_durations_matrix(default: int = 45):
+def get_durations_matrix(default: int = 45) -> List[int]:
    durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default=str(default))
-    return list(map(int, durations.split(",")))
+    rv = []
+    for d in durations.split(","):
+        d = d.strip().lower()
+        if d.endswith('h'):
+            duration = int(d.removesuffix('h')) * 60 * 60
+        elif d.endswith('m'):
+            duration = int(d.removesuffix('m')) * 60
+        else:
+            duration = int(d.removesuffix('s'))
+        rv.append(duration)
+
+    return rv


-def get_scales_matrix(default: int = 10):
+def get_scales_matrix(default: int = 10) -> List[int]:
    scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default=str(default))
-    return list(map(int, scales.split(",")))
+    rv = []
+    for s in scales.split(","):
+        s = s.strip().lower()
+        if s.endswith('mb'):
+            scale = get_scale_for_db(int(s.removesuffix('mb')))
+        elif s.endswith('gb'):
+            scale = get_scale_for_db(int(s.removesuffix('gb')) * 1024)
+        else:
+            scale = int(s)
+        rv.append(scale)
+
+    return rv


 # Run the pgbench tests against vanilla Postgres and neon
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix())
 def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int):
-    run_test_pgbench(neon_with_baseline, scale, duration)
+    run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.INIT)
+    run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
+    run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY)


 # Run the pgbench tests, and generate a flamegraph from it
@@ -123,12 +176,34 @@ profiling="page_requests"
    env = neon_env_builder.init_start()
    env.neon_cli.create_branch("empty", "main")

-    run_test_pgbench(NeonCompare(zenbenchmark, env, pg_bin, "pgbench"), scale, duration)
+    neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench")
+    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT)
+    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
+    run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)


+# The following 3 tests run on an existing database as it was set up by previous tests,
+# and leaves the database in a state that would be used in the next tests.
+# Modifying the definition order of these functions or adding other remote tests in between will alter results.
+# See usage of --sparse-ordering flag in the pytest invocation in the CI workflow
+#
 # Run the pgbench tests against an existing Postgres cluster
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix())
@pytest.mark.remote_cluster
-def test_pgbench_remote(remote_compare: PgCompare, scale: int, duration: int):
-    run_test_pgbench(remote_compare, scale, duration)
+def test_pgbench_remote_init(remote_compare: PgCompare, scale: int, duration: int):
+    run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.INIT)
+
+
+@pytest.mark.parametrize("scale", get_scales_matrix())
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, duration: int):
+    run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
+
+
+@pytest.mark.parametrize("scale", get_scales_matrix())
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int):
+    run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
--- a/test_runner/pg_clients/test_pg_clients.py
+++ b/test_runner/pg_clients/test_pg_clients.py
@@ -3,10 +3,10 @@ import shutil
 import subprocess
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from urllib.parse import urlparse

 import pytest
 from fixtures.neon_fixtures import RemotePostgres
+from fixtures.utils import subprocess_capture


@pytest.mark.remote_cluster
@@ -25,7 +25,7 @@ from fixtures.neon_fixtures import RemotePostgres
        "typescript/postgresql-client",
    ],
 )
-def test_pg_clients(remote_pg: RemotePostgres, client: str):
+def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: str):
    conn_options = remote_pg.conn_options()

    env_file = None
@@ -43,12 +43,10 @@ def test_pg_clients(remote_pg: RemotePostgres, client: str):
    if docker_bin is None:
        raise RuntimeError("docker is required for running this test")

-    build_cmd = [
-        docker_bin, "build", "--quiet", "--tag", image_tag, f"{Path(__file__).parent / client}"
-    ]
+    build_cmd = [docker_bin, "build", "--tag", image_tag, f"{Path(__file__).parent / client}"]
+    subprocess_capture(str(test_output_dir), build_cmd, check=True)
+
    run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag]
+    basepath = subprocess_capture(str(test_output_dir), run_cmd, check=True)

-    subprocess.run(build_cmd, check=True)
-    result = subprocess.run(run_cmd, check=True, capture_output=True, text=True)
-
-    assert result.stdout.strip() == "1"
+    assert Path(f"{basepath}.stdout").read_text().strip() == "1"
--- a/vendor/postgres
+++ b/vendor/postgres
Author	SHA1	Message	Date
Arthur Petukhovsky	84f7d0ac92	Disable XLP_FIRST_IS_CONTRECORD check	2022-08-15 19:38:12 +00:00
Alexander Bayandin	4cddb0f1a4	Set up a workflow to run pgbench against captest (#2077 )	2022-08-15 18:54:31 +01:00
Arseny Sher	7b12deead7	Bump vendor/postgres to include XLP_FIRST_IS_CONTRECORD fix. (#2274 )	2022-08-15 18:24:24 +03:00
Dmitry Rodionov	63a72d99bb	increase timeout in wait_for_upload to avoid spurious failures when testing with real s3	2022-08-15 18:02:27 +03:00
Arthur Petukhovsky	116ecdf87a	Improve walreceiver logic (#2253 ) This patch makes walreceiver logic more complicated, but it should work better in most cases. Added `test_wal_lagging` to test scenarios where alive safekeepers can lag behind other alive safekeepers. - There was a bug which looks like `etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn())` filtered all safekeepers in some strange cases. I removed this filter, it should probably help with #2237 - Now walreceiver_connection reports status, including commit_lsn. This allows keeping safekeeper connection even when etcd is down. - Safekeeper connection now fails if pageserver doesn't receive safekeeper messages for some time. Usually safekeeper sends messages at least once per second. - `LaggingWal` check now uses `commit_lsn` directly from safekeeper. This fixes the issue with often reconnects, when compute generates WAL really fast. - `NoWalTimeout` is rewritten to trigger only when we know about the new WAL and the connected safekeeper doesn't stream any WAL. This allows setting a small `lagging_wal_timeout` because it will trigger only when we observe that the connected safekeeper has stuck.	2022-08-15 13:31:26 +03:00
Arseny Sher	431393e361	Find end of WAL on safekeepers using WalStreamDecoder. We could make it inside wal_storage.rs, but taking into account that - wal_storage.rs reading is async - we don't need s3 here - error handling is different; error during decoding is normal I decided to put it separately. Test cargo test test_find_end_of_wal_last_crossing_segment prepared earlier by @yeputons passes now. Fixes https://github.com/neondatabase/neon/issues/544 https://github.com/neondatabase/cloud/issues/2004 Supersedes https://github.com/neondatabase/neon/pull/2066	2022-08-14 14:47:14 +03:00
Kirill Bulatov	f38f45b01d	Better storage sync logs (#2268 )	2022-08-13 10:58:14 +03:00
Andrey Taranik	a5154dce3e	get_binaries script fix (#2263 ) * get_binaries uses DOCKER_TAG taken from docker image build step * remove docker tag discovery at all and fix get_binaries for version variable	2022-08-12 20:35:26 +03:00
Alexander Bayandin	da5f8486ce	test_runner/pg_clients: collect docker logs (#2259 )	2022-08-12 17:03:09 +01:00
Dmitry Ivanov	ad08c273d3	[proxy] Rework wire format of the password hack and some errors (#2236 ) The new format has a few benefits: it's shorter, simpler and human-readable as well. We don't use base64 anymore, since url encoding got us covered. We also show a better error in case we couldn't parse the payload; the users should know it's all about passing the correct project name.	2022-08-12 17:38:43 +03:00
Andrey Taranik	7f97269277	get_binaries uses DOCKER_TAG taken from docker image build step (#2260 )	2022-08-12 16:01:22 +03:00
Thang Pham	6d99b4f1d8	disable `test_import_from_pageserver_multisegment` (#2258 ) This test failed consistently on `main` now. It's better to temporarily disable it to avoid blocking others' PRs while investigating the root cause for the test failure. See: #2255, #2256	2022-08-12 19:13:42 +07:00
Egor Suvorov	a7bf60631f	postgres_ffi/waldecoder: introduce explicit `enum State` Previously it was emulated with a combination of nullable fields. This change should make the logic more readable.	2022-08-12 11:40:46 +03:00
Egor Suvorov	07bb7a2afe	postgres_ffi/waldecoder: remove unused startlsn	2022-08-12 11:40:46 +03:00
Egor Suvorov	142e247e85	postgres_ffi/waldecoder: validate more header fields	2022-08-12 11:40:46 +03:00
Thang Pham	7da47d8a0a	Fix timeline physical size flaky tests (#2244 ) Resolves #2212. - use `wait_for_last_flush_lsn` in `test_timeline_physical_size_` tests ## Context Need to wait for the pageserver to catch up with the compute's last flush LSN because during the timeline physical size API call, it's possible that there are running `LayerFlushThread` threads. These threads flush new layers into disk and hence update the physical size. This results in a mismatch between the physical size reported by the API and the actual physical size on disk. ### Note The `LayerFlushThread` threads are processed concurrently*, so it's possible that the above error still persists even with this patch. However, making the tests wait to finish processing all the WALs (not flushing) before calculating the physical size should help reduce the "flakiness" significantly	2022-08-12 14:28:50 +07:00
Thang Pham	dc52436a8f	Fix bug when import large (>1GB) relations (#2172 ) Resolves #2097 - use timeline modification's `lsn` and timeline's `last_record_lsn` to determine the corresponding LSN to query data in `DatadirModification::get` - update `test_import_from_pageserver`. Split the test into 2 variants: `small` and `multisegment`. + `small` is the old test + `multisegment` is to simulate #2097 by using a larger number of inserted rows to create multiple segment files of a relation. `multisegment` is configured to only run with a `release` build	2022-08-12 09:24:20 +07:00
Kirill Bulatov	995a2de21e	Share exponential backoff code and fix logic for delete task failure (#2252 )	2022-08-11 23:21:06 +03:00
Arseny Sher	e593cbaaba	Add pageserver checkpoint_timeout option. To flush inmemory layer eventually when no new data arrives, which helps safekeepers to suspend activity (stop pushing to the broker). Default 10m should be ok.	2022-08-11 22:54:09 +03:00
Heikki Linnakangas	4b9e02be45	Update back `vendor/postgres` back; it was changed accidentally. (#2251 ) Commit `4227cfc96e` accidentally reverted vendor/postgres to an older version. Update it back.	2022-08-11 19:25:08 +03:00
Kirill Bulatov	7a36d06cc2	Fix exponential backoff values	2022-08-11 08:34:57 +03:00
Konstantin Knizhnik	4227cfc96e	Safe truncate (#2218 ) * Move relation sie cache to layered timeline * Fix obtaining current LSN for relation size cache * Resolve merge conflicts * Resolve merge conflicts * Reestore 'lsn' field in DatadirModification * adjust DatadirModification lsn in ingest_record * Fix formatting * Pass lsn to get_relsize * Fix merge conflict * Update pageserver/src/pgdatadir_mapping.rs Co-authored-by: Heikki Linnakangas <heikki@zenith.tech> * Update pageserver/src/pgdatadir_mapping.rs Co-authored-by: Heikki Linnakangas <heikki@zenith.tech> * Check if relation exists before trying to truncat it refer #1932 * Add test reporducing FSM truncate problem Co-authored-by: Heikki Linnakangas <heikki@zenith.tech>	2022-08-09 22:45:33 +03:00