WIP

test_forward_compatibility: fix path to pg_distrib_dir (#2826 )
Set correct `pg_distrib_dir` in `pageserver.toml` and in neon_local `config`. `test_forward_compatibility` shows flakiness during `neon_local pg start`, so hopefully, the patch will help. ``` 2022-11-15 16:07:34.091 GMT [13338] LOG: starting with zenith basebackup at LSN 0/A6A9310, prev 0/0 2022-11-15 16:07:34.091 GMT [13338] FATAL: cannot start in read-write mode from this base backup 2022-11-15 16:07:34.091 GMT [13337] LOG: startup process (PID 13338) exited with exit code 1 ```
2026-07-01 11:10:37 +00:00 · 2022-11-16 11:40:39 -05:00 · 2022-11-16 15:14:36 +00:00 · 2022-11-16 15:10:36 +00:00 · 2022-11-16 15:50:49 +02:00 · 2022-11-16 16:54:55 +04:00
38 changed files with 1021 additions and 119 deletions
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -22,6 +22,10 @@ storage:
          console_region_id: aws-us-west-2
        zenith-1-ps-3:
          console_region_id: aws-us-west-2
+        zenith-1-ps-4:
+          console_region_id: aws-us-west-2
+        zenith-1-ps-5:
+          console_region_id: aws-us-west-2

    safekeepers:
      hosts:
--- a/.github/ansible/staging.hosts.yaml
+++ b/.github/ansible/staging.hosts.yaml
@@ -3,7 +3,7 @@ storage:
    bucket_name: zenith-staging-storage-us-east-1
    bucket_region: us-east-1
    console_mgmt_base_url: http://console-staging.local
-    etcd_endpoints: zenith-us-stage-etcd.local:2379
+    etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
    pageserver_config_stub:
      pg_distrib_dir: /usr/local
      remote_storage:
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -164,7 +164,7 @@ jobs:
      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
      PLATFORM: ${{ matrix.platform }}

-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
      options: --init
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -18,8 +18,8 @@ env:

 jobs:
  tag:
-    runs-on: dev
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
+    runs-on: [ self-hosted, dev, x64 ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
    outputs:
      build-tag: ${{steps.build-tag.outputs.tag}}

@@ -46,7 +46,7 @@ jobs:
        id: build-tag

  build-neon:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -236,7 +236,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -269,7 +269,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  benchmarks:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -300,7 +300,7 @@ jobs:
      # while coverage is currently collected for the debug ones

  merge-allure-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -338,7 +338,7 @@ jobs:
          DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json

  coverage-report:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
@@ -415,7 +415,7 @@ jobs:
        shell: bash -euxo pipefail {0}

  trigger-e2e-tests:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
@@ -460,7 +460,7 @@ jobs:
            }"

  neon-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -478,7 +478,7 @@ jobs:
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}

  compute-tools-image:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug

@@ -493,7 +493,7 @@ jobs:
        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}

  compute-node-image-v14:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -510,7 +510,7 @@ jobs:
        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}

  compute-node-image-v15:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: gcr.io/kaniko-project/executor:v1.9.0-debug
    needs: [ tag ]
    steps:
@@ -528,7 +528,7 @@ jobs:

  test-images:
    needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]

    steps:
      - name: Checkout
@@ -570,7 +570,7 @@ jobs:
          docker compose -f ./docker-compose/docker-compose.yml down

  promote-images:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ tag, test-images ]
    if: github.event_name != 'workflow_dispatch'
    container: amazon/aws-cli
@@ -586,7 +586,7 @@ jobs:
          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"

  push-docker-hub:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    needs: [ promote-images, tag ]
    container: golang:1.19-bullseye

@@ -736,7 +736,7 @@ jobs:
          rm -f neon_install.tar.gz .neon_current_version

  deploy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -816,7 +816,7 @@ jobs:
          rm -f neon_install.tar.gz .neon_current_version

  deploy-proxy:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -858,7 +858,7 @@ jobs:
          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

  deploy-proxy-new:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -928,7 +928,7 @@ jobs:
          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

  promote-compatibility-data:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -115,7 +115,7 @@ jobs:
        run: cargo build --locked --all --all-targets

  check-rust-dependencies:
-    runs-on: dev
+    runs-on: [ self-hosted, dev, x64 ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
      options: --init
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -37,7 +37,7 @@

 - [Source view](./sourcetree.md)
  - [docker.md](./docker.md) — Docker images and building pipeline.
-  - [Error handling and logging]()
+  - [Error handling and logging](./error-handling.md)
  - [Testing]()
    - [Unit testing]()
    - [Integration testing]()
--- a/docs/error-handling.md
+++ b/docs/error-handling.md
@@ -0,0 +1,198 @@
+# Error handling and logging
+
+## Logging errors
+
+The principle is that errors are logged when they are handled. If you
+just propagate an error to the caller in a function, you don't need to
+log it; the caller will. But if you consume an error in a function,
+you *must* log it (if it needs to be logged at all).
+
+For example:
+
+```rust
+fn read_motd_file() -> std::io::Result<String> {
+    let mut f = File::open("/etc/motd")?;
+    let mut result = String::new();
+    f.read_to_string(&mut result)?;
+    result
+}
+```
+
+Opening or reading the file could fail, but there is no need to log
+the error here. The function merely propagates the error to the
+caller, and it is up to the caller to log the error or propagate it
+further, if the failure is not expected. But if, for example, it is
+normal that the "/etc/motd" file doesn't exist, the caller can choose
+to silently ignore the error, or log it as an INFO or DEBUG level
+message:
+
+```rust
+fn get_message_of_the_day() -> String {
+    // Get the motd from /etc/motd, or return the default proverb
+    match read_motd_file() {
+        Ok(motd) => motd,
+        Err(err)  => {
+            // It's normal that /etc/motd doesn't exist, but if we fail to
+            // read it for some other reason, that's unexpected. The message
+            // of the day isn't very important though, so we just WARN and
+            // continue with the default in any case.
+            if err.kind() != std::io::ErrorKind::NotFound {
+                 tracing::warn!("could not read \"/etc/motd\": {err:?}");
+            }
+            "An old error is always more popular than a new truth. - German proverb"
+        }
+    }
+}
+```
+
+## Error types
+
+We use the `anyhow` crate widely. It contains many convenient macros
+like `bail!` and `ensure!` to construct and return errors, and to
+propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
+
+A downside of `anyhow::Error` is that the caller cannot distinguish
+between different error cases. Most errors are propagated all the way
+to the mgmt API handler function, or the main loop that handles a
+connection with the compute node, and they are all handled the same
+way: the error is logged and returned to the client as an HTTP or
+libpq error.
+
+But in some cases, we need to distinguish between errors and handle
+them differently. For example, attaching a tenant to the pageserver
+could fail either because the tenant has already been attached, or
+because we could not load its metadata from cloud storage. The first
+case is more or less expected. The console sends the Attach request to
+the pageserver, and the pageserver completes the operation, but the
+network connection might be lost before the console receives the
+response. The console will retry the operation in that case, but the
+tenant has already been attached. It is important that the pagserver
+responds with the HTTP 403 Already Exists error in that case, rather
+than a generic HTTP 500 Internal Server Error.
+
+If you need to distinguish between different kinds of errors, create a
+new `Error` type. The `thiserror` crate is useful for that. But in
+most cases `anyhow::Error` is good enough.
+
+## Panics
+
+Depending on where a panic happens, it can cause the whole pageserver
+or safekeeper to restart, or just a single tenant. In either case,
+that is pretty bad and causes an outage. Avoid panics. Never use
+`unwrap()` or other calls that might panic, to verify inputs from the
+network or from disk.
+
+It is acceptable to use functions that might panic, like `unwrap()`, if
+it is obvious that it cannot panic. For example, if you have just
+checked that a variable is not None, it is OK to call `unwrap()` on it,
+but it is still preferable to use `expect("reason")` instead to explain
+why the function cannot fail.
+
+`assert!` and `panic!` are reserved for checking clear invariants and
+very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
+or `bail!` instead.
+
+## Error levels
+
+`tracing::Level` doesn't provide very clear guidelines on what the
+different levels mean, or when to use which level. Here is how we use
+them:
+
+### Error
+
+Examples:
+- could not open file "foobar"
+- invalid tenant id
+
+Errors are not expected to happen during normal operation. Incorrect
+inputs from client can cause ERRORs. For example, if a client tries to
+call a mgmt API that doesn't exist, or if a compute node sends passes
+an LSN that has already been garbage collected away.
+
+These should *not* happen during normal operations. "Normal
+operations" is not a very precise concept. But for example, disk
+errors are not expected to happen when the system is working, so those
+count as Errors. However, if a TCP connection to a compute node is
+lost, that is not considered an Error, because it doesn't affect the
+pageserver's or safekeeper's operation in any way, and happens fairly
+frequently when compute nodes are shut down, or are killed abruptly
+because of errors in the compute.
+
+**Errors are monitored, and always need human investigation to determine
+the cause.**
+
+Whether something should be logged at ERROR, WARNING or INFO level can
+depend on the callers and clients. For example, it might be unexpected
+and a sign of a serious issue if the console calls the
+"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
+would be appropriate in that case. But if the console routinely calls
+the API after deleting a timeline, to check if the deletion has
+completed, then it would be totally normal and an INFO or DEBUG level
+message would be more appropriate. If a message is logged as an ERROR,
+but it in fact happens frequently in production and never requires any
+action, it should probably be demoted to an INFO level message.
+
+### Warn
+
+Examples:
+- could not remove temporary file "foobar.temp"
+- unrecognized file "foobar" in timeline directory
+
+Warnings are similar to Errors, in that they should not happen
+when the system is operating normally. The difference between Error and
+Warning is that an Error means that the operation failed, whereas Warning
+means that something unexpected happened, but the operation continued anyway.
+For example, if deleting a file fails because the file already didn't exist,
+it should be logged as Warning.
+
+> **Note:** The python regression tests, under `test_regress`, check the
+> pageserver log after each test for any ERROR and WARN lines. If there are
+> any ERRORs or WARNs that have not been explicitly listed in the test as
+> allowed, the test is marked a failed. This is to catch unexpected errors
+> e.g. in background operations, that don't cause immediate misbehaviour in
+> the tested functionality.
+
+### Info
+
+Info level is used to log useful information when the system is
+operating normally. Info level is appropriate e.g. for logging state
+changes, background operations, and network connections.
+
+Examples:
+- "system is shutting down"
+- "tenant was created"
+- "retrying S3 upload"
+
+### Debug & Trace
+
+Debug and Trace level messages are not printed to the log in our normal
+production configuration, but could be enabled for a specific server or
+tenant, to aid debugging. (Although we don't actually have that
+capability as of this writing).
+
+## Context
+
+We use logging "spans" to hold context information about the current
+operation. Almost every operation happens on a particular tenant and
+timeline, so we enter a span with the "tenant_id" and "timeline_id"
+very early when processing an incoming API request, for example. All
+background operations should also run in a span containing at least
+those two fields, and any other parameters or information that might
+be useful when debugging an error that might happen when performing
+the operation.
+
+TODO: Spans are not captured in the Error when it is created, but when
+the error is logged. It would be more useful to capture them at Error
+creation. We should consider using `tracing_error::SpanTrace` to do
+that.
+
+## Error message style
+
+PostgreSQL has a style guide for writing error messages:
+
+https://www.postgresql.org/docs/current/error-style-guide.html
+
+Follow that guide when writing error messages in the PostgreSQL
+extension. We don't follow it strictly in the pageserver and
+safekeeper, but the advice in the PostgreSQL style guide is generally
+good, and you can't go wrong by following it.
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -138,7 +138,7 @@ impl FromStr for Lsn {
    ///
    /// If the input string is missing the '/' character, then use `Lsn::from_hex`
    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let mut splitter = s.split('/');
+        let mut splitter = s.trim().split('/');
        if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
        {
            let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
@@ -270,6 +270,11 @@ mod tests {
        );
        assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
        assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
+
+        let expected_lsn = Lsn(0x3C490F8);
+        assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
+        assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
+        assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
    }

    #[test]
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -76,3 +76,7 @@ tempfile = "3.2"
 [[bench]]
 name = "bench_layer_map"
 harness = false
+
+[[bench]]
+name = "bench_walredo"
+harness = false
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -614,8 +614,9 @@ impl PageServerConf {
        PathBuf::from(format!("../tmp_check/test_{test_name}"))
    }

-    #[cfg(test)]
    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
+        let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
+
        PageServerConf {
            id: NodeId(0),
            wait_lsn_timeout: Duration::from_secs(60),
@@ -626,7 +627,7 @@ impl PageServerConf {
            listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
            superuser: "cloud_admin".to_string(),
            workdir: repo_dir,
-            pg_distrib_dir: PathBuf::new(),
+            pg_distrib_dir,
            auth_type: AuthType::Trust,
            auth_validation_public_key_path: None,
            remote_storage_config: None,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -76,6 +76,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                        FeMessage::CopyData(bytes) => bytes,
                        FeMessage::CopyDone => { break },
                        FeMessage::Sync => continue,
+                        FeMessage::Terminate => {
+                            let msg = format!("client terminated connection with Terminate message during COPY");
+                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
+                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                            break;
+                        }
                        m => {
                            let msg = format!("unexpected message {:?}", m);
                            pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
@@ -87,10 +93,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
                    yield copy_data_bytes;
                }
                Ok(None) => {
-                    let msg = "client closed connection";
+                    let msg = "client closed connection during COPY";
                    pgb.write_message(&BeMessage::ErrorResponse(msg))?;
                    pgb.flush().await?;
-                    Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                }
                Err(e) => {
                    Err(io::Error::new(io::ErrorKind::Other, e))?;
--- a/pageserver/src/tenant_config.rs
+++ b/pageserver/src/tenant_config.rs
@@ -216,7 +216,6 @@ impl TenantConf {
        }
    }

-    #[cfg(test)]
    pub fn dummy_conf() -> Self {
        TenantConf {
            checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
--- a/pageserver/src/tenant_tasks.rs
+++ b/pageserver/src/tenant_tasks.rs
@@ -72,8 +72,6 @@ async fn compaction_loop(tenant_id: TenantId) {
            if let Err(e) = tenant.compaction_iteration() {
                sleep_duration = wait_duration;
                error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
-                #[cfg(feature = "testing")]
-                std::process::abort();
            }

            // Sleep
@@ -123,8 +121,6 @@ async fn gc_loop(tenant_id: TenantId) {
                {
                    sleep_duration = wait_duration;
                    error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
-                    #[cfg(feature = "testing")]
-                    std::process::abort();
                }
            }

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -210,6 +210,16 @@ impl PostgresRedoManager {
        }
    }

+    /// Launch process pre-emptively. Should not be needed except for benchmarking.
+    pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
+        let inner = self.process.get_mut().unwrap();
+        if inner.is_none() {
+            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
+            *inner = Some(p);
+        }
+        Ok(())
+    }
+
    ///
    /// Process one request for WAL redo using wal-redo postgres
    ///
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -440,68 +440,35 @@ def pytest_terminal_summary(

    terminalreporter.section("Benchmark results", "-")

-    # TODO group by test report
-    reports = {
-        report.head_line: report
-        for report in terminalreporter.stats.get("passed", [])
-    }
+    result = []
+    for test_report in terminalreporter.stats.get("passed", []):
+        result_entry = []

-    results = []
-    for name, report in reports.items():
-        # terminalreporter.write(f"{name}", green=True)
-        # terminalreporter.line("")
-        if "[neon" in name:
-            vanilla_report = reports.get(name.replace("[neon", "[vanilla"))
-            if vanilla_report:
-                for key, prop in report.user_properties:
-                    if prop["unit"] == "s":
-                        neon_value = prop["value"]
-                        vanilla_value = dict(vanilla_report.user_properties)[key]["value"]
-                        try:
-                            ratio = float(neon_value) / vanilla_value
-                        except ZeroDivisionError:
-                            ratio = 99999
+        for _, recorded_property in test_report.user_properties:
+            terminalreporter.write(
+                "{}.{}: ".format(test_report.head_line, recorded_property["name"])
+            )
+            unit = recorded_property["unit"]
+            value = recorded_property["value"]
+            if unit == "MB":
+                terminalreporter.write("{0:,.0f}".format(value), green=True)
+            elif unit in ("s", "ms") and isinstance(value, float):
+                terminalreporter.write("{0:,.3f}".format(value), green=True)
+            elif isinstance(value, float):
+                terminalreporter.write("{0:,.4f}".format(value), green=True)
+            else:
+                terminalreporter.write(str(value), green=True)
+            terminalreporter.line(" {}".format(unit))

-                        results.append((ratio, name.replace("[neon", "[neon/vanilla"), prop["name"]))
+            result_entry.append(recorded_property)

-    results.sort(reverse=True)
-    for ratio, test, prop in results:
-        terminalreporter.write("{}.{}: ".format(test, prop))
-        terminalreporter.write("{0:,.3f}".format(ratio), green=True)
-        terminalreporter.line("")
-
-    # result = []
-    # for test_report in terminalreporter.stats.get("passed", []):
-    #     result_entry = []
-
-    #     durations = [
-    #         prop
-    #         for _, prop in test_report.user_properties
-    #         if prop["unit"] == "s"
-    #     ]
-
-    #     for _, recorded_property in test_report.user_properties:
-    #         terminalreporter.write("{}.{}: ".format(test_report.head_line,
-    #                                                 recorded_property["name"]))
-    #         unit = recorded_property["unit"]
-    #         value = recorded_property["value"]
-    #         if unit == "MB":
-    #             terminalreporter.write("{0:,.0f}".format(value), green=True)
-    #         elif unit in ("s", "ms") and isinstance(value, float):
-    #             terminalreporter.write("{0:,.3f}".format(value), green=True)
-    #         elif isinstance(value, float):
-    #             terminalreporter.write("{0:,.4f}".format(value), green=True)
-    #         else:
-    #             terminalreporter.write(str(value), green=True)
-    #         terminalreporter.line(" {}".format(unit))
-
-    #         result_entry.append(recorded_property)
-
-    #     result.append({
-    #         "suit": test_report.nodeid,
-    #         "total_duration": test_report.duration,
-    #         "data": result_entry,
-    #     })
+        result.append(
+            {
+                "suit": test_report.nodeid,
+                "total_duration": test_report.duration,
+                "data": result_entry,
+            }
+        )

    out_dir = config.getoption("out_dir")
    if out_dir is None:
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -784,6 +784,8 @@ class NeonEnvBuilder:

            self.cleanup_remote_storage()

+            self.env.pageserver.assert_no_errors()
+

 class NeonEnv:
    """
@@ -1723,6 +1725,43 @@ class NeonPageserver(PgProtocol):
        self.config_override = config_override
        self.version = env.get_pageserver_version()

+        # After a test finishes, we will scrape the log to see if there are any
+        # unexpected error messages. If your test expects an error, add it to
+        # 'allowed_errors' in the test with something like:
+        #
+        # env.pageserver.allowed_errors.append(".*could not open garage door.*")
+        #
+        # The entries in the list are regular experessions.
+        self.allowed_errors = [
+            # All tests print these, when starting up or shutting down
+            ".*wal receiver task finished with an error: walreceiver connection handling failure.*",
+            ".*Shutdown task error: walreceiver connection handling failure.*",
+            ".*Etcd client error: grpc request error: status: Unavailable.*",
+            ".*query handler for .* failed: Connection reset by peer.*",
+            ".*serving compute connection task.*exited with error: Broken pipe.*",
+            ".*Connection aborted: error communicating with the server: Broken pipe.*",
+            ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
+            ".*Connection aborted: error communicating with the server: Connection reset by peer.*",
+            ".*kill_and_wait_impl.*: wait successful.*",
+            ".*end streaming to Some.*",
+            # safekeeper connection can fail with this, in the window between timeline creation
+            # and streaming start
+            ".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
+            # Tests related to authentication and authorization print these
+            ".*Error processing HTTP request: Forbidden",
+            # intentional failpoints
+            ".*failpoint ",
+            # FIXME: there is a race condition between GC and detach, see
+            # https://github.com/neondatabase/neon/issues/2442
+            ".*could not remove ephemeral file.*No such file or directory.*",
+            # FIXME: These need investigation
+            ".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
+            ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
+            ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
+            ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
+            ".*Removing intermediate uninit mark file.*",
+        ]
+
    def start(self, overrides: Tuple[str, ...] = ()) -> "NeonPageserver":
        """
        Start the page server.
@@ -1771,6 +1810,26 @@ class NeonPageserver(PgProtocol):
            is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
        )

+    def assert_no_errors(self):
+        logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
+
+        error_or_warn = re.compile("ERROR|WARN")
+        errors = []
+        while True:
+            line = logfile.readline()
+            if not line:
+                break
+
+            if error_or_warn.search(line):
+                # It's an ERROR or WARN. Is it in the allow-list?
+                for a in self.allowed_errors:
+                    if re.match(a, line):
+                        break
+                else:
+                    errors.append(line)
+
+        assert not errors
+

 def append_pageserver_param_overrides(
    params_to_update: List[str],
@@ -2661,8 +2720,6 @@ def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[P

    yield test_dir

-    shutil.rmtree(test_dir)
-
    allure_attach_from_dir(test_dir)


--- a/test_runner/performance/test_seqscans.py
+++ b/test_runner/performance/test_seqscans.py
@@ -25,6 +25,11 @@ def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers:

    with closing(env.pg.connect()) as conn:
        with conn.cursor() as cur:
+
+            if True:
+                cur.execute("set enable_seqscan_prefetch = on;")
+                cur.execute("set seqscan_prefetch_buffers = 10;")
+
            cur.execute("create table t (i integer);")
            cur.execute(f"insert into t values (generate_series(1,{rows}));")

--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -116,6 +116,13 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http_client = env.pageserver.http_client()

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*invalid branch start lsn: less than latest GC cutoff.*",
+            ".*invalid branch start lsn: less than planned GC cutoff.*",
+        ]
+    )
+
    # Disable background GC but set the `pitr_interval` to be small, so GC can delete something
    tenant, _ = env.neon_cli.create_tenant(
        conf={
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -13,6 +13,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
    env = neon_env_builder.init_start()

+    env.pageserver.allowed_errors.append(".*invalid branch start lsn.*")
+    env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
+
    # Branch at the point where only 100 rows were inserted
    env.neon_cli.create_branch("test_branch_behind")
    pgmain = env.postgres.create_start("test_branch_behind")
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -11,10 +11,17 @@ from fixtures.types import TenantId, TimelineId
 # Test restarting page server, while safekeeper and compute node keep
 # running.
 def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
-    # One safekeeper is enough for this test.
-    neon_env_builder.num_safekeepers = 3
    env = neon_env_builder.init_start()

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*No timelines to attach received.*",
+            ".*Failed to process timeline dir contents.*",
+            ".*Failed to load delta layer.*",
+            ".*Timeline .* was not found.*",
+        ]
+    )
+
    tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = []

    for n in range(4):
@@ -72,23 +79,24 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
    # First timeline would not get loaded into pageserver due to corrupt metadata file
    with pytest.raises(Exception, match=f"Timeline {tenant1}/{timeline1} was not found") as err:
        pg1.start()
-    log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
+    log.info(
+        f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
+    )

    # Second timeline has no ancestors, only the metadata file and no layer files
    # We don't have the remote storage enabled, which means timeline is in an incorrect state,
    # it's not loaded at all
    with pytest.raises(Exception, match=f"Timeline {tenant2}/{timeline2} was not found") as err:
        pg2.start()
-    log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
+    log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")

-    # Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline
-    for n in range(3, 4):
-        (bad_tenant, bad_timeline, pg) = tenant_timelines[n]
-        with pytest.raises(Exception, match="extracting base backup failed") as err:
-            pg.start()
-        log.info(
-            f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}"
-        )
+    # Third timeline will also fail during basebackup, because the layer file is corrupt.
+    # (We don't check layer file contents on startup, when loading the timeline)
+    with pytest.raises(Exception, match="Failed to load delta layer") as err:
+        pg3.start()
+    log.info(
+        f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
+    )


 def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
@@ -111,6 +119,13 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http = env.pageserver.http_client()

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
+            ".*Timeline got dropped without initializing, cleaning its files.*",
+        ]
+    )
+
    tenant_id, _ = env.neon_cli.create_tenant()

    timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -2,7 +2,7 @@ import os
 import shutil
 import subprocess
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional

 import pytest
 import toml  # TODO: replace with tomllib for Python >= 3.11
@@ -50,6 +50,12 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o

    env = neon_env_builder.init_start()
    pg = env.postgres.create_start("main")
+
+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
    pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
    pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
@@ -154,6 +160,7 @@ def test_forward_compatibility(
        from_dir=compatibility_snapshot_dir,
        to_dir=test_output_dir / "compatibility_snapshot",
        port_distributor=port_distributor,
+        pg_distrib_dir=compatibility_postgres_distrib_dir,
    )

    breaking_changes_allowed = (
@@ -183,7 +190,12 @@ def test_forward_compatibility(
    ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"


-def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistributor):
+def prepare_snapshot(
+    from_dir: Path,
+    to_dir: Path,
+    port_distributor: PortDistributor,
+    pg_distrib_dir: Optional[Path] = None,
+):
    assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist"
    assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory"
    assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql"
@@ -208,7 +220,7 @@ def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistrib
    # Update paths and ports in config files
    pageserver_toml = repo_dir / "pageserver.toml"
    pageserver_config = toml.load(pageserver_toml)
-    pageserver_config["remote_storage"]["local_path"] = repo_dir / "local_fs_remote_storage"
+    pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
        pageserver_config["listen_http_addr"]
    )
@@ -219,6 +231,9 @@ def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistrib
        port_distributor.replace_with_new_port(ep) for ep in pageserver_config["broker_endpoints"]
    ]

+    if pg_distrib_dir:
+        pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
+
    with pageserver_toml.open("w") as f:
        toml.dump(pageserver_config, f)

@@ -238,7 +253,10 @@ def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistrib
        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])

-    with (snapshot_config_toml).open("w") as f:
+    if pg_distrib_dir:
+        snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
+
+    with snapshot_config_toml.open("w") as f:
        toml.dump(snapshot_config, f)

    # Ensure that snapshot doesn't contain references to the original path
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -179,7 +179,16 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    # run compute_ctl and wait for 10s
    try:
        ctl.raw_cli(
-            ["--connstr", ps_connstr, "--pgdata", pgdata, "--spec", spec, "--pgbin", pg_bin_path],
+            [
+                "--connstr",
+                "postgres://invalid/",
+                "--pgdata",
+                pgdata,
+                "--spec",
+                spec,
+                "--pgbin",
+                pg_bin_path,
+            ],
            timeout=10,
        )
    except TimeoutExpired as exc:
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -1,3 +1,4 @@
+import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin


@@ -9,6 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 # test anyway, so it doesn't need any special attention here.
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
    env = neon_env_builder.init_start()
+
+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+
    pageserver_http = env.pageserver.http_client()

    # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
@@ -30,10 +36,9 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):

    pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))

-    for i in range(5):
-        try:
+    for _ in range(5):
+        with pytest.raises(Exception):
            pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr])
-        except Exception:
-            env.pageserver.stop()
-            env.pageserver.start()
-            pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
+        env.pageserver.stop()
+        env.pageserver.start()
+        pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -76,6 +76,26 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
    env = neon_env_builder.init_start()
    env.pageserver.http_client().tenant_create(tenant)

+    env.pageserver.allowed_errors.extend(
+        [
+            ".*error importing base backup .*",
+            ".*Timeline got dropped without initializing, cleaning its files.*",
+            ".*Removing intermediate uninit mark file.*",
+            ".*InternalServerError.*timeline not found.*",
+            ".*InternalServerError.*Tenant .* not found.*",
+            ".*InternalServerError.*Timeline .* not found.*",
+            ".*InternalServerError.*Cannot delete timeline which has child timelines.*",
+        ]
+    )
+
+    # FIXME: we should clean up pageserver to not print this
+    env.pageserver.allowed_errors.append(".*exited with error: unexpected message type: CopyData.*")
+
+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    def import_tar(base, wal):
        env.neon_cli.raw_cli(
            [
@@ -122,6 +142,11 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu
    neon_env_builder.enable_local_fs_remote_storage()
    env = neon_env_builder.init_start()

+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    timeline = env.neon_cli.create_branch("test_import_from_pageserver_small")
    pg = env.postgres.create_start("test_import_from_pageserver_small")

--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -67,6 +67,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+
    # Use a tiny checkpoint distance, to create a lot of layers quickly.
    # That allows us to stress the compaction and layer flushing logic more.
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_read_validation.py
+++ b/test_runner/regress/test_read_validation.py
@@ -143,6 +143,8 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
    env = neon_simple_env
    env.neon_cli.create_branch("test_read_validation_neg", "empty")

+    env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
+
    pg = env.postgres.create_start("test_read_validation_neg")
    log.info("postgres is running on 'test_read_validation_neg' branch")

--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -17,6 +17,8 @@ def test_readonly_node(neon_simple_env: NeonEnv):
    pgmain = env.postgres.create_start("test_readonly_node")
    log.info("postgres is running on 'test_readonly_node' branch")

+    env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*")
+
    main_pg_conn = pgmain.connect()
    main_cur = main_pg_conn.cursor()

--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -17,6 +17,10 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):

    neon_env_builder.start()

+    # These warnings are expected, when the pageserver is restarted abruptly
+    env.pageserver.allowed_errors.append(".*found future delta layer.*")
+    env.pageserver.allowed_errors.append(".*found future image layer.*")
+
    # Create a branch for us
    env.neon_cli.create_branch("test_pageserver_recovery", "main")

--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -56,6 +56,17 @@ def test_remote_storage_backup_and_restore(

    ##### First start, insert secret data and upload it to the remote storage
    env = neon_env_builder.init_start()
+
+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+
+    env.pageserver.allowed_errors.append(".*Tenant download is already in progress.*")
+    env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
+    env.pageserver.allowed_errors.append(".*No metadata file found in the timeline directory.*")
+
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -20,6 +20,11 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

+    env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found in the local state")
+    # FIXME: we have a race condition between GC and detach. GC might fail with this
+    # error. Similar to https://github.com/neondatabase/neon/issues/2671
+    env.pageserver.allowed_errors.append(".*InternalServerError\\(No such file or directory.*")
+
    # first check for non existing tenant
    tenant_id = TenantId.generate()
    with pytest.raises(
@@ -28,6 +33,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    ):
        pageserver_http.tenant_detach(tenant_id)

+    # the error will be printed to the log too
+    env.pageserver.allowed_errors.append(".*Tenant not found for id.*")
+
    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

@@ -50,6 +58,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
        bogus_timeline_id = TimelineId.generate()
        pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)

+        # the error will be printed to the log too
+    env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
+
    # try to concurrently run gc and detach
    gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id))
    gc_thread.start()
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -259,6 +259,11 @@ def test_tenant_relocation(

    env = neon_env_builder.init_start()

+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    # create folder for remote storage mock
    remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage"

--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -25,6 +25,13 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
    )
    initial_tenant_dirs = [d for d in tenants_dir.iterdir()]

+    neon_simple_env.pageserver.allowed_errors.extend(
+        [
+            ".*Failed to create directory structure for tenant .*, cleaning tmp data.*",
+            ".*Failed to fsync removed temporary tenant directory .*",
+        ]
+    )
+
    pageserver_http = neon_simple_env.pageserver.http_client()
    pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return"))
    with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"):
@@ -206,6 +213,13 @@ def test_pageserver_with_empty_tenants(
    )

    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(
+        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+    env.pageserver.allowed_errors.append(".*Tenant .* has no timelines directory.*")
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+
    client = env.pageserver.http_client()

    tenant_without_timelines_dir = env.initial_tenant
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -66,6 +66,11 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem

    env = neon_env_builder.init_start()

+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    tenants_pgs: List[Tuple[TenantId, Postgres]] = []

    for _ in range(1, 5):
@@ -117,6 +122,13 @@ def test_tenants_attached_after_download(

    ##### First start, insert secret data and upload it to the remote storage
    env = neon_env_builder.init_start()
+
+    # FIXME: Are these expected?
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+    env.pageserver.allowed_errors.append(
+        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

@@ -209,6 +221,16 @@ def test_tenant_upgrades_index_json_from_v0(
    # launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
    # then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
    env = neon_env_builder.init_start()
+
+    # FIXME: Are these expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+    env.pageserver.allowed_errors.append(
+        ".*Failed to get local tenant state: Tenant .* not found in the local state.*"
+    )
+
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

@@ -315,6 +337,20 @@ def test_tenant_redownloads_truncated_file_on_startup(
    )

    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(
+        ".*Redownloading locally existing .* due to size mismatch.*"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*Downloaded layer exists already but layer file metadata mismatches.*"
+    )
+
+    # FIXME: Are these expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+    env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
+
    pageserver_http = env.pageserver.http_client()
    pg = env.postgres.create_start("main")

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -7,6 +7,11 @@ from fixtures.utils import wait_until
 def test_timeline_delete(neon_simple_env: NeonEnv):
    env = neon_simple_env

+    env.pageserver.allowed_errors.append(".*Timeline .* was not found.*")
+    env.pageserver.allowed_errors.append(".*timeline not found.*")
+    env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*")
+    env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
+
    ps_http = env.pageserver.http_client()

    # first try to delete non existing timeline
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -263,6 +263,12 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

    env.neon_cli.create_branch("test_broker", "main")
+
+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    pg = env.postgres.create_start("test_broker")
    pg.safe_psql("CREATE TABLE t(key int primary key, value text)")

@@ -306,6 +312,11 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    neon_env_builder.auth_enabled = auth_enabled
    env = neon_env_builder.init_start()

+    # FIXME: Is this expected?
+    env.pageserver.allowed_errors.append(
+        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
+    )
+
    env.neon_cli.create_branch("test_safekeepers_wal_removal")
    pg = env.postgres.create_start("test_safekeepers_wal_removal")

@@ -538,6 +549,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
    )

    pg.stop_and_destroy()
+    ps_cli.timeline_delete(tenant_id, timeline_id)

    # Also delete and manually create timeline on safekeepers -- this tests
    # scenario of manual recovery on different set of safekeepers.
@@ -562,7 +574,6 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
        shutil.copy(f_partial_saved, f_partial_path)

    # recreate timeline on pageserver from scratch
-    ps_cli.timeline_delete(tenant_id, timeline_id)
    ps_cli.timeline_create(tenant_id, timeline_id)

    wait_lsn_timeout = 60 * 3
@@ -1081,6 +1092,14 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    neon_env_builder.auth_enabled = auth_enabled
    env = neon_env_builder.init_start()

+    # FIXME: are these expected?
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Failed to process query for timeline .*: Timeline .* was not found in global map.*",
+            ".*end streaming to Some.*",
+        ]
+    )
+
    # Create two tenants: one will be deleted, other should be preserved.
    tenant_id = env.initial_tenant
    timeline_id_1 = env.neon_cli.create_branch("br1")  # Active, delete explicitly
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -22,6 +22,8 @@ def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_prese
 # as a zombie process.
 def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
+    # We intentionally test for a non-existent tenant.
+    env.pageserver.allowed_errors.append(".*Tenant not found.*")
    pageserver_http = env.pageserver.http_client()

    pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text())
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15