Merge remote-tracking branch 'origin/main' into arpad/lsn_by_ts

2026-05-20 06:30:43 +00:00 · 2023-11-07 05:40:46 +01:00
parent 090246cad7 ad5b02e175
commit 52017eff68
103 changed files with 5783 additions and 3587 deletions
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -626,6 +626,8 @@ class NeonEnvBuilder:
                sk.stop(immediate=True)

            for pageserver in self.env.pageservers:
+                pageserver.assert_no_metric_errors()
+
                pageserver.stop(immediate=True)

            if self.env.attachment_service is not None:
@@ -1784,6 +1786,21 @@ class NeonPageserver(PgProtocol):

        assert not errors

+    def assert_no_metric_errors(self):
+        """
+        Certain metrics should _always_ be zero: they track conditions that indicate a bug.
+        """
+        if not self.running:
+            log.info(f"Skipping metrics check on pageserver {self.id}, it is not running")
+            return
+
+        for metric in [
+            "pageserver_tenant_manager_unexpected_errors_total",
+            "pageserver_deletion_queue_unexpected_errors_total",
+        ]:
+            value = self.http_client().get_metric_value(metric)
+            assert value == 0, f"Nonzero {metric} == {value}"
+
    def log_contains(self, pattern: str) -> Optional[str]:
        """Check that the pageserver log contains a line that matches the given regex"""
        logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
@@ -2868,7 +2885,7 @@ class SafekeeperHttpClient(requests.Session):
        params = params or {}
        res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
        res.raise_for_status()
-        res_json = res.json()
+        res_json = json.loads(res.text)
        assert isinstance(res_json, dict)
        return res_json

@@ -2968,24 +2985,33 @@ class S3Scrubber:
        self.env = env
        self.log_dir = log_dir

-    def scrubber_cli(self, args, timeout):
+    def scrubber_cli(self, args: list[str], timeout) -> str:
        assert isinstance(self.env.pageserver_remote_storage, S3Storage)
        s3_storage = self.env.pageserver_remote_storage

        env = {
            "REGION": s3_storage.bucket_region,
            "BUCKET": s3_storage.bucket_name,
+            "BUCKET_PREFIX": s3_storage.prefix_in_bucket,
+            "RUST_LOG": "DEBUG",
        }
        env.update(s3_storage.access_env_vars())

        if s3_storage.endpoint is not None:
            env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})

-        base_args = [self.env.neon_binpath / "s3_scrubber"]
+        base_args = [str(self.env.neon_binpath / "s3_scrubber")]
        args = base_args + args

-        (output_path, _, status_code) = subprocess_capture(
-            self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False
+        (output_path, stdout, status_code) = subprocess_capture(
+            self.log_dir,
+            args,
+            echo_stderr=True,
+            echo_stdout=True,
+            env=env,
+            check=False,
+            capture_stdout=True,
+            timeout=timeout,
        )
        if status_code:
            log.warning(f"Scrub command {args} failed")
@@ -2994,8 +3020,18 @@ class S3Scrubber:

            raise RuntimeError("Remote storage scrub failed")

-    def scan_metadata(self):
-        self.scrubber_cli(["scan-metadata"], timeout=30)
+        assert stdout is not None
+        return stdout
+
+    def scan_metadata(self) -> Any:
+        stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
+
+        try:
+            return json.loads(stdout)
+        except:
+            log.error("Failed to decode JSON output from `scan-metadata`.  Dumping stdout:")
+            log.error(stdout)
+            raise


 def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -249,7 +249,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in the wild by tests with the below contradicting logging
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
            # this seems like a mock_s3 issue
-            log.warn(
+            log.warning(
                f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
            )
            keys = 0
@@ -257,7 +257,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
            # this has been seen in one case with mock_s3:
            # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
            # looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
-            log.warn(
+            log.warning(
                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
            )

--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -35,6 +35,7 @@ def subprocess_capture(
    echo_stderr=False,
    echo_stdout=False,
    capture_stdout=False,
+    timeout=None,
    **kwargs: Any,
 ) -> Tuple[str, Optional[str], int]:
    """Run a process and bifurcate its output to files and the `log` logger
@@ -104,7 +105,7 @@ def subprocess_capture(
                stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False)
                stderr_handler.start()

-                r = p.wait()
+                r = p.wait(timeout=timeout)

                stdout_handler.join()
                stderr_handler.join()
--- a/test_runner/performance/test_bulk_insert.py
+++ b/test_runner/performance/test_bulk_insert.py
@@ -1,8 +1,10 @@
 from contextlib import closing

+from fixtures.benchmark_fixture import MetricReport
 from fixtures.compare_fixtures import NeonCompare, PgCompare
 from fixtures.pageserver.utils import wait_tenant_status_404
 from fixtures.pg_version import PgVersion
+from fixtures.types import Lsn


 #
@@ -18,6 +20,8 @@ from fixtures.pg_version import PgVersion
 def test_bulk_insert(neon_with_baseline: PgCompare):
    env = neon_with_baseline

+    start_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
+
    with closing(env.pg.connect()) as conn:
        with conn.cursor() as cur:
            cur.execute("create table huge (i int, j int);")
@@ -31,6 +35,13 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
            env.report_peak_memory_use()
            env.report_size()

+    # Report amount of wal written. Useful for comparing vanilla wal format vs
+    # neon wal format, measuring neon write amplification, etc.
+    end_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
+    wal_written_bytes = end_lsn - start_lsn
+    wal_written_mb = round(wal_written_bytes / (1024 * 1024))
+    env.zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
+
    # When testing neon, also check how long it takes the pageserver to reingest the
    # wal from safekeepers. If this number is close to total runtime, then the pageserver
    # is the bottleneck.
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -1,6 +1,3 @@
-from contextlib import closing
-
-import pytest
 import requests
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnvBuilder
@@ -81,49 +78,3 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc

        # Imitate optimizations that console would do for the second start
        endpoint.respec(skip_pg_catalog_updates=True)
-
-
-# This test sometimes runs for longer than the global 5 minute timeout.
-@pytest.mark.timeout(900)
-def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-
-    # Start
-    env.neon_cli.create_branch("test_startup")
-    with zenbenchmark.record_duration("startup_time"):
-        endpoint = env.endpoints.create_start("test_startup")
-        endpoint.safe_psql("select 1;")
-
-    # Restart
-    endpoint.stop_and_destroy()
-    with zenbenchmark.record_duration("restart_time"):
-        endpoint.create_start("test_startup")
-        endpoint.safe_psql("select 1;")
-
-    # Fill up
-    num_rows = 1000000  # 30 MB
-    num_tables = 100
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            for i in range(num_tables):
-                cur.execute(f"create table t_{i} (i integer);")
-                cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));")
-
-    # Read
-    with zenbenchmark.record_duration("read_time"):
-        endpoint.safe_psql("select * from t_0;")
-
-    # Read again
-    with zenbenchmark.record_duration("second_read_time"):
-        endpoint.safe_psql("select * from t_0;")
-
-    # Restart
-    endpoint.stop_and_destroy()
-    with zenbenchmark.record_duration("restart_with_data"):
-        endpoint.create_start("test_startup")
-        endpoint.safe_psql("select 1;")
-
-    # Read
-    with zenbenchmark.record_duration("read_after_restart"):
-        endpoint.safe_psql("select * from t_0;")
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -72,7 +72,7 @@ class DdlForwardingContext:
        self.dbs: Dict[str, str] = {}
        self.roles: Dict[str, str] = {}
        self.fail = False
-        endpoint = "/management/api/v2/roles_and_databases"
+        endpoint = "/test/roles_and_databases"
        ddl_url = f"http://{host}:{port}{endpoint}"
        self.pg.configure(
            [
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -1,11 +1,14 @@
 import time

+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
    NeonEnv,
    logical_replication_sync,
    wait_for_last_flush_lsn,
 )
+from fixtures.types import Lsn
+from fixtures.utils import query_scalar


 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
@@ -147,3 +150,89 @@ COMMIT;
    endpoint.start()
    # it must be gone (but walproposer slot still exists, hence 1)
    assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1
+
+
+# Test compute start at LSN page of which starts with contrecord
+# https://github.com/neondatabase/neon/issues/5749
+def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
+    env = neon_simple_env
+
+    env.neon_cli.create_branch("init")
+    endpoint = env.endpoints.create_start("init")
+    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
+    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+
+    cur = endpoint.connect().cursor()
+    cur.execute("create table t(key int, value text)")
+    cur.execute("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
+    cur.execute("insert into replication_example values (1, 2)")
+    cur.execute("create publication pub1 for table replication_example")
+
+    # now start subscriber
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("create table t(pk integer primary key, value text)")
+    vanilla_pg.safe_psql("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
+
+    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
+    connstr = endpoint.connstr().replace("'", "''")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+    logical_replication_sync(vanilla_pg, endpoint)
+    vanilla_pg.stop()
+
+    with endpoint.cursor() as cur:
+        # measure how much space logical message takes. Sometimes first attempt
+        # creates huge message and then it stabilizes, have no idea why.
+        for _ in range(3):
+            lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+            log.info(f"current_lsn={lsn_before}")
+            # Non-transactional logical message doesn't write WAL, only XLogInsert's
+            # it, so use transactional. Which is a bit problematic as transactional
+            # necessitates commit record. Alternatively we can do smth like
+            #   select neon_xlogflush(pg_current_wal_insert_lsn());
+            # but isn't much better + that particular call complains on 'xlog flush
+            # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
+            # page headers.
+            payload = "blahblah"
+            cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
+            lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+            lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
+            logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
+            log.info(
+                f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
+            )
+
+        # and write logical message spanning exactly as we want
+        lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+        log.info(f"current_lsn={lsn_before}")
+        curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+        offs = int(curr_lsn) % 8192
+        till_page = 8192 - offs
+        payload_len = (
+            till_page - logical_message_base - 8
+        )  # not sure why 8 is here, it is deduced from experiments
+        log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}")
+
+        # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer
+        payload_len += 8
+
+        cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
+        supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
+        log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
+        # The calculations to hit the page boundary are very fuzzy, so just
+        # ignore test if we fail to reach it.
+        if not (int(supposedly_contrecord_end) % 8192 == 32):
+            pytest.skip("missed page boundary, bad luck")
+
+        cur.execute("insert into replication_example values (2, 3)")
+
+    wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+    endpoint.stop().start()
+
+    cur = endpoint.connect().cursor()
+    # this should flush current wal page
+    cur.execute("insert into replication_example values (3, 4)")
+    vanilla_pg.start()
+    logical_replication_sync(vanilla_pg, endpoint)
+    assert vanilla_pg.safe_psql(
+        "select sum(somedata) from replication_example"
+    ) == endpoint.safe_psql("select sum(somedata) from replication_example")
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -134,6 +134,9 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
    env.neon_cli.pageserver_stop(env.pageserver.id)
    env.neon_cli.safekeeper_stop()

+    # Keep NeonEnv state up to date, it usually owns starting/stopping services
+    env.pageserver.running = False
+
    # Default start
    res = env.neon_cli.raw_cli(["start"])
    res.check_returncode()
@@ -155,6 +158,10 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
    env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID)
    env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1)

+    # Keep NeonEnv state up to date, it usually owns starting/stopping services
+    env.pageservers[0].running = False
+    env.pageservers[1].running = False
+
    # Addressing a nonexistent ID throws
    with pytest.raises(RuntimeError):
        env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 100)
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -21,6 +21,7 @@ from fixtures.neon_fixtures import (
    NeonEnv,
    NeonEnvBuilder,
    PgBin,
+    S3Scrubber,
    last_flush_lsn_upload,
    wait_for_last_flush_lsn,
 )
@@ -234,8 +235,22 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
    assert len(suffixed_objects) > 0
    assert len(legacy_objects) > 0

+    # Flush through deletions to get a clean state for scrub: we are implicitly validating
+    # that our generations-enabled pageserver was able to do deletions of layers
+    # from earlier which don't have a generation.
+    env.pageserver.http_client().deletion_queue_flush(execute=True)
+
    assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0

+    # Having written a mixture of generation-aware and legacy index_part.json,
+    # ensure the scrubber handles the situation as expected.
+    metadata_summary = S3Scrubber(
+        neon_env_builder.test_output_dir, neon_env_builder
+    ).scan_metadata()
+    assert metadata_summary["count"] == 1  # Scrubber should have seen our timeline
+    assert not metadata_summary["with_errors"]
+    assert not metadata_summary["with_warnings"]
+

 def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.enable_generations = True
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -20,6 +20,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
    endpoint = env.endpoints.create_start("main")
    pageserver_http = env.pageserver.http_client()

+    assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+
    pg_conn = endpoint.connect()
    cur = pg_conn.cursor()

@@ -52,6 +54,9 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
    env.pageserver.stop()
    env.pageserver.start()

+    # We reloaded our tenant
+    assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+
    cur.execute("SELECT count(*) FROM foo")
    assert cur.fetchone() == (100000,)

--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -17,6 +17,10 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
    n_restarts = 10
    scale = 10

+    # Pageserver currently logs requests on non-active tenants at error level
+    # https://github.com/neondatabase/neon/issues/5784
+    env.pageserver.allowed_errors.append(".* will not become active. Current state: Stopping.*")
+
    def run_pgbench(connstr: str):
        log.info(f"Start a pgbench workload on pg {connstr}")
        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -432,3 +432,47 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
    query(200, "BEGIN")
    pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
    assert pid1 != pid2
+
+
+@pytest.mark.timeout(60)
+def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
+
+    static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo')")
+
+    def query(status: int, query: str) -> Any:
+        return static_proxy.http_query(
+            query,
+            [],
+            user="http_auth",
+            password="http",
+            expected_code=status,
+        )
+
+    # query generates a million rows - should hit the 10MB reponse limit quickly
+    response = query(
+        400,
+        "select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
+    )
+    assert "response is too large (max is 10485760 bytes)" in response["message"]
+
+
+def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create user http_auth with password 'http' superuser")
+
+    static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo','bar','baz')")
+
+    def query(status: int, query: str) -> Any:
+        return static_proxy.http_query(
+            query,
+            [],
+            user="http_auth",
+            password="http",
+            expected_code=status,
+        )
+
+    response = query(
+        200,
+        "select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data",
+    )
+    assert response["rows"][0]["data"] == ["foo", "bar", "baz"]
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -63,6 +63,9 @@ def test_tenant_delete_smoke(
        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
    )

+    # Default tenant and the one we created
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
+
    # create two timelines one being the parent of another
    parent = None
    for timeline in ["first", "second"]:
@@ -88,7 +91,9 @@ def test_tenant_delete_smoke(

    iterations = poll_for_remote_storage_iterations(remote_storage_kind)

+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1

    tenant_path = env.pageserver.tenant_dir(tenant_id)
    assert not tenant_path.exists()
@@ -104,6 +109,9 @@ def test_tenant_delete_smoke(
            ),
        )

+    # Deletion updates the tenant count: the one default tenant remains
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+

 class Check(enum.Enum):
    RETRY_WITHOUT_RESTART = enum.auto()
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -26,6 +26,16 @@ from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 from prometheus_client.samples import Sample

+# In tests that overlap endpoint activity with tenant attach/detach, there are
+# a variety of warnings that the page service may emit when it cannot acquire
+# an active tenant to serve a request
+PERMIT_PAGE_SERVICE_ERRORS = [
+    ".*page_service.*Tenant .* not found",
+    ".*page_service.*Tenant .* is not active",
+    ".*page_service.*cancelled",
+    ".*page_service.*will not become active.*",
+]
+

 def do_gc_target(
    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
@@ -60,12 +70,7 @@ def test_tenant_reattach(
    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
        with endpoint.cursor() as cur:
@@ -235,10 +240,7 @@ def test_tenant_reattach_while_busy(

    # Attempts to connect from compute to pageserver while the tenant is
    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)

@@ -259,7 +261,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

-    env.pageserver.allowed_errors.append(".*NotFound: Tenant .*")
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    # first check for non existing tenant
    tenant_id = TenantId.generate()
@@ -271,19 +273,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):

    assert excinfo.value.status_code == 404

-    # the error will be printed to the log too
-    env.pageserver.allowed_errors.append(".*NotFound: tenant *")
-
    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
-
    # assert tenant exists on disk
    assert env.pageserver.tenant_dir(tenant_id).exists()

@@ -345,12 +337,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
    # create a new tenant
    tenant_id, _ = env.neon_cli.create_tenant()

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    # assert tenant exists on disk
    assert env.pageserver.tenant_dir(tenant_id).exists()
@@ -401,12 +388,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
    # create a new tenant
    tenant_id, _ = env.neon_cli.create_tenant()

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    # assert tenant exists on disk
    assert env.pageserver.tenant_dir(tenant_id).exists()
@@ -453,12 +435,7 @@ def test_detach_while_attaching(
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
    # shared_buffers, otherwise the SELECT after restart will just return answer
@@ -593,12 +570,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    data_id = 1
    data_secret = "very secret secret"
@@ -649,12 +621,7 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):

    tenant_id = env.initial_tenant

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
    with pytest.raises(
@@ -693,12 +660,7 @@ def test_ignore_while_attaching(
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    data_id = 1
    data_secret = "very secret secret"