tests: use smaller layers in test_pg_regress (#8232)

## Problem Debug-mode runs of test_pg_regress are rather slow since https://github.com/neondatabase/neon/pull/8105, and occasionally exceed their 600s timeout. ## Summary of changes - Use 8MiB layer files, avoiding large ephemeral layers On a hetzner AX102, this takes the runtime from 230s to 190s. Which hopefully will be enough to get the runtime on github runners more reliably below its 600s timeout. This has the side benefit of exercising more of the pageserver stack (including compaction) under a workload that exercises a more diverse set of postgres functionality than most of our tests.
2026-01-14 00:42:54 +00:00 · 2024-07-08 20:05:35 +01:00
parent 84b039e615
commit daea26a22f
2 changed files with 118 additions and 67 deletions
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -728,6 +728,9 @@ impl From<CreateImageLayersError> for CompactionError {
    fn from(e: CreateImageLayersError) -> Self {
        match e {
            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            CreateImageLayersError::Other(e) => {
+                CompactionError::Other(e.context("create image layers"))
+            }
            _ => CompactionError::Other(e.into()),
        }
    }
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -8,8 +8,11 @@ from typing import TYPE_CHECKING, cast

 import pytest
 from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
    NeonEnvBuilder,
    check_restored_datadir_content,
+    tenant_get_shards,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
@@ -21,6 +24,97 @@ if TYPE_CHECKING:
    from pytest import CaptureFixture


+TENANT_CONF = {
+    # Scaled down thresholds so that we are exercising the pageserver beyond just writing
+    # ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files.
+    "pitr_interval": "60s",
+    "checkpoint_distance": f"{8 * 1024 * 1024}",
+    "compaction_target_size": f"{8 * 1024 * 1024}",
+}
+
+# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+# # There should have been compactions mid-test as well, this final check is in addition those.
+# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant):
+#     pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True)
+
+
+def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint):
+    """
+    After running some opaque tests that create interesting content in a timeline, run
+    some generic integrity checks that the storage stack is able to reproduce the written
+    data properly.
+    """
+
+    ignored_files: Optional[list[str]] = None
+
+    # Neon handles unlogged relations in a special manner. During a
+    # basebackup, we ship the init fork as the main fork. This presents a
+    # problem in that the endpoint's data directory and the basebackup will
+    # have differences and will fail the eventual file comparison.
+    #
+    # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
+    # support for setting the persistence of a table in 9.5. The reason that
+    # this doesn't affect versions < 15 (but probably would between 9.1 and
+    # 9.5) is that all the regression tests that deal with unlogged tables
+    # up until that point dropped the unlogged tables or set them to logged
+    # at some point during the test.
+    #
+    # In version 15, Postgres grew support for unlogged sequences, and with
+    # that came a few more regression tests. These tests did not all drop
+    # the unlogged tables/sequences prior to finishing.
+    #
+    # But unlogged sequences came with a bug in that, sequences didn't
+    # inherit the persistence of their "parent" tables if they had one. This
+    # was fixed and backported to 15, thus exacerbating our problem a bit.
+    #
+    # So what we can do is just ignore file differences between the data
+    # directory and basebackup for unlogged relations.
+    results = cast(
+        "list[tuple[str, str]]",
+        endpoint.safe_psql(
+            """
+        SELECT
+            relkind,
+            pg_relation_filepath(
+                pg_filenode_relation(reltablespace, relfilenode)
+            ) AS unlogged_relation_paths
+        FROM pg_class
+        WHERE relpersistence = 'u'
+        """,
+            dbname=db_name,
+        ),
+    )
+
+    unlogged_relation_files: list[str] = []
+    for r in results:
+        unlogged_relation_files.append(r[1])
+        # This is related to the following Postgres commit:
+        #
+        # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
+        # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+        # Date:   2023-08-23 09:21:31 -0500
+        #
+        # Use the buffer cache when initializing an unlogged index.
+        #
+        # This patch was backpatched to 16. Without it, the LSN in the
+        # page header would be 0/0 in the data directory, which wouldn't
+        # match the LSN generated during the basebackup, thus creating
+        # a difference.
+        if env.pg_version <= PgVersion.V15 and r[0] == "i":
+            unlogged_relation_files.append(f"{r[1]}_init")
+
+    ignored_files = unlogged_relation_files
+
+    check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+
+    # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+    # There should have been compactions mid-test as well, this final check is in addition those.
+    for shard, pageserver in tenant_get_shards(env, env.initial_tenant):
+        pageserver.http_client().timeline_checkpoint(
+            shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True
+        )
+
+
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
@pytest.mark.timeout(600)
@@ -45,7 +139,10 @@ def test_pg_regress(

    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+    )

    # Connect to postgres and create a database called "regression".
    endpoint = env.endpoints.create_start("main")
@@ -84,67 +181,7 @@ def test_pg_regress(
    with capsys.disabled():
        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

-        ignored_files: Optional[list[str]] = None
-
-        # Neon handles unlogged relations in a special manner. During a
-        # basebackup, we ship the init fork as the main fork. This presents a
-        # problem in that the endpoint's data directory and the basebackup will
-        # have differences and will fail the eventual file comparison.
-        #
-        # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
-        # support for setting the persistence of a table in 9.5. The reason that
-        # this doesn't affect versions < 15 (but probably would between 9.1 and
-        # 9.5) is that all the regression tests that deal with unlogged tables
-        # up until that point dropped the unlogged tables or set them to logged
-        # at some point during the test.
-        #
-        # In version 15, Postgres grew support for unlogged sequences, and with
-        # that came a few more regression tests. These tests did not all drop
-        # the unlogged tables/sequences prior to finishing.
-        #
-        # But unlogged sequences came with a bug in that, sequences didn't
-        # inherit the persistence of their "parent" tables if they had one. This
-        # was fixed and backported to 15, thus exacerbating our problem a bit.
-        #
-        # So what we can do is just ignore file differences between the data
-        # directory and basebackup for unlogged relations.
-        results = cast(
-            "list[tuple[str, str]]",
-            endpoint.safe_psql(
-                """
-            SELECT
-                relkind,
-                pg_relation_filepath(
-                    pg_filenode_relation(reltablespace, relfilenode)
-                ) AS unlogged_relation_paths
-            FROM pg_class
-            WHERE relpersistence = 'u'
-            """,
-                dbname=DBNAME,
-            ),
-        )
-
-        unlogged_relation_files: list[str] = []
-        for r in results:
-            unlogged_relation_files.append(r[1])
-            # This is related to the following Postgres commit:
-            #
-            # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
-            # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-            # Date:   2023-08-23 09:21:31 -0500
-            #
-            # Use the buffer cache when initializing an unlogged index.
-            #
-            # This patch was backpatched to 16. Without it, the LSN in the
-            # page header would be 0/0 in the data directory, which wouldn't
-            # match the LSN generated during the basebackup, thus creating
-            # a difference.
-            if env.pg_version <= PgVersion.V15 and r[0] == "i":
-                unlogged_relation_files.append(f"{r[1]}_init")
-
-        ignored_files = unlogged_relation_files
-
-        check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+    post_checks(env, test_output_dir, DBNAME, endpoint)


 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
@@ -159,16 +196,20 @@ def test_isolation(
    pg_distrib_dir: Path,
    shard_count: Optional[int],
 ):
+    DBNAME = "isolation_regression"
+
    if shard_count is not None:
        neon_env_builder.num_pageservers = shard_count
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )

    # Connect to postgres and create a database called "regression".
    # isolation tests use prepared transactions, so enable them
    endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
-    endpoint.safe_psql("CREATE DATABASE isolation_regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")

    # Create some local directories for pg_isolation_regress to run in.
    runpath = test_output_dir / "regress"
@@ -202,6 +243,9 @@ def test_isolation(
    with capsys.disabled():
        pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)

+    # This fails with a mismatch on `pg_multixact/offsets/0000`
+    # post_checks(env, test_output_dir, DBNAME, endpoint)
+

 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
@@ -215,15 +259,19 @@ def test_sql_regress(
    pg_distrib_dir: Path,
    shard_count: Optional[int],
 ):
+    DBNAME = "regression"
+
    if shard_count is not None:
        neon_env_builder.num_pageservers = shard_count
    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
    neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )

    # Connect to postgres and create a database called "regression".
    endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("CREATE DATABASE regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")

    # Create some local directories for pg_regress to run in.
    runpath = test_output_dir / "regress"
@@ -258,4 +306,4 @@ def test_sql_regress(
    with capsys.disabled():
        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

-        check_restored_datadir_content(test_output_dir, env, endpoint)
+    post_checks(env, test_output_dir, DBNAME, endpoint)