diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 92baf1073a..541704e8d6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -728,6 +728,9 @@ impl From for CompactionError { fn from(e: CreateImageLayersError) -> Self { match e { CreateImageLayersError::Cancelled => CompactionError::ShuttingDown, + CreateImageLayersError::Other(e) => { + CompactionError::Other(e.context("create image layers")) + } _ => CompactionError::Other(e.into()), } } diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 756a2c17c9..54b493ec70 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -8,8 +8,11 @@ from typing import TYPE_CHECKING, cast import pytest from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, NeonEnvBuilder, check_restored_datadir_content, + tenant_get_shards, ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import s3_storage @@ -21,6 +24,97 @@ if TYPE_CHECKING: from pytest import CaptureFixture +TENANT_CONF = { + # Scaled down thresholds so that we are exercising the pageserver beyond just writing + # ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files. + "pitr_interval": "60s", + "checkpoint_distance": f"{8 * 1024 * 1024}", + "compaction_target_size": f"{8 * 1024 * 1024}", +} + +# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. +# # There should have been compactions mid-test as well, this final check is in addition those. +# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant): +# pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True) + + +def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint): + """ + After running some opaque tests that create interesting content in a timeline, run + some generic integrity checks that the storage stack is able to reproduce the written + data properly. + """ + + ignored_files: Optional[list[str]] = None + + # Neon handles unlogged relations in a special manner. During a + # basebackup, we ship the init fork as the main fork. This presents a + # problem in that the endpoint's data directory and the basebackup will + # have differences and will fail the eventual file comparison. + # + # Unlogged tables were introduced in version 9.1. ALTER TABLE grew + # support for setting the persistence of a table in 9.5. The reason that + # this doesn't affect versions < 15 (but probably would between 9.1 and + # 9.5) is that all the regression tests that deal with unlogged tables + # up until that point dropped the unlogged tables or set them to logged + # at some point during the test. + # + # In version 15, Postgres grew support for unlogged sequences, and with + # that came a few more regression tests. These tests did not all drop + # the unlogged tables/sequences prior to finishing. + # + # But unlogged sequences came with a bug in that, sequences didn't + # inherit the persistence of their "parent" tables if they had one. This + # was fixed and backported to 15, thus exacerbating our problem a bit. + # + # So what we can do is just ignore file differences between the data + # directory and basebackup for unlogged relations. + results = cast( + "list[tuple[str, str]]", + endpoint.safe_psql( + """ + SELECT + relkind, + pg_relation_filepath( + pg_filenode_relation(reltablespace, relfilenode) + ) AS unlogged_relation_paths + FROM pg_class + WHERE relpersistence = 'u' + """, + dbname=db_name, + ), + ) + + unlogged_relation_files: list[str] = [] + for r in results: + unlogged_relation_files.append(r[1]) + # This is related to the following Postgres commit: + # + # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b + # Author: Heikki Linnakangas + # Date: 2023-08-23 09:21:31 -0500 + # + # Use the buffer cache when initializing an unlogged index. + # + # This patch was backpatched to 16. Without it, the LSN in the + # page header would be 0/0 in the data directory, which wouldn't + # match the LSN generated during the basebackup, thus creating + # a difference. + if env.pg_version <= PgVersion.V15 and r[0] == "i": + unlogged_relation_files.append(f"{r[1]}_init") + + ignored_files = unlogged_relation_files + + check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) + + # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. + # There should have been compactions mid-test as well, this final check is in addition those. + for shard, pageserver in tenant_get_shards(env, env.initial_tenant): + pageserver.http_client().timeline_checkpoint( + shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True + ) + + # Run the main PostgreSQL regression tests, in src/test/regress. # @pytest.mark.timeout(600) @@ -45,7 +139,10 @@ def test_pg_regress( neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + ) # Connect to postgres and create a database called "regression". endpoint = env.endpoints.create_start("main") @@ -84,67 +181,7 @@ def test_pg_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - ignored_files: Optional[list[str]] = None - - # Neon handles unlogged relations in a special manner. During a - # basebackup, we ship the init fork as the main fork. This presents a - # problem in that the endpoint's data directory and the basebackup will - # have differences and will fail the eventual file comparison. - # - # Unlogged tables were introduced in version 9.1. ALTER TABLE grew - # support for setting the persistence of a table in 9.5. The reason that - # this doesn't affect versions < 15 (but probably would between 9.1 and - # 9.5) is that all the regression tests that deal with unlogged tables - # up until that point dropped the unlogged tables or set them to logged - # at some point during the test. - # - # In version 15, Postgres grew support for unlogged sequences, and with - # that came a few more regression tests. These tests did not all drop - # the unlogged tables/sequences prior to finishing. - # - # But unlogged sequences came with a bug in that, sequences didn't - # inherit the persistence of their "parent" tables if they had one. This - # was fixed and backported to 15, thus exacerbating our problem a bit. - # - # So what we can do is just ignore file differences between the data - # directory and basebackup for unlogged relations. - results = cast( - "list[tuple[str, str]]", - endpoint.safe_psql( - """ - SELECT - relkind, - pg_relation_filepath( - pg_filenode_relation(reltablespace, relfilenode) - ) AS unlogged_relation_paths - FROM pg_class - WHERE relpersistence = 'u' - """, - dbname=DBNAME, - ), - ) - - unlogged_relation_files: list[str] = [] - for r in results: - unlogged_relation_files.append(r[1]) - # This is related to the following Postgres commit: - # - # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b - # Author: Heikki Linnakangas - # Date: 2023-08-23 09:21:31 -0500 - # - # Use the buffer cache when initializing an unlogged index. - # - # This patch was backpatched to 16. Without it, the LSN in the - # page header would be 0/0 in the data directory, which wouldn't - # match the LSN generated during the basebackup, thus creating - # a difference. - if env.pg_version <= PgVersion.V15 and r[0] == "i": - unlogged_relation_files.append(f"{r[1]}_init") - - ignored_files = unlogged_relation_files - - check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) + post_checks(env, test_output_dir, DBNAME, endpoint) # Run the PostgreSQL "isolation" tests, in src/test/isolation. @@ -159,16 +196,20 @@ def test_isolation( pg_distrib_dir: Path, shard_count: Optional[int], ): + DBNAME = "isolation_regression" + if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + ) # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"]) - endpoint.safe_psql("CREATE DATABASE isolation_regression") + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_isolation_regress to run in. runpath = test_output_dir / "regress" @@ -202,6 +243,9 @@ def test_isolation( with capsys.disabled(): pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) + # This fails with a mismatch on `pg_multixact/offsets/0000` + # post_checks(env, test_output_dir, DBNAME, endpoint) + # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. @@ -215,15 +259,19 @@ def test_sql_regress( pg_distrib_dir: Path, shard_count: Optional[int], ): + DBNAME = "regression" + if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + ) # Connect to postgres and create a database called "regression". endpoint = env.endpoints.create_start("main") - endpoint.safe_psql("CREATE DATABASE regression") + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" @@ -258,4 +306,4 @@ def test_sql_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - check_restored_datadir_content(test_output_dir, env, endpoint) + post_checks(env, test_output_dir, DBNAME, endpoint)