mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-27 08:09:58 +00:00
## Problem Running `pytest.skip(...)` in a test body instead of marking the test with `@pytest.mark.skipif(...)` makes all fixtures to be initialised, which is not necessary if the test is going to be skipped anyway. Also, some tests are unnecessarily skipped (e.g. `test_layer_bloating` on Postgres 17, or `test_idle_reconnections` at all) or run (e.g. `test_parse_project_git_version_output_positive` more than on once configuration) according to comments. ## Summary of changes - Move `skip_on_postgres` / `xfail_on_postgres` / `run_only_on_default_postgres` decorators to `fixture.utils` - Add new `skip_in_debug_build` and `skip_on_ci` decorators - Replace `pytest.skip(...)` calls with decorators where possible
424 lines
15 KiB
Python
424 lines
15 KiB
Python
#
|
|
# This file runs pg_regress-based tests.
|
|
#
|
|
from __future__ import annotations
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, cast
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import (
|
|
Endpoint,
|
|
NeonEnv,
|
|
NeonEnvBuilder,
|
|
check_restored_datadir_content,
|
|
tenant_get_shards,
|
|
)
|
|
from fixtures.pg_version import PgVersion
|
|
from fixtures.remote_storage import s3_storage
|
|
from fixtures.utils import skip_in_debug_build
|
|
|
|
if TYPE_CHECKING:
|
|
from typing import Optional
|
|
|
|
from fixtures.neon_fixtures import PgBin
|
|
from pytest import CaptureFixture
|
|
|
|
|
|
TENANT_CONF = {
|
|
# Scaled down thresholds so that we are exercising the pageserver beyond just writing
|
|
# ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files.
|
|
"pitr_interval": "60s",
|
|
"checkpoint_distance": f"{8 * 1024 * 1024}",
|
|
"compaction_target_size": f"{8 * 1024 * 1024}",
|
|
}
|
|
|
|
# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
|
|
# # There should have been compactions mid-test as well, this final check is in addition those.
|
|
# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant):
|
|
# pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True)
|
|
|
|
|
|
def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint):
|
|
"""
|
|
After running some opaque tests that create interesting content in a timeline, run
|
|
some generic integrity checks that the storage stack is able to reproduce the written
|
|
data properly.
|
|
"""
|
|
|
|
ignored_files: Optional[list[str]] = None
|
|
|
|
# Neon handles unlogged relations in a special manner. During a
|
|
# basebackup, we ship the init fork as the main fork. This presents a
|
|
# problem in that the endpoint's data directory and the basebackup will
|
|
# have differences and will fail the eventual file comparison.
|
|
#
|
|
# Unlogged tables were introduced in version 9.1. ALTER TABLE grew
|
|
# support for setting the persistence of a table in 9.5. The reason that
|
|
# this doesn't affect versions < 15 (but probably would between 9.1 and
|
|
# 9.5) is that all the regression tests that deal with unlogged tables
|
|
# up until that point dropped the unlogged tables or set them to logged
|
|
# at some point during the test.
|
|
#
|
|
# In version 15, Postgres grew support for unlogged sequences, and with
|
|
# that came a few more regression tests. These tests did not all drop
|
|
# the unlogged tables/sequences prior to finishing.
|
|
#
|
|
# But unlogged sequences came with a bug in that, sequences didn't
|
|
# inherit the persistence of their "parent" tables if they had one. This
|
|
# was fixed and backported to 15, thus exacerbating our problem a bit.
|
|
#
|
|
# So what we can do is just ignore file differences between the data
|
|
# directory and basebackup for unlogged relations.
|
|
results = cast(
|
|
"list[tuple[str, str]]",
|
|
endpoint.safe_psql(
|
|
"""
|
|
SELECT
|
|
relkind,
|
|
pg_relation_filepath(
|
|
pg_filenode_relation(reltablespace, relfilenode)
|
|
) AS unlogged_relation_paths
|
|
FROM pg_class
|
|
WHERE relpersistence = 'u'
|
|
""",
|
|
dbname=db_name,
|
|
),
|
|
)
|
|
|
|
unlogged_relation_files: list[str] = []
|
|
for r in results:
|
|
unlogged_relation_files.append(r[1])
|
|
# This is related to the following Postgres commit:
|
|
#
|
|
# commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
|
|
# Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
|
|
# Date: 2023-08-23 09:21:31 -0500
|
|
#
|
|
# Use the buffer cache when initializing an unlogged index.
|
|
#
|
|
# This patch was backpatched to 16. Without it, the LSN in the
|
|
# page header would be 0/0 in the data directory, which wouldn't
|
|
# match the LSN generated during the basebackup, thus creating
|
|
# a difference.
|
|
if env.pg_version <= PgVersion.V15 and r[0] == "i":
|
|
unlogged_relation_files.append(f"{r[1]}_init")
|
|
|
|
ignored_files = unlogged_relation_files
|
|
|
|
check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
|
|
|
|
# Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
|
|
# There should have been compactions mid-test as well, this final check is in addition those.
|
|
for shard, pageserver in tenant_get_shards(env, env.initial_tenant):
|
|
pageserver.http_client().timeline_checkpoint(
|
|
shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True
|
|
)
|
|
|
|
|
|
# Run the main PostgreSQL regression tests, in src/test/regress.
|
|
#
|
|
@pytest.mark.timeout(900) # Contains many sub-tests, is slow in debug builds
|
|
@pytest.mark.parametrize("shard_count", [None, 4])
|
|
def test_pg_regress(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
test_output_dir: Path,
|
|
pg_bin: PgBin,
|
|
capsys: CaptureFixture[str],
|
|
base_dir: Path,
|
|
pg_distrib_dir: Path,
|
|
shard_count: Optional[int],
|
|
):
|
|
DBNAME = "regression"
|
|
|
|
"""
|
|
:param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this
|
|
many shards.
|
|
"""
|
|
if shard_count is not None:
|
|
neon_env_builder.num_pageservers = shard_count
|
|
|
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf=TENANT_CONF,
|
|
initial_tenant_shard_count=shard_count,
|
|
)
|
|
|
|
# Connect to postgres and create a database called "regression".
|
|
endpoint = env.endpoints.create_start(
|
|
"main",
|
|
config_lines=[
|
|
# Enable the test mode, so that we don't need to patch the test cases.
|
|
"neon.regress_test_mode = true",
|
|
],
|
|
)
|
|
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
|
|
|
# Create some local directories for pg_regress to run in.
|
|
runpath = test_output_dir / "regress"
|
|
(runpath / "testtablespace").mkdir(parents=True)
|
|
|
|
# Compute all the file locations that pg_regress will need.
|
|
build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress"
|
|
src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress"
|
|
bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
|
|
schedule = src_path / "parallel_schedule"
|
|
pg_regress = build_path / "pg_regress"
|
|
|
|
pg_regress_command = [
|
|
str(pg_regress),
|
|
'--bindir=""',
|
|
"--use-existing",
|
|
f"--bindir={bindir}",
|
|
f"--dlpath={build_path}",
|
|
f"--schedule={schedule}",
|
|
f"--inputdir={src_path}",
|
|
]
|
|
|
|
env_vars = {
|
|
"PGPORT": str(endpoint.default_options["port"]),
|
|
"PGUSER": endpoint.default_options["user"],
|
|
"PGHOST": endpoint.default_options["host"],
|
|
}
|
|
|
|
# Run the command.
|
|
# We don't capture the output. It's not too chatty, and it always
|
|
# logs the exact same data to `regression.out` anyway.
|
|
with capsys.disabled():
|
|
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
|
|
|
|
post_checks(env, test_output_dir, DBNAME, endpoint)
|
|
|
|
|
|
# Run the PostgreSQL "isolation" tests, in src/test/isolation.
|
|
#
|
|
@pytest.mark.timeout(600) # Contains many sub-tests, is slow in debug builds
|
|
@pytest.mark.parametrize("shard_count", [None, 4])
|
|
def test_isolation(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
test_output_dir: Path,
|
|
pg_bin: PgBin,
|
|
capsys: CaptureFixture[str],
|
|
base_dir: Path,
|
|
pg_distrib_dir: Path,
|
|
shard_count: Optional[int],
|
|
):
|
|
DBNAME = "isolation_regression"
|
|
|
|
if shard_count is not None:
|
|
neon_env_builder.num_pageservers = shard_count
|
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
|
|
)
|
|
|
|
# Connect to postgres and create a database called "regression".
|
|
# isolation tests use prepared transactions, so enable them
|
|
endpoint = env.endpoints.create_start(
|
|
"main",
|
|
config_lines=[
|
|
"max_prepared_transactions=100",
|
|
# Enable the test mode, so that we don't need to patch the test cases.
|
|
"neon.regress_test_mode = true",
|
|
],
|
|
)
|
|
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
|
|
|
# Create some local directories for pg_isolation_regress to run in.
|
|
runpath = test_output_dir / "regress"
|
|
(runpath / "testtablespace").mkdir(parents=True)
|
|
|
|
# Compute all the file locations that pg_isolation_regress will need.
|
|
build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation"
|
|
src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation"
|
|
bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
|
|
schedule = src_path / "isolation_schedule"
|
|
pg_isolation_regress = build_path / "pg_isolation_regress"
|
|
|
|
pg_isolation_regress_command = [
|
|
str(pg_isolation_regress),
|
|
"--use-existing",
|
|
f"--bindir={bindir}",
|
|
f"--dlpath={build_path}",
|
|
f"--inputdir={src_path}",
|
|
f"--schedule={schedule}",
|
|
]
|
|
|
|
env_vars = {
|
|
"PGPORT": str(endpoint.default_options["port"]),
|
|
"PGUSER": endpoint.default_options["user"],
|
|
"PGHOST": endpoint.default_options["host"],
|
|
}
|
|
|
|
# Run the command.
|
|
# We don't capture the output. It's not too chatty, and it always
|
|
# logs the exact same data to `regression.out` anyway.
|
|
with capsys.disabled():
|
|
pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
|
|
|
|
# This fails with a mismatch on `pg_multixact/offsets/0000`
|
|
# post_checks(env, test_output_dir, DBNAME, endpoint)
|
|
|
|
|
|
# Run extra Neon-specific pg_regress-based tests. The tests and their
|
|
# schedule file are in the sql_regress/ directory.
|
|
@pytest.mark.parametrize("shard_count", [None, 4])
|
|
def test_sql_regress(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
test_output_dir: Path,
|
|
pg_bin: PgBin,
|
|
capsys: CaptureFixture[str],
|
|
base_dir: Path,
|
|
pg_distrib_dir: Path,
|
|
shard_count: Optional[int],
|
|
):
|
|
DBNAME = "regression"
|
|
|
|
if shard_count is not None:
|
|
neon_env_builder.num_pageservers = shard_count
|
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
|
|
)
|
|
|
|
# Connect to postgres and create a database called "regression".
|
|
endpoint = env.endpoints.create_start(
|
|
"main",
|
|
config_lines=[
|
|
# Enable the test mode, so that we don't need to patch the test cases.
|
|
"neon.regress_test_mode = true",
|
|
],
|
|
)
|
|
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
|
|
|
# Create some local directories for pg_regress to run in.
|
|
runpath = test_output_dir / "regress"
|
|
(runpath / "testtablespace").mkdir(parents=True)
|
|
|
|
# Compute all the file locations that pg_regress will need.
|
|
# This test runs neon specific tests
|
|
build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress"
|
|
src_path = base_dir / "test_runner/sql_regress"
|
|
bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
|
|
schedule = src_path / "parallel_schedule"
|
|
pg_regress = build_path / "pg_regress"
|
|
|
|
pg_regress_command = [
|
|
str(pg_regress),
|
|
"--use-existing",
|
|
f"--bindir={bindir}",
|
|
f"--dlpath={build_path}",
|
|
f"--schedule={schedule}",
|
|
f"--inputdir={src_path}",
|
|
]
|
|
|
|
env_vars = {
|
|
"PGPORT": str(endpoint.default_options["port"]),
|
|
"PGUSER": endpoint.default_options["user"],
|
|
"PGHOST": endpoint.default_options["host"],
|
|
}
|
|
|
|
# Run the command.
|
|
# We don't capture the output. It's not too chatty, and it always
|
|
# logs the exact same data to `regression.out` anyway.
|
|
with capsys.disabled():
|
|
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
|
|
|
|
post_checks(env, test_output_dir, DBNAME, endpoint)
|
|
|
|
|
|
@skip_in_debug_build("only run with release build")
|
|
def test_tx_abort_with_many_relations(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres
|
|
behavior when aborting a transaction with lots of relations.
|
|
|
|
Reproducer for https://github.com/neondatabase/neon/issues/9505
|
|
"""
|
|
|
|
env = neon_env_builder.init_start()
|
|
ep = env.endpoints.create_start(
|
|
"main",
|
|
tenant_id=env.initial_tenant,
|
|
config_lines=[
|
|
"shared_buffers=1000MB",
|
|
"max_locks_per_transaction=16384",
|
|
],
|
|
)
|
|
|
|
# How many relations: this number is tuned to be long enough to take tens of seconds
|
|
# if the rollback code path is buggy, tripping the test's timeout.
|
|
n = 4000
|
|
|
|
def create():
|
|
# Create many relations
|
|
log.info(f"Creating {n} relations...")
|
|
ep.safe_psql_many(
|
|
[
|
|
"BEGIN",
|
|
f"""DO $$
|
|
DECLARE
|
|
i INT;
|
|
table_name TEXT;
|
|
BEGIN
|
|
FOR i IN 1..{n} LOOP
|
|
table_name := 'table_' || i;
|
|
EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)';
|
|
END LOOP;
|
|
END $$;
|
|
""",
|
|
"COMMIT",
|
|
]
|
|
)
|
|
|
|
def truncate():
|
|
# Truncate relations, then roll back the transaction containing the truncations
|
|
log.info(f"Truncating {n} relations...")
|
|
ep.safe_psql_many(
|
|
[
|
|
"BEGIN",
|
|
f"""DO $$
|
|
DECLARE
|
|
i INT;
|
|
table_name TEXT;
|
|
BEGIN
|
|
FOR i IN 1..{n} LOOP
|
|
table_name := 'table_' || i;
|
|
EXECUTE 'TRUNCATE ' || table_name ;
|
|
END LOOP;
|
|
END $$;
|
|
""",
|
|
]
|
|
)
|
|
|
|
def rollback_and_wait():
|
|
log.info(f"Rolling back after truncating {n} relations...")
|
|
ep.safe_psql("ROLLBACK")
|
|
|
|
# Restart the endpoint: this ensures that we can read back what we just wrote, i.e. pageserver
|
|
# ingest has caught up.
|
|
ep.stop()
|
|
log.info(f"Starting endpoint after truncating {n} relations...")
|
|
ep.start()
|
|
log.info(f"Started endpoint after truncating {n} relations...")
|
|
|
|
# Actual create & truncate phases may be slow, these involves lots of WAL records. We do not
|
|
# apply a special timeout, they are expected to complete within general test timeout
|
|
create()
|
|
truncate()
|
|
|
|
# Run in a thread because the failure case is to take pathologically long time, and we don't want
|
|
# to block the test executor on that.
|
|
with ThreadPoolExecutor(max_workers=1) as exec:
|
|
try:
|
|
# Rollback phase should be fast: this is one WAL record that we should process efficiently
|
|
fut = exec.submit(rollback_and_wait)
|
|
fut.result(timeout=5)
|
|
except:
|
|
exec.shutdown(wait=False, cancel_futures=True)
|
|
raise
|