mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 01:12:56 +00:00
## Problem We have some known N^2 behaviors when it comes to large relation counts, due to the monolithic encoding and full rewrites of of RelDirectory each time a relation is added. Ordinarily our backpressure mechanisms give "slow but steady" performance when creating/dropping/truncating relations. However, in the case of a transaction abort, it is possible for a single WAL record to drop an unbounded number of relations. The results in an unavailable compute, as when it sends one of these records, it can stall the pageserver's ingest for many minutes, even though the compute only sent a small amount of WAL. Closes https://github.com/neondatabase/neon/issues/9505 ## Summary of changes - Rewrite relation-dropping code to do one read/modify/write cycle of RelDirectory, instead of doing it separately for each relation in a loop. - Add a test for the bug scenario encountered: `test_tx_abort_with_many_relations` The test has ~40s runtime on my workstation. About 1 second of that is the part where we wait for ingest to catch up after a rollback, the rest is the slowness of creating and truncating a large number of relations. --------- Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
424 lines
15 KiB
Python
424 lines
15 KiB
Python
#
|
|
# This file runs pg_regress-based tests.
|
|
#
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, cast
|
|
|
|
import pytest
|
|
from fixtures.log_helper import log
|
|
from fixtures.neon_fixtures import (
|
|
Endpoint,
|
|
NeonEnv,
|
|
NeonEnvBuilder,
|
|
check_restored_datadir_content,
|
|
tenant_get_shards,
|
|
)
|
|
from fixtures.pg_version import PgVersion
|
|
from fixtures.remote_storage import s3_storage
|
|
|
|
if TYPE_CHECKING:
|
|
from typing import Optional
|
|
|
|
from fixtures.neon_fixtures import PgBin
|
|
from pytest import CaptureFixture
|
|
|
|
|
|
TENANT_CONF = {
|
|
# Scaled down thresholds so that we are exercising the pageserver beyond just writing
|
|
# ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files.
|
|
"pitr_interval": "60s",
|
|
"checkpoint_distance": f"{8 * 1024 * 1024}",
|
|
"compaction_target_size": f"{8 * 1024 * 1024}",
|
|
}
|
|
|
|
# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
|
|
# # There should have been compactions mid-test as well, this final check is in addition those.
|
|
# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant):
|
|
# pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True)
|
|
|
|
|
|
def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint):
|
|
"""
|
|
After running some opaque tests that create interesting content in a timeline, run
|
|
some generic integrity checks that the storage stack is able to reproduce the written
|
|
data properly.
|
|
"""
|
|
|
|
ignored_files: Optional[list[str]] = None
|
|
|
|
# Neon handles unlogged relations in a special manner. During a
|
|
# basebackup, we ship the init fork as the main fork. This presents a
|
|
# problem in that the endpoint's data directory and the basebackup will
|
|
# have differences and will fail the eventual file comparison.
|
|
#
|
|
# Unlogged tables were introduced in version 9.1. ALTER TABLE grew
|
|
# support for setting the persistence of a table in 9.5. The reason that
|
|
# this doesn't affect versions < 15 (but probably would between 9.1 and
|
|
# 9.5) is that all the regression tests that deal with unlogged tables
|
|
# up until that point dropped the unlogged tables or set them to logged
|
|
# at some point during the test.
|
|
#
|
|
# In version 15, Postgres grew support for unlogged sequences, and with
|
|
# that came a few more regression tests. These tests did not all drop
|
|
# the unlogged tables/sequences prior to finishing.
|
|
#
|
|
# But unlogged sequences came with a bug in that, sequences didn't
|
|
# inherit the persistence of their "parent" tables if they had one. This
|
|
# was fixed and backported to 15, thus exacerbating our problem a bit.
|
|
#
|
|
# So what we can do is just ignore file differences between the data
|
|
# directory and basebackup for unlogged relations.
|
|
results = cast(
|
|
"list[tuple[str, str]]",
|
|
endpoint.safe_psql(
|
|
"""
|
|
SELECT
|
|
relkind,
|
|
pg_relation_filepath(
|
|
pg_filenode_relation(reltablespace, relfilenode)
|
|
) AS unlogged_relation_paths
|
|
FROM pg_class
|
|
WHERE relpersistence = 'u'
|
|
""",
|
|
dbname=db_name,
|
|
),
|
|
)
|
|
|
|
unlogged_relation_files: list[str] = []
|
|
for r in results:
|
|
unlogged_relation_files.append(r[1])
|
|
# This is related to the following Postgres commit:
|
|
#
|
|
# commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
|
|
# Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
|
|
# Date: 2023-08-23 09:21:31 -0500
|
|
#
|
|
# Use the buffer cache when initializing an unlogged index.
|
|
#
|
|
# This patch was backpatched to 16. Without it, the LSN in the
|
|
# page header would be 0/0 in the data directory, which wouldn't
|
|
# match the LSN generated during the basebackup, thus creating
|
|
# a difference.
|
|
if env.pg_version <= PgVersion.V15 and r[0] == "i":
|
|
unlogged_relation_files.append(f"{r[1]}_init")
|
|
|
|
ignored_files = unlogged_relation_files
|
|
|
|
check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
|
|
|
|
# Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
|
|
# There should have been compactions mid-test as well, this final check is in addition those.
|
|
for shard, pageserver in tenant_get_shards(env, env.initial_tenant):
|
|
pageserver.http_client().timeline_checkpoint(
|
|
shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True
|
|
)
|
|
|
|
|
|
# Run the main PostgreSQL regression tests, in src/test/regress.
|
|
#
|
|
@pytest.mark.timeout(900) # Contains many sub-tests, is slow in debug builds
|
|
@pytest.mark.parametrize("shard_count", [None, 4])
|
|
def test_pg_regress(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
test_output_dir: Path,
|
|
pg_bin: PgBin,
|
|
capsys: CaptureFixture[str],
|
|
base_dir: Path,
|
|
pg_distrib_dir: Path,
|
|
shard_count: Optional[int],
|
|
):
|
|
DBNAME = "regression"
|
|
|
|
"""
|
|
:param shard_count: if None, create an unsharded tenant. Otherwise create a tenant with this
|
|
many shards.
|
|
"""
|
|
if shard_count is not None:
|
|
neon_env_builder.num_pageservers = shard_count
|
|
|
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf=TENANT_CONF,
|
|
initial_tenant_shard_count=shard_count,
|
|
)
|
|
|
|
# Connect to postgres and create a database called "regression".
|
|
endpoint = env.endpoints.create_start(
|
|
"main",
|
|
config_lines=[
|
|
# Enable the test mode, so that we don't need to patch the test cases.
|
|
"neon.regress_test_mode = true",
|
|
],
|
|
)
|
|
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
|
|
|
# Create some local directories for pg_regress to run in.
|
|
runpath = test_output_dir / "regress"
|
|
(runpath / "testtablespace").mkdir(parents=True)
|
|
|
|
# Compute all the file locations that pg_regress will need.
|
|
build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/regress"
|
|
src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/regress"
|
|
bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
|
|
schedule = src_path / "parallel_schedule"
|
|
pg_regress = build_path / "pg_regress"
|
|
|
|
pg_regress_command = [
|
|
str(pg_regress),
|
|
'--bindir=""',
|
|
"--use-existing",
|
|
f"--bindir={bindir}",
|
|
f"--dlpath={build_path}",
|
|
f"--schedule={schedule}",
|
|
f"--inputdir={src_path}",
|
|
]
|
|
|
|
env_vars = {
|
|
"PGPORT": str(endpoint.default_options["port"]),
|
|
"PGUSER": endpoint.default_options["user"],
|
|
"PGHOST": endpoint.default_options["host"],
|
|
}
|
|
|
|
# Run the command.
|
|
# We don't capture the output. It's not too chatty, and it always
|
|
# logs the exact same data to `regression.out` anyway.
|
|
with capsys.disabled():
|
|
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
|
|
|
|
post_checks(env, test_output_dir, DBNAME, endpoint)
|
|
|
|
|
|
# Run the PostgreSQL "isolation" tests, in src/test/isolation.
|
|
#
|
|
@pytest.mark.timeout(600) # Contains many sub-tests, is slow in debug builds
|
|
@pytest.mark.parametrize("shard_count", [None, 4])
|
|
def test_isolation(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
test_output_dir: Path,
|
|
pg_bin: PgBin,
|
|
capsys: CaptureFixture[str],
|
|
base_dir: Path,
|
|
pg_distrib_dir: Path,
|
|
shard_count: Optional[int],
|
|
):
|
|
DBNAME = "isolation_regression"
|
|
|
|
if shard_count is not None:
|
|
neon_env_builder.num_pageservers = shard_count
|
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
|
|
)
|
|
|
|
# Connect to postgres and create a database called "regression".
|
|
# isolation tests use prepared transactions, so enable them
|
|
endpoint = env.endpoints.create_start(
|
|
"main",
|
|
config_lines=[
|
|
"max_prepared_transactions=100",
|
|
# Enable the test mode, so that we don't need to patch the test cases.
|
|
"neon.regress_test_mode = true",
|
|
],
|
|
)
|
|
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
|
|
|
# Create some local directories for pg_isolation_regress to run in.
|
|
runpath = test_output_dir / "regress"
|
|
(runpath / "testtablespace").mkdir(parents=True)
|
|
|
|
# Compute all the file locations that pg_isolation_regress will need.
|
|
build_path = pg_distrib_dir / f"build/{env.pg_version.v_prefixed}/src/test/isolation"
|
|
src_path = base_dir / f"vendor/postgres-{env.pg_version.v_prefixed}/src/test/isolation"
|
|
bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
|
|
schedule = src_path / "isolation_schedule"
|
|
pg_isolation_regress = build_path / "pg_isolation_regress"
|
|
|
|
pg_isolation_regress_command = [
|
|
str(pg_isolation_regress),
|
|
"--use-existing",
|
|
f"--bindir={bindir}",
|
|
f"--dlpath={build_path}",
|
|
f"--inputdir={src_path}",
|
|
f"--schedule={schedule}",
|
|
]
|
|
|
|
env_vars = {
|
|
"PGPORT": str(endpoint.default_options["port"]),
|
|
"PGUSER": endpoint.default_options["user"],
|
|
"PGHOST": endpoint.default_options["host"],
|
|
}
|
|
|
|
# Run the command.
|
|
# We don't capture the output. It's not too chatty, and it always
|
|
# logs the exact same data to `regression.out` anyway.
|
|
with capsys.disabled():
|
|
pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
|
|
|
|
# This fails with a mismatch on `pg_multixact/offsets/0000`
|
|
# post_checks(env, test_output_dir, DBNAME, endpoint)
|
|
|
|
|
|
# Run extra Neon-specific pg_regress-based tests. The tests and their
|
|
# schedule file are in the sql_regress/ directory.
|
|
@pytest.mark.parametrize("shard_count", [None, 4])
|
|
def test_sql_regress(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
test_output_dir: Path,
|
|
pg_bin: PgBin,
|
|
capsys: CaptureFixture[str],
|
|
base_dir: Path,
|
|
pg_distrib_dir: Path,
|
|
shard_count: Optional[int],
|
|
):
|
|
DBNAME = "regression"
|
|
|
|
if shard_count is not None:
|
|
neon_env_builder.num_pageservers = shard_count
|
|
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
|
env = neon_env_builder.init_start(
|
|
initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
|
|
)
|
|
|
|
# Connect to postgres and create a database called "regression".
|
|
endpoint = env.endpoints.create_start(
|
|
"main",
|
|
config_lines=[
|
|
# Enable the test mode, so that we don't need to patch the test cases.
|
|
"neon.regress_test_mode = true",
|
|
],
|
|
)
|
|
endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
|
|
|
|
# Create some local directories for pg_regress to run in.
|
|
runpath = test_output_dir / "regress"
|
|
(runpath / "testtablespace").mkdir(parents=True)
|
|
|
|
# Compute all the file locations that pg_regress will need.
|
|
# This test runs neon specific tests
|
|
build_path = pg_distrib_dir / f"build/v{env.pg_version}/src/test/regress"
|
|
src_path = base_dir / "test_runner/sql_regress"
|
|
bindir = pg_distrib_dir / f"v{env.pg_version}/bin"
|
|
schedule = src_path / "parallel_schedule"
|
|
pg_regress = build_path / "pg_regress"
|
|
|
|
pg_regress_command = [
|
|
str(pg_regress),
|
|
"--use-existing",
|
|
f"--bindir={bindir}",
|
|
f"--dlpath={build_path}",
|
|
f"--schedule={schedule}",
|
|
f"--inputdir={src_path}",
|
|
]
|
|
|
|
env_vars = {
|
|
"PGPORT": str(endpoint.default_options["port"]),
|
|
"PGUSER": endpoint.default_options["user"],
|
|
"PGHOST": endpoint.default_options["host"],
|
|
}
|
|
|
|
# Run the command.
|
|
# We don't capture the output. It's not too chatty, and it always
|
|
# logs the exact same data to `regression.out` anyway.
|
|
with capsys.disabled():
|
|
pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
|
|
|
|
post_checks(env, test_output_dir, DBNAME, endpoint)
|
|
|
|
|
|
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
|
|
def test_tx_abort_with_many_relations(
|
|
neon_env_builder: NeonEnvBuilder,
|
|
):
|
|
"""
|
|
This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres
|
|
behavior when aborting a transaction with lots of relations.
|
|
|
|
Reproducer for https://github.com/neondatabase/neon/issues/9505
|
|
"""
|
|
|
|
env = neon_env_builder.init_start()
|
|
ep = env.endpoints.create_start(
|
|
"main",
|
|
tenant_id=env.initial_tenant,
|
|
config_lines=[
|
|
"shared_buffers=1000MB",
|
|
"max_locks_per_transaction=16384",
|
|
],
|
|
)
|
|
|
|
# How many relations: this number is tuned to be long enough to take tens of seconds
|
|
# if the rollback code path is buggy, tripping the test's timeout.
|
|
n = 4000
|
|
|
|
def create():
|
|
# Create many relations
|
|
log.info(f"Creating {n} relations...")
|
|
ep.safe_psql_many(
|
|
[
|
|
"BEGIN",
|
|
f"""DO $$
|
|
DECLARE
|
|
i INT;
|
|
table_name TEXT;
|
|
BEGIN
|
|
FOR i IN 1..{n} LOOP
|
|
table_name := 'table_' || i;
|
|
EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)';
|
|
END LOOP;
|
|
END $$;
|
|
""",
|
|
"COMMIT",
|
|
]
|
|
)
|
|
|
|
def truncate():
|
|
# Truncate relations, then roll back the transaction containing the truncations
|
|
log.info(f"Truncating {n} relations...")
|
|
ep.safe_psql_many(
|
|
[
|
|
"BEGIN",
|
|
f"""DO $$
|
|
DECLARE
|
|
i INT;
|
|
table_name TEXT;
|
|
BEGIN
|
|
FOR i IN 1..{n} LOOP
|
|
table_name := 'table_' || i;
|
|
EXECUTE 'TRUNCATE ' || table_name ;
|
|
END LOOP;
|
|
END $$;
|
|
""",
|
|
]
|
|
)
|
|
|
|
def rollback_and_wait():
|
|
log.info(f"Rolling back after truncating {n} relations...")
|
|
ep.safe_psql("ROLLBACK")
|
|
|
|
# Restart the endpoint: this ensures that we can read back what we just wrote, i.e. pageserver
|
|
# ingest has caught up.
|
|
ep.stop()
|
|
log.info(f"Starting endpoint after truncating {n} relations...")
|
|
ep.start()
|
|
log.info(f"Started endpoint after truncating {n} relations...")
|
|
|
|
# Actual create & truncate phases may be slow, these involves lots of WAL records. We do not
|
|
# apply a special timeout, they are expected to complete within general test timeout
|
|
create()
|
|
truncate()
|
|
|
|
# Run in a thread because the failure case is to take pathologically long time, and we don't want
|
|
# to block the test executor on that.
|
|
with ThreadPoolExecutor(max_workers=1) as exec:
|
|
try:
|
|
# Rollback phase should be fast: this is one WAL record that we should process efficiently
|
|
fut = exec.submit(rollback_and_wait)
|
|
fut.result(timeout=5)
|
|
except:
|
|
exec.shutdown(wait=False, cancel_futures=True)
|
|
raise
|