Merge branch 'main' into bojan-get-page-tests

This commit is contained in:
Bojan Serafimov
2022-04-14 13:59:59 -04:00
58 changed files with 2180 additions and 656 deletions

View File

@@ -28,4 +28,4 @@ def test_createuser(zenith_simple_env: ZenithEnv):
pg2 = env.postgres.create_start('test_createuser2')
# Test that you can connect to new branch as a new user
assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
assert pg2.safe_psql('select current_user', user='testuser') == [('testuser', )]

View File

@@ -19,6 +19,11 @@ async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str)
copy_input = repeat_bytes(buf.read(), 5000)
pg_conn = await pg.connect_async()
# PgProtocol.connect_async sets statement_timeout to 2 minutes.
# That's not enough for this test, on a slow system in debug mode.
await pg_conn.execute("SET statement_timeout='300s'")
await pg_conn.copy_to_table(table_name, source=copy_input)

View File

@@ -1,14 +0,0 @@
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.log_helper import log
def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin):
env = zenith_simple_env
env.zenith_cli.create_branch("test_pgbench", "empty")
pg = env.postgres.create_start('test_pgbench')
log.info("postgres is running on 'test_pgbench' branch")
connstr = pg.connstr()
pg_bin.run_capture(['pgbench', '-i', connstr])
pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr])

View File

@@ -5,11 +5,14 @@ def test_proxy_select_1(static_proxy):
static_proxy.safe_psql("select 1;")
@pytest.mark.xfail # Proxy eats the extra connection options
# Pass extra options to the server.
#
# Currently, proxy eats the extra connection options, so this fails.
# See https://github.com/neondatabase/neon/issues/1287
@pytest.mark.xfail
def test_proxy_options(static_proxy):
schema_name = "tmp_schema_1"
with static_proxy.connect(schema=schema_name) as conn:
with static_proxy.connect(options="-cproxytest.option=value") as conn:
with conn.cursor() as cur:
cur.execute("SHOW search_path;")
search_path = cur.fetchall()[0][0]
assert schema_name == search_path
cur.execute("SHOW proxytest.option;")
value = cur.fetchall()[0][0]
assert value == 'value'

View File

@@ -379,7 +379,7 @@ class ProposerPostgres(PgProtocol):
tenant_id: uuid.UUID,
listen_addr: str,
port: int):
super().__init__(host=listen_addr, port=port, username='zenith_admin')
super().__init__(host=listen_addr, port=port, user='zenith_admin', dbname='postgres')
self.pgdata_dir: str = pgdata_dir
self.pg_bin: PgBin = pg_bin

View File

@@ -35,9 +35,9 @@ def test_isolation(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, capsys
]
env_vars = {
'PGPORT': str(pg.port),
'PGUSER': pg.username,
'PGHOST': pg.host,
'PGPORT': str(pg.default_options['port']),
'PGUSER': pg.default_options['user'],
'PGHOST': pg.default_options['host'],
}
# Run the command.

View File

@@ -35,9 +35,9 @@ def test_pg_regress(zenith_simple_env: ZenithEnv, test_output_dir: str, pg_bin,
]
env_vars = {
'PGPORT': str(pg.port),
'PGUSER': pg.username,
'PGHOST': pg.host,
'PGPORT': str(pg.default_options['port']),
'PGUSER': pg.default_options['user'],
'PGHOST': pg.default_options['host'],
}
# Run the command.

View File

@@ -40,9 +40,9 @@ def test_zenith_regress(zenith_simple_env: ZenithEnv, test_output_dir, pg_bin, c
log.info(pg_regress_command)
env_vars = {
'PGPORT': str(pg.port),
'PGUSER': pg.username,
'PGHOST': pg.host,
'PGPORT': str(pg.default_options['port']),
'PGUSER': pg.default_options['user'],
'PGHOST': pg.default_options['host'],
}
# Run the command.

View File

@@ -17,7 +17,7 @@ import warnings
from contextlib import contextmanager
# Type-related stuff
from typing import Iterator
from typing import Iterator, Optional
"""
This file contains fixtures for micro-benchmarks.
@@ -51,17 +51,12 @@ in the test initialization, or measure disk usage after the test query.
@dataclasses.dataclass
class PgBenchRunResult:
scale: int
number_of_clients: int
number_of_threads: int
number_of_transactions_actually_processed: int
latency_average: float
latency_stddev: float
tps_including_connection_time: float
tps_excluding_connection_time: float
init_duration: float
init_start_timestamp: int
init_end_timestamp: int
latency_stddev: Optional[float]
tps: float
run_duration: float
run_start_timestamp: int
run_end_timestamp: int
@@ -69,56 +64,67 @@ class PgBenchRunResult:
# TODO progress
@classmethod
def parse_from_output(
def parse_from_stdout(
cls,
out: 'subprocess.CompletedProcess[str]',
init_duration: float,
init_start_timestamp: int,
init_end_timestamp: int,
stdout: str,
run_duration: float,
run_start_timestamp: int,
run_end_timestamp: int,
):
stdout_lines = out.stdout.splitlines()
stdout_lines = stdout.splitlines()
latency_stddev = None
# we know significant parts of these values from test input
# but to be precise take them from output
# scaling factor: 5
assert "scaling factor" in stdout_lines[1]
scale = int(stdout_lines[1].split()[-1])
# number of clients: 1
assert "number of clients" in stdout_lines[3]
number_of_clients = int(stdout_lines[3].split()[-1])
# number of threads: 1
assert "number of threads" in stdout_lines[4]
number_of_threads = int(stdout_lines[4].split()[-1])
# number of transactions actually processed: 1000/1000
assert "number of transactions actually processed" in stdout_lines[6]
number_of_transactions_actually_processed = int(stdout_lines[6].split("/")[1])
# latency average = 19.894 ms
assert "latency average" in stdout_lines[7]
latency_average = stdout_lines[7].split()[-2]
# latency stddev = 3.387 ms
assert "latency stddev" in stdout_lines[8]
latency_stddev = stdout_lines[8].split()[-2]
# tps = 50.219689 (including connections establishing)
assert "(including connections establishing)" in stdout_lines[9]
tps_including_connection_time = stdout_lines[9].split()[2]
# tps = 50.264435 (excluding connections establishing)
assert "(excluding connections establishing)" in stdout_lines[10]
tps_excluding_connection_time = stdout_lines[10].split()[2]
for line in stdout.splitlines():
# scaling factor: 5
if line.startswith("scaling factor:"):
scale = int(line.split()[-1])
# number of clients: 1
if line.startswith("number of clients: "):
number_of_clients = int(line.split()[-1])
# number of threads: 1
if line.startswith("number of threads: "):
number_of_threads = int(line.split()[-1])
# number of transactions actually processed: 1000/1000
# OR
# number of transactions actually processed: 1000
if line.startswith("number of transactions actually processed"):
if "/" in line:
number_of_transactions_actually_processed = int(line.split("/")[1])
else:
number_of_transactions_actually_processed = int(line.split()[-1])
# latency average = 19.894 ms
if line.startswith("latency average"):
latency_average = float(line.split()[-2])
# latency stddev = 3.387 ms
# (only printed with some options)
if line.startswith("latency stddev"):
latency_stddev = float(line.split()[-2])
# Get the TPS without initial connection time. The format
# of the tps lines changed in pgbench v14, but we accept
# either format:
#
# pgbench v13 and below:
# tps = 50.219689 (including connections establishing)
# tps = 50.264435 (excluding connections establishing)
#
# pgbench v14:
# initial connection time = 3.858 ms
# tps = 309.281539 (without initial connection time)
if (line.startswith("tps = ") and ("(excluding connections establishing)" in line
or "(without initial connection time)")):
tps = float(line.split()[2])
return cls(
scale=scale,
number_of_clients=number_of_clients,
number_of_threads=number_of_threads,
number_of_transactions_actually_processed=number_of_transactions_actually_processed,
latency_average=float(latency_average),
latency_stddev=float(latency_stddev),
tps_including_connection_time=float(tps_including_connection_time),
tps_excluding_connection_time=float(tps_excluding_connection_time),
init_duration=init_duration,
init_start_timestamp=init_start_timestamp,
init_end_timestamp=init_end_timestamp,
latency_average=latency_average,
latency_stddev=latency_stddev,
tps=tps,
run_duration=run_duration,
run_start_timestamp=run_start_timestamp,
run_end_timestamp=run_end_timestamp,
@@ -187,60 +193,41 @@ class ZenithBenchmarker:
report=MetricReport.LOWER_IS_BETTER,
)
def record_pg_bench_result(self, pg_bench_result: PgBenchRunResult):
self.record("scale", pg_bench_result.scale, '', MetricReport.TEST_PARAM)
self.record("number_of_clients",
def record_pg_bench_result(self, prefix: str, pg_bench_result: PgBenchRunResult):
self.record(f"{prefix}.number_of_clients",
pg_bench_result.number_of_clients,
'',
MetricReport.TEST_PARAM)
self.record("number_of_threads",
self.record(f"{prefix}.number_of_threads",
pg_bench_result.number_of_threads,
'',
MetricReport.TEST_PARAM)
self.record(
"number_of_transactions_actually_processed",
f"{prefix}.number_of_transactions_actually_processed",
pg_bench_result.number_of_transactions_actually_processed,
'',
# thats because this is predefined by test matrix and doesnt change across runs
report=MetricReport.TEST_PARAM,
)
self.record("latency_average",
self.record(f"{prefix}.latency_average",
pg_bench_result.latency_average,
unit="ms",
report=MetricReport.LOWER_IS_BETTER)
self.record("latency_stddev",
pg_bench_result.latency_stddev,
unit="ms",
report=MetricReport.LOWER_IS_BETTER)
self.record("tps_including_connection_time",
pg_bench_result.tps_including_connection_time,
'',
report=MetricReport.HIGHER_IS_BETTER)
self.record("tps_excluding_connection_time",
pg_bench_result.tps_excluding_connection_time,
'',
report=MetricReport.HIGHER_IS_BETTER)
self.record("init_duration",
pg_bench_result.init_duration,
unit="s",
report=MetricReport.LOWER_IS_BETTER)
self.record("init_start_timestamp",
pg_bench_result.init_start_timestamp,
'',
MetricReport.TEST_PARAM)
self.record("init_end_timestamp",
pg_bench_result.init_end_timestamp,
'',
MetricReport.TEST_PARAM)
self.record("run_duration",
if pg_bench_result.latency_stddev is not None:
self.record(f"{prefix}.latency_stddev",
pg_bench_result.latency_stddev,
unit="ms",
report=MetricReport.LOWER_IS_BETTER)
self.record(f"{prefix}.tps", pg_bench_result.tps, '', report=MetricReport.HIGHER_IS_BETTER)
self.record(f"{prefix}.run_duration",
pg_bench_result.run_duration,
unit="s",
report=MetricReport.LOWER_IS_BETTER)
self.record("run_start_timestamp",
self.record(f"{prefix}.run_start_timestamp",
pg_bench_result.run_start_timestamp,
'',
MetricReport.TEST_PARAM)
self.record("run_end_timestamp",
self.record(f"{prefix}.run_end_timestamp",
pg_bench_result.run_end_timestamp,
'',
MetricReport.TEST_PARAM)
@@ -259,10 +246,18 @@ class ZenithBenchmarker:
"""
Fetch the "cumulative # of bytes written" metric from the pageserver
"""
# Fetch all the exposed prometheus metrics from page server
all_metrics = pageserver.http_client().get_metrics()
# Use a regular expression to extract the one we're interested in
#
metric_name = r'pageserver_disk_io_bytes{io_operation="write"}'
return self.get_int_counter_value(pageserver, metric_name)
def get_peak_mem(self, pageserver) -> int:
"""
Fetch the "maxrss" metric from the pageserver
"""
metric_name = r'pageserver_maxrss_kb'
return self.get_int_counter_value(pageserver, metric_name)
def get_int_counter_value(self, pageserver, metric_name) -> int:
"""Fetch the value of given int counter from pageserver metrics."""
# TODO: If we start to collect more of the prometheus metrics in the
# performance test suite like this, we should refactor this to load and
# parse all the metrics into a more convenient structure in one go.
@@ -270,20 +265,8 @@ class ZenithBenchmarker:
# The metric should be an integer, as it's a number of bytes. But in general
# all prometheus metrics are floats. So to be pedantic, read it as a float
# and round to integer.
matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$',
all_metrics,
re.MULTILINE)
assert matches
return int(round(float(matches.group(1))))
def get_peak_mem(self, pageserver) -> int:
"""
Fetch the "maxrss" metric from the pageserver
"""
# Fetch all the exposed prometheus metrics from page server
all_metrics = pageserver.http_client().get_metrics()
# See comment in get_io_writes()
matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE)
matches = re.search(fr'^{metric_name} (\S+)$', all_metrics, re.MULTILINE)
assert matches
return int(round(float(matches.group(1))))

View File

@@ -2,7 +2,7 @@ import pytest
from contextlib import contextmanager
from abc import ABC, abstractmethod
from fixtures.zenith_fixtures import PgBin, PgProtocol, VanillaPostgres, ZenithEnv
from fixtures.zenith_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, ZenithEnv
from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
# Type-related stuff
@@ -87,6 +87,9 @@ class ZenithCompare(PgCompare):
def flush(self):
self.pscur.execute(f"do_gc {self.env.initial_tenant.hex} {self.timeline} 0")
def compact(self):
self.pscur.execute(f"compact {self.env.initial_tenant.hex} {self.timeline}")
def report_peak_memory_use(self) -> None:
self.zenbenchmark.record("peak_mem",
self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024,
@@ -102,6 +105,19 @@ class ZenithCompare(PgCompare):
'MB',
report=MetricReport.LOWER_IS_BETTER)
total_files = self.zenbenchmark.get_int_counter_value(
self.env.pageserver, "pageserver_num_persistent_files_created")
total_bytes = self.zenbenchmark.get_int_counter_value(
self.env.pageserver, "pageserver_persistent_bytes_written")
self.zenbenchmark.record("data_uploaded",
total_bytes / (1024 * 1024),
"MB",
report=MetricReport.LOWER_IS_BETTER)
self.zenbenchmark.record("num_files_uploaded",
total_files,
"",
report=MetricReport.LOWER_IS_BETTER)
def record_pageserver_writes(self, out_name):
return self.zenbenchmark.record_pageserver_writes(self.env.pageserver, out_name)
@@ -159,6 +175,48 @@ class VanillaCompare(PgCompare):
return self.zenbenchmark.record_duration(out_name)
class RemoteCompare(PgCompare):
"""PgCompare interface for a remote postgres instance."""
def __init__(self, zenbenchmark, remote_pg: RemotePostgres):
self._pg = remote_pg
self._zenbenchmark = zenbenchmark
# Long-lived cursor, useful for flushing
self.conn = self.pg.connect()
self.cur = self.conn.cursor()
@property
def pg(self):
return self._pg
@property
def zenbenchmark(self):
return self._zenbenchmark
@property
def pg_bin(self):
return self._pg.pg_bin
def flush(self):
# TODO: flush the remote pageserver
pass
def report_peak_memory_use(self) -> None:
# TODO: get memory usage from remote pageserver
pass
def report_size(self) -> None:
# TODO: get storage size from remote pageserver
pass
@contextmanager
def record_pageserver_writes(self, out_name):
yield # Do nothing
def record_duration(self, out_name):
return self.zenbenchmark.record_duration(out_name)
@pytest.fixture(scope='function')
def zenith_compare(request, zenbenchmark, pg_bin, zenith_simple_env) -> ZenithCompare:
branch_name = request.node.name
@@ -170,6 +228,11 @@ def vanilla_compare(zenbenchmark, vanilla_pg) -> VanillaCompare:
return VanillaCompare(zenbenchmark, vanilla_pg)
@pytest.fixture(scope='function')
def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare:
return RemoteCompare(zenbenchmark, remote_pg)
@pytest.fixture(params=["vanilla_compare", "zenith_compare"], ids=["vanilla", "zenith"])
def zenith_with_baseline(request) -> PgCompare:
"""Parameterized fixture that helps compare zenith against vanilla postgres.

View File

@@ -27,6 +27,7 @@ from dataclasses import dataclass
# Type-related stuff
from psycopg2.extensions import connection as PgConnection
from psycopg2.extensions import make_dsn, parse_dsn
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple
from typing_extensions import Literal
@@ -122,6 +123,22 @@ def pytest_configure(config):
top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
mkdir_if_needed(top_output_dir)
# Find the postgres installation.
global pg_distrib_dir
env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR')
if env_postgres_bin:
pg_distrib_dir = env_postgres_bin
else:
pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
log.info(f'pg_distrib_dir is {pg_distrib_dir}')
if os.getenv("REMOTE_ENV"):
# When testing against a remote server, we only need the client binary.
if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/psql')):
raise Exception('psql not found at "{}"'.format(pg_distrib_dir))
else:
if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')):
raise Exception('postgres not found at "{}"'.format(pg_distrib_dir))
if os.getenv("REMOTE_ENV"):
# we are in remote env and do not have zenith binaries locally
# this is the case for benchmarks run on self-hosted runner
@@ -137,17 +154,6 @@ def pytest_configure(config):
if not os.path.exists(os.path.join(zenith_binpath, 'pageserver')):
raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath))
# Find the postgres installation.
global pg_distrib_dir
env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR')
if env_postgres_bin:
pg_distrib_dir = env_postgres_bin
else:
pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
log.info(f'pg_distrib_dir is {pg_distrib_dir}')
if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')):
raise Exception('postgres not found at "{}"'.format(pg_distrib_dir))
def zenfixture(func: Fn) -> Fn:
"""
@@ -238,98 +244,69 @@ def port_distributor(worker_base_port):
class PgProtocol:
""" Reusable connection logic """
def __init__(self,
host: str,
port: int,
username: Optional[str] = None,
password: Optional[str] = None,
dbname: Optional[str] = None,
schema: Optional[str] = None):
self.host = host
self.port = port
self.username = username
self.password = password
self.dbname = dbname
self.schema = schema
def __init__(self, **kwargs):
self.default_options = kwargs
def connstr(self,
*,
dbname: Optional[str] = None,
schema: Optional[str] = None,
username: Optional[str] = None,
password: Optional[str] = None,
statement_timeout_ms: Optional[int] = None) -> str:
def connstr(self, **kwargs) -> str:
"""
Build a libpq connection string for the Postgres instance.
"""
return str(make_dsn(**self.conn_options(**kwargs)))
username = username or self.username
password = password or self.password
dbname = dbname or self.dbname or "postgres"
schema = schema or self.schema
res = f'host={self.host} port={self.port} dbname={dbname}'
def conn_options(self, **kwargs):
conn_options = self.default_options.copy()
if 'dsn' in kwargs:
conn_options.update(parse_dsn(kwargs['dsn']))
conn_options.update(kwargs)
if username:
res = f'{res} user={username}'
if password:
res = f'{res} password={password}'
if schema:
res = f"{res} options='-c search_path={schema}'"
if statement_timeout_ms:
res = f"{res} options='-c statement_timeout={statement_timeout_ms}'"
return res
# Individual statement timeout in seconds. 2 minutes should be
# enough for our tests, but if you need a longer, you can
# change it by calling "SET statement_timeout" after
# connecting.
if 'options' in conn_options:
conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options']
else:
conn_options['options'] = "-cstatement_timeout=120s"
return conn_options
# autocommit=True here by default because that's what we need most of the time
def connect(
self,
*,
autocommit=True,
dbname: Optional[str] = None,
schema: Optional[str] = None,
username: Optional[str] = None,
password: Optional[str] = None,
# individual statement timeout in seconds, 2 minutes should be enough for our tests
statement_timeout: Optional[int] = 120
) -> PgConnection:
def connect(self, autocommit=True, **kwargs) -> PgConnection:
"""
Connect to the node.
Returns psycopg2's connection object.
This method passes all extra params to connstr.
"""
conn = psycopg2.connect(**self.conn_options(**kwargs))
conn = psycopg2.connect(
self.connstr(dbname=dbname,
schema=schema,
username=username,
password=password,
statement_timeout_ms=statement_timeout *
1000 if statement_timeout else None))
# WARNING: this setting affects *all* tests!
conn.autocommit = autocommit
return conn
async def connect_async(self,
*,
dbname: str = 'postgres',
username: Optional[str] = None,
password: Optional[str] = None) -> asyncpg.Connection:
async def connect_async(self, **kwargs) -> asyncpg.Connection:
"""
Connect to the node from async python.
Returns asyncpg's connection object.
"""
conn = await asyncpg.connect(
host=self.host,
port=self.port,
database=dbname,
user=username or self.username,
password=password,
)
return conn
# asyncpg takes slightly different options than psycopg2. Try
# to convert the defaults from the psycopg2 format.
# The psycopg2 option 'dbname' is called 'database' is asyncpg
conn_options = self.conn_options(**kwargs)
if 'dbname' in conn_options:
conn_options['database'] = conn_options.pop('dbname')
# Convert options='-c<key>=<val>' to server_settings
if 'options' in conn_options:
options = conn_options.pop('options')
for match in re.finditer('-c(\w*)=(\w*)', options):
key = match.group(1)
val = match.group(2)
if 'server_options' in conn_options:
conn_options['server_settings'].update({key: val})
else:
conn_options['server_settings'] = {key: val}
return await asyncpg.connect(**conn_options)
def safe_psql(self, query: str, **kwargs: Any) -> List[Any]:
"""
@@ -1149,10 +1126,10 @@ class ZenithPageserver(PgProtocol):
port: PageserverPort,
remote_storage: Optional[RemoteStorage] = None,
config_override: Optional[str] = None):
super().__init__(host='localhost', port=port.pg, username='zenith_admin')
super().__init__(host='localhost', port=port.pg, user='zenith_admin')
self.env = env
self.running = False
self.service_port = port # do not shadow PgProtocol.port which is just int
self.service_port = port
self.remote_storage = remote_storage
self.config_override = config_override
@@ -1314,7 +1291,7 @@ def psbench_bin(test_output_dir):
class VanillaPostgres(PgProtocol):
def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int):
super().__init__(host='localhost', port=port)
super().__init__(host='localhost', port=port, dbname='postgres')
self.pgdatadir = pgdatadir
self.pg_bin = pg_bin
self.running = False
@@ -1356,10 +1333,57 @@ def vanilla_pg(test_output_dir: str) -> Iterator[VanillaPostgres]:
yield vanilla_pg
class RemotePostgres(PgProtocol):
def __init__(self, pg_bin: PgBin, remote_connstr: str):
super().__init__(**parse_dsn(remote_connstr))
self.pg_bin = pg_bin
# The remote server is assumed to be running already
self.running = True
def configure(self, options: List[str]):
raise Exception('cannot change configuration of remote Posgres instance')
def start(self):
raise Exception('cannot start a remote Postgres instance')
def stop(self):
raise Exception('cannot stop a remote Postgres instance')
def get_subdir_size(self, subdir) -> int:
# TODO: Could use the server's Generic File Acccess functions if superuser.
# See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE
raise Exception('cannot get size of a Postgres instance')
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
# do nothing
pass
@pytest.fixture(scope='function')
def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]:
pg_bin = PgBin(test_output_dir)
connstr = os.getenv("BENCHMARK_CONNSTR")
if connstr is None:
raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable")
with RemotePostgres(pg_bin, connstr) as remote_pg:
yield remote_pg
class ZenithProxy(PgProtocol):
def __init__(self, port: int):
super().__init__(host="127.0.0.1", username="pytest", password="pytest", port=port)
super().__init__(host="127.0.0.1",
user="pytest",
password="pytest",
port=port,
dbname='postgres')
self.http_port = 7001
self.host = "127.0.0.1"
self.port = port
self._popen: Optional[subprocess.Popen[bytes]] = None
def start_static(self, addr="127.0.0.1:5432") -> None:
@@ -1403,13 +1427,13 @@ def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]:
class Postgres(PgProtocol):
""" An object representing a running postgres daemon. """
def __init__(self, env: ZenithEnv, tenant_id: uuid.UUID, port: int):
super().__init__(host='localhost', port=port, username='zenith_admin')
super().__init__(host='localhost', port=port, user='zenith_admin', dbname='postgres')
self.env = env
self.running = False
self.node_name: Optional[str] = None # dubious, see asserts below
self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA
self.tenant_id = tenant_id
self.port = port
# path to conf is <repo_dir>/pgdatadirs/tenants/<tenant_id>/<node_name>/postgresql.conf
def create(

View File

@@ -2,29 +2,113 @@ from contextlib import closing
from fixtures.zenith_fixtures import PgBin, VanillaPostgres, ZenithEnv
from fixtures.compare_fixtures import PgCompare, VanillaCompare, ZenithCompare
from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, ZenithBenchmarker
from fixtures.log_helper import log
from pathlib import Path
import pytest
from datetime import datetime
import calendar
import os
import timeit
def utc_now_timestamp() -> int:
return calendar.timegm(datetime.utcnow().utctimetuple())
def init_pgbench(env: PgCompare, cmdline):
# calculate timestamps and durations separately
# timestamp is intended to be used for linking to grafana and logs
# duration is actually a metric and uses float instead of int for timestamp
init_start_timestamp = utc_now_timestamp()
t0 = timeit.default_timer()
with env.record_pageserver_writes('init.pageserver_writes'):
env.pg_bin.run_capture(cmdline)
env.flush()
init_duration = timeit.default_timer() - t0
init_end_timestamp = utc_now_timestamp()
env.zenbenchmark.record("init.duration",
init_duration,
unit="s",
report=MetricReport.LOWER_IS_BETTER)
env.zenbenchmark.record("init.start_timestamp",
init_start_timestamp,
'',
MetricReport.TEST_PARAM)
env.zenbenchmark.record("init.end_timestamp", init_end_timestamp, '', MetricReport.TEST_PARAM)
def run_pgbench(env: PgCompare, prefix: str, cmdline):
with env.record_pageserver_writes(f'{prefix}.pageserver_writes'):
run_start_timestamp = utc_now_timestamp()
t0 = timeit.default_timer()
out = env.pg_bin.run_capture(cmdline, )
run_duration = timeit.default_timer() - t0
run_end_timestamp = utc_now_timestamp()
env.flush()
stdout = Path(f"{out}.stdout").read_text()
res = PgBenchRunResult.parse_from_stdout(
stdout=stdout,
run_duration=run_duration,
run_start_timestamp=run_start_timestamp,
run_end_timestamp=run_end_timestamp,
)
env.zenbenchmark.record_pg_bench_result(prefix, res)
#
# Run a very short pgbench test.
# Initialize a pgbench database, and run pgbench against it.
#
# Collects three metrics:
# This makes runs two different pgbench workloads against the same
# initialized database, and 'duration' is the time of each run. So
# the total runtime is 2 * duration, plus time needed to initialize
# the test database.
#
# 1. Time to initialize the pgbench database (pgbench -s5 -i)
# 2. Time to run 5000 pgbench transactions
# 3. Disk space used
#
def test_pgbench(zenith_with_baseline: PgCompare):
env = zenith_with_baseline
# Currently, the # of connections is hardcoded at 4
def run_test_pgbench(env: PgCompare, scale: int, duration: int):
with env.record_pageserver_writes('pageserver_writes'):
with env.record_duration('init'):
env.pg_bin.run_capture(['pgbench', '-s5', '-i', env.pg.connstr()])
env.flush()
# Record the scale and initialize
env.zenbenchmark.record("scale", scale, '', MetricReport.TEST_PARAM)
init_pgbench(env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr()])
with env.record_duration('5000_xacts'):
env.pg_bin.run_capture(['pgbench', '-c1', '-t5000', env.pg.connstr()])
env.flush()
# Run simple-update workload
run_pgbench(env,
"simple-update",
['pgbench', '-n', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()])
# Run SELECT workload
run_pgbench(env,
"select-only",
['pgbench', '-S', '-c4', f'-T{duration}', '-P2', '-Mprepared', env.pg.connstr()])
env.report_size()
def get_durations_matrix():
durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default="45")
return list(map(int, durations.split(",")))
def get_scales_matrix():
scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default="10")
return list(map(int, scales.split(",")))
# Run the pgbench tests against vanilla Postgres and zenith
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix())
def test_pgbench(zenith_with_baseline: PgCompare, scale: int, duration: int):
run_test_pgbench(zenith_with_baseline, scale, duration)
# Run the pgbench tests against an existing Postgres cluster
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix())
@pytest.mark.remote_cluster
def test_pgbench_remote(remote_compare: PgCompare, scale: int, duration: int):
run_test_pgbench(remote_compare, scale, duration)

View File

@@ -1,124 +0,0 @@
import dataclasses
import os
import subprocess
from typing import List
from fixtures.benchmark_fixture import PgBenchRunResult, ZenithBenchmarker
import pytest
from datetime import datetime
import calendar
import timeit
import os
def utc_now_timestamp() -> int:
return calendar.timegm(datetime.utcnow().utctimetuple())
@dataclasses.dataclass
class PgBenchRunner:
connstr: str
scale: int
transactions: int
pgbench_bin_path: str = "pgbench"
def invoke(self, args: List[str]) -> 'subprocess.CompletedProcess[str]':
res = subprocess.run([self.pgbench_bin_path, *args], text=True, capture_output=True)
if res.returncode != 0:
raise RuntimeError(f"pgbench failed. stdout: {res.stdout} stderr: {res.stderr}")
return res
def init(self, vacuum: bool = True) -> 'subprocess.CompletedProcess[str]':
args = []
if not vacuum:
args.append("--no-vacuum")
args.extend([f"--scale={self.scale}", "--initialize", self.connstr])
return self.invoke(args)
def run(self, jobs: int = 1, clients: int = 1):
return self.invoke([
f"--transactions={self.transactions}",
f"--jobs={jobs}",
f"--client={clients}",
"--progress=2", # print progress every two seconds
self.connstr,
])
@pytest.fixture
def connstr():
res = os.getenv("BENCHMARK_CONNSTR")
if res is None:
raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable")
return res
def get_transactions_matrix():
transactions = os.getenv("TEST_PG_BENCH_TRANSACTIONS_MATRIX")
if transactions is None:
return [10**4, 10**5]
return list(map(int, transactions.split(",")))
def get_scales_matrix():
scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX")
if scales is None:
return [10, 20]
return list(map(int, scales.split(",")))
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("transactions", get_transactions_matrix())
@pytest.mark.remote_cluster
def test_pg_bench_remote_cluster(zenbenchmark: ZenithBenchmarker,
connstr: str,
scale: int,
transactions: int):
"""
The best way is to run same pack of tests both, for local zenith
and against staging, but currently local tests heavily depend on
things available only locally e.g. zenith binaries, pageserver api, etc.
Also separate test allows to run pgbench workload against vanilla postgres
or other systems that support postgres protocol.
Also now this is more of a liveness test because it stresses pageserver internals,
so we clearly see what goes wrong in more "real" environment.
"""
pg_bin = os.getenv("PG_BIN")
if pg_bin is not None:
pgbench_bin_path = os.path.join(pg_bin, "pgbench")
else:
pgbench_bin_path = "pgbench"
runner = PgBenchRunner(
connstr=connstr,
scale=scale,
transactions=transactions,
pgbench_bin_path=pgbench_bin_path,
)
# calculate timestamps and durations separately
# timestamp is intended to be used for linking to grafana and logs
# duration is actually a metric and uses float instead of int for timestamp
init_start_timestamp = utc_now_timestamp()
t0 = timeit.default_timer()
runner.init()
init_duration = timeit.default_timer() - t0
init_end_timestamp = utc_now_timestamp()
run_start_timestamp = utc_now_timestamp()
t0 = timeit.default_timer()
out = runner.run() # TODO handle failures
run_duration = timeit.default_timer() - t0
run_end_timestamp = utc_now_timestamp()
res = PgBenchRunResult.parse_from_output(
out=out,
init_duration=init_duration,
init_start_timestamp=init_start_timestamp,
init_end_timestamp=init_end_timestamp,
run_duration=run_duration,
run_start_timestamp=run_start_timestamp,
run_end_timestamp=run_end_timestamp,
)
zenbenchmark.record_pg_bench_result(res)