Merge branch 'main' into bojan-get-page-tests

This commit is contained in:
Bojan Serafimov
2022-04-14 13:59:59 -04:00
58 changed files with 2180 additions and 656 deletions

View File

@@ -17,7 +17,7 @@ import warnings
from contextlib import contextmanager
# Type-related stuff
from typing import Iterator
from typing import Iterator, Optional
"""
This file contains fixtures for micro-benchmarks.
@@ -51,17 +51,12 @@ in the test initialization, or measure disk usage after the test query.
@dataclasses.dataclass
class PgBenchRunResult:
scale: int
number_of_clients: int
number_of_threads: int
number_of_transactions_actually_processed: int
latency_average: float
latency_stddev: float
tps_including_connection_time: float
tps_excluding_connection_time: float
init_duration: float
init_start_timestamp: int
init_end_timestamp: int
latency_stddev: Optional[float]
tps: float
run_duration: float
run_start_timestamp: int
run_end_timestamp: int
@@ -69,56 +64,67 @@ class PgBenchRunResult:
# TODO progress
@classmethod
def parse_from_output(
def parse_from_stdout(
cls,
out: 'subprocess.CompletedProcess[str]',
init_duration: float,
init_start_timestamp: int,
init_end_timestamp: int,
stdout: str,
run_duration: float,
run_start_timestamp: int,
run_end_timestamp: int,
):
stdout_lines = out.stdout.splitlines()
stdout_lines = stdout.splitlines()
latency_stddev = None
# we know significant parts of these values from test input
# but to be precise take them from output
# scaling factor: 5
assert "scaling factor" in stdout_lines[1]
scale = int(stdout_lines[1].split()[-1])
# number of clients: 1
assert "number of clients" in stdout_lines[3]
number_of_clients = int(stdout_lines[3].split()[-1])
# number of threads: 1
assert "number of threads" in stdout_lines[4]
number_of_threads = int(stdout_lines[4].split()[-1])
# number of transactions actually processed: 1000/1000
assert "number of transactions actually processed" in stdout_lines[6]
number_of_transactions_actually_processed = int(stdout_lines[6].split("/")[1])
# latency average = 19.894 ms
assert "latency average" in stdout_lines[7]
latency_average = stdout_lines[7].split()[-2]
# latency stddev = 3.387 ms
assert "latency stddev" in stdout_lines[8]
latency_stddev = stdout_lines[8].split()[-2]
# tps = 50.219689 (including connections establishing)
assert "(including connections establishing)" in stdout_lines[9]
tps_including_connection_time = stdout_lines[9].split()[2]
# tps = 50.264435 (excluding connections establishing)
assert "(excluding connections establishing)" in stdout_lines[10]
tps_excluding_connection_time = stdout_lines[10].split()[2]
for line in stdout.splitlines():
# scaling factor: 5
if line.startswith("scaling factor:"):
scale = int(line.split()[-1])
# number of clients: 1
if line.startswith("number of clients: "):
number_of_clients = int(line.split()[-1])
# number of threads: 1
if line.startswith("number of threads: "):
number_of_threads = int(line.split()[-1])
# number of transactions actually processed: 1000/1000
# OR
# number of transactions actually processed: 1000
if line.startswith("number of transactions actually processed"):
if "/" in line:
number_of_transactions_actually_processed = int(line.split("/")[1])
else:
number_of_transactions_actually_processed = int(line.split()[-1])
# latency average = 19.894 ms
if line.startswith("latency average"):
latency_average = float(line.split()[-2])
# latency stddev = 3.387 ms
# (only printed with some options)
if line.startswith("latency stddev"):
latency_stddev = float(line.split()[-2])
# Get the TPS without initial connection time. The format
# of the tps lines changed in pgbench v14, but we accept
# either format:
#
# pgbench v13 and below:
# tps = 50.219689 (including connections establishing)
# tps = 50.264435 (excluding connections establishing)
#
# pgbench v14:
# initial connection time = 3.858 ms
# tps = 309.281539 (without initial connection time)
if (line.startswith("tps = ") and ("(excluding connections establishing)" in line
or "(without initial connection time)")):
tps = float(line.split()[2])
return cls(
scale=scale,
number_of_clients=number_of_clients,
number_of_threads=number_of_threads,
number_of_transactions_actually_processed=number_of_transactions_actually_processed,
latency_average=float(latency_average),
latency_stddev=float(latency_stddev),
tps_including_connection_time=float(tps_including_connection_time),
tps_excluding_connection_time=float(tps_excluding_connection_time),
init_duration=init_duration,
init_start_timestamp=init_start_timestamp,
init_end_timestamp=init_end_timestamp,
latency_average=latency_average,
latency_stddev=latency_stddev,
tps=tps,
run_duration=run_duration,
run_start_timestamp=run_start_timestamp,
run_end_timestamp=run_end_timestamp,
@@ -187,60 +193,41 @@ class ZenithBenchmarker:
report=MetricReport.LOWER_IS_BETTER,
)
def record_pg_bench_result(self, pg_bench_result: PgBenchRunResult):
self.record("scale", pg_bench_result.scale, '', MetricReport.TEST_PARAM)
self.record("number_of_clients",
def record_pg_bench_result(self, prefix: str, pg_bench_result: PgBenchRunResult):
self.record(f"{prefix}.number_of_clients",
pg_bench_result.number_of_clients,
'',
MetricReport.TEST_PARAM)
self.record("number_of_threads",
self.record(f"{prefix}.number_of_threads",
pg_bench_result.number_of_threads,
'',
MetricReport.TEST_PARAM)
self.record(
"number_of_transactions_actually_processed",
f"{prefix}.number_of_transactions_actually_processed",
pg_bench_result.number_of_transactions_actually_processed,
'',
# thats because this is predefined by test matrix and doesnt change across runs
report=MetricReport.TEST_PARAM,
)
self.record("latency_average",
self.record(f"{prefix}.latency_average",
pg_bench_result.latency_average,
unit="ms",
report=MetricReport.LOWER_IS_BETTER)
self.record("latency_stddev",
pg_bench_result.latency_stddev,
unit="ms",
report=MetricReport.LOWER_IS_BETTER)
self.record("tps_including_connection_time",
pg_bench_result.tps_including_connection_time,
'',
report=MetricReport.HIGHER_IS_BETTER)
self.record("tps_excluding_connection_time",
pg_bench_result.tps_excluding_connection_time,
'',
report=MetricReport.HIGHER_IS_BETTER)
self.record("init_duration",
pg_bench_result.init_duration,
unit="s",
report=MetricReport.LOWER_IS_BETTER)
self.record("init_start_timestamp",
pg_bench_result.init_start_timestamp,
'',
MetricReport.TEST_PARAM)
self.record("init_end_timestamp",
pg_bench_result.init_end_timestamp,
'',
MetricReport.TEST_PARAM)
self.record("run_duration",
if pg_bench_result.latency_stddev is not None:
self.record(f"{prefix}.latency_stddev",
pg_bench_result.latency_stddev,
unit="ms",
report=MetricReport.LOWER_IS_BETTER)
self.record(f"{prefix}.tps", pg_bench_result.tps, '', report=MetricReport.HIGHER_IS_BETTER)
self.record(f"{prefix}.run_duration",
pg_bench_result.run_duration,
unit="s",
report=MetricReport.LOWER_IS_BETTER)
self.record("run_start_timestamp",
self.record(f"{prefix}.run_start_timestamp",
pg_bench_result.run_start_timestamp,
'',
MetricReport.TEST_PARAM)
self.record("run_end_timestamp",
self.record(f"{prefix}.run_end_timestamp",
pg_bench_result.run_end_timestamp,
'',
MetricReport.TEST_PARAM)
@@ -259,10 +246,18 @@ class ZenithBenchmarker:
"""
Fetch the "cumulative # of bytes written" metric from the pageserver
"""
# Fetch all the exposed prometheus metrics from page server
all_metrics = pageserver.http_client().get_metrics()
# Use a regular expression to extract the one we're interested in
#
metric_name = r'pageserver_disk_io_bytes{io_operation="write"}'
return self.get_int_counter_value(pageserver, metric_name)
def get_peak_mem(self, pageserver) -> int:
"""
Fetch the "maxrss" metric from the pageserver
"""
metric_name = r'pageserver_maxrss_kb'
return self.get_int_counter_value(pageserver, metric_name)
def get_int_counter_value(self, pageserver, metric_name) -> int:
"""Fetch the value of given int counter from pageserver metrics."""
# TODO: If we start to collect more of the prometheus metrics in the
# performance test suite like this, we should refactor this to load and
# parse all the metrics into a more convenient structure in one go.
@@ -270,20 +265,8 @@ class ZenithBenchmarker:
# The metric should be an integer, as it's a number of bytes. But in general
# all prometheus metrics are floats. So to be pedantic, read it as a float
# and round to integer.
matches = re.search(r'^pageserver_disk_io_bytes{io_operation="write"} (\S+)$',
all_metrics,
re.MULTILINE)
assert matches
return int(round(float(matches.group(1))))
def get_peak_mem(self, pageserver) -> int:
"""
Fetch the "maxrss" metric from the pageserver
"""
# Fetch all the exposed prometheus metrics from page server
all_metrics = pageserver.http_client().get_metrics()
# See comment in get_io_writes()
matches = re.search(r'^pageserver_maxrss_kb (\S+)$', all_metrics, re.MULTILINE)
matches = re.search(fr'^{metric_name} (\S+)$', all_metrics, re.MULTILINE)
assert matches
return int(round(float(matches.group(1))))

View File

@@ -2,7 +2,7 @@ import pytest
from contextlib import contextmanager
from abc import ABC, abstractmethod
from fixtures.zenith_fixtures import PgBin, PgProtocol, VanillaPostgres, ZenithEnv
from fixtures.zenith_fixtures import PgBin, PgProtocol, VanillaPostgres, RemotePostgres, ZenithEnv
from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
# Type-related stuff
@@ -87,6 +87,9 @@ class ZenithCompare(PgCompare):
def flush(self):
self.pscur.execute(f"do_gc {self.env.initial_tenant.hex} {self.timeline} 0")
def compact(self):
self.pscur.execute(f"compact {self.env.initial_tenant.hex} {self.timeline}")
def report_peak_memory_use(self) -> None:
self.zenbenchmark.record("peak_mem",
self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024,
@@ -102,6 +105,19 @@ class ZenithCompare(PgCompare):
'MB',
report=MetricReport.LOWER_IS_BETTER)
total_files = self.zenbenchmark.get_int_counter_value(
self.env.pageserver, "pageserver_num_persistent_files_created")
total_bytes = self.zenbenchmark.get_int_counter_value(
self.env.pageserver, "pageserver_persistent_bytes_written")
self.zenbenchmark.record("data_uploaded",
total_bytes / (1024 * 1024),
"MB",
report=MetricReport.LOWER_IS_BETTER)
self.zenbenchmark.record("num_files_uploaded",
total_files,
"",
report=MetricReport.LOWER_IS_BETTER)
def record_pageserver_writes(self, out_name):
return self.zenbenchmark.record_pageserver_writes(self.env.pageserver, out_name)
@@ -159,6 +175,48 @@ class VanillaCompare(PgCompare):
return self.zenbenchmark.record_duration(out_name)
class RemoteCompare(PgCompare):
"""PgCompare interface for a remote postgres instance."""
def __init__(self, zenbenchmark, remote_pg: RemotePostgres):
self._pg = remote_pg
self._zenbenchmark = zenbenchmark
# Long-lived cursor, useful for flushing
self.conn = self.pg.connect()
self.cur = self.conn.cursor()
@property
def pg(self):
return self._pg
@property
def zenbenchmark(self):
return self._zenbenchmark
@property
def pg_bin(self):
return self._pg.pg_bin
def flush(self):
# TODO: flush the remote pageserver
pass
def report_peak_memory_use(self) -> None:
# TODO: get memory usage from remote pageserver
pass
def report_size(self) -> None:
# TODO: get storage size from remote pageserver
pass
@contextmanager
def record_pageserver_writes(self, out_name):
yield # Do nothing
def record_duration(self, out_name):
return self.zenbenchmark.record_duration(out_name)
@pytest.fixture(scope='function')
def zenith_compare(request, zenbenchmark, pg_bin, zenith_simple_env) -> ZenithCompare:
branch_name = request.node.name
@@ -170,6 +228,11 @@ def vanilla_compare(zenbenchmark, vanilla_pg) -> VanillaCompare:
return VanillaCompare(zenbenchmark, vanilla_pg)
@pytest.fixture(scope='function')
def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare:
return RemoteCompare(zenbenchmark, remote_pg)
@pytest.fixture(params=["vanilla_compare", "zenith_compare"], ids=["vanilla", "zenith"])
def zenith_with_baseline(request) -> PgCompare:
"""Parameterized fixture that helps compare zenith against vanilla postgres.

View File

@@ -27,6 +27,7 @@ from dataclasses import dataclass
# Type-related stuff
from psycopg2.extensions import connection as PgConnection
from psycopg2.extensions import make_dsn, parse_dsn
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple
from typing_extensions import Literal
@@ -122,6 +123,22 @@ def pytest_configure(config):
top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
mkdir_if_needed(top_output_dir)
# Find the postgres installation.
global pg_distrib_dir
env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR')
if env_postgres_bin:
pg_distrib_dir = env_postgres_bin
else:
pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
log.info(f'pg_distrib_dir is {pg_distrib_dir}')
if os.getenv("REMOTE_ENV"):
# When testing against a remote server, we only need the client binary.
if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/psql')):
raise Exception('psql not found at "{}"'.format(pg_distrib_dir))
else:
if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')):
raise Exception('postgres not found at "{}"'.format(pg_distrib_dir))
if os.getenv("REMOTE_ENV"):
# we are in remote env and do not have zenith binaries locally
# this is the case for benchmarks run on self-hosted runner
@@ -137,17 +154,6 @@ def pytest_configure(config):
if not os.path.exists(os.path.join(zenith_binpath, 'pageserver')):
raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath))
# Find the postgres installation.
global pg_distrib_dir
env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR')
if env_postgres_bin:
pg_distrib_dir = env_postgres_bin
else:
pg_distrib_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
log.info(f'pg_distrib_dir is {pg_distrib_dir}')
if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')):
raise Exception('postgres not found at "{}"'.format(pg_distrib_dir))
def zenfixture(func: Fn) -> Fn:
"""
@@ -238,98 +244,69 @@ def port_distributor(worker_base_port):
class PgProtocol:
""" Reusable connection logic """
def __init__(self,
host: str,
port: int,
username: Optional[str] = None,
password: Optional[str] = None,
dbname: Optional[str] = None,
schema: Optional[str] = None):
self.host = host
self.port = port
self.username = username
self.password = password
self.dbname = dbname
self.schema = schema
def __init__(self, **kwargs):
self.default_options = kwargs
def connstr(self,
*,
dbname: Optional[str] = None,
schema: Optional[str] = None,
username: Optional[str] = None,
password: Optional[str] = None,
statement_timeout_ms: Optional[int] = None) -> str:
def connstr(self, **kwargs) -> str:
"""
Build a libpq connection string for the Postgres instance.
"""
return str(make_dsn(**self.conn_options(**kwargs)))
username = username or self.username
password = password or self.password
dbname = dbname or self.dbname or "postgres"
schema = schema or self.schema
res = f'host={self.host} port={self.port} dbname={dbname}'
def conn_options(self, **kwargs):
conn_options = self.default_options.copy()
if 'dsn' in kwargs:
conn_options.update(parse_dsn(kwargs['dsn']))
conn_options.update(kwargs)
if username:
res = f'{res} user={username}'
if password:
res = f'{res} password={password}'
if schema:
res = f"{res} options='-c search_path={schema}'"
if statement_timeout_ms:
res = f"{res} options='-c statement_timeout={statement_timeout_ms}'"
return res
# Individual statement timeout in seconds. 2 minutes should be
# enough for our tests, but if you need a longer, you can
# change it by calling "SET statement_timeout" after
# connecting.
if 'options' in conn_options:
conn_options['options'] = f"-cstatement_timeout=120s " + conn_options['options']
else:
conn_options['options'] = "-cstatement_timeout=120s"
return conn_options
# autocommit=True here by default because that's what we need most of the time
def connect(
self,
*,
autocommit=True,
dbname: Optional[str] = None,
schema: Optional[str] = None,
username: Optional[str] = None,
password: Optional[str] = None,
# individual statement timeout in seconds, 2 minutes should be enough for our tests
statement_timeout: Optional[int] = 120
) -> PgConnection:
def connect(self, autocommit=True, **kwargs) -> PgConnection:
"""
Connect to the node.
Returns psycopg2's connection object.
This method passes all extra params to connstr.
"""
conn = psycopg2.connect(**self.conn_options(**kwargs))
conn = psycopg2.connect(
self.connstr(dbname=dbname,
schema=schema,
username=username,
password=password,
statement_timeout_ms=statement_timeout *
1000 if statement_timeout else None))
# WARNING: this setting affects *all* tests!
conn.autocommit = autocommit
return conn
async def connect_async(self,
*,
dbname: str = 'postgres',
username: Optional[str] = None,
password: Optional[str] = None) -> asyncpg.Connection:
async def connect_async(self, **kwargs) -> asyncpg.Connection:
"""
Connect to the node from async python.
Returns asyncpg's connection object.
"""
conn = await asyncpg.connect(
host=self.host,
port=self.port,
database=dbname,
user=username or self.username,
password=password,
)
return conn
# asyncpg takes slightly different options than psycopg2. Try
# to convert the defaults from the psycopg2 format.
# The psycopg2 option 'dbname' is called 'database' is asyncpg
conn_options = self.conn_options(**kwargs)
if 'dbname' in conn_options:
conn_options['database'] = conn_options.pop('dbname')
# Convert options='-c<key>=<val>' to server_settings
if 'options' in conn_options:
options = conn_options.pop('options')
for match in re.finditer('-c(\w*)=(\w*)', options):
key = match.group(1)
val = match.group(2)
if 'server_options' in conn_options:
conn_options['server_settings'].update({key: val})
else:
conn_options['server_settings'] = {key: val}
return await asyncpg.connect(**conn_options)
def safe_psql(self, query: str, **kwargs: Any) -> List[Any]:
"""
@@ -1149,10 +1126,10 @@ class ZenithPageserver(PgProtocol):
port: PageserverPort,
remote_storage: Optional[RemoteStorage] = None,
config_override: Optional[str] = None):
super().__init__(host='localhost', port=port.pg, username='zenith_admin')
super().__init__(host='localhost', port=port.pg, user='zenith_admin')
self.env = env
self.running = False
self.service_port = port # do not shadow PgProtocol.port which is just int
self.service_port = port
self.remote_storage = remote_storage
self.config_override = config_override
@@ -1314,7 +1291,7 @@ def psbench_bin(test_output_dir):
class VanillaPostgres(PgProtocol):
def __init__(self, pgdatadir: str, pg_bin: PgBin, port: int):
super().__init__(host='localhost', port=port)
super().__init__(host='localhost', port=port, dbname='postgres')
self.pgdatadir = pgdatadir
self.pg_bin = pg_bin
self.running = False
@@ -1356,10 +1333,57 @@ def vanilla_pg(test_output_dir: str) -> Iterator[VanillaPostgres]:
yield vanilla_pg
class RemotePostgres(PgProtocol):
def __init__(self, pg_bin: PgBin, remote_connstr: str):
super().__init__(**parse_dsn(remote_connstr))
self.pg_bin = pg_bin
# The remote server is assumed to be running already
self.running = True
def configure(self, options: List[str]):
raise Exception('cannot change configuration of remote Posgres instance')
def start(self):
raise Exception('cannot start a remote Postgres instance')
def stop(self):
raise Exception('cannot stop a remote Postgres instance')
def get_subdir_size(self, subdir) -> int:
# TODO: Could use the server's Generic File Acccess functions if superuser.
# See https://www.postgresql.org/docs/14/functions-admin.html#FUNCTIONS-ADMIN-GENFILE
raise Exception('cannot get size of a Postgres instance')
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
# do nothing
pass
@pytest.fixture(scope='function')
def remote_pg(test_output_dir: str) -> Iterator[RemotePostgres]:
pg_bin = PgBin(test_output_dir)
connstr = os.getenv("BENCHMARK_CONNSTR")
if connstr is None:
raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable")
with RemotePostgres(pg_bin, connstr) as remote_pg:
yield remote_pg
class ZenithProxy(PgProtocol):
def __init__(self, port: int):
super().__init__(host="127.0.0.1", username="pytest", password="pytest", port=port)
super().__init__(host="127.0.0.1",
user="pytest",
password="pytest",
port=port,
dbname='postgres')
self.http_port = 7001
self.host = "127.0.0.1"
self.port = port
self._popen: Optional[subprocess.Popen[bytes]] = None
def start_static(self, addr="127.0.0.1:5432") -> None:
@@ -1403,13 +1427,13 @@ def static_proxy(vanilla_pg) -> Iterator[ZenithProxy]:
class Postgres(PgProtocol):
""" An object representing a running postgres daemon. """
def __init__(self, env: ZenithEnv, tenant_id: uuid.UUID, port: int):
super().__init__(host='localhost', port=port, username='zenith_admin')
super().__init__(host='localhost', port=port, user='zenith_admin', dbname='postgres')
self.env = env
self.running = False
self.node_name: Optional[str] = None # dubious, see asserts below
self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA
self.tenant_id = tenant_id
self.port = port
# path to conf is <repo_dir>/pgdatadirs/tenants/<tenant_id>/<node_name>/postgresql.conf
def create(