Files
neon/test_runner/performance/pageserver/test_page_service_batching.py
Christian Schwarz a7ce323949 benchmarking: extend test_page_service_batching.py to cover concurrent IO + batching under random reads (#10466)
This PR commits the benchmarks I ran to qualify concurrent IO before we
released it.

Changes:
- Add `l0stack` fixture; a reusable abstraction for creating a stack of
L0 deltas
  each of which has 1 Value::Delta per page.
- Such a stack of L0 deltas is a good and understandable demo for
concurrent IO
because to reconstruct any page, $layer_stack_height` Values need to be
read.
  Before concurrent IO, the reads were sequential.
  With concurrent IO, they are executed concurrently.
- So, switch `test_latency` to use the l0stack.
- Teach `pagebench`, which is used by `test_latency`, to limit itself to
the blocks of the relation created by the l0stack abstraction.
- Additional parametrization of `test_latency` over dimensions
`ps_io_concurrency,l0_stack_height,queue_depth`
- Use better names for the tests to reflect what they do, leave
interpretation of the (now quite high-dimensional) results to the reader
  - `test_{throughput => postgres_seqscan}`
  - `test_{latency => random_reads}`
- Cut down on permutations to those we use in production. Runtime is
about 2min.

Refs
- concurrent IO epic https://github.com/neondatabase/neon/issues/9378 
- batching task: fixes https://github.com/neondatabase/neon/issues/9837

---------

Co-authored-by: Peter Bendel <peterbendel@neon.tech>
2025-05-15 17:48:13 +00:00

467 lines
16 KiB
Python

import concurrent.futures
import dataclasses
import json
import threading
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import pytest
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin
from fixtures.pageserver.makelayers import l0stack
from fixtures.utils import humantime_to_ms
TARGET_RUNTIME = 30
@dataclass
class PageServicePipeliningConfig:
pass
@dataclass
class PageServicePipeliningConfigSerial(PageServicePipeliningConfig):
mode: str = "serial"
@dataclass
class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
max_batch_size: int
execution: str
batching: str
mode: str = "pipelined"
PS_IO_CONCURRENCY = ["sidecar-task"]
PIPELINING_CONFIGS: list[PageServicePipeliningConfig] = []
for max_batch_size in [32]:
for execution in ["concurrent-futures"]:
for batching in ["scattered-lsn"]:
PIPELINING_CONFIGS.append(
PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
)
@pytest.mark.parametrize(
"tablesize_mib, pipelining_config, target_runtime, ps_io_concurrency, effective_io_concurrency, readhead_buffer_size, name",
[
# batchable workloads should show throughput and CPU efficiency improvements
*[
(
50,
config,
TARGET_RUNTIME,
ps_io_concurrency,
100,
128,
f"batchable {dataclasses.asdict(config)}",
)
for config in PIPELINING_CONFIGS
for ps_io_concurrency in PS_IO_CONCURRENCY
],
],
)
def test_postgres_seqscan(
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
tablesize_mib: int,
pipelining_config: PageServicePipeliningConfig,
target_runtime: int,
ps_io_concurrency: str,
effective_io_concurrency: int,
readhead_buffer_size: int,
name: str,
):
"""
Do a bunch of sequential scans with varying compute and pipelining configurations.
Primary performance metrics are the achieved batching factor and throughput (wall clock time).
Resource utilization is also interesting - we currently measure CPU time.
The test is a fixed-runtime based type of test (target_runtime).
Hence, the results are normalized to the number of iterations completed within target runtime.
If the compute doesn't provide pipeline depth (effective_io_concurrency=1),
performance should be about identical in all configurations.
Pipelining can still yield improvements in these scenarios because it parses the
next request while the current one is still being executed.
If the compute provides pipeline depth (effective_io_concurrency=100), then
pipelining configs, especially with max_batch_size>1 should yield dramatic improvements
in all performance metrics.
We advance the LSN from a disruptor thread to simulate the effect of a workload with concurrent writes
in another table. The `scattered-lsn` batching mode handles this well whereas the
initial implementatin (`uniform-lsn`) would break the batch.
"""
#
# record perf-related parameters as metrics to simplify processing of results
#
params: dict[str, tuple[float | int, dict[str, Any]]] = {}
params.update(
{
"tablesize_mib": (tablesize_mib, {"unit": "MiB"}),
# target_runtime is just a polite ask to the workload to run for this long
"effective_io_concurrency": (effective_io_concurrency, {}),
"readhead_buffer_size": (readhead_buffer_size, {}),
# name is not a metric, we just use it to identify the test easily in the `test_...[...]`` notation
}
)
# For storing configuration as a metric, insert a fake 0 with labels with actual data
params.update(
{
"config": (
0,
{
"labels": {
"pipelining_config": dataclasses.asdict(pipelining_config),
"ps_io_concurrency": ps_io_concurrency,
}
},
)
}
)
log.info("params: %s", params)
for param, (value, kwargs) in params.items():
zenbenchmark.record(
param,
metric_value=float(value),
unit=kwargs.pop("unit", ""),
report=MetricReport.TEST_PARAM,
labels=kwargs.pop("labels", None),
**kwargs,
)
#
# Setup
#
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start(
"main",
config_lines=[
# minimal lfc & small shared buffers to force requests to pageserver
"neon.max_file_cache_size=1MB",
"shared_buffers=10MB",
],
)
conn = endpoint.connect()
cur = conn.cursor()
cur.execute("SET max_parallel_workers_per_gather=0") # disable parallel backends
cur.execute(f"SET effective_io_concurrency={effective_io_concurrency}")
cur.execute(
f"SET neon.readahead_buffer_size={readhead_buffer_size}"
) # this is the current default value, but let's hard-code that
cur.execute("CREATE EXTENSION IF NOT EXISTS neon;")
cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
log.info("Filling the table")
cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)")
tablesize = tablesize_mib * 1024 * 1024
npages = tablesize // (8 * 1024)
cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
#
# Run the workload, collect `Metrics` before and after, calculate difference, normalize.
#
@dataclass
class Metrics:
time: float
pageserver_batch_size_histo_sum: float
pageserver_batch_size_histo_count: float
pageserver_batch_breaks_reason_count: dict[str, int]
compute_getpage_count: float
pageserver_cpu_seconds_total: float
def __sub__(self, other: "Metrics") -> "Metrics":
return Metrics(
time=self.time - other.time,
pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum
- other.pageserver_batch_size_histo_sum,
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count
- other.pageserver_batch_size_histo_count,
compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
- other.pageserver_cpu_seconds_total,
pageserver_batch_breaks_reason_count={
reason: count - other.pageserver_batch_breaks_reason_count.get(reason, 0)
for reason, count in self.pageserver_batch_breaks_reason_count.items()
},
)
def normalize(self, by) -> "Metrics":
return Metrics(
time=self.time / by,
pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by,
pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
compute_getpage_count=self.compute_getpage_count / by,
pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
pageserver_batch_breaks_reason_count={
reason: count / by
for reason, count in self.pageserver_batch_breaks_reason_count.items()
},
)
def get_metrics() -> Metrics:
with conn.cursor() as cur:
cur.execute(
"select value from neon_perf_counters where metric='getpage_wait_seconds_count';"
)
compute_getpage_count = cur.fetchall()[0][0]
pageserver_metrics = ps_http.get_metrics()
for name, samples in pageserver_metrics.metrics.items():
for sample in samples:
log.info(f"{name=} labels={sample.labels} {sample.value}")
raw_batch_break_reason_count = pageserver_metrics.query_all(
"pageserver_page_service_batch_break_reason_total",
filter={"timeline_id": str(env.initial_timeline)},
)
batch_break_reason_count = {
sample.labels["reason"]: int(sample.value)
for sample in raw_batch_break_reason_count
}
return Metrics(
time=time.time(),
pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
"pageserver_page_service_batch_size_sum"
).value,
pageserver_batch_size_histo_count=pageserver_metrics.query_one(
"pageserver_page_service_batch_size_count"
).value,
pageserver_batch_breaks_reason_count=batch_break_reason_count,
compute_getpage_count=compute_getpage_count,
pageserver_cpu_seconds_total=pageserver_metrics.query_one(
"libmetrics_process_cpu_seconds_highres"
).value,
)
def workload(disruptor_started: threading.Event) -> Metrics:
disruptor_started.wait()
start = time.time()
iters = 0
while time.time() - start < target_runtime or iters < 2:
if iters == 1:
# round zero for warming up
before = get_metrics()
cur.execute("select sum(data::bigint) from t")
assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
iters += 1
after = get_metrics()
return (after - before).normalize(iters - 1)
def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event):
conn = endpoint.connect()
cur = conn.cursor()
iters = 0
while True:
cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')")
if stop_disruptor.is_set():
break
disruptor_started.set()
iters += 1
time.sleep(0.001)
return iters
env.pageserver.patch_config_toml_nonrecursive(
{
"page_service_pipelining": dataclasses.asdict(pipelining_config),
"get_vectored_concurrent_io": {"mode": ps_io_concurrency},
}
)
# set trace for log analysis below
env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"})
log.info("Starting workload")
with concurrent.futures.ThreadPoolExecutor() as executor:
disruptor_started = threading.Event()
stop_disruptor = threading.Event()
disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor)
workload_fut = executor.submit(workload, disruptor_started)
metrics = workload_fut.result()
stop_disruptor.set()
ndisruptions = disruptor_fut.result()
log.info("Disruptor issued %d disrupting requests", ndisruptions)
log.info("Results: %s", metrics)
#
# Sanity-checks on the collected data
#
# assert that getpage counts roughly match between compute and ps
assert metrics.pageserver_batch_size_histo_sum == pytest.approx(
metrics.compute_getpage_count, rel=0.01
)
#
# Record the results
#
for metric, value in dataclasses.asdict(metrics).items():
if metric == "pageserver_batch_breaks_reason_count":
assert isinstance(value, dict)
for reason, count in value.items():
zenbenchmark.record(
f"counters.{metric}_{reason}", count, unit="", report=MetricReport.TEST_PARAM
)
else:
zenbenchmark.record(
f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM
)
zenbenchmark.record(
"perfmetric.batching_factor",
metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count,
unit="",
report=MetricReport.HIGHER_IS_BETTER,
)
@pytest.mark.parametrize(
"pipelining_config,ps_io_concurrency,l0_stack_height,queue_depth,name",
[
(config, ps_io_concurrency, l0_stack_height, queue_depth, f"{dataclasses.asdict(config)}")
for config in PIPELINING_CONFIGS
for ps_io_concurrency in PS_IO_CONCURRENCY
for queue_depth in [1, 2, 32]
for l0_stack_height in [0, 20]
],
)
def test_random_reads(
neon_env_builder: NeonEnvBuilder,
zenbenchmark: NeonBenchmarker,
pg_bin: PgBin,
pipelining_config: PageServicePipeliningConfig,
ps_io_concurrency: str,
l0_stack_height: int,
queue_depth: int,
name: str,
):
"""
Throw pagebench random getpage at latest lsn workload from a single client against pageserver.
"""
#
# Setup
#
def build_snapshot_cb(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
env = neon_env_builder.init_start()
endpoint = env.endpoints.create_start("main")
l0stack.make_l0_stack(
endpoint,
l0stack.L0StackShape(logical_table_size_mib=50, delta_stack_height=l0_stack_height),
)
return env
env = neon_env_builder.build_and_use_snapshot(
f"test_page_service_batching--test_pagebench-{l0_stack_height}", build_snapshot_cb
)
def patch_ps_config(ps_config):
ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config)
ps_config["get_vectored_concurrent_io"] = {"mode": ps_io_concurrency}
env.pageserver.edit_config_toml(patch_ps_config)
env.start()
lsn = env.safekeepers[0].get_commit_lsn(env.initial_tenant, env.initial_timeline)
ep = env.endpoints.create_start("main", lsn=lsn)
data_table_relnode_oid = ep.safe_psql_scalar("SELECT 'data'::regclass::oid")
ep.stop_and_destroy()
for sk in env.safekeepers:
sk.stop()
env.pageserver.allowed_errors.append(
# https://github.com/neondatabase/neon/issues/6925
r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
)
ps_http = env.pageserver.http_client()
metrics_before = ps_http.get_metrics()
cmd = [
str(env.neon_binpath / "pagebench"),
"get-page-latest-lsn",
"--mgmt-api-endpoint",
ps_http.base_url,
"--page-service-connstring",
env.pageserver.connstr(password=None),
"--num-clients",
"1",
"--queue-depth",
str(queue_depth),
"--only-relnode",
str(data_table_relnode_oid),
"--runtime",
"10s",
]
log.info(f"command: {' '.join(cmd)}")
basepath = pg_bin.run_capture(cmd, with_command_header=False)
results_path = Path(basepath + ".stdout")
log.info(f"Benchmark results at: {results_path}")
metrics_after = ps_http.get_metrics()
with open(results_path) as f:
results = json.load(f)
log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}")
total = results["total"]
metric = "request_count"
zenbenchmark.record(
metric,
metric_value=total[metric],
unit="",
report=MetricReport.HIGHER_IS_BETTER,
)
metric = "latency_mean"
zenbenchmark.record(
metric,
metric_value=humantime_to_ms(total[metric]),
unit="ms",
report=MetricReport.LOWER_IS_BETTER,
)
metric = "latency_percentiles"
for k, v in total[metric].items():
zenbenchmark.record(
f"{metric}.{k}",
metric_value=humantime_to_ms(v),
unit="ms",
report=MetricReport.LOWER_IS_BETTER,
)
reads_before = metrics_before.query_one(
"pageserver_io_operations_seconds_count", filter={"operation": "read"}
)
reads_after = metrics_after.query_one(
"pageserver_io_operations_seconds_count", filter={"operation": "read"}
)
zenbenchmark.record(
"virtual_file_reads",
metric_value=reads_after.value - reads_before.value,
unit="",
report=MetricReport.LOWER_IS_BETTER,
)