mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-26 07:39:58 +00:00
Newer version of mypy fixes buggy error when trying to update only boto3 stubs. However it brings new checks and starts to yell when we index into cusror.fetchone without checking for None first. So this introduces a wrapper to simplify quering for scalar values. I tried to use cursor_factory connection argument but without success. There can be a better way to do that, but this looks the simplest
87 lines
4.0 KiB
Python
87 lines
4.0 KiB
Python
import os
|
|
from contextlib import closing
|
|
from fixtures.benchmark_fixture import MetricReport
|
|
from fixtures.neon_fixtures import NeonEnv
|
|
from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare
|
|
from fixtures.log_helper import log
|
|
|
|
import psycopg2.extras
|
|
import random
|
|
import time
|
|
|
|
from fixtures.utils import query_scalar
|
|
|
|
|
|
# This is a clear-box test that demonstrates the worst case scenario for the
|
|
# "1 segment per layer" implementation of the pageserver. It writes to random
|
|
# rows, while almost never writing to the same segment twice before flushing.
|
|
# A naive pageserver implementation would create a full image layer for each
|
|
# dirty segment, leading to write_amplification = segment_size / page_size,
|
|
# when compared to vanilla postgres. With segment_size = 10MB, that's 1250.
|
|
def test_random_writes(neon_with_baseline: PgCompare):
|
|
env = neon_with_baseline
|
|
|
|
# Number of rows in the test database. 1M rows runs quickly, but implies
|
|
# a small effective_checkpoint_distance, which makes the test less realistic.
|
|
# Using a 300 TB database would imply a 250 MB effective_checkpoint_distance,
|
|
# but it will take a very long time to run. From what I've seen so far,
|
|
# increasing n_rows doesn't have impact on the (neon_runtime / vanilla_runtime)
|
|
# performance ratio.
|
|
n_rows = 1 * 1000 * 1000 # around 36 MB table
|
|
|
|
# Number of writes per 3 segments. A value of 1 should produce a random
|
|
# workload where we almost never write to the same segment twice. Larger
|
|
# values of load_factor produce a larger effective_checkpoint_distance,
|
|
# making the test more realistic, but less effective. If you want a realistic
|
|
# worst case scenario and you have time to wait you should increase n_rows instead.
|
|
load_factor = 1
|
|
|
|
# Not sure why but this matters in a weird way (up to 2x difference in perf).
|
|
# TODO look into it
|
|
n_iterations = 1
|
|
|
|
with closing(env.pg.connect()) as conn:
|
|
with conn.cursor() as cur:
|
|
# Create the test table
|
|
with env.record_duration('init'):
|
|
cur.execute("""
|
|
CREATE TABLE Big(
|
|
pk integer primary key,
|
|
count integer default 0
|
|
);
|
|
""")
|
|
|
|
# Insert n_rows in batches to avoid query timeouts
|
|
rows_inserted = 0
|
|
while rows_inserted < n_rows:
|
|
rows_to_insert = min(1000 * 1000, n_rows - rows_inserted)
|
|
low = rows_inserted + 1
|
|
high = rows_inserted + rows_to_insert
|
|
cur.execute(f"INSERT INTO Big (pk) values (generate_series({low},{high}))")
|
|
rows_inserted += rows_to_insert
|
|
|
|
# Get table size (can't be predicted because padding and alignment)
|
|
table_size = query_scalar(cur, "SELECT pg_relation_size('Big')")
|
|
env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM)
|
|
|
|
# Decide how much to write, based on knowledge of pageserver implementation.
|
|
# Avoiding segment collisions maximizes (neon_runtime / vanilla_runtime).
|
|
segment_size = 10 * 1024 * 1024
|
|
n_segments = table_size // segment_size
|
|
n_writes = load_factor * n_segments // 3
|
|
|
|
# The closer this is to 250 MB, the more realistic the test is.
|
|
effective_checkpoint_distance = table_size * n_writes // n_rows
|
|
env.zenbenchmark.record("effective_checkpoint_distance",
|
|
effective_checkpoint_distance,
|
|
'bytes',
|
|
MetricReport.TEST_PARAM)
|
|
|
|
# Update random keys
|
|
with env.record_duration('run'):
|
|
for it in range(n_iterations):
|
|
for i in range(n_writes):
|
|
key = random.randint(1, n_rows)
|
|
cur.execute(f"update Big set count=count+1 where pk={key}")
|
|
env.flush()
|