neon/test_runner/performance/test_write_amplification.py

# Demonstrate Write Amplification with naive oldest-first layer checkpointing
# algorithm.
#
# In each iteration of the test, we create a new table that's slightly under 10
# MB in size (10 MB is the current "segment size" used by the page server). Then
# we make a tiny update to all the tables already created. This creates a WAL
# pattern where you have a lot of updates on one segment (the newly created
# one), alternating with a small updates on all relations. This is the worst
# case scenario for the naive checkpointing policy where we write out the layers
# in LSN order, writing the oldest layer first. That creates a new 10 MB image
# layer to be created for each of those small updates.  This is the Write
# Amplification problem at its finest.
import os
from contextlib import closing
from fixtures.benchmark_fixture import MetricReport
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.log_helper import log

pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")


def test_write_amplification(zenith_simple_env: ZenithEnv, zenbenchmark):
    env = zenith_simple_env
    # Create a branch for us
    env.zenith_cli(["branch", "test_write_amplification", "empty"])

    pg = env.postgres.create_start('test_write_amplification')
    log.info("postgres is running on 'test_write_amplification' branch")

    # Open a connection directly to the page server that we'll use to force
    # flushing the layers to disk
    psconn = env.pageserver.connect()
    pscur = psconn.cursor()

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
            # Get the timeline ID of our branch. We need it for the 'do_gc' command
            cur.execute("SHOW zenith.zenith_timeline")
            timeline = cur.fetchone()[0]

            with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
                with zenbenchmark.record_duration('run'):

                    # NOTE: Because each iteration updates every table already created,
                    # the runtime and write amplification is O(n^2), where n is the
                    # number of iterations.
                    for i in range(25):
                        cur.execute(f'''
                        CREATE TABLE tbl{i} AS
                            SELECT g as i, 'long string to consume some space' || g as t
                            FROM generate_series(1, 100000) g
                        ''')
                        cur.execute(f"create index on tbl{i} (i);")
                        for j in range(1, i):
                            cur.execute(f"delete from tbl{j} where i = {i}")

                        # Force checkpointing. As of this writing, we don't have
                        # a back-pressure mechanism, and the page server cannot
                        # keep up digesting and checkpointing the WAL at the
                        # rate that it is generated. If we don't force a
                        # checkpoint, the WAL will just accumulate in memory
                        # until you hit OOM error. So in effect, we use much
                        # more memory to hold the incoming WAL, and write them
                        # out in larger batches than we'd really want. Using
                        # more memory hides the write amplification problem this
                        # test tries to demonstrate.
                        #
                        # The write amplification problem is real, and using
                        # more memory isn't the right solution. We could
                        # demonstrate the effect also by generating the WAL
                        # slower, adding some delays in this loop.  But forcing
                        # the checkpointing and GC makes the test go faster,
                        # with the same total I/O effect.
                        pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")

            # Report disk space used by the repository
            timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
                                                           env.initial_tenant,
                                                           timeline)
            zenbenchmark.record('size',
                                timeline_size / (1024 * 1024),
                                'MB',
                                report=MetricReport.LOWER_IS_BETTER)