From 60dae0b4ac2b9217f12750baa6c4357f4eb796c1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@zenith.tech>
Date: Fri, 8 Oct 2021 00:34:29 +0300
Subject: [PATCH] Add test case that demonstrates Write Amplification.

---
 .../performance/test_write_amplification.py   | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 test_runner/performance/test_write_amplification.py

diff --git a/test_runner/performance/test_write_amplification.py b/test_runner/performance/test_write_amplification.py
new file mode 100644
index 0000000000..09310c702b
--- /dev/null
+++ b/test_runner/performance/test_write_amplification.py
@@ -0,0 +1,74 @@
+# Demonstrate Write Amplification with naive oldest-first layer checkpointing
+# algorithm.
+#
+# In each iteration of the test, we create a new table that's slightly under 10
+# MB in size (10 MB is the current "segment size" used by the page server). Then
+# we make a tiny update to all the tables already created. This creates a WAL
+# pattern where you have a lot of updates on one segment (the newly created
+# one), alternating with a small updates on all relations. This is the worst
+# case scenario for the naive checkpointing policy where we write out the layers
+# in LSN order, writing the oldest layer first. That creates a new 10 MB image
+# layer to be created for each of those small updates.  This is the Write
+# Amplification problem at its finest.
+import os
+from contextlib import closing
+from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
+
+pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
+
+def test_write_amplification(postgres: PostgresFactory, pageserver: ZenithPageserver, pg_bin, zenith_cli, zenbenchmark, repo_dir: str):
+    # Create a branch for us
+    zenith_cli.run(["branch", "test_write_amplification", "empty"])
+
+    pg = postgres.create_start('test_write_amplification')
+    print("postgres is running on 'test_write_amplification' branch")
+
+    # Open a connection directly to the page server that we'll use to force
+    # flushing the layers to disk
+    psconn = pageserver.connect();
+    pscur = psconn.cursor()
+
+    with closing(pg.connect()) as conn:
+        with conn.cursor() as cur:
+            # Get the timeline ID of our branch. We need it for the 'do_gc' command
+            cur.execute("SHOW zenith.zenith_timeline")
+            timeline = cur.fetchone()[0]
+
+            with zenbenchmark.record_pageserver_writes(pageserver, 'pageserver_writes'):
+                with zenbenchmark.record_duration('run'):
+
+                    # NOTE: Because each iteration updates every table already created,
+                    # the runtime and write amplification is O(n^2), where n is the
+                    # number of iterations.
+                    for i in range(25):
+                        cur.execute(f'''
+                        CREATE TABLE tbl{i} AS
+                            SELECT g as i, 'long string to consume some space' || g as t
+                            FROM generate_series(1, 100000) g
+                        ''')
+                        cur.execute(f"create index on tbl{i} (i);")
+                        for j in range(1, i):
+                            cur.execute(f"delete from tbl{j} where i = {i}")
+
+                        # Force checkpointing. As of this writing, we don't have
+                        # a back-pressure mechanism, and the page server cannot
+                        # keep up digesting and checkpointing the WAL at the
+                        # rate that it is generated. If we don't force a
+                        # checkpoint, the WAL will just accumulate in memory
+                        # until you hit OOM error. So in effect, we use much
+                        # more memory to hold the incoming WAL, and write them
+                        # out in larger batches than we'd really want. Using
+                        # more memory hides the write amplification problem this
+                        # test tries to demonstrate.
+                        #
+                        # The write amplification problem is real, and using
+                        # more memory isn't the right solution. We could
+                        # demonstrate the effect also by generating the WAL
+                        # slower, adding some delays in this loop.  But forcing
+                        # the the checkpointing and GC makes the test go faster,
+                        # with the same total I/O effect.
+                        pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
+
+            # Report disk space used by the repository
+            timeline_size = zenbenchmark.get_timeline_size(repo_dir, pageserver.initial_tenant, timeline)
+            zenbenchmark.record('size', timeline_size / (1024*1024), 'MB')