neon/test_runner/batch_others/test_snapfiles_gc.py

from contextlib import closing
import psycopg2.extras
import time
from fixtures.utils import print_gc_result
from fixtures.zenith_fixtures import ZenithEnv
from fixtures.log_helper import log

pytest_plugins = ("fixtures.zenith_fixtures")


#
# Test Garbage Collection of old layer files
#
# This test is pretty tightly coupled with the current implementation of layered
# storage, in layered_repository.rs.
#
def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
    env = zenith_simple_env
    env.zenith_cli(["branch", "test_layerfiles_gc", "empty"])
    pg = env.postgres.create_start('test_layerfiles_gc')

    with closing(pg.connect()) as conn:
        with conn.cursor() as cur:
            with closing(env.pageserver.connect()) as psconn:
                with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:

                    # Get the timeline ID of our branch. We need it for the 'do_gc' command
                    cur.execute("SHOW zenith.zenith_timeline")
                    timeline = cur.fetchone()[0]

                    # Create a test table
                    cur.execute("CREATE TABLE foo(x integer)")
                    cur.execute("INSERT INTO foo VALUES (1)")

                    cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass")
                    row = cur.fetchone()
                    log.info(f"relfilenode is {row[0]}")

                    # Run GC, to clear out any garbage left behind in the catalogs by
                    # the CREATE TABLE command. We want to have a clean slate with no garbage
                    # before running the actual tests below, otherwise the counts won't match
                    # what we expect.
                    #
                    # Also run vacuum first to make it less likely that autovacuum or pruning
                    # kicks in and confuses our numbers.
                    cur.execute("VACUUM")

                    # delete the row, to update the Visibility Map. We don't want the VM
                    # update to confuse our numbers either.
                    cur.execute("DELETE FROM foo")

                    log.info("Running GC before test")
                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    # remember the number of files
                    layer_relfiles_remain = (row['layer_relfiles_total'] -
                                             row['layer_relfiles_removed'])
                    assert layer_relfiles_remain > 0

                    # Insert a row and run GC. Checkpoint should freeze the layer
                    # so that there is only the most recent image layer left for the rel,
                    # removing the old image and delta layer.
                    log.info("Inserting one row and running GC")
                    cur.execute("INSERT INTO foo VALUES (1)")
                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
                    assert row['layer_relfiles_removed'] == 2
                    assert row['layer_relfiles_dropped'] == 0

                    # Insert two more rows and run GC.
                    # This should create new image and delta layer file with the new contents, and
                    # then remove the old one image and the just-created delta layer.
                    log.info("Inserting two more rows and running GC")
                    cur.execute("INSERT INTO foo VALUES (2)")
                    cur.execute("INSERT INTO foo VALUES (3)")

                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
                    assert row['layer_relfiles_removed'] == 2
                    assert row['layer_relfiles_dropped'] == 0

                    # Do it again. Should again create two new layer files and remove old ones.
                    log.info("Inserting two more rows and running GC")
                    cur.execute("INSERT INTO foo VALUES (2)")
                    cur.execute("INSERT INTO foo VALUES (3)")

                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
                    assert row['layer_relfiles_removed'] == 2
                    assert row['layer_relfiles_dropped'] == 0

                    # Run GC again, with no changes in the database. Should not remove anything.
                    log.info("Run GC again, with nothing to do")
                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)
                    assert row['layer_relfiles_total'] == layer_relfiles_remain
                    assert row['layer_relfiles_removed'] == 0
                    assert row['layer_relfiles_dropped'] == 0

                    #
                    # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
                    #
                    log.info("Drop table and run GC again")
                    cur.execute("DROP TABLE foo")

                    pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
                    row = pscur.fetchone()
                    print_gc_result(row)

                    # We still cannot remove the latest layers
                    # because they serve as tombstones for earlier layers.
                    assert row['layer_relfiles_dropped'] == 0
                    # Each relation fork is counted separately, hence 3.
                    assert row['layer_relfiles_needed_as_tombstone'] == 3

                    # The catalog updates also create new layer files of the catalogs, which
                    # are counted as 'removed'
                    assert row['layer_relfiles_removed'] > 0

                    # TODO Change the test to check actual CG of dropped layers.
                    # Each relation fork is counted separately, hence 3.
                    #assert row['layer_relfiles_dropped'] == 3

                    # TODO: perhaps we should count catalog and user relations separately,
                    # to make this kind of testing more robust